def __init__(self): super(NotificationPluginManager, self).__init__() self.daemon = True try: self.load_plugins() notification_medium = [] self.complete = threading.Event() for plugin in NotificationPlugin.plugins: notification_medium.append(plugin.name) NS.notifier.objects.NotificationMedia( media=notification_medium).save() except (AttributeError, SyntaxError, ValueError, KeyError, ImportError, etcd.EtcdException) as ex: Event( ExceptionMessage(priority="debug", publisher="notifier", payload={ "message": 'Failed to intialize notification ' 'manager', "exception": ex })) raise ex
def _run(self): while not self._complete.is_set(): cluster_summaries = [] clusters = central_store_util.get_cluster_ids() for clusterid in clusters: gevent.sleep(0.1) try: cluster_summary = self.parse_cluster(clusterid) cluster_summaries.append(cluster_summary.copy()) cluster_summary.save(update=False) except EtcdKeyNotFound: pass except (EtcdException, AttributeError) as ex: Event( ExceptionMessage(priority="debug", publisher=NS.publisher_id, payload={ "message": 'Error caught computing summary.', "exception": ex })) continue NS.sds_monitoring_manager.compute_system_summary(cluster_summaries) gevent.sleep(60)
def init_monitoring(self): try: node_dets = central_store_util.get_nodes_details() for node_det in node_dets: if ( node_det['node_id'] not in self.monitoring_config_init_nodes ): self.init_monitoring_on_node(node_det) self.monitoring_config_init_nodes.append( node_det['node_id'] ) except TendrlPerformanceMonitoringException as ex: Event( ExceptionMessage( priority="debug", publisher=NS.publisher_id, payload={"message": 'Failed to intialize monitoring ' 'configuration on nodes. ', "exception": ex } ) ) raise ex
def osds_by_pool(self): """Get the OSDS which may be used in this pool :return dict of pool ID to OSD IDs in the pool """ result = {} for pool_id, pool in self.pools_by_id.items(): osds = None for rule in [ r for r in self.data[ 'crush' ]['rules'] if r['ruleset'] == pool['crush_ruleset'] ]: if rule['min_size'] <= pool['size'] <= rule['max_size']: osds = self.osds_by_rule_id[rule['rule_id']] if osds is None: # Fallthrough, the pool size didn't fall within any of the # rules in its ruleset, Calamari doesn't understand. # Just report all OSDs instead of failing horribly. Event( Message( priority="error", publisher=NS.publisher_id, payload={"message": "Cannot determine OSDS for pool %s" % pool_id } ) ) osds = self.osds_by_id.keys() result[pool_id] = osds return result
def ceph_create_ssh_setup_jobs(parameters): node_list = parameters['Node[]'] ssh_job_ids = [] ssh_setup_script = NS.ceph_provisioner.get_plugin().setup() if len(node_list) > 0: for node in node_list: if NS.node_context.node_id != node: new_params = parameters.copy() new_params['Node[]'] = [node] new_params['ssh_setup_script'] = ssh_setup_script # create same flow for each node in node list except $this payload = { "tags": ["tendrl/node_%s" % node], "run": "tendrl.flows.SetupSsh", "status": "new", "parameters": new_params, "parent": parameters['job_id'], "type": "node" } _job_id = str(uuid.uuid4()) Job(job_id=_job_id, status="new", payload=payload).save() ssh_job_ids.append(_job_id) Event( Message( job_id=parameters['job_id'], flow_id=parameters['flow_id'], priority="info", publisher=NS.publisher_id, payload={"message": "Created SSH setup job %s for node" " %s" % (_job_id, node) } ) ) return ssh_job_ids
def log(log_priority, publisher_id, log_payload, job_id=None, flow_id=None, parent_id=None, cluster_id=None): """Function used for logging errors/output/info. Args: log_priority [Type : String]: Priority of the Log Message (error/info) publisher_id [Type : Integer] : Id of publisher (mandatory) log_payload [Type: Dict] : Payload can contain / parameters like message that is to be logged """ caller_details = getframeinfo(stack()[1][0]) caller_details = { "filename": caller_details.filename, "line_no": caller_details.lineno, "function": caller_details.function } try: Event( Message(log_priority, publisher_id, log_payload, job_id, flow_id, parent_id, cluster_id, caller=caller_details)) except Exception: if log_priority.lower() == "error": sys.stderr.write(log_payload.get("message")) else: sys.stdout.write(log_payload.get("message"))
def run(self): retry_count = 0 while True: volumes = None try: volumes = NS._int.client.read("clusters/%s/Volumes" % NS.tendrl_context.integration_id) except etcd.EtcdKeyNotFound: # ignore as no volumes available till now pass if volumes: for entry in volumes.leaves: volume = Volume( vol_id=entry.key.split("Volumes/")[-1]).load() if volume.name == self.parameters['Volume.volname']: return True retry_count += 1 time.sleep(1) if retry_count == 600: Event( Message(priority="error", publisher=NS.publisher_id, payload={ "message": "Volume %s not reflected in tendrl" " yet. Timing out" % self.parameters['Volume.volname'] }, job_id=self.parameters['job_id'], flow_id=self.parameters['flow_id'], cluster_id=NS.tendrl_context.integration_id)) raise AtomExecutionFailedError( "Volume %s not reflected in tendrl yet. Timing out" % self.parameters['Volume.volname'])
def __init__(self): super(NotificationPluginManager, self).__init__() try: self.load_plugins() notification_medium = [] for plugin in NotificationPlugin.plugins: notification_medium.append(plugin.name) NS.notification_medium = notification_medium NotificationMedia(media=notification_medium).save() self.save_alertnotificationconfig() except (SyntaxError, ValueError, KeyError, etcd.EtcdKeyNotFound, etcd.EtcdConnectionFailed, etcd.EtcdException, NotificationPluginError) as ex: Event( ExceptionMessage(priority="error", publisher="alerting", payload={ "message": 'Failed to intialize notification ' 'manager', "exception": ex })) raise AlertingError(str(ex))
def _application(self, env, start_response): try: if env['PATH_INFO'] != '/grafana_callback': start_response('404 Not Found', [('Content-Type', 'text/html')]) response = [b'<h1>Alert Not Found</h1>'] else: data = env['wsgi.input'].read() data = json.loads(data) self.alert_handler.handle_alert(data["ruleId"]) start_response('200 OK', [('Content-Type', 'text/html')]) response = [b'<h1>Alert Received</h1>'] except (IOError, AssertionError) as ex: Event( ExceptionMessage(priority="error", publisher=NS.publisher_id, payload={ "message": "Unable to read alert from socket", "exception": ex })) response = [b'<h1>Error in reading alert from socket</h1>'] return response
def _create_node_id(self): node_id = str(uuid.uuid4()) index_key = "/indexes/machine_id/%s" % self.machine_id NS._int.wclient.write(index_key, node_id, prevExist=False) try: Event( Message(priority="debug", publisher=NS.publisher_id, payload={ "message": "Registered Node (%s) with " "machine_id==%s" % (node_id, self.machine_id) })) except KeyError: sys.stdout.write("message: Registered Node (%s) with " "machine_id==%s" % (node_id, self.machine_id)) local_node_id = "/var/lib/tendrl/node_id" if not os.path.exists(os.path.dirname(local_node_id)): os.makedirs(os.path.dirname(local_node_id)) with open(local_node_id, 'wb+') as f: f.write(node_id) global NODE_ID NODE_ID = node_id return node_id
def run(self): retry_count = 0 while True: _cluster = None try: _cluster = NS.tendrl.objects.Cluster( integration_id=self.parameters[ "TendrlContext.integration_id" ] ).load() except etcd.EtcdKeyNotFound: # pass and continue the time out below pass if _cluster and _cluster.is_managed == "yes": return True retry_count += 1 time.sleep(1) if retry_count == 600: Event( Message( priority="error", publisher=NS.publisher_id, payload={ "message": "Cluster data sync still incomplete. " "Timing out" }, job_id=self.parameters['job_id'], flow_id=self.parameters['flow_id'], cluster_id=NS.tendrl_context.integration_id, ) ) raise AtomExecutionFailedError( "Cluster data sync still incomplete. Timing out" )
def complete_jid(self, result): """Call this when remote execution is done. Implementations must always update .jid appropriately here: either to the jid of a new job, or to None. """ self.result = result Event( Message(priority="info", publisher=NS.publisher_id, payload={ "message": "Request %s JID %s completed with result=" "%s" % (self.id, self.jid, self.result) })) self.jid = None # This is a default behaviour for UserRequests which don't # override this method: assume completion of a JID means the # job is now done. self.complete()
def on_map(self, sync_type, sync_object): Event( Message(priority="debug", publisher=NS.publisher_id, payload={ "message": "PgCreatingRequest %s %s" % (sync_type.str, self._phase) })) if self._phase == self.PG_MAP_WAIT: if sync_type == PgSummary: # Count the PGs in this pool which are not in state 'creating' pg_summary = sync_object pgs_not_creating = 0 for state_tuple, count in pg_summary.data['by_pool'][ self._pool_id].items(): states = state_tuple.split("+") if 'creating' not in states: pgs_not_creating += count self._pg_progress.set_created_pg_count(pgs_not_creating) Event( Message(priority="debug", publisher=NS.publisher_id, payload={ "message": "PgCreatingRequest.on_map: " "pg_counter=%s/%s (final %s)" % (pgs_not_creating, self._pg_progress.goal, self._pg_progress.final) })) if pgs_not_creating >= self._pg_progress.goal: if self._pg_progress.is_final_block(): Event( Message(priority="debug", publisher=NS.publisher_id, payload={ "message": "PgCreatingRequest.on_map " "Creations complete" })) if self._post_create_commands: Event( Message(priority="debug", publisher=NS.publisher_id, payload={ "message": "PgCreatingRequest." "on_map Issuing " "post-create commands" })) self._submit(self._post_create_commands) self._phase = self.JID_WAIT else: Event( Message(priority="debug", publisher=NS.publisher_id, payload={ "message": "PgCreatingRequest.on_" "map All done" })) self.complete() else: Event( Message(priority="debug", publisher=NS.publisher_id, payload={ "message": "PgCreatingREQUEST.on_map " "Issuing more creates" })) self._pg_progress.advance_goal() # Request another tranche of PGs up to _block_size self._submit([('osd pool set', { 'pool': self._pool_name, 'var': 'pg_num', 'val': self._pg_progress.goal })]) self._phase = self.JID_WAIT elif sync_type == OsdMap: # Keep an eye on the OsdMap to check that pg_num is what we # expect: otherwise if forces of darkness changed pg_num # then our PG creation check could get confused and fail # to complete. osd_map = sync_object pool = osd_map.pools_by_id[self._pool_id] if pool['pg_num'] != self._pg_progress.expected_count(): self.set_error( "PG creation interrupted (unexpected change to pg_num)" ) self.complete() return else: raise NotImplementedError( "Unexpected map {1} in state {2}".format( sync_type, self._phase)) elif self._phase == self.OSD_MAP_WAIT: # Read back the pg_num for my pool from the OSD map osd_map = sync_object pool = osd_map.pools_by_id[self._pool_id] # In Ceph <= 0.67.7, "osd pool set pg_num" will return # success even if it hasn't really increased pg_num, # so we must examine the OSD map to see if it really succeded if pool['pg_num'] != self._pg_progress.expected_count(): self.set_error("PG creation failed (check that there" " aren't already PGs in 'creating' state)") self.complete() return assert self._await_version ready = osd_map.version >= self._await_version if ready: # OSD map advancement either means a PG creation round # completed, or that the post_create_commands completed. # Distinguish by looking at pg_progress. if self._pg_progress.is_complete(): # This was the OSD map update from the # post_create_commands, we we're all done! self.complete() else: # This was the OSD map update from a PG creation command, # so start waiting for the pgs self._phase = self.PG_MAP_WAIT else: raise NotImplementedError("Unexpected {0} in phase {1}".format( sync_type, self._phase))
def _get_utilization_data(self): from ceph_argparse import json_command import rados _conf_file = os.path.join("/etc/ceph", NS.tendrl_context.cluster_name + ".conf") # TODO(shtripat) use ceph.ceph_command instead of rados/json_command cluster_handle = rados.Rados( name=ceph.RADOS_NAME, clustername=NS.tendrl_context.cluster_name, conffile=_conf_file) cluster_handle.connect() prefix = 'df' ret, outbuf, outs = json_command(cluster_handle, prefix=prefix, argdict={}, timeout=ceph.RADOS_TIMEOUT) if ret != 0: cluster_handle.shutdown() raise rados.Error(outs) else: outbuf = outbuf.replace('RAW USED', 'RAW_USED') outbuf = outbuf.replace('%RAW USED', '%RAW_USED') outbuf = outbuf.replace('MAX AVAIL', 'MAX_AVAIL') lines = outbuf.split('\n') index = 0 cluster_stat = {} pool_stat = {} pool_stat_available = False cluster_handle.shutdown() while index < len(lines): line = lines[index] if line == "" or line == '\n': index += 1 continue if "GLOBAL" in line: index += 1 if len(lines) < 3: raise rados.Error("Failed to parse pool stats data") cluster_fields = lines[index].split() cluster_size_idx = self._idx_in_list( cluster_fields, 'SIZE') cluster_avail_idx = self._idx_in_list( cluster_fields, 'AVAIL') cluster_used_idx = self._idx_in_list( cluster_fields, 'RAW_USED') cluster_pcnt_used_idx = self._idx_in_list( cluster_fields, '%RAW_USED') if cluster_size_idx == -1 or cluster_avail_idx == -1 or \ cluster_used_idx == -1 or cluster_pcnt_used_idx == -1: raise rados.Error("Missing fields in cluster stat") index += 1 if index >= len(lines): Event( Message(priority="debug", publisher=NS.publisher_id, payload={ "message": "No cluster stats to parse" })) return {'cluster': cluster_stat, 'pools': {}} line = lines[index] cluster_fields = line.split() if len(cluster_fields) < 4: Event( Message(priority="debug", publisher=NS.publisher_id, payload={ "message": "Missing fields in cluster" " stat" })) return {'cluster': cluster_stat, 'pools': {}} cluster_stat['total'] = self._to_bytes( cluster_fields[cluster_size_idx]) cluster_stat['used'] = self._to_bytes( cluster_fields[cluster_used_idx]) cluster_stat['available'] = self._to_bytes( cluster_fields[cluster_avail_idx]) cluster_stat['pcnt_used'] = cluster_fields[ cluster_pcnt_used_idx] if "POOLS" in line: pool_stat_available = True index += 1 if index >= len(lines): Event( Message( priority="debug", publisher=NS.publisher_id, payload={"message": "No pool stats to parse"})) return {'cluster': cluster_stat, 'pools': {}} pool_fields = lines[index].split() pool_name_idx = self._idx_in_list(pool_fields, 'NAME') pool_id_idx = self._idx_in_list(pool_fields, 'ID') pool_used_idx = self._idx_in_list(pool_fields, 'USED') pool_pcnt_used_idx = self._idx_in_list( pool_fields, '%USED') pool_max_avail_idx = self._idx_in_list( pool_fields, 'MAX_AVAIL') if pool_name_idx == -1 or pool_id_idx == -1 or \ pool_used_idx == -1 or pool_pcnt_used_idx == -1 or \ pool_max_avail_idx == -1: Event( Message(priority="debug", publisher=NS.publisher_id, payload={ "message": "Missing fields in pool " "stat" })) return {'cluster': cluster_stat, 'pools': {}} index += 1 if pool_stat_available is True: line = lines[index] pool_fields = line.split() if len(pool_fields) < 5: Event( Message(priority="debug", publisher=NS.publisher_id, payload={ "message": "Missing fields in pool" " stat" })) return {'cluster': cluster_stat, 'pools': {}} loc_dict = {} loc_dict['available'] = self._to_bytes( pool_fields[pool_max_avail_idx]) loc_dict['used'] = self._to_bytes( pool_fields[pool_used_idx]) loc_dict['pcnt_used'] = pool_fields[pool_pcnt_used_idx] pool_stat[pool_fields[pool_name_idx]] = loc_dict index += 1 return {'cluster': cluster_stat, 'pools': pool_stat}
def on_sync_object(self, data): assert data['fsid'] == self.fsid sync_object = copy.deepcopy(data['data']) sync_type = SYNC_OBJECT_STR_TYPE[data['type']] new_object = self.inject_sync_object(data['type'], data['version'], sync_object) self._request_coll.on_map(sync_type, new_object) if new_object: # Check and raise any alerts if required # TODO(team) Enabled the below if condition as when # alerting needed for cluster health, mon status, pool # status etc # if sync_type.str == "health": # self._on_health(sync_object) # if sync_type.str == "mon_status": # self._on_mon_status(sync_object) if sync_type.str == "osd_map": # self._on_pool_status(sync_object) self._on_osd_map(sync_object) NS.ceph.objects.SyncObject( updated=now(), sync_type=sync_type.str, version=new_object.version if isinstance( new_object.version, int) else None, when=now(), data=data['data']).save(update=False) if sync_type.str == "health": NS.ceph.objects.GlobalDetails( status=sync_object['overall_status']).save() if sync_type.str == "osd_map": # Pool out of band deletion handling try: pools = NS._int.client.read( "clusters/%s/Pools" % NS.tendrl_context.integration_id) old_pool_ids = [] for pool in pools.leaves: old_pool_ids.append(int(pool.key.split("/")[-1])) new_pool_ids = [] for raw_pool in sync_object.get('pools', []): new_pool_ids.append(raw_pool['pool']) delete_pool_ids = set(old_pool_ids) - set(new_pool_ids) for id in delete_pool_ids: NS._int.client.delete( "clusters/%s/Pools/%s" % (NS.tendrl_context.integration_id, id), recursive=True) except etcd.EtcdKeyNotFound as ex: Event( ExceptionMessage(priority="debug", publisher=NS.publisher_id, payload={ "message": "No pools found \ for ceph cluster %s" % NS.tendrl_context.integration_id, "exception": ex })) for raw_pool in sync_object.get('pools', []): Event( Message(priority="info", publisher=NS.publisher_id, payload={ "message": "Updating Pool %s" % raw_pool['pool_name'] })) pool_type = 'replicated' if 'erasure_code_profile' in raw_pool and \ raw_pool['erasure_code_profile'] != "": pool_type = 'erasure_coded' quota_enabled = False if ('quota_max_objects' in raw_pool and raw_pool['quota_max_objects'] > 0) or \ ('quota_max_bytes' in raw_pool and raw_pool['quota_max_bytes'] > 0): quota_enabled = True NS.ceph.objects.Pool( pool_id=raw_pool['pool'], pool_name=raw_pool['pool_name'], pg_num=raw_pool['pg_num'], type=pool_type, erasure_code_profile=raw_pool.get( 'erasure_code_profile'), min_size=raw_pool['min_size'], size=raw_pool.get('size', None), quota_enabled=quota_enabled, quota_max_objects=raw_pool['quota_max_objects'], quota_max_bytes=raw_pool['quota_max_bytes'], ).save() # Osd out of band deletion handling try: osds = NS._int.client.read( "clusters/%s/Osds" % NS.tendrl_context.integration_id) old_osds = [] for osd in osds.leaves: old_osds.append(str(osd.key.split("/")[-1])) new_osds = [] for raw_osd in sync_object.get('osds', []): new_osds.append(raw_osd['uuid']) delete_osds = set(old_osds) - set(new_osds) for id in delete_osds: NS._int.client.delete( "clusters/%s/Osds/%s" % (NS.tendrl_context.integration_id, id), recursive=True) except etcd.EtcdKeyNotFound as ex: Event( ExceptionMessage(priority="debug", publisher=NS.publisher_id, payload={ "message": "key not found in etcd", "exception": ex })) for raw_osd in sync_object.get('osds', []): Event( Message(priority="info", publisher=NS.publisher_id, payload={ "message": "Updating OSD %s" % raw_osd['osd'] })) osd_host = socket.gethostbyaddr( raw_osd['public_addr'].split(':')[0])[0] NS.ceph.objects.Osd( id=raw_osd['osd'], uuid=raw_osd['uuid'], hostname=osd_host, public_addr=raw_osd['public_addr'], cluster_addr=raw_osd['cluster_addr'], heartbeat_front_addr=raw_osd['heartbeat_front_addr'], heartbeat_back_addr=raw_osd['heartbeat_back_addr'], down_at=raw_osd['down_at'], up_from=raw_osd['up_from'], lost_at=raw_osd['lost_at'], osd_up=raw_osd['up'], osd_in=raw_osd['in'], up_thru=raw_osd['up_thru'], weight=str(raw_osd['weight']), primary_affinity=str(raw_osd['primary_affinity']), state=raw_osd['state'], last_clean_begin=raw_osd['last_clean_begin'], last_clean_end=raw_osd['last_clean_end']).save() else: Event( Message(priority="debug", publisher=NS.publisher_id, payload={ "message": "ClusterMonitor.on_sync_object: " "stale object received for %s" % data['type'] }))
def get_node_network(): """return [{"ipv4": ["ipv4address", ...], "ipv6": ["ipv6address, ..."], "netmask": ["subnet", ...], "subnet": "subnet", "status":"up/down", "interface_id": "", "sysfs_id": "", "device_link": "", "interface_type": "", "model": "", "driver_modules": "", "drive": "", "hw_address": "", "link_detected": "" }, ... ] """ rv = [] network_interfaces = get_node_interface() cmd = cmd_utils.Command('hwinfo --network') out, err, rc = cmd.run() if not err or "vdsmdummy: command not found" in err: for interface in out.split('\n\n'): devlist = { "interface_id": "", "sysfs_id": "", "device_link": "", "interface_type": "", "model": "", "driver_modules": "", "drive": "", "interface": "", "hw_address": "", "link_detected": "" } for line in interface.split('\n'): if "Unique ID" in line: devlist['interface_id'] = \ line.split(':')[1].lstrip() elif "SysFS ID" in line: devlist['sysfs_id'] = \ line.split(':')[1].lstrip() elif "SysFS Device Link" in line: devlist['device_link'] = \ line.split(':')[1].lstrip() elif "Hardware Class" in line: devlist['interface_type'] = \ line.split(':')[1].lstrip() elif "Model" in line: devlist['model'] = \ line.split(':')[1].lstrip().replace('"', "") elif "Driver Modules" in line: devlist['driver_modules'] = \ line.split(':')[1].lstrip().replace('"', "") elif "Driver" in line: devlist['driver'] = \ line.split(':')[1].lstrip().replace('"', "") elif "Device File" in line: devlist['interface'] = \ line.split(':')[1].lstrip() elif "HW Address" in line: devlist['hw_address'] = \ line.split(':')[1].lstrip() elif "Link detected" in line: devlist['link_detected'] = \ line.split(':')[1].lstrip() if devlist["interface"] in network_interfaces: interface_name = devlist["interface"] network_interfaces[interface_name].update(devlist) rv.append(network_interfaces[interface_name]) else: Event( Message(priority="debug", publisher=NS.publisher_id, payload={"message": err})) return rv
def process_job(job): jid = job.key.split('/')[-1] job_status_key = "/queue/%s/status" % jid job_lock_key = "/queue/%s/locked_by" % jid NS.node_context = NS.node_context.load() # Check job not already locked by some agent try: _locked_by = etcd_utils.read(job_lock_key).value if _locked_by: return except etcd.EtcdKeyNotFound: pass # Check job not already "finished", or "processing" try: _status = etcd_utils.read(job_status_key).value if _status in ["finished", "processing"]: return except etcd.EtcdKeyNotFound: pass try: _job_timeout_key = "/queue/%s/timeout" % jid _timeout = None _timeout = etcd_utils.read(_job_timeout_key).value if _timeout: _timeout = _timeout.lower() except etcd.EtcdKeyNotFound: pass # tendrl-node-agent tagged as tendrl/monitor will ensure # >10 min old "new" jobs are timed out and marked as # "failed" (the parent job of these jobs will also be # marked as "failed") if "tendrl/monitor" in NS.node_context.tags and \ _timeout == "yes": _job_valid_until_key = "/queue/%s/valid_until" % jid _valid_until = None try: _valid_until = etcd_utils.read( _job_valid_until_key).value except etcd.EtcdKeyNotFound: pass if _valid_until: _now_epoch = (time_utils.now() - datetime.datetime(1970, 1, 1).replace( tzinfo=utc)).total_seconds() if int(_now_epoch) >= int(_valid_until): # Job has "new" status since 10 minutes, # mark status as "failed" and Job.error = # "Timed out" try: etcd_utils.write(job_status_key, "failed", prevValue="new") except etcd.EtcdCompareFailed: pass else: job = NS.tendrl.objects.Job(job_id=jid).load() _msg = str("Timed-out (>10min as 'new')") job.errors = _msg job.save() if job.payload.get('parent') is None: alert_utils.alert_job_status( "failed", "Job timed out (job_id: %s)" % jid, integration_id=NS.tendrl_context.integration_id or job.payload['parameters'].get( 'TendrlContext.integration_id' ), cluster_name=NS.tendrl_context.cluster_name or job.payload['parameters'].get( 'TendrlContext.cluster_name' ) ) return else: _now_plus_10 = time_utils.now() + datetime.timedelta(minutes=10) _epoch_start = datetime.datetime(1970, 1, 1).replace(tzinfo=utc) # noinspection PyTypeChecker _now_plus_10_epoch = (_now_plus_10 - _epoch_start).total_seconds() etcd_utils.write(_job_valid_until_key, int(_now_plus_10_epoch)) job = NS.tendrl.objects.Job(job_id=jid).load() if job.payload["type"] == NS.type and \ job.status == "new": # Job routing # Flows created by tendrl-api use 'tags' from flow # definition to target jobs _tag_match = False if job.payload.get("tags", []): for flow_tag in job.payload['tags']: if flow_tag in NS.node_context.tags: _tag_match = True if not _tag_match: _job_tags = ", ".join(job.payload.get("tags", [])) _msg = "Node (%s)(type: %s)(tags: %s) will not " \ "process job-%s (tags: %s)" % \ (NS.node_context.node_id, NS.type, NS.node_context.tags, jid, _job_tags) logger.log( "info", NS.publisher_id, {"message": _msg} ) return job_status_key = "/queue/%s/status" % job.job_id job_lock_key = "/queue/%s/locked_by" % job.job_id try: lock_info = dict(node_id=NS.node_context.node_id, fqdn=NS.node_context.fqdn, tags=NS.node_context.tags, type=NS.type) etcd_utils.write(job_status_key, "processing", prevValue="new") etcd_utils.write(job_lock_key, json.dumps(lock_info)) except etcd.EtcdCompareFailed: # job is already being processed by some tendrl # agent return the_flow = None try: current_ns, flow_name, obj_name = \ _extract_fqdn(job.payload['run']) if obj_name: runnable_flow = current_ns.ns.get_obj_flow( obj_name, flow_name) else: runnable_flow = current_ns.ns.get_flow(flow_name) the_flow = runnable_flow(parameters=job.payload[ 'parameters'], job_id=job.job_id) logger.log( "info", NS.publisher_id, {"message": "Processing Job %s" % job.job_id}, job_id=job.job_id, flow_id=the_flow.parameters['flow_id'] ) logger.log( "info", NS.publisher_id, {"message": "Running Flow %s" % job.payload['run']}, job_id=job.job_id, flow_id=the_flow.parameters['flow_id'] ) the_flow.run() try: etcd_utils.write(job_status_key, "finished", prevValue="processing") except etcd.EtcdCompareFailed: # This should not happen! _msg = "Cannot mark job as 'finished', " \ "current job status invalid" raise FlowExecutionFailedError(_msg) logger.log( "info", NS.publisher_id, {"message": "Job (%s): Finished " "Flow %s" % ( job.job_id, job.payload['run'])}, job_id=job.job_id, flow_id=the_flow.parameters['flow_id'], ) if job.payload.get('parent') is None: alert_utils.alert_job_status( "finished", "Job finished successfully (job_id: %s)" % job.job_id, integration_id=NS.tendrl_context.integration_id or job.payload['parameters'].get( 'TendrlContext.integration_id' ), cluster_name=NS.tendrl_context.cluster_name or job.payload['parameters'].get( 'TendrlContext.cluster_name' ) ) except (FlowExecutionFailedError, AtomExecutionFailedError, Exception) as e: _trace = str(traceback.format_exc(e)) _msg = "Failure in Job %s Flow %s with error:" % \ (job.job_id, job.payload['run']) Event( ExceptionMessage( priority="error", publisher=NS.publisher_id, payload={"message": _msg + _trace, "exception": e } ) ) if the_flow: logger.log( "error", NS.publisher_id, {"message": _msg + "\n" + _trace}, job_id=job.job_id, flow_id=the_flow.parameters['flow_id'] ) else: logger.log( "error", NS.publisher_id, {"message": _msg + "\n" + _trace} ) try: etcd_utils.write(job_status_key, "failed", prevValue="processing") except etcd.EtcdCompareFailed: # This should not happen! _msg = "Cannot mark job as 'failed', current" \ "job status invalid" raise FlowExecutionFailedError(_msg) else: job = job.load() job.errors = _trace if job.payload.get('parent') is None: alert_utils.alert_job_status( "failed", "Job failed (job_id: %s)" % job.job_id, integration_id=NS.tendrl_context.integration_id or job.payload['parameters'].get( 'TendrlContext.integration_id' ), cluster_name=NS.tendrl_context.cluster_name or job.payload['parameters'].get( 'TendrlContext.cluster_name' ) ) job.save()
def main(): ceph_integration.CephIntegrationNS() TendrlNS() NS.type = "sds" NS.publisher_id = "ceph_integration" from tendrl.ceph_integration import sds_sync NS.state_sync_thread = sds_sync.CephIntegrationSdsSyncStateThread() NS.node_context.save() # Check if Integration is part of any Tendrl imported/created sds cluster try: NS.tendrl_context = NS.tendrl_context.load() Event( Message(priority="info", publisher=NS.publisher_id, payload={ "message": "Integration %s is part of sds cluster" % NS.tendrl_context.integration_id })) except etcd.EtcdKeyNotFound: Event( Message(priority="debug", publisher=NS.publisher_id, payload={ "message": "Node %s is not part of any sds cluster" % NS.node_context.node_id })) raise Exception("Integration cannot be started, " "please Import or Create sds cluster in Tendrl " "and include Node %s" % NS.node_context.node_id) if NS.tendrl_context.integration_id is None: Event( Message(priority="debug", publisher=NS.publisher_id, payload={ "message": "Node %s is not part of any sds cluster" % NS.node_context.node_id })) raise Exception("Integration cannot be started, " "please Import or Create sds cluster in Tendrl " "and include Node %s" % NS.node_context.node_id) NS.tendrl_context.save() NS.ceph.definitions.save() NS.ceph.config.save() if NS.config.data.get("with_internal_profiling", False): from tendrl.commons import profiler profiler.start() m = CephIntegrationManager() m.start() complete = gevent.event.Event() def shutdown(): Event( Message(priority="info", publisher=NS.publisher_id, payload={"message": "Signal handler: stopping"})) complete.set() gevent.signal(signal.SIGTERM, shutdown) gevent.signal(signal.SIGINT, shutdown) while not complete.is_set(): complete.wait(timeout=1)
def load(self): if "Message" not in self.__class__.__name__: try: # Generate current in memory object hash self.hash = self._hash() _hash_key = "/{0}/hash".format(self.value) _stored_hash = None try: _stored_hash = NS._int.client.read(_hash_key).value except (etcd.EtcdConnectionFailed, etcd.EtcdException) as ex: if type(ex) != etcd.EtcdKeyNotFound: NS._int.reconnect() _stored_hash = NS._int.client.read(_hash_key).value if self.hash == _stored_hash: # No changes in stored object and current object, # dont save current object to central store return self except TypeError: # no hash for this object, save the current hash as is pass _copy = self._copy_vars() for item in _copy.render(): try: Event( Message(priority="debug", publisher=NS.publisher_id, payload={"message": "Reading %s" % item['key']})) except KeyError: sys.stdout.write("Reading %s" % item['key']) try: etcd_resp = NS._int.client.read(item['key'], quorum=True) except (etcd.EtcdConnectionFailed, etcd.EtcdException) as ex: if type(ex) == etcd.EtcdKeyNotFound: continue else: NS._int.reconnect() etcd_resp = NS._int.client.read(item['key'], quorum=True) value = etcd_resp.value if item['dir']: key = item['key'].split('/')[-1] dct = dict(key=value) if hasattr(_copy, item['name']): dct = getattr(_copy, item['name']) if type(dct) == dict: dct[key] = value else: setattr(_copy, item['name'], dct) else: setattr(_copy, item['name'], dct) continue # convert list, dict (json) to python based on definitions _type = self._defs.get("attrs", {}).get(item['name'], {}).get("type") if _type: if _type.lower() in ['json', 'list']: if value: try: value = json.loads(value.decode('utf-8')) except ValueError as ex: _msg = "Error load() attr %s for object %s" % \ (item['name'], self.__name__) Event( ExceptionMessage(priority="debug", publisher=NS.publisher_id, payload={ "message": _msg, "exception": ex })) else: if _type.lower() == "list": value = list() if _type.lower() == "json": value = dict() setattr(_copy, item['name'], value) return _copy
def run(self): Event( Message( priority="info", publisher=NS.publisher_id, payload={ "message": "Checking if volume %s stopped" % self.parameters['Volume.volname'] }, job_id=self.parameters["job_id"], flow_id=self.parameters["flow_id"], cluster_id=NS.tendrl_context.integration_id, )) try: fetched_volume = Volume( vol_id=self.parameters['Volume.vol_id']).load() except etcd.EtcdKeyNotFound: Event( Message( priority="error", publisher=NS.publisher_id, payload={ "message": "Volume %s does not exist" % self.parameters['Volume.volname'] }, job_id=self.parameters["job_id"], flow_id=self.parameters["flow_id"], cluster_id=NS.tendrl_context.integration_id, )) return False if fetched_volume.status == "Stopped": Event( Message( priority="info", publisher=NS.publisher_id, payload={ "message": "Volume %s is stopped" % self.parameters['Volume.volname'] }, job_id=self.parameters["job_id"], flow_id=self.parameters["flow_id"], cluster_id=NS.tendrl_context.integration_id, )) return True else: Event( Message( priority="warning", publisher=NS.publisher_id, payload={ "message": "Volume %s is already started" % self.parameters['Volume.volname'] }, job_id=self.parameters["job_id"], flow_id=self.parameters["flow_id"], cluster_id=NS.tendrl_context.integration_id, )) return False
def run(self): try: # Lock nodes flow_utils.acquire_node_lock(self.parameters) integration_id = self.parameters['TendrlContext.integration_id'] if integration_id is None: raise FlowExecutionFailedError( "TendrlContext.integration_id cannot be empty") supported_sds = NS.compiled_definitions.get_parsed_defs( )['namespace.tendrl']['supported_sds'] sds_name = self.parameters["TendrlContext.sds_name"] if sds_name not in supported_sds: raise FlowExecutionFailedError("SDS (%s) not supported" % sds_name) ssh_job_ids = [] ssh_job_ids = \ flow_utils.gluster_create_ssh_setup_jobs( self.parameters, skip_current_node=True ) while True: time.sleep(3) all_status = {} for job_id in ssh_job_ids: job = NS.tendrl.objects.Job(job_id=job_id).load() all_status[job_id] = job.status _failed = { _jid: status for _jid, status in all_status.iteritems() if status == "failed" } if _failed: raise FlowExecutionFailedError( "SSH setup failed for jobs %s cluster %s" % (str(_failed), integration_id)) if all( [status == "finished" for status in all_status.values()]): logger.log("info", NS.publisher_id, { "message": "SSH setup completed for all " "nodes in cluster %s" % integration_id }, job_id=self.parameters['job_id'], flow_id=self.parameters['flow_id']) break # SSH setup jobs finished above, now install sds # bits and create cluster logger.log("info", NS.publisher_id, { "message": "Expanding Gluster Storage" " Cluster %s" % integration_id }, job_id=self.parameters['job_id'], flow_id=self.parameters['flow_id']) gluster_help.expand_gluster(self.parameters) logger.log( "info", NS.publisher_id, { "message": "SDS install/config completed on newly " "expanded nodes, Please wait while " "tendrl-node-agents detect sds details on the newly " "expanded nodes %s" % self.parameters['Node[]'] }, job_id=self.parameters['job_id'], flow_id=self.parameters['flow_id']) # Wait till detected cluster in populated for nodes while True: time.sleep(3) all_status = [] detected_cluster = "" different_cluster_id = False dc = "" for node in self.parameters['Node[]']: try: dc = NS.tendrl.objects.DetectedCluster( node_id=node).load() if not detected_cluster: detected_cluster = dc.detected_cluster_id else: if detected_cluster != dc.detected_cluster_id: all_status.append(False) different_cluster_id = True break all_status.append(True) except etcd.EtcdKeyNotFound: all_status.append(False) if different_cluster_id: raise FlowExecutionFailedError( "Seeing different detected cluster id in" " different nodes. %s and %s" % (detected_cluster, dc.detected_cluster_id)) if all_status: if all(all_status): break # Create the params list for import cluster flow new_params = dict() new_params['Node[]'] = self.parameters['Node[]'] new_params['TendrlContext.integration_id'] = integration_id # Get node context for one of the nodes from list dc = NS.tendrl.objects.DetectedCluster( node_id=self.parameters['Node[]'][0]).load() sds_pkg_name = dc.sds_pkg_name new_params['import_after_expand'] = True sds_pkg_version = dc.sds_pkg_version new_params['DetectedCluster.sds_pkg_name'] = \ sds_pkg_name new_params['DetectedCluster.sds_pkg_version'] = \ sds_pkg_version tags = [] for node in self.parameters['Node[]']: tags.append("tendrl/node_%s" % node) payload = { "tags": tags, "run": "tendrl.flows.ImportCluster", "status": "new", "parameters": new_params, "parent": self.parameters['job_id'], "type": "node" } _job_id = str(uuid.uuid4()) # release lock before import cluster flow_utils.release_node_lock(self.parameters) NS.tendrl.objects.Job(job_id=_job_id, status="new", payload=payload).save() logger.log( "info", NS.publisher_id, { "message": "Please wait while Tendrl imports (" "job_id: %s) newly expanded " "%s storage nodes in cluster %s" % (_job_id, sds_pkg_name, NS.tendrl.objects.Cluster( integration_id=integration_id).load().short_name) }, job_id=self.parameters['job_id'], flow_id=self.parameters['flow_id']) except Exception as ex: Event( ExceptionMessage(priority="error", publisher=NS.publisher_id, payload={ "message": ex.message, "exception": ex })) # raising exception to mark job as failed raise ex finally: # release lock if any exception came flow_utils.release_node_lock(self.parameters)
def sync_volumes( volumes, index, vol_options, sync_ttl, cluster_short_name, devicetree ): NS.node_context = NS.tendrl.objects.NodeContext().load() tag_list = NS.node_context.tags # Raise alerts for volume state change. cluster_provisioner = "provisioner/%s" % NS.tendrl_context.integration_id if cluster_provisioner in tag_list: try: _volume = NS.tendrl.objects.GlusterVolume( NS.tendrl_context.integration_id, vol_id=volumes['volume%s.id' % index] ).load() if _volume.locked_by and 'job_id' in _volume.locked_by and \ _volume.current_job.get('status', '') == 'in_progress': # There is a job active on volume. skip the sync return stored_volume_status = _volume.status current_status = volumes['volume%s.status' % index] if stored_volume_status not in [None, ""] and \ current_status != stored_volume_status: msg = ("Status of volume: %s in cluster %s " "changed from %s to %s") % ( volumes['volume%s.name' % index], cluster_short_name, stored_volume_status, current_status) instance = "volume_%s" % volumes[ 'volume%s.name' % index ] event_utils.emit_event( "volume_status", current_status, msg, instance, 'WARNING' if current_status == 'Stopped' else 'INFO', tags={"entity_type": RESOURCE_TYPE_VOLUME, "volume_name": volumes['volume%s.name' % index] } ) except (KeyError, etcd.EtcdKeyNotFound) as ex: if isinstance(ex, KeyError): raise ex pass volume = NS.tendrl.objects.GlusterVolume( NS.tendrl_context.integration_id, vol_id=volumes['volume%s.id' % index] ).load() volume.vol_type = "arbiter" \ if int(volumes['volume%s.arbiter_count' % index]) > 0 \ else volumes['volume%s.type' % index] volume.name = volumes['volume%s.name' % index] volume.transport_type = volumes['volume%s.transport_type' % index] volume.status = volumes['volume%s.status' % index] volume.brick_count = volumes['volume%s.brickcount' % index] volume.snap_count = volumes['volume%s.snap_count' % index] volume.stripe_count = volumes['volume%s.stripe_count' % index] volume.replica_count = volumes['volume%s.replica_count' % index] volume.subvol_count = volumes['volume%s.subvol_count' % index] volume.arbiter_count = volumes['volume%s.arbiter_count' % index] volume.disperse_count = volumes['volume%s.disperse_count' % index] volume.redundancy_count = volumes['volume%s.redundancy_count' % index] volume.quorum_status = volumes['volume%s.quorum_status' % index] volume.snapd_status = volumes[ 'volume%s.snapd_svc.online_status' % index] volume.snapd_inited = volumes['volume%s.snapd_svc.inited' % index] if NS.tendrl.objects.GlusterVolume( NS.tendrl_context.integration_id, vol_id=volumes['volume%s.id' % index] ).exists(): existing_vol = NS.tendrl.objects.GlusterVolume( NS.tendrl_context.integration_id, vol_id=volumes['volume%s.id' % index] ).load() volume_profiling_old_value = existing_vol.profiling_enabled else: volume_profiling_old_value = volume.profiling_enabled if ('volume%s.profile_enabled' % index) in volumes: value = int(volumes['volume%s.profile_enabled' % index]) if value == 1: volume_profiling_new_value = "yes" else: volume_profiling_new_value = "no" else: volume_profiling_new_value = None volume.profiling_enabled = volume_profiling_new_value if volume_profiling_old_value not in [None, ""] and \ volume_profiling_old_value != volume_profiling_new_value: # Raise alert for the same value change msg = ("Value of volume profiling for volume: %s " "of cluster %s changed from %s to %s" % ( volumes['volume%s.name' % index], cluster_short_name, volume_profiling_old_value, volume_profiling_new_value)) instance = "volume_%s" % \ volumes['volume%s.name' % index] event_utils.emit_event( "volume_profiling_status", volume_profiling_new_value, msg, instance, 'INFO', tags={ "entity_type": RESOURCE_TYPE_BRICK, "volume_name": volumes[ 'volume%s.name' % index ] } ) volume.save(ttl=sync_ttl) # Save the default values of volume options vol_opt_dict = {} for opt_count in \ range(1, int(vol_options['volume%s.options.count' % index])): vol_opt_dict[ vol_options[ 'volume%s.options.key%s' % (index, opt_count) ] ] = vol_options[ 'volume%s.options.value%s' % (index, opt_count) ] volume.options = vol_opt_dict volume.save() rebal_det = NS.gluster.objects.RebalanceDetails( vol_id=volumes['volume%s.id' % index], rebal_id=volumes['volume%s.rebalance.id' % index], rebal_status=volumes['volume%s.rebalance.status' % index], rebal_failures=volumes['volume%s.rebalance.failures' % index], rebal_skipped=volumes['volume%s.rebalance.skipped' % index], rebal_lookedup=volumes['volume%s.rebalance.lookedup' % index], rebal_files=volumes['volume%s.rebalance.files' % index], rebal_data=volumes['volume%s.rebalance.data' % index], time_left=volumes.get('volume%s.rebalance.time_left' % index), ) rebal_det.save(ttl=sync_ttl) georep_details.save_georep_details(volumes, index) b_index = 1 # ipv4 address of current node try: network_ip = [] networks = NS.tendrl.objects.NodeNetwork().load_all() for network in networks: if network.ipv4: network_ip.extend(network.ipv4) except etcd.EtcdKeyNotFound as ex: Event( ExceptionMessage( priority="debug", publisher=NS.publisher_id, payload={ "message": "Could not find " "any ipv4 networks for node" " %s" % NS.node_context.node_id, "exception": ex } ) ) while True: try: # Update brick node wise hostname = volumes[ 'volume%s.brick%s.hostname' % (index, b_index) ] ip = socket.gethostbyname(hostname) try: node_id = etcd_utils.read("indexes/ip/%s" % ip).value fqdn = NS.tendrl.objects.ClusterNodeContext( node_id=node_id ).load().fqdn cluster_node_ids = etcd_utils.read( "indexes/tags/tendrl/integration/%s" % NS.tendrl_context.integration_id ).value cluster_node_ids = json.loads(cluster_node_ids) if NS.node_context.fqdn != fqdn or \ node_id not in cluster_node_ids: b_index += 1 continue except(TypeError, etcd.EtcdKeyNotFound): b_index += 1 continue sub_vol_size = (int( volumes['volume%s.brickcount' % index] )) / int( volumes['volume%s.subvol_count' % index] ) brick_name = NS.node_context.fqdn brick_name += ":" brick_name += volumes['volume%s.brick%s' '.path' % ( index, b_index )].split(":")[-1].replace("/", "_") # Raise alerts if the brick path changes try: stored_brick = NS.tendrl.objects.GlusterBrick( NS.tendrl_context.integration_id, NS.node_context.fqdn, brick_dir=brick_name.split(":_")[-1] ).load() current_status = volumes.get( 'volume%s.brick%s.status' % (index, b_index) ) if stored_brick.status and \ current_status != stored_brick.status: msg = ("Brick:%s in volume:%s has %s" ) % ( volumes['volume%s.brick%s' '.path' % ( index, b_index )], volumes['volume%s.' 'name' % index], current_status) instance = "volume_%s|brick_%s" % ( volumes['volume%s.name' % index], volumes['volume%s.brick%s.path' % ( index, b_index )] ) event_utils.emit_event( "brick_status", current_status, msg, instance, 'WARNING' if current_status == 'Stopped' else 'INFO', tags={"entity_type": RESOURCE_TYPE_BRICK, "volume_name": volumes[ 'volume%s.' 'name' % index] } ) except etcd.EtcdKeyNotFound: pass brk_pth = "clusters/%s/Volumes/%s/Bricks/subvolume%s/%s" vol_brick_path = brk_pth % ( NS.tendrl_context.integration_id, volumes['volume%s.id' % index], str((b_index - 1) / sub_vol_size), brick_name ) etcd_utils.write(vol_brick_path, "") brick = NS.tendrl.objects.GlusterBrick( NS.tendrl_context.integration_id, NS.node_context.fqdn, brick_dir=brick_name.split(":_")[-1] ).load() brick.integration_id = NS.tendrl_context.integration_id brick.fqdn = NS.node_context.fqdn brick.brick_dir = brick_name.split(":_")[-1] brick.name = brick_name brick.vol_id = volumes['volume%s.id' % index] brick.sequence_number = b_index brick.brick_path = volumes[ 'volume%s.brick%s.path' % (index, b_index) ] brick.hostname = volumes.get( 'volume%s.brick%s.hostname' % (index, b_index) ) brick.port = volumes.get( 'volume%s.brick%s.port' % (index, b_index) ) brick.vol_name = volumes['volume%s.name' % index] brick.used = True brick.node_id = NS.node_context.node_id brick.status = volumes.get( 'volume%s.brick%s.status' % (index, b_index) ) brick.filesystem_type = volumes.get( 'volume%s.brick%s.filesystem_type' % (index, b_index) ) brick.mount_opts = volumes.get( 'volume%s.brick%s.mount_options' % (index, b_index) ) brick.utilization = brick_utilization.brick_utilization( volumes['volume%s.brick%s.path' % (index, b_index)] ) brick.client_count = volumes.get( 'volume%s.brick%s.client_count' % (index, b_index) ) brick.is_arbiter = volumes.get( 'volume%s.brick%s.is_arbiter' % (index, b_index) ) brick.save(ttl=sync_ttl) # sync brick device details brick_device_details.\ update_brick_device_details( brick_name, volumes[ 'volume%s.brick%s.path' % ( index, b_index) ], devicetree, sync_ttl ) # Sync the brick client details c_index = 1 if volumes.get( 'volume%s.brick%s.client_count' % (index, b_index) ) > 0: while True: try: NS.gluster.objects.ClientConnection( brick_name=brick_name, fqdn=NS.node_context.fqdn, brick_dir=brick_name.split(":_")[-1], hostname=volumes[ 'volume%s.brick%s.client%s.hostname' % ( index, b_index, c_index ) ], bytesread=volumes[ 'volume%s.brick%s.client%s.bytesread' % ( index, b_index, c_index ) ], byteswrite=volumes[ 'volume%s.brick%s.client%s.byteswrite' % ( index, b_index, c_index ) ], opversion=volumes[ 'volume%s.brick%s.client%s.opversion' % ( index, b_index, c_index ) ] ).save(ttl=sync_ttl) except KeyError: break c_index += 1 sync_ttl += 4 b_index += 1 except KeyError: break return b_index
def save(self, update=True, ttl=None): self.render() if "Message" not in self.__class__.__name__: try: # Generate current in memory object hash self.hash = self._hash() _hash_key = "/{0}/hash".format(self.value) _stored_hash = None try: _stored_hash = NS._int.client.read(_hash_key).value except (etcd.EtcdConnectionFailed, etcd.EtcdException) as ex: if type(ex) != etcd.EtcdKeyNotFound: NS._int.reconnect() _stored_hash = NS._int.client.read(_hash_key).value if self.hash == _stored_hash: # No changes in stored object and current object, # dont save current object to central store if ttl: etcd_utils.refresh(self.value, ttl) return except TypeError: # no hash for this object, save the current hash as is pass if update: current_obj = self.load() for attr, val in vars(self).iteritems(): if isinstance(val, (types.FunctionType, types.BuiltinFunctionType, types.MethodType, types.BuiltinMethodType, types.UnboundMethodType)) or \ attr.startswith("_") or attr in ['value', 'list']: continue if val is None and hasattr(current_obj, attr): # if self.attr is None, use attr value from central # store (i.e. current_obj.attr) if getattr(current_obj, attr): setattr(self, attr, getattr(current_obj, attr)) self.updated_at = str(time_utils.now()) for item in self.render(): ''' Note: Log messages in this file have try-except blocks to run in the condition when the node_agent has not been started and name spaces are being created. ''' try: Event( Message(priority="debug", publisher=NS.publisher_id, payload={ "message": "Writing %s to %s" % (item['key'], item['value']) })) except KeyError: sys.stdout.write("Writing %s to %s" % (item['key'], item['value'])) # convert list, dict (json) to python based on definitions _type = self._defs.get("attrs", {}).get(item['name'], {}).get("type") if _type: if _type.lower() in ['json', 'list']: if item['value']: try: item['value'] = json.dumps(item['value']) except ValueError as ex: _msg = "Error save() attr %s for object %s" % \ (item['name'], self.__name__) Event( ExceptionMessage(priority="debug", publisher=NS.publisher_id, payload={ "message": _msg, "exception": ex })) try: NS._int.wclient.write(item['key'], item['value'], quorum=True) except (etcd.EtcdConnectionFailed, etcd.EtcdException): NS._int.wreconnect() NS._int.wclient.write(item['key'], item['value'], quorum=True) if ttl: etcd_utils.refresh(self.value, ttl)
def brick_status_alert(hostname): try: # fetching brick details of disconnected node lock = None path = "clusters/%s/Bricks/all/%s" % ( NS.tendrl_context.integration_id, hostname ) lock = etcd.Lock( NS._int.client, path ) lock.acquire( blocking=True, lock_ttl=60 ) if lock.is_acquired: bricks = NS.tendrl.objects.GlusterBrick( NS.tendrl_context.integration_id, fqdn=hostname ).load_all() for brick in bricks: if brick.status.lower() == BRICK_STARTED: # raise an alert for brick msg = ( "Brick:%s in volume:%s has %s") % ( brick.brick_path, brick.vol_name, BRICK_STOPPED.title() ) instance = "volume_%s|brick_%s" % ( brick.vol_name, brick.brick_path, ) event_utils.emit_event( "brick_status", BRICK_STOPPED.title(), msg, instance, 'WARNING', tags={"entity_type": RESOURCE_TYPE_BRICK, "volume_name": brick.vol_name, "node_id": brick.node_id, "fqdn": brick.hostname } ) # Update brick status as stopped brick.status = BRICK_STOPPED.title() brick.save() lock.release() except ( etcd.EtcdException, KeyError, ValueError, AttributeError ) as ex: Event( ExceptionMessage( priority="error", publisher=NS.publisher_id, payload={ "message": "Unable to raise an brick status " "alert for host %s" % hostname, "exception": ex } ) ) finally: if isinstance(lock, etcd.lock.Lock) and lock.is_acquired: lock.release()
def shutdown(): Event( Message(priority="info", publisher=NS.publisher_id, payload={"message": "Signal handler: stopping"})) complete.set()
def sync(): try: Event( Message(priority="debug", publisher=NS.publisher_id, payload={"message": "Running SDS detection"})) try: sds_discovery_manager = sds_manager.SDSDiscoveryManager() except ValueError as ex: Event( ExceptionMessage(priority="debug", publisher=NS.publisher_id, payload={ "message": "Failed to init SDSDiscoveryManager.", "exception": ex })) return # Execute the SDS discovery plugins and tag the nodes with data for plugin in sds_discovery_manager.get_available_plugins(): sds_details = plugin.discover_storage_system() if ('detected_cluster_id' in sds_details and sds_details['detected_cluster_id'] != ""): if sds_details: try: dc = NS.tendrl.objects.DetectedCluster().load() dc_changed = False if dc.detected_cluster_id: if dc.detected_cluster_id != sds_details.get( 'detected_cluster_id'): dc_changed = True else: time.sleep(3) integration_index_key = \ "indexes/detected_cluster_id_to_integration_id/" \ "%s" % sds_details['detected_cluster_id'] try: if dc_changed: integration_id = \ NS.tendrl_context.integration_id NS._int.wclient.write(integration_index_key, integration_id) else: integration_id = str(uuid.uuid4()) NS._int.wclient.write(integration_index_key, integration_id, prevExist=False) except etcd.EtcdAlreadyExist: if not dc_changed: integration_id = NS._int.client.read( integration_index_key).value finally: NS.tendrl_context.integration_id = integration_id NS.tendrl_context.cluster_id = sds_details.get( 'detected_cluster_id') NS.tendrl_context.cluster_name = sds_details.get( 'detected_cluster_name') NS.tendrl_context.sds_name = sds_details.get( 'pkg_name') NS.tendrl_context.sds_version = sds_details.get( 'pkg_version') NS.tendrl_context.save() NS.node_context = NS.node_context.load() integration_tag = "tendrl/integration/%s" % \ integration_id detected_cluster_tag = "detected_cluster/%s" % \ sds_details[ 'detected_cluster_id'] NS.node_context.tags += [ detected_cluster_tag, integration_tag ] NS.node_context.tags = list(set(NS.node_context.tags)) NS.node_context.save() _cluster = NS.tendrl.objects.Cluster( integration_id=NS.tendrl_context.integration_id ).load() NS.tendrl.objects.DetectedCluster( detected_cluster_id=sds_details.get( 'detected_cluster_id'), detected_cluster_name=sds_details.get( 'detected_cluster_name'), sds_pkg_name=sds_details.get('pkg_name'), sds_pkg_version=sds_details.get('pkg_version'), ).save() if _cluster.is_managed == "yes": continue else: _cluster.is_managed = "no" _cluster.save() except (etcd.EtcdException, KeyError) as ex: Event( ExceptionMessage(priority="debug", publisher=NS.publisher_id, payload={ "message": "Failed SDS detection", "exception": ex })) break except Exception as ex: Event( ExceptionMessage(priority="error", publisher=NS.publisher_id, payload={ "message": "node_sync " "SDS detection failed: " + ex.message, "exception": ex }))
def sync(): try: _keep_alive_for = int(NS.config.data.get("sync_interval", 10)) + 250 interfaces = get_node_network() if len(interfaces) > 0: for interface in interfaces: NS.tendrl.objects.NodeNetwork(**interface).save( ttl=_keep_alive_for) if interface['ipv4']: for ipv4 in interface['ipv4']: index_key = "/indexes/ip/%s" % ipv4 try: NS._int.wclient.write(index_key, NS.node_context.node_id, prevExist=False) except etcd.EtcdAlreadyExist: pass # TODO(team) add ipv6 support # if interface['ipv6']: # for ipv6 in interface['ipv6']: # index_key = "/indexes/ip/%s/%s" % (ipv6, # # NS.node_context.node_id) # NS._int.wclient.write(index_key, 1) # global network if len(interfaces) > 0: for interface in interfaces: if interface["subnet"] is not "": NS.node_agent.objects.GlobalNetwork(**interface).save( ttl=_keep_alive_for) try: networks = NS._int.client.read("/networks") for network in networks.leaves: try: # it will delete any node with empty network detail in # subnet, if one entry present then deletion never happen NS._int.wclient.delete( "%s/%s" % (network.key, NS.node_context.node_id), dir=True) # it will delete any subnet dir when it is empty # if one entry present then deletion never happen NS._int.wclient.delete(network.key, dir=True) except (etcd.EtcdKeyNotFound, etcd.EtcdDirNotEmpty): continue except etcd.EtcdKeyNotFound as ex: Event( ExceptionMessage(priority="debug", publisher=NS.publisher_id, payload={ "message": "Given key is not present in " "etcd .", "exception": ex })) except Exception as ex: _msg = "node_sync networks sync failed: " + ex.message Event( ExceptionMessage(priority="error", publisher=NS.publisher_id, payload={ "message": _msg, "exception": ex }))
def _sync_rbds(self): try: pools = NS._int.client.read("clusters/%s/Pools" % NS.tendrl_context.integration_id, recursive=True) for child in pools._children: pool_id = child['key'].split('/')[-1] pool_name = NS._int.client.read( "clusters/%s/Pools/%s/pool_name" % (NS.tendrl_context.integration_id, pool_id)).value rbd_details = self._get_rbds(pool_name) # Rbd out of band delete handling try: rbds = NS._int.client.read( "clusters/%s/Pools/%s/Rbds" % (NS.tendrl_context.integration_id, pool_id)) old_rbds = [] for rbd in rbds.leaves: old_rbds.append(rbd.key.split("/")[-1]) new_rbds = [] for k, v in rbd_details.iteritems(): new_rbds.append(k) delete_rbds = set(old_rbds) - set(new_rbds) for id in delete_rbds: NS._int.client.delete( "clusters/%s/Pools/%s/Rbds/%s" % (NS.tendrl_context.integration_id, pool_id, id), recursive=True) except etcd.EtcdKeyNotFound as ex: Event( ExceptionMessage( priority="debug", publisher=NS.publisher_id, payload={ "message": "No rbds found for ceph cluster %s" % NS.tendrl_context.integration_id, "exception": ex })) for k, v in rbd_details.iteritems(): NS.ceph.objects.Rbd( name=k, size=v['size'], pool_id=pool_id, flags=v['flags'], provisioned=self._to_bytes(v['provisioned']) if v.get("provisioned") else None, used=self._to_bytes(v['used'])).save() try: rbds = NS._int.client.read( "clusters/%s/Pools/%s/Rbds" % (NS.tendrl_context.integration_id, pool_id)) except etcd.EtcdKeyNotFound: # no rbds for pool, continue continue for entry in rbds.leaves: fetched_rbd = NS.ceph.objects.Rbd( pool_id=pool_id, name=entry.key.split("Rbds/")[-1]).load() if fetched_rbd.name not in rbd_details.keys(): NS._int.client.delete( "clusters/%s/Pools/%s/Rbds/%s" % (NS.tendrl_context.integration_id, pool_id, fetched_rbd.name), recursive=True) except etcd.EtcdKeyNotFound: pass
def sync(sync_ttl=None): try: tags = [] # update node agent service details logger.log("debug", NS.publisher_id, {"message": "node_sync, Updating Service data"}) for service in TENDRL_SERVICES: s = NS.tendrl.objects.Service(service=service) if s.running: service_tag = NS.compiled_definitions.get_parsed_defs( )['namespace.tendrl']['tags'][service.strip("@*")] tags.append(service_tag) if service_tag == "tendrl/server": tags.append("tendrl/monitor") s.save() if "tendrl/monitor" not in tags and \ NS.tendrl_context.integration_id: _cluster = NS.tendrl.objects.Cluster( integration_id=NS.tendrl_context.integration_id).load() # Try to claim orphan "provisioner_%integration_id" tag _tag = "provisioner/%s" % _cluster.integration_id _is_new_provisioner = False NS.node_context = NS.tendrl.objects.NodeContext().load() if _tag not in NS.node_context.tags: try: _index_key = "/indexes/tags/%s" % _tag _node_id = json.dumps([NS.node_context.node_id]) etcd_utils.write(_index_key, _node_id, prevExist=False) etcd_utils.refresh(_index_key, sync_ttl + 50) tags.append(_tag) _is_new_provisioner = True except etcd.EtcdAlreadyExist: pass # updating node context with latest tags logger.log( "debug", NS.publisher_id, {"message": "node_sync, updating node context " "data with tags"}) NS.node_context = NS.tendrl.objects.NodeContext().load() current_tags = list(NS.node_context.tags) tags += current_tags NS.node_context.tags = list(set(tags)) NS.node_context.tags.sort() current_tags.sort() if NS.node_context.tags != current_tags: NS.node_context.save() if "tendrl/monitor" not in tags and \ NS.tendrl_context.integration_id: _cluster = _cluster.load() if _is_new_provisioner and _cluster.is_managed == "yes": _msg = "node_sync, NEW provisioner node found! "\ "re-configuring monitoring (job-id: %s) on this node" payload = { "tags": ["tendrl/node_%s" % NS.node_context.node_id], "run": "tendrl.flows.ConfigureMonitoring", "status": "new", "parameters": { 'TendrlContext.integration_id': NS.tendrl_context.integration_id }, "type": "node" } _job_id = str(uuid.uuid4()) NS.tendrl.objects.Job(job_id=_job_id, status="new", payload=payload).save() logger.log("debug", NS.publisher_id, {"message": _msg % _job_id}) # Update /indexes/tags/:tag = [node_ids] for tag in NS.node_context.tags: index_key = "/indexes/tags/%s" % tag _node_ids = [] try: _node_ids = etcd_utils.read(index_key).value _node_ids = json.loads(_node_ids) except etcd.EtcdKeyNotFound: pass if _node_ids: if "provisioner" in tag: # Check if this is a stale provisioner if NS.node_context.node_id != _node_ids[0]: NS.node_context.tags.remove(tag) NS.node_context.save() continue if NS.node_context.node_id in _node_ids: if sync_ttl and len(_node_ids) == 1: etcd_utils.refresh(index_key, sync_ttl + 50) continue else: _node_ids += [NS.node_context.node_id] else: _node_ids = [NS.node_context.node_id] _node_ids = list(set(_node_ids)) etcd_utils.write(index_key, json.dumps(_node_ids)) if sync_ttl and len(_node_ids) == 1: etcd_utils.refresh(index_key, sync_ttl + 50) logger.log("debug", NS.publisher_id, {"message": "node_sync, Updating detected " "platform"}) except Exception as ex: Event( ExceptionMessage(priority="error", publisher=NS.publisher_id, payload={ "message": "node_sync service and indexes " "sync failed: " + ex.message, "exception": ex }))
def _sync_ec_profiles(self): """Invokes the below CLI commands 1. ```ceph osd erasure-code-profile ls``` and required output format is a list of ec profiles separated with new lines as below ``` default k4m2 ``` 2. ```ceph osd erasure-code-profile get {name}``` and the required output format is '=' separated values in multiple lines ``` k=2 m=1 plugin=jerasure directory={dir} ``` """ required_ec_profiles = [(2, 1), (4, 2), (6, 3), (8, 4)] ec_profile_details = {} commands = ['osd', 'erasure-code-profile', 'ls'] cmd_out = ceph.ceph_command(NS.tendrl_context.cluster_name, commands) if cmd_out['err'] == "": ec_profile_list = [] for item in cmd_out['out'].split('\n'): if item != "": ec_profile_list.append(item) for ec_profile in ec_profile_list: commands = ['osd', 'erasure-code-profile', 'get', ec_profile] cmd_out = ceph.ceph_command(NS.tendrl_context.cluster_name, commands) if cmd_out['err'] == "": info = {} for item in cmd_out['out'].split('\n'): if item != "": info[item.split('=')[0]] = \ item.split('=')[1].strip() ec_profile_details[ec_profile] = info # Ec profile out of band delete handling try: ec_profiles = NS._int.client.read( "clusters/%s/ECProfiles" % (NS.tendrl_context.integration_id)) old_ec_profiles = [] for ec_profile in ec_profiles.leaves: old_ec_profiles.append(ec_profile.key.split("/")[-1]) new_ec_profiles = [] for k, v in ec_profile_details.iteritems(): new_ec_profiles.append(k) delete_ec_profiles = set(old_ec_profiles) - set( new_ec_profiles) for id in delete_ec_profiles: NS._int.client.delete( "clusters/%s/ECProfiles/%s" % (NS.tendrl_context.integration_id, id), recursive=True) except etcd.EtcdKeyNotFound as ex: Event( ExceptionMessage(priority="debug", publisher=NS.publisher_id, payload={ "message": "key not found in etcd", "exception": ex })) available_ec_profiles = [] for k, v in ec_profile_details.iteritems(): NS.ceph.objects.ECProfile( name=k, k=v['k'], m=v['m'], plugin=v.get('plugin'), directory=v.get('directory'), ruleset_failure_domain=v.get('ruleset_failure_domain')).save() available_ec_profiles.append((int(v['k']), int(v['m']))) # Create the missing ec_profile_details missing_ec_profiles = [ item for item in required_ec_profiles if item not in available_ec_profiles ] for item in missing_ec_profiles: attrs = dict(name="k%sm%s" % (item[0], item[1]), k=item[0], m=item[1], plugin='jerasure', directory='/usr/lib/ceph/erasure-code') crud = Crud() crud.create("ec_profile", attrs)