def __init__(self, priority, publisher, payload, job_id=None, flow_id=None, parent_id=None, integration_id=None, message_id=None, timestamp=None, node_id=None, caller=None, *args, **kwargs): super(Message, self).__init__(*args, **kwargs) if message_id is None: self.message_id = str(uuid.uuid4()) self.timestamp = now() else: self.message_id = message_id self.timestamp = timestamp if caller is None: # From which function, line and file error raised caller = getframeinfo(stack()[1][0]) self.caller = {"filename": caller.filename, "line_no": caller.lineno, "function": caller.function} else: self.caller = caller self.priority = priority self.publisher = publisher self.node_id = node_id if self.node_id is None: self.node_id = NS.node_context.node_id self.job_id = job_id self.flow_id = flow_id self.parent_id = parent_id self.integration_id = integration_id self.payload = payload
def __init__(self, node_id=None, fqdn=None, ipv4_addr=None, updated_at=None, tags=None, status=None, sync_status=None, last_sync=None, first_sync_done=None, is_managed=None, *args, **kwargs): super(ClusterNodeContext, self).__init__(*args, **kwargs) _node_context = NS.node_context.load() self.node_id = node_id or _node_context.node_id self.fqdn = fqdn or _node_context.fqdn self.ipv4_addr = ipv4_addr or _node_context.ipv4_addr self.updated_at = updated_at or str(time_utils.now()) self.tags = tags or _node_context.tags self.status = status or _node_context.status self.sync_status = sync_status or _node_context.sync_status self.last_sync = last_sync or _node_context.last_sync self.first_sync_done = first_sync_done self.is_managed = is_managed self.value = 'clusters/{0}/nodes/{1}/NodeContext'
def test_constructor_Messsage(): init() msg = Message(priority="info", publisher="node_context", payload={"message": "Test Message"}) assert msg.priority == "info" assert msg.publisher == "node_context" assert msg.caller is not None assert msg.message_id is not None msg = Message("info", "node_context", message_id=1, timestamp=now(), payload={"message": "Test Message"}) assert msg.message_id == 1 assert isinstance(msg.timestamp, datetime.datetime) obj_caller = getframeinfo(stack()[1][0]) obj_caller = { "filename": obj_caller.filename, "line_no": obj_caller.lineno, "function": obj_caller.function } msg = Message("info", "node_context", payload={"message": "Test Message"}, caller=obj_caller) msg = Message(priority="info", publisher="node_context", payload={"message": "Test Message"}, node_id="Test id") assert msg.node_id == "Test id"
def _emit_event(self, severity, resource, curr_value, msg, plugin_instance=None): if not NS.node_context.node_id: return alert = {} alert['source'] = NS.publisher_id alert['pid'] = os.getpid() alert['time_stamp'] = now().isoformat() alert['alert_type'] = 'status' alert['severity'] = SEVERITIES[severity] alert['resource'] = resource alert['current_value'] = curr_value alert['tags'] = dict(message=msg, cluster_id=NS.tendrl_context.integration_id, cluster_name=NS.tendrl_context.cluster_name, sds_name=NS.tendrl_context.sds_name, fqdn=socket.getfqdn()) if plugin_instance: alert['tags']['plugin_instance'] = plugin_instance alert['node_id'] = NS.node_context.node_id Event(Message("notice", "alerting", {'message': json.dumps(alert)}))
def _read(*args, **kwargs): test_job._complete._Event__flag = True global status_flag global status_valid if args[1] == "/queue/job/status" and status_flag == 0: status_flag = 1 return maps.NamedDict(leaves=[maps.NamedDict(key="test/job")], value="finished") elif args[1] == "/queue/job/status" and status_flag == 1: status_flag = 2 return maps.NamedDict(leaves=[maps.NamedDict(key="test/job")], value="unfinished") elif args[1] == "/queue/job/status" and status_flag == 2: raise etcd.EtcdKeyNotFound elif args[1] == "/queue" or args[1] == "/queue/job/locked_by": return maps.NamedDict(leaves=[maps.NamedDict(key="test/job")], value=False) elif args[1] == "/queue/job/valid_until" and status_valid == 0: status_valid = 1 return maps.NamedDict(leaves=[maps.NamedDict(key="test/job")], value=False) elif args[1] == "/queue/job/valid_until" and status_valid == 1: return maps.NamedDict( leaves=[maps.NamedDict(key="test/job")], value=(time_utils.now() - datetime.datetime( 1970, 1, 1).replace(tzinfo=utc)).total_seconds())
def __init__( self, plugin_name=None, node_id=None, job_id='', time_stamp=str(now()), *args, **kwargs ): # TODO(anmol_b): Add status to track configuration status and retrial # count if auto-retrials are required. node-monitoring which is the # consumer of these monitoring configuration jobs, will update success # or failure in configuration when the job is picked by it. And retrial # counter would be incremented from the only supposed way to load # monitoring utils#util#initiate_config_generation jobs to /queue. # There unpicked timed-out jobs can then be updated from configuration # threads - configure_node_monitoiring and configure_cluster_monitoring # before deciding to attempt/re-attempt config generation by looking at # job's' status using the job_id here.. super(NodeMonitoringPlugin, self).__init__(*args, **kwargs) self.node_id = node_id self.plugin_name = plugin_name self.job_id = job_id self.time_stamp = time_stamp self.value = 'monitoring/plugin_configurations/nodes/{0}/{1}'
def _run(self): Event( Message( priority="info", publisher=NS.publisher_id, payload={"message": "%s running" % self.__class__.__name__})) # Check if monitor key exists, if not sync try: NS._int.client.read("clusters/%s/_mon_key" % NS.tendrl_context.integration_id) except etcd.EtcdKeyNotFound: out, err, rc = cmd_utils.Command( "ceph auth get mon. --cluster %s" % NS.tendrl_context.cluster_name).run() if rc != 0: Event( Message(priority="debug", publisher=NS.publisher_id, payload={ "message": "Couldn't get monitor key. Error:%s" % err })) else: if out and out != "": mon_sec = out.split('\n')[1].strip().split( ' = ')[1].strip() NS._int.wclient.write( "clusters/%s/_mon_key" % NS.tendrl_context.integration_id, mon_sec) while not self._complete.is_set(): gevent.sleep(int(NS.config.data.get("sync_interval", 10))) try: NS._int.wclient.write("clusters/%s/sync_status" % NS.tendrl_context.integration_id, "in_progress", prevExist=False) except (etcd.EtcdAlreadyExist, etcd.EtcdCompareFailed) as ex: pass cluster_data = ceph.heartbeat(NS.tendrl_context.cluster_id) self.on_heartbeat(cluster_data) _cluster = NS.tendrl.objects.Cluster( integration_id=NS.tendrl_context.integration_id) if _cluster.exists(): _cluster.sync_status = "done" _cluster.last_sync = str(now()) _cluster.save() Event( Message( priority="info", publisher=NS.publisher_id, payload={"message": "%s complete" % self.__class__.__name__}))
def update(self, new_alert, existing_alert): if (alert_severity_map[new_alert.severity] <= alert_severity_map[existing_alert.severity] and alert_severity_map[new_alert.severity] == alert_severity_map['INFO']): new_alert.ackedby = "TENDRL" new_alert.acked = True new_alert.acked_at = now() new_alert.ack_comment = ['System acked'] new_alert.alert_id = existing_alert.alert_id return new_alert
def update(self, new_alert, existing_alert): if ( alert_severity_map[new_alert.severity] <= alert_severity_map[ existing_alert.severity] and alert_severity_map[new_alert.severity] == alert_severity_map[ 'INFO'] ): new_alert.ackedby = "TENDRL" new_alert.acked = True new_alert.acked_at = now() new_alert.ack_comment = ['System acked'] new_alert.alert_id = existing_alert.alert_id return new_alert
def update(self, new_alert, existing_alert): time_stamp = existing_alert.time_stamp if (alert_severity_map[new_alert.severity] < alert_severity_map[existing_alert.severity] and alert_severity_map[new_alert.severity] == alert_severity_map['INFO']): time_stamp = new_alert.time_stamp new_alert.ackedby = constants.TENDRL new_alert.acked = True new_alert.acked_at = now() new_alert.ack_comment = ['System acked'] new_alert.alert_id = existing_alert.alert_id new_alert.time_stamp = time_stamp return new_alert
def save(self, update=True, ttl=None): hash_key_changed = True if "Message" not in self.__class__.__name__: # If local object.hash is equal to # central_store object.hash, return if self.hash_compare_with_central_store(ttl=ttl): # No change in hashkey hash_key_changed = False rendered_obj = self.render() watchables = self._defs.get("watch_attrs", []) if self.__class__.__name__ in ['Config', 'Definition'] or \ len(watchables) > 0: for item in rendered_obj: if item['name'] in watchables: _type = self._defs.get("attrs", {}).get(item['name'], {}).get("type") if _type and _type.lower() in ['json', 'list'] and \ item['value']: try: item['value'] = json.dumps(item['value']) except ValueError: _msg = "Error save() attr %s for object %s" % \ (item['name'], self.__name__) logger.log("debug", NS.publisher_id, {"message": _msg}) if self._ttl and item['name'] in self._attrs_with_ttl: etcd_utils.write(item['key'], item['value'], quorum=True, ttl=self._ttl) else: etcd_utils.write(item['key'], item['value'], quorum=True) if hash_key_changed: data_key = self.value + '/data' etcd_utils.write(data_key, self.json) updated_at_key = self.value + '/updated_at' hash_key = self.value + '/hash' etcd_utils.write(updated_at_key, str(time_utils.now())) if hasattr(self, 'hash'): etcd_utils.write(hash_key, self.hash) if ttl: etcd_utils.refresh(self.value, ttl) self.watch_attrs()
def save(self, update=True, ttl=None): hash_key_changed = True if "Message" not in self.__class__.__name__: # If local object.hash is equal to # central_store object.hash, return if self.hash_compare_with_central_store(ttl=ttl): # No change in hashkey hash_key_changed = False rendered_obj = self.render() watchables = self._defs.get("watch_attrs", []) if self.__class__.__name__ in ['Config', 'Definition'] or \ len(watchables) > 0: for item in rendered_obj: if item['name'] in watchables: _type = self._defs.get("attrs", {}).get( item['name'], {} ).get("type") if _type and _type.lower() in ['json', 'list'] and \ item['value']: try: item['value'] = json.dumps(item['value']) except ValueError: _msg = "Error save() attr %s for object %s" % \ (item['name'], self.__name__) logger.log( "debug", NS.publisher_id, {"message": _msg} ) etcd_utils.write(item['key'], item['value'], quorum=True) if hash_key_changed: data_key = self.value + '/data' etcd_utils.write(data_key, self.json) updated_at_key = self.value + '/updated_at' hash_key = self.value + '/hash' etcd_utils.write(updated_at_key, str(time_utils.now())) if hasattr(self, 'hash'): etcd_utils.write(hash_key, self.hash) if ttl: etcd_utils.refresh(self.value, ttl) self.watch_attrs()
def __init__(self, node_id=None, fqdn=None, ipv4_addr=None, tags=None, status=None, sync_status=None, last_sync=None, updated_at=None, pkey=None, *args, **kwargs): super(NodeContext, self).__init__(*args, **kwargs) self.node_id = node_id or self._get_node_id() or self._create_node_id() self.fqdn = fqdn or socket.getfqdn() self.ipv4_addr = ipv4_addr or socket.gethostbyname(self.fqdn) curr_tags = [] try: curr_tags = NS._int.client.read("/nodes/%s/NodeContext/tags" % self.node_id).value except etcd.EtcdKeyNotFound: pass try: curr_tags = json.loads(curr_tags) except (ValueError, TypeError): # No existing tags pass self.tags = tags or [] self.tags += NS.config.data.get('tags', []) self.tags += curr_tags self.tags = list(set(self.tags)) self.status = status or "UP" self.sync_status = sync_status self.last_sync = last_sync self.updated_at = updated_at or str(time_utils.now()) self.pkey = pkey or self.fqdn self.value = 'nodes/{0}/NodeContext'
def init(patch_write, patch_refresh, patch_client): patch_write.return_value = True patch_refresh.return_value = True patch_client.return_value = etcd.Client() setattr(__builtin__, "NS", maps.NamedDict()) setattr(NS, "_int", maps.NamedDict()) NS._int.etcd_kwargs = { 'port': 1, 'host': 2, 'allow_reconnect': True} NS._int.client = etcd.Client(**NS._int.etcd_kwargs) NS._int.wclient = etcd.Client(**NS._int.etcd_kwargs) NS["config"] = maps.NamedDict() NS.config["data"] = maps.NamedDict() NS.config.data['message_retention_time'] = "infinite" NS.node_agent = maps.NamedDict() NS.node_agent.objects = importlib.import_module( "tendrl.commons.tests.fixtures.cluster_message") NS.node_context = maps.NamedDict() NS.node_context.node_id = 1 message = maps.NamedDict() message["priority"] = "info" message["cluster_id"] = "test_cluster" message["message_id"] = "test_id" message["timestamp"] = now() message["publisher"] = "node_context" message["node_id"] = "test_id" message["payload"] = {"message": "test_message"} message["job_id"] = "test_job_id" message["flow_id"] = "test_flow_id" message["parent_id"] = "test_parent_id" obj_caller = getframeinfo(stack()[1][0]) obj_caller = {"filename": obj_caller.filename, "line_no": obj_caller.lineno, "function": obj_caller.function} message["caller"] = obj_caller return message
def init(patch_write, patch_refresh, patch_client): patch_write.return_value = True patch_refresh.return_value = True patch_client.return_value = etcd.Client() setattr(__builtin__, "NS", maps.NamedDict()) setattr(NS, "_int", maps.NamedDict()) NS._int.etcd_kwargs = { 'port': 1, 'host': 2, 'allow_reconnect': True} NS._int.client = etcd.Client(**NS._int.etcd_kwargs) NS._int.wclient = etcd.Client(**NS._int.etcd_kwargs) NS["config"] = maps.NamedDict() NS.config["data"] = maps.NamedDict() NS.config.data['message_retention_time'] = "infinite" NS.node_agent = maps.NamedDict() NS.node_agent.objects = importlib.import_module( "tendrl.commons.tests.fixtures.cluster_message") NS.node_context = maps.NamedDict() NS.node_context.node_id = 1 message = maps.NamedDict() message["priority"] = "info" message["integration_id"] = "test_cluster" message["message_id"] = "test_id" message["timestamp"] = now() message["publisher"] = "node_context" message["node_id"] = "test_id" message["payload"] = {"message": "test_message"} message["job_id"] = "test_job_id" message["flow_id"] = "test_flow_id" message["parent_id"] = "test_parent_id" obj_caller = getframeinfo(stack()[1][0]) obj_caller = {"filename": obj_caller.filename, "line_no": obj_caller.lineno, "function": obj_caller.function} message["caller"] = obj_caller return message
def run(self): Event( Message( priority="info", publisher=NS.publisher_id, payload={"message": "%s running" % self.__class__.__name__})) NS.node_context = NS.node_context.load() current_tags = list(NS.node_context.tags) current_tags += ["tendrl/node_%s" % NS.node_context.node_id] NS.node_context.tags = list(set(current_tags)) NS.node_context.status = "UP" NS.node_context.save() # Initialize alert count try: key = '/nodes/%s/alert_counters' % NS.node_context.node_id etcd_utils.read(key) except (EtcdException) as ex: if type(ex) == EtcdKeyNotFound: NodeAlertCounters(node_id=NS.node_context.node_id).save() _sleep = 0 while not self._complete.is_set(): _sync_ttl = int(NS.config.data.get("sync_interval", 10)) + 100 if _sleep > 5: _sleep = int(NS.config.data.get("sync_interval", 10)) else: _sleep += 1 NS.node_context = NS.node_context.load() NS.node_context.sync_status = "in_progress" NS.node_context.status = "UP" NS.node_context.save(ttl=_sync_ttl) NS.tendrl_context = NS.tendrl_context.load() sync_cluster_contexts_thread = threading.Thread( target=cluster_contexts_sync.sync, args=(_sync_ttl, )) sync_cluster_contexts_thread.daemon = True sync_cluster_contexts_thread.start() sync_cluster_contexts_thread.join() platform_detect_thread = threading.Thread( target=platform_detect.sync) platform_detect.daemon = True platform_detect_thread.start() platform_detect_thread.join() sds_detect_thread = threading.Thread(target=sds_detect.sync) sds_detect_thread.daemon = True sds_detect_thread.start() sds_detect_thread.join() sync_service_and_index_thread = threading.Thread( target=services_and_index_sync.sync, args=(_sync_ttl, )) sync_service_and_index_thread.daemon = True sync_service_and_index_thread.start() sync_service_and_index_thread.join() try: NS.tendrl.objects.Os().save() NS.tendrl.objects.Cpu().save() NS.tendrl.objects.Memory().save() except Exception as ex: Event( ExceptionMessage(priority="error", publisher=NS.publisher_id, payload={ "message": "node_sync " "os/cpu/memory sync failed: " + ex.message, "exception": ex })) NS.node_context = NS.node_context.load() NS.node_context.sync_status = "failed" NS.node_context.last_sync = str(time_utils.now()) NS.node_context.status = "UP" NS.node_context.save(ttl=_sync_ttl) time.sleep(_sleep) sync_disks_thread = threading.Thread(target=disk_sync.sync) sync_disks_thread.daemon = True sync_disks_thread.start() sync_disks_thread.join() sync_networks_thread = threading.Thread(target=network_sync.sync) sync_networks_thread.daemon = True sync_networks_thread.start() sync_networks_thread.join() NS.node_context = NS.node_context.load() NS.node_context.sync_status = "done" NS.node_context.last_sync = str(time_utils.now()) NS.node_context.status = "UP" NS.node_context.save(ttl=_sync_ttl) sync_cluster_contexts_thread = threading.Thread( target=cluster_contexts_sync.sync, args=(_sync_ttl, )) sync_cluster_contexts_thread.daemon = True sync_cluster_contexts_thread.start() sync_cluster_contexts_thread.join() if "tendrl/monitor" in NS.node_context.tags: check_all_managed_node_status_thread = threading.Thread( target=check_all_managed_nodes_status.run) check_all_managed_node_status_thread.daemon = True check_all_managed_node_status_thread.start() check_all_managed_node_status_thread.join() check_cluster_status_thread = threading.Thread( target=check_cluster_status.run) check_cluster_status_thread.daemon = True check_cluster_status_thread.start() check_cluster_status_thread.join() if not NS.gluster_sds_sync_running: NS.gluster_integrations_sync_thread = \ gluster_integrations_sds_sync.\ GlusterIntegrtaionsSyncThread() NS.gluster_integrations_sync_thread.start() NS.gluster_sds_sync_running = True time.sleep(_sleep) Event( Message( priority="info", publisher=NS.publisher_id, payload={"message": "%s complete" % self.__class__.__name__}))
def volume_delete(self, event): time.sleep(self.sync_interval) fetched_volumes = NS.tendrl.objects.GlusterVolume( NS.tendrl_context.integration_id).load_all() for fetched_volume in fetched_volumes: if fetched_volume.name == event['message']['name']: fetched_volume.deleted = True fetched_volume.deleted_at = time_utils.now() fetched_volume.save() try: sub_volumes = etcd_utils.read( "/clusters/{0}/Volumes/{1}/Bricks".format( NS.tendrl_context.integration_id, fetched_volume.vol_id)) for sub_volume in sub_volumes.leaves: bricks = etcd_utils.read(sub_volume.key) for brick in bricks.leaves: fqdn = brick.key.split('/')[-1].split(':')[0] path = brick.key.split('/')[-1].split(':')[-1][1:] # Delete brick dashboard from grafana brick_obj = NS.tendrl.objects.GlusterBrick( NS.tendrl_context.integration_id, fqdn, path).load() # Delete brick brick_path = "clusters/{0}/Bricks/"\ "all/{1}/{2}".format( NS.tendrl_context.integration_id, fqdn, path ) etcd_utils.delete(brick_path, recursive=True) brick_full_path = fqdn + ":" + brick_obj.\ brick_path.split(":")[-1] job_id = monitoring_utils.update_dashboard( "%s|%s" % (event['message']['name'], brick_full_path), RESOURCE_TYPE_BRICK, NS.tendrl_context.integration_id, "delete") logger.log( "debug", NS.publisher_id, { "message": "Update dashboard job %s" " for brick %s " "in cluster %s created" % (job_id, brick.key.split('/')[-1], NS.tendrl_context.integration_id) }) # Delete brick from graphite job_id = monitoring_utils.\ delete_resource_from_graphite( "%s|%s" % ( event['message']['name'], brick_full_path ), RESOURCE_TYPE_BRICK, NS.tendrl_context.integration_id, "delete" ) logger.log( "debug", NS.publisher_id, { "message": "Delete resource " "from graphite job %s " "for brick %s in cluster %s created" % (job_id, brick.key.split('/')[-1], NS.tendrl_context.integration_id) }) except etcd.EtcdKeyNotFound: pass # Delete volume dashboard from grafana job_id = monitoring_utils.update_dashboard( event['message']['name'], RESOURCE_TYPE_VOLUME, NS.tendrl_context.integration_id, "delete") logger.log("debug", NS.publisher_id, {"message": "Update dashboard job %s " "created" % job_id}) # Delete volume details from graphite job_id = monitoring_utils.delete_resource_from_graphite( event['message']['name'], RESOURCE_TYPE_VOLUME, NS.tendrl_context.integration_id, "delete") logger.log("debug", NS.publisher_id, { "message": "Delete resource from graphite job %s " "created" % job_id })
def process_job(jid): job = NS.tendrl.objects.Job(job_id=jid).load() if job.status in [None, ""]: job.status = "new" job.save() NS.node_context = NS.node_context.load() # Check job not already "finished", or "processing" try: if job.status in ["finished", "processing", "failed"]: return except etcd.EtcdKeyNotFound: pass try: _timeout = None _timeout = job.timeout if _timeout: _timeout = _timeout.lower() except etcd.EtcdKeyNotFound: pass # tendrl-node-agent tagged as tendrl/monitor will ensure # >10 min old "new" jobs are timed out and marked as # "failed" (the parent job of these jobs will also be # marked as "failed") if "tendrl/monitor" in NS.node_context.tags and \ _timeout == "yes" and job.status == "new": _valid_until = job.valid_until if _valid_until: _now_epoch = (time_utils.now() - datetime.datetime(1970, 1, 1).replace( tzinfo=utc)).total_seconds() if int(_now_epoch) >= int(_valid_until): # Job has "new" status since 10 minutes, # mark status as "failed" and Job.error = # "Timed out" try: job = job.load() if job.status == "new": job.status = "failed" job.save() except etcd.EtcdCompareFailed: pass else: job = NS.tendrl.objects.Job(job_id=jid).load() if job.status == "new": _msg = str("Timed-out (>10min as 'new')") job.errors = _msg job.save() if job.payload.get('parent') is None: integration_id = NS.tendrl_context.integration_id alert_utils.alert_job_status( "failed", "Job timed out (job_id: %s)" % jid, integration_id=integration_id or job.payload['parameters'].get( 'TendrlContext.integration_id' ), cluster_name=NS.tendrl_context.cluster_name or job.payload['parameters'].get( 'TendrlContext.cluster_name' ) ) return else: _now_plus_10 = time_utils.now() + datetime.timedelta(minutes=10) _epoch_start = datetime.datetime(1970, 1, 1).replace(tzinfo=utc) _now_plus_10_epoch = (_now_plus_10 - _epoch_start).total_seconds() time.sleep(7) job = job.load() if job.status == "new": # To avoid server and storage node do save same time job.valid_until = int(_now_plus_10_epoch) job.save() job = NS.tendrl.objects.Job(job_id=jid).load() if job.payload["type"] == NS.type and \ job.status == "new": # Job routing # Flows created by tendrl-api use 'tags' from flow # definition to target jobs _tag_match = False if job.payload.get("tags", []): for flow_tag in job.payload['tags']: if flow_tag in NS.node_context.tags: _tag_match = True if not _tag_match: _job_tags = ", ".join(job.payload.get("tags", [])) _msg = "Node (%s)(type: %s)(tags: %s) will not " \ "process job-%s (tags: %s)" % \ (NS.node_context.node_id, NS.type, NS.node_context.tags, jid, _job_tags) logger.log( "debug", NS.publisher_id, {"message": _msg} ) return try: try: job_status_key = "/queue/%s/status" % job.job_id etcd_utils.write(job_status_key, "processing", prevValue="new") except etcd.EtcdKeyNotFound: # if status watchable attribute not present # then it will be created when job save happens pass lock_info = dict(node_id=NS.node_context.node_id, fqdn=NS.node_context.fqdn, type=NS.type) job = job.load() job.locked_by = lock_info job.status = "processing" job.save(ttl=DEFAULT_JOB_TTL) except etcd.EtcdCompareFailed: # job is already being processed by some tendrl # agent return the_flow = None try: current_ns, flow_name, obj_name = \ _extract_fqdn(job.payload['run']) if obj_name: runnable_flow = current_ns.ns.get_obj_flow( obj_name, flow_name) else: runnable_flow = current_ns.ns.get_flow(flow_name) job = job.load() lock_info = dict(node_id=NS.node_context.node_id, fqdn=NS.node_context.fqdn, type=NS.type) if job.locked_by != lock_info: return the_flow = runnable_flow(parameters=job.payload[ 'parameters'], job_id=job.job_id) logger.log( "info", NS.publisher_id, {"message": "Starting Job %s" % job.job_id}, job_id=job.job_id, flow_id=the_flow.parameters['flow_id'] ) logger.log( "info", NS.publisher_id, {"message": "Running %s" % job.payload['run'].split('.')[-1]}, job_id=job.job_id, flow_id=the_flow.parameters['flow_id'] ) the_flow.run() try: job = job.load() job.status = "finished" job.save() except etcd.EtcdCompareFailed: # This should not happen! _msg = "Cannot mark job as 'finished', " \ "current job status invalid" raise FlowExecutionFailedError(_msg) logger.log( "info", NS.publisher_id, {"message": "Job (%s) for %s finished. " % ( job.job_id, job.payload['run'].split('.')[-1])}, job_id=job.job_id, flow_id=the_flow.parameters['flow_id'], ) if job.payload.get('parent') is None: alert_utils.alert_job_status( "finished", "%s (job ID: %s) completed successfully " % ( job.payload['run'].split('.')[-1], job.job_id), integration_id=NS.tendrl_context.integration_id or job.payload['parameters'].get( 'TendrlContext.integration_id' ), cluster_name=NS.tendrl_context.cluster_name or job.payload['parameters'].get( 'TendrlContext.cluster_name' ) ) except (FlowExecutionFailedError, AtomExecutionFailedError, Exception) as e: _trace = str(traceback.format_exc(e)) _msg = "Failure in Job %s Flow %s with error:" % \ (job.job_id, job.payload['run']) Event( ExceptionMessage( priority="error", publisher=NS.publisher_id, payload={"message": _msg + _trace, "exception": e } ) ) if the_flow: logger.log( "error", NS.publisher_id, {"message": _msg + "\n" + _trace}, job_id=job.job_id, flow_id=the_flow.parameters['flow_id'] ) else: logger.log( "error", NS.publisher_id, {"message": _msg + "\n" + _trace} ) try: job = job.load() job.status = "failed" job.save() except etcd.EtcdCompareFailed: # This should not happen! _msg = "Cannot mark job as 'failed', current" \ "job status invalid" raise FlowExecutionFailedError(_msg) else: job = job.load() job.errors = _trace if job.payload.get('parent') is None: alert_utils.alert_job_status( "failed", "Job failed (job_id: %s)" % job.job_id, integration_id=NS.tendrl_context.integration_id or job.payload['parameters'].get( 'TendrlContext.integration_id' ), cluster_name=NS.tendrl_context.cluster_name or job.payload['parameters'].get( 'TendrlContext.cluster_name' ) ) job.save()
def __init__(self): self.time_stamp = now() self.alert = None
def volume_delete(self, event): time.sleep(self.sync_interval) fetched_volumes = NS.tendrl.objects.GlusterVolume( NS.tendrl_context.integration_id ).load_all() for fetched_volume in fetched_volumes: if fetched_volume.name == event['message']['name']: fetched_volume.deleted = True fetched_volume.deleted_at = time_utils.now() fetched_volume.save() try: sub_volumes = etcd_utils.read( "/clusters/{0}/Volumes/{1}/Bricks".format( NS.tendrl_context.integration_id, fetched_volume.vol_id ) ) for sub_volume in sub_volumes.leaves: bricks = etcd_utils.read( sub_volume.key ) for brick in bricks.leaves: fqdn = brick.key.split('/')[-1].split(':')[0] path = brick.key.split('/')[-1].split(':')[-1][1:] # Delete brick dashboard from grafana brick_obj = NS.tendrl.objects.GlusterBrick( NS.tendrl_context.integration_id, fqdn, path ).load() # Delete brick brick_path = "clusters/{0}/Bricks/"\ "all/{1}/{2}".format( NS.tendrl_context.integration_id, fqdn, path ) etcd_utils.delete( brick_path, recursive=True ) brick_full_path = fqdn + ":" + brick_obj.\ brick_path.split(":")[-1] job_id = monitoring_utils.update_dashboard( "%s|%s" % ( event['message']['name'], brick_full_path ), RESOURCE_TYPE_BRICK, NS.tendrl_context.integration_id, "delete" ) logger.log( "debug", NS.publisher_id, { "message": "Update dashboard job %s" " for brick %s " "in cluster %s created" % ( job_id, brick.key.split('/')[-1], NS.tendrl_context.integration_id ) } ) # Delete brick from graphite job_id = monitoring_utils.\ delete_resource_from_graphite( "%s|%s" % ( event['message']['name'], brick_full_path ), RESOURCE_TYPE_BRICK, NS.tendrl_context.integration_id, "delete" ) logger.log( "debug", NS.publisher_id, { "message": "Delete resource " "from graphite job %s " "for brick %s in cluster %s created" % ( job_id, brick.key.split('/')[-1], NS.tendrl_context.integration_id ) } ) except etcd.EtcdKeyNotFound: pass # Delete volume dashboard from grafana job_id = monitoring_utils.update_dashboard( event['message']['name'], RESOURCE_TYPE_VOLUME, NS.tendrl_context.integration_id, "delete" ) logger.log( "debug", NS.publisher_id, { "message": "Update dashboard job %s " "created" % job_id } ) # Delete volume details from graphite job_id = monitoring_utils.delete_resource_from_graphite( event['message']['name'], RESOURCE_TYPE_VOLUME, NS.tendrl_context.integration_id, "delete" ) logger.log( "debug", NS.publisher_id, { "message": "Delete resource from graphite job %s " "created" % job_id } )
def process_job(jid): job = NS.tendrl.objects.Job(job_id=jid).load() if job.status in [None, ""]: job.status = "new" job.save() NS.node_context = NS.node_context.load() # Check job not already "finished", or "processing" try: if job.status in ["finished", "processing", "failed"]: return except etcd.EtcdKeyNotFound: pass try: _timeout = None _timeout = job.timeout if _timeout: _timeout = _timeout.lower() except etcd.EtcdKeyNotFound: pass # tendrl-node-agent tagged as tendrl/monitor will ensure # >10 min old "new" parent jobs are timed out and marked # as "failed" if "tendrl/monitor" in NS.node_context.tags and _timeout == "yes" and \ job.status == "new" and job.payload.get('parent') is None: _valid_until = job.valid_until if _valid_until: _now_epoch = (time_utils.now() - datetime.datetime( 1970, 1, 1).replace(tzinfo=utc)).total_seconds() if int(_now_epoch) >= int(_valid_until): # Job has "new" status since 10 minutes, # mark status as "failed" and Job.error = # "Timed out" _msg = str("Timed-out (>10min as 'new')") job.errors = _msg job.status = "failed" job.save() integration_id = NS.tendrl_context.integration_id alert_utils.alert_job_status( "failed", "Job timed out (job_id: %s)" % jid, integration_id=integration_id or job.payload['parameters'].get( 'TendrlContext.integration_id'), cluster_name=NS.tendrl_context.cluster_name or job.payload['parameters'].get( 'TendrlContext.cluster_name')) return else: _now_plus_10 = time_utils.now() + datetime.timedelta(minutes=10) _epoch_start = datetime.datetime(1970, 1, 1).replace(tzinfo=utc) _now_plus_10_epoch = (_now_plus_10 - _epoch_start).total_seconds() job = NS.tendrl.objects.Job(job_id=jid).load() if job.status == "new": # To avoid server and storage node do save same time job.valid_until = int(_now_plus_10_epoch) job.save() job = NS.tendrl.objects.Job(job_id=jid).load() if job.payload["type"] == NS.type and \ job.status == "new": # Job routing # Flows created by tendrl-api use 'tags' from flow # definition to target jobs _tag_match = False if job.payload.get("tags", []): for flow_tag in job.payload['tags']: if flow_tag in NS.node_context.tags: _tag_match = True if not _tag_match: _job_tags = ", ".join(job.payload.get("tags", [])) _msg = "Node (%s)(type: %s)(tags: %s) will not " \ "process job-%s (tags: %s)" % \ (NS.node_context.node_id, NS.type, NS.node_context.tags, jid, _job_tags) logger.log("debug", NS.publisher_id, {"message": _msg}) return try: try: job_status_key = "/queue/%s/status" % job.job_id etcd_utils.write(job_status_key, "processing", prevValue="new") except etcd.EtcdKeyNotFound: # if status watchable attribute not present # then it will be created when job save happens pass lock_info = dict(node_id=NS.node_context.node_id, fqdn=NS.node_context.fqdn, type=NS.type) job = NS.tendrl.objects.Job(job_id=jid).load() job.locked_by = lock_info job.status = "processing" job.save(ttl=DEFAULT_JOB_TTL) except etcd.EtcdCompareFailed: # job is already being processed by some tendrl # agent return the_flow = None try: current_ns, flow_name, obj_name = \ _extract_fqdn(job.payload['run']) if obj_name: runnable_flow = current_ns.ns.get_obj_flow(obj_name, flow_name) else: runnable_flow = current_ns.ns.get_flow(flow_name) time.sleep(2) job = NS.tendrl.objects.Job(job_id=jid).load() lock_info = dict(node_id=NS.node_context.node_id, fqdn=NS.node_context.fqdn, type=NS.type) if job.locked_by != lock_info: return the_flow = runnable_flow(parameters=job.payload['parameters'], job_id=job.job_id) # Tendrl server does not have fqdn in node_context logger.log("info", NS.publisher_id, { "message": "Starting %s Job: %s on %s" % (job.payload['run'].split('.')[-1], job.job_id, NS.node_context.fqdn or "server") }, job_id=job.job_id, flow_id=the_flow.parameters['flow_id']) logger.log("info", NS.publisher_id, { "message": "Running %s job: %s on %s" % (job.payload['run'].split('.')[-1], job.job_id, NS.node_context.fqdn or "server") }, job_id=job.job_id, flow_id=the_flow.parameters['flow_id']) the_flow.run() try: job = NS.tendrl.objects.Job(job_id=jid).load() job.status = "finished" job.save() except etcd.EtcdCompareFailed: # This should not happen! _msg = "Cannot mark job as 'finished', " \ "current job status invalid" raise FlowExecutionFailedError(_msg) logger.log( "info", NS.publisher_id, { "message": "Job (%s) for %s finished. " % (job.job_id, job.payload['run'].split('.')[-1]) }, job_id=job.job_id, flow_id=the_flow.parameters['flow_id'], ) if job.payload.get('parent') is None: alert_utils.alert_job_status( "finished", "%s (job ID: %s) completed successfully " % (job.payload['run'].split('.')[-1], job.job_id), integration_id=NS.tendrl_context.integration_id or job.payload['parameters'].get( 'TendrlContext.integration_id'), cluster_name=NS.tendrl_context.cluster_name or job.payload['parameters'].get( 'TendrlContext.cluster_name')) except (FlowExecutionFailedError, AtomExecutionFailedError, Exception) as e: _trace = str(traceback.format_exc(e)) _msg = "Failure in Job %s Flow %s with error:" % \ (job.job_id, job.payload['run']) Event( ExceptionMessage(priority="error", publisher=NS.publisher_id, payload={ "message": _msg + _trace, "exception": e })) if the_flow: logger.log("error", NS.publisher_id, {"message": _msg + "\n" + _trace}, job_id=job.job_id, flow_id=the_flow.parameters['flow_id']) else: logger.log("error", NS.publisher_id, {"message": _msg + "\n" + _trace}) try: job = NS.tendrl.objects.Job(job_id=jid).load() job.status = "failed" job.save() except etcd.EtcdCompareFailed: # This should not happen! _msg = "Cannot mark job as 'failed', current" \ "job status invalid" raise FlowExecutionFailedError(_msg) else: job = NS.tendrl.objects.Job(job_id=jid).load() job.errors = _trace if job.payload.get('parent') is None: alert_utils.alert_job_status( "failed", "Job failed (job_id: %s)" % job.job_id, integration_id=NS.tendrl_context.integration_id or job.payload['parameters'].get( 'TendrlContext.integration_id'), cluster_name=NS.tendrl_context.cluster_name or job.payload['parameters'].get( 'TendrlContext.cluster_name')) job.save()
def on_sync_object(self, data): assert data['fsid'] == self.fsid sync_object = copy.deepcopy(data['data']) sync_type = SYNC_OBJECT_STR_TYPE[data['type']] new_object = self.inject_sync_object(data['type'], data['version'], sync_object) self._request_coll.on_map(sync_type, new_object) if new_object: # Check and raise any alerts if required # TODO(team) Enabled the below if condition as when # alerting needed for cluster health, mon status, pool # status etc # if sync_type.str == "health": # self._on_health(sync_object) # if sync_type.str == "mon_status": # self._on_mon_status(sync_object) if sync_type.str == "osd_map": # self._on_pool_status(sync_object) self._on_osd_map(sync_object) NS.ceph.objects.SyncObject( updated=now(), sync_type=sync_type.str, version=new_object.version if isinstance( new_object.version, int) else None, when=now(), data=data['data']).save(update=False) if sync_type.str == "health": NS.ceph.objects.GlobalDetails( status=sync_object['overall_status']).save() if sync_type.str == "osd_map": # Pool out of band deletion handling try: pools = NS._int.client.read( "clusters/%s/Pools" % NS.tendrl_context.integration_id) old_pool_ids = [] for pool in pools.leaves: old_pool_ids.append(int(pool.key.split("/")[-1])) new_pool_ids = [] for raw_pool in sync_object.get('pools', []): new_pool_ids.append(raw_pool['pool']) delete_pool_ids = set(old_pool_ids) - set(new_pool_ids) for id in delete_pool_ids: NS._int.client.delete( "clusters/%s/Pools/%s" % (NS.tendrl_context.integration_id, id), recursive=True) except etcd.EtcdKeyNotFound as ex: Event( ExceptionMessage(priority="debug", publisher=NS.publisher_id, payload={ "message": "No pools found \ for ceph cluster %s" % NS.tendrl_context.integration_id, "exception": ex })) for raw_pool in sync_object.get('pools', []): Event( Message(priority="info", publisher=NS.publisher_id, payload={ "message": "Updating Pool %s" % raw_pool['pool_name'] })) pool_type = 'replicated' if 'erasure_code_profile' in raw_pool and \ raw_pool['erasure_code_profile'] != "": pool_type = 'erasure_coded' quota_enabled = False if ('quota_max_objects' in raw_pool and raw_pool['quota_max_objects'] > 0) or \ ('quota_max_bytes' in raw_pool and raw_pool['quota_max_bytes'] > 0): quota_enabled = True NS.ceph.objects.Pool( pool_id=raw_pool['pool'], pool_name=raw_pool['pool_name'], pg_num=raw_pool['pg_num'], type=pool_type, erasure_code_profile=raw_pool.get( 'erasure_code_profile'), min_size=raw_pool['min_size'], size=raw_pool.get('size', None), quota_enabled=quota_enabled, quota_max_objects=raw_pool['quota_max_objects'], quota_max_bytes=raw_pool['quota_max_bytes'], ).save() # Osd out of band deletion handling try: osds = NS._int.client.read( "clusters/%s/Osds" % NS.tendrl_context.integration_id) old_osds = [] for osd in osds.leaves: old_osds.append(str(osd.key.split("/")[-1])) new_osds = [] for raw_osd in sync_object.get('osds', []): new_osds.append(raw_osd['uuid']) delete_osds = set(old_osds) - set(new_osds) for id in delete_osds: NS._int.client.delete( "clusters/%s/Osds/%s" % (NS.tendrl_context.integration_id, id), recursive=True) except etcd.EtcdKeyNotFound as ex: Event( ExceptionMessage(priority="debug", publisher=NS.publisher_id, payload={ "message": "key not found in etcd", "exception": ex })) for raw_osd in sync_object.get('osds', []): Event( Message(priority="info", publisher=NS.publisher_id, payload={ "message": "Updating OSD %s" % raw_osd['osd'] })) osd_host = socket.gethostbyaddr( raw_osd['public_addr'].split(':')[0])[0] NS.ceph.objects.Osd( id=raw_osd['osd'], uuid=raw_osd['uuid'], hostname=osd_host, public_addr=raw_osd['public_addr'], cluster_addr=raw_osd['cluster_addr'], heartbeat_front_addr=raw_osd['heartbeat_front_addr'], heartbeat_back_addr=raw_osd['heartbeat_back_addr'], down_at=raw_osd['down_at'], up_from=raw_osd['up_from'], lost_at=raw_osd['lost_at'], osd_up=raw_osd['up'], osd_in=raw_osd['in'], up_thru=raw_osd['up_thru'], weight=str(raw_osd['weight']), primary_affinity=str(raw_osd['primary_affinity']), state=raw_osd['state'], last_clean_begin=raw_osd['last_clean_begin'], last_clean_end=raw_osd['last_clean_end']).save() else: Event( Message(priority="debug", publisher=NS.publisher_id, payload={ "message": "ClusterMonitor.on_sync_object: " "stale object received for %s" % data['type'] }))
def save(self, update=True, ttl=None): self.render() if "Message" not in self.__class__.__name__: # If local object.hash is equal to # central_store object.hash, return if self.hash_compare_with_central_store(ttl=ttl): return if update: current_obj = self.load() for attr, val in vars(self).iteritems(): if isinstance(val, (types.FunctionType, types.BuiltinFunctionType, types.MethodType, types.BuiltinMethodType, types.UnboundMethodType)) or \ attr.startswith("_") or attr in ['value', 'list']: continue if val is None and hasattr(current_obj, attr): # if self.attr is None, use attr value from central # store (i.e. current_obj.attr) if getattr(current_obj, attr): setattr(self, attr, getattr(current_obj, attr)) self.updated_at = str(time_utils.now()) for item in self.render(): ''' Note: Log messages in this file have try-except blocks to run in the condition when the node_agent has not been started and name spaces are being created. ''' try: logger.log("debug", NS.publisher_id, { "message": "Writing %s to %s" % (item['key'], item['value']) }) except KeyError: sys.stdout.write("Writing %s to %s \n" % (item['key'], item['value'])) # convert list, dict (json) to python based on definitions _type = self._defs.get("attrs", {}).get(item['name'], {}).get("type") if _type: if _type.lower() in ['json', 'list']: if item['value']: try: item['value'] = json.dumps(item['value']) except ValueError as ex: _msg = "Error save() attr %s for object %s" % \ (item['name'], self.__name__) Event( ExceptionMessage(priority="debug", publisher=NS.publisher_id, payload={ "message": _msg, "exception": ex })) try: NS._int.wclient.write(item['key'], item['value'], quorum=True) except (etcd.EtcdConnectionFailed, etcd.EtcdException): NS._int.wreconnect() NS._int.wclient.write(item['key'], item['value'], quorum=True) if ttl: etcd_utils.refresh(self.value, ttl) self.watch_attrs()
def run(self): logger.log("info", NS.publisher_id, {"message": "%s running" % self.__class__.__name__}) NS.node_context = NS.node_context.load() current_tags = list(NS.node_context.tags) current_tags += ["tendrl/node_%s" % NS.node_context.node_id] NS.node_context.tags = list(set(current_tags)) NS.node_context.status = "UP" NS.node_context.save() _sleep = 0 msg = "{0} is UP".format(NS.node_context.fqdn) event_utils.emit_event("node_status", "UP", msg, "node_{0}".format(NS.node_context.fqdn), "INFO", node_id=NS.node_context.node_id) while not self._complete.is_set(): _sync_ttl = int(NS.config.data.get("sync_interval", 10)) + 100 if _sleep > 5: _sleep = int(NS.config.data.get("sync_interval", 10)) else: _sleep += 1 NS.node_context = NS.node_context.load() NS.node_context.sync_status = "in_progress" current_tags = list(NS.node_context.tags) current_tags += ["tendrl/node_%s" % NS.node_context.node_id] NS.node_context.tags = list(set(current_tags)) NS.node_context.status = "UP" NS.node_context.save(ttl=_sync_ttl) NS.tendrl_context = NS.tendrl_context.load() sync_service_and_index_thread = threading.Thread( target=services_and_index_sync.sync, args=(_sync_ttl, )) sync_service_and_index_thread.daemon = True sync_service_and_index_thread.start() sync_service_and_index_thread.join() NS.node_context = NS.node_context.load() if "tendrl/monitor" in NS.node_context.tags: check_all_managed_node_status_thread = threading.Thread( target=check_all_managed_nodes_status.run) check_all_managed_node_status_thread.daemon = True check_all_managed_node_status_thread.start() check_all_managed_node_status_thread.join() check_cluster_status_thread = threading.Thread( target=check_cluster_status.run) check_cluster_status_thread.daemon = True check_cluster_status_thread.start() check_cluster_status_thread.join() if "tendrl/monitor" not in NS.node_context.tags: sync_cluster_contexts_thread = threading.Thread( target=cluster_contexts_sync.sync, args=(_sync_ttl, )) sync_cluster_contexts_thread.daemon = True sync_cluster_contexts_thread.start() sync_cluster_contexts_thread.join() platform_detect_thread = threading.Thread( target=platform_detect.sync) platform_detect.daemon = True platform_detect_thread.start() platform_detect_thread.join() if "tendrl/monitor" not in NS.node_context.tags: sds_detect_thread = threading.Thread(target=sds_detect.sync, args=(_sleep, )) sds_detect_thread.daemon = True sds_detect_thread.start() sds_detect_thread.join() NS.tendrl_context = NS.tendrl_context.load() try: NS.tendrl.objects.Os().save() NS.tendrl.objects.Cpu().save() NS.tendrl.objects.Memory().save() except Exception as ex: Event( ExceptionMessage(priority="error", publisher=NS.publisher_id, payload={ "message": "node_sync " "os/cpu/memory sync failed: " + ex.message, "exception": ex })) NS.node_context = NS.node_context.load() NS.node_context.sync_status = "failed" NS.node_context.last_sync = str(time_utils.now()) NS.node_context.status = "UP" NS.node_context.save(ttl=_sync_ttl) time.sleep(_sleep) sync_disks_thread = threading.Thread(target=disk_sync.sync) sync_disks_thread.daemon = True sync_disks_thread.start() sync_disks_thread.join() sync_networks_thread = threading.Thread(target=network_sync.sync) sync_networks_thread.daemon = True sync_networks_thread.start() sync_networks_thread.join() NS.node_context = NS.node_context.load() NS.node_context.sync_status = "done" NS.node_context.last_sync = str(time_utils.now()) NS.node_context.status = "UP" NS.node_context.save(ttl=_sync_ttl) if "tendrl/monitor" not in NS.node_context.tags: sync_cluster_contexts_thread = threading.Thread( target=cluster_contexts_sync.sync, args=(_sync_ttl, )) sync_cluster_contexts_thread.daemon = True sync_cluster_contexts_thread.start() sync_cluster_contexts_thread.join() # Update node alert count if not NS.tendrl.objects.ClusterNodeAlertCounters().exists(): update_cluster_node_alert_count() time.sleep(_sleep) logger.log("info", NS.publisher_id, {"message": "%s complete" % self.__class__.__name__})
def save(self, update=True, ttl=None): self.render() if "Message" not in self.__class__.__name__: try: # Generate current in memory object hash self.hash = self._hash() _hash_key = "/{0}/hash".format(self.value) _stored_hash = None try: _stored_hash = NS._int.client.read(_hash_key).value except (etcd.EtcdConnectionFailed, etcd.EtcdException) as ex: if type(ex) != etcd.EtcdKeyNotFound: NS._int.reconnect() _stored_hash = NS._int.client.read(_hash_key).value if self.hash == _stored_hash: # No changes in stored object and current object, # dont save current object to central store if ttl: etcd_utils.refresh(self.value, ttl) return except TypeError: # no hash for this object, save the current hash as is pass if update: current_obj = self.load() for attr, val in vars(self).iteritems(): if isinstance(val, (types.FunctionType, types.BuiltinFunctionType, types.MethodType, types.BuiltinMethodType, types.UnboundMethodType)) or \ attr.startswith("_") or attr in ['value', 'list']: continue if val is None and hasattr(current_obj, attr): # if self.attr is None, use attr value from central # store (i.e. current_obj.attr) if getattr(current_obj, attr): setattr(self, attr, getattr(current_obj, attr)) self.updated_at = str(time_utils.now()) for item in self.render(): ''' Note: Log messages in this file have try-except blocks to run in the condition when the node_agent has not been started and name spaces are being created. ''' try: Event( Message(priority="debug", publisher=NS.publisher_id, payload={ "message": "Writing %s to %s" % (item['key'], item['value']) })) except KeyError: sys.stdout.write("Writing %s to %s" % (item['key'], item['value'])) # convert list, dict (json) to python based on definitions _type = self._defs.get("attrs", {}).get(item['name'], {}).get("type") if _type: if _type.lower() in ['json', 'list']: if item['value']: try: item['value'] = json.dumps(item['value']) except ValueError as ex: _msg = "Error save() attr %s for object %s" % \ (item['name'], self.__name__) Event( ExceptionMessage(priority="debug", publisher=NS.publisher_id, payload={ "message": _msg, "exception": ex })) try: NS._int.wclient.write(item['key'], item['value'], quorum=True) except (etcd.EtcdConnectionFailed, etcd.EtcdException): NS._int.wreconnect() NS._int.wclient.write(item['key'], item['value'], quorum=True) if ttl: etcd_utils.refresh(self.value, ttl)
def test_now(): date = time_utils.now() assert isinstance(date, datetime.datetime)
def process_job(job): jid = job.key.split('/')[-1] job_status_key = "/queue/%s/status" % jid job_lock_key = "/queue/%s/locked_by" % jid NS.node_context = NS.node_context.load() # Check job not already locked by some agent try: _locked_by = etcd_utils.read(job_lock_key).value if _locked_by: return except etcd.EtcdKeyNotFound: pass # Check job not already "finished", or "processing" try: _status = etcd_utils.read(job_status_key).value if _status in ["finished", "processing"]: return except etcd.EtcdKeyNotFound: pass try: _job_timeout_key = "/queue/%s/timeout" % jid _timeout = None _timeout = etcd_utils.read(_job_timeout_key).value if _timeout: _timeout = _timeout.lower() except etcd.EtcdKeyNotFound: pass # tendrl-node-agent tagged as tendrl/monitor will ensure # >10 min old "new" jobs are timed out and marked as # "failed" (the parent job of these jobs will also be # marked as "failed") if "tendrl/monitor" in NS.node_context.tags and \ _timeout == "yes": _job_valid_until_key = "/queue/%s/valid_until" % jid _valid_until = None try: _valid_until = etcd_utils.read( _job_valid_until_key).value except etcd.EtcdKeyNotFound: pass if _valid_until: _now_epoch = (time_utils.now() - datetime.datetime(1970, 1, 1).replace( tzinfo=utc)).total_seconds() if int(_now_epoch) >= int(_valid_until): # Job has "new" status since 10 minutes, # mark status as "failed" and Job.error = # "Timed out" try: etcd_utils.write(job_status_key, "failed", prevValue="new") except etcd.EtcdCompareFailed: pass else: job = NS.tendrl.objects.Job(job_id=jid).load() _msg = str("Timed-out (>10min as 'new')") job.errors = _msg job.save() if job.payload.get('parent') is None: alert_utils.alert_job_status( "failed", "Job timed out (job_id: %s)" % jid, integration_id=NS.tendrl_context.integration_id or job.payload['parameters'].get( 'TendrlContext.integration_id' ), cluster_name=NS.tendrl_context.cluster_name or job.payload['parameters'].get( 'TendrlContext.cluster_name' ) ) return else: _now_plus_10 = time_utils.now() + datetime.timedelta(minutes=10) _epoch_start = datetime.datetime(1970, 1, 1).replace(tzinfo=utc) # noinspection PyTypeChecker _now_plus_10_epoch = (_now_plus_10 - _epoch_start).total_seconds() etcd_utils.write(_job_valid_until_key, int(_now_plus_10_epoch)) job = NS.tendrl.objects.Job(job_id=jid).load() if job.payload["type"] == NS.type and \ job.status == "new": # Job routing # Flows created by tendrl-api use 'tags' from flow # definition to target jobs _tag_match = False if job.payload.get("tags", []): for flow_tag in job.payload['tags']: if flow_tag in NS.node_context.tags: _tag_match = True if not _tag_match: _job_tags = ", ".join(job.payload.get("tags", [])) _msg = "Node (%s)(type: %s)(tags: %s) will not " \ "process job-%s (tags: %s)" % \ (NS.node_context.node_id, NS.type, NS.node_context.tags, jid, _job_tags) logger.log( "info", NS.publisher_id, {"message": _msg} ) return job_status_key = "/queue/%s/status" % job.job_id job_lock_key = "/queue/%s/locked_by" % job.job_id try: lock_info = dict(node_id=NS.node_context.node_id, fqdn=NS.node_context.fqdn, tags=NS.node_context.tags, type=NS.type) etcd_utils.write(job_status_key, "processing", prevValue="new") etcd_utils.write(job_lock_key, json.dumps(lock_info)) except etcd.EtcdCompareFailed: # job is already being processed by some tendrl # agent return the_flow = None try: current_ns, flow_name, obj_name = \ _extract_fqdn(job.payload['run']) if obj_name: runnable_flow = current_ns.ns.get_obj_flow( obj_name, flow_name) else: runnable_flow = current_ns.ns.get_flow(flow_name) the_flow = runnable_flow(parameters=job.payload[ 'parameters'], job_id=job.job_id) logger.log( "info", NS.publisher_id, {"message": "Processing Job %s" % job.job_id}, job_id=job.job_id, flow_id=the_flow.parameters['flow_id'] ) logger.log( "info", NS.publisher_id, {"message": "Running Flow %s" % job.payload['run']}, job_id=job.job_id, flow_id=the_flow.parameters['flow_id'] ) the_flow.run() try: etcd_utils.write(job_status_key, "finished", prevValue="processing") except etcd.EtcdCompareFailed: # This should not happen! _msg = "Cannot mark job as 'finished', " \ "current job status invalid" raise FlowExecutionFailedError(_msg) logger.log( "info", NS.publisher_id, {"message": "Job (%s): Finished " "Flow %s" % ( job.job_id, job.payload['run'])}, job_id=job.job_id, flow_id=the_flow.parameters['flow_id'], ) if job.payload.get('parent') is None: alert_utils.alert_job_status( "finished", "Job finished successfully (job_id: %s)" % job.job_id, integration_id=NS.tendrl_context.integration_id or job.payload['parameters'].get( 'TendrlContext.integration_id' ), cluster_name=NS.tendrl_context.cluster_name or job.payload['parameters'].get( 'TendrlContext.cluster_name' ) ) except (FlowExecutionFailedError, AtomExecutionFailedError, Exception) as e: _trace = str(traceback.format_exc(e)) _msg = "Failure in Job %s Flow %s with error:" % \ (job.job_id, job.payload['run']) Event( ExceptionMessage( priority="error", publisher=NS.publisher_id, payload={"message": _msg + _trace, "exception": e } ) ) if the_flow: logger.log( "error", NS.publisher_id, {"message": _msg + "\n" + _trace}, job_id=job.job_id, flow_id=the_flow.parameters['flow_id'] ) else: logger.log( "error", NS.publisher_id, {"message": _msg + "\n" + _trace} ) try: etcd_utils.write(job_status_key, "failed", prevValue="processing") except etcd.EtcdCompareFailed: # This should not happen! _msg = "Cannot mark job as 'failed', current" \ "job status invalid" raise FlowExecutionFailedError(_msg) else: job = job.load() job.errors = _trace if job.payload.get('parent') is None: alert_utils.alert_job_status( "failed", "Job failed (job_id: %s)" % job.job_id, integration_id=NS.tendrl_context.integration_id or job.payload['parameters'].get( 'TendrlContext.integration_id' ), cluster_name=NS.tendrl_context.cluster_name or job.payload['parameters'].get( 'TendrlContext.cluster_name' ) ) job.save()
def test_now(): date = time_utils.now() assert type(date) == datetime.datetime