Exemple #1
0
 def __init__(self, priority, publisher, payload, job_id=None,
              flow_id=None, parent_id=None, integration_id=None,
              message_id=None, timestamp=None, node_id=None,
              caller=None, *args, **kwargs):
     super(Message, self).__init__(*args, **kwargs)
     if message_id is None:
         self.message_id = str(uuid.uuid4())
         self.timestamp = now()
     else:
         self.message_id = message_id
         self.timestamp = timestamp
     if caller is None:
         # From which function, line and file error raised
         caller = getframeinfo(stack()[1][0])
         self.caller = {"filename": caller.filename,
                        "line_no": caller.lineno,
                        "function": caller.function}
     else:
         self.caller = caller
     self.priority = priority
     self.publisher = publisher
     self.node_id = node_id
     if self.node_id is None:
         self.node_id = NS.node_context.node_id
     self.job_id = job_id
     self.flow_id = flow_id
     self.parent_id = parent_id
     self.integration_id = integration_id
     self.payload = payload
Exemple #2
0
 def __init__(self,
              node_id=None,
              fqdn=None,
              ipv4_addr=None,
              updated_at=None,
              tags=None,
              status=None,
              sync_status=None,
              last_sync=None,
              first_sync_done=None,
              is_managed=None,
              *args,
              **kwargs):
     super(ClusterNodeContext, self).__init__(*args, **kwargs)
     _node_context = NS.node_context.load()
     self.node_id = node_id or _node_context.node_id
     self.fqdn = fqdn or _node_context.fqdn
     self.ipv4_addr = ipv4_addr or _node_context.ipv4_addr
     self.updated_at = updated_at or str(time_utils.now())
     self.tags = tags or _node_context.tags
     self.status = status or _node_context.status
     self.sync_status = sync_status or _node_context.sync_status
     self.last_sync = last_sync or _node_context.last_sync
     self.first_sync_done = first_sync_done
     self.is_managed = is_managed
     self.value = 'clusters/{0}/nodes/{1}/NodeContext'
Exemple #3
0
def test_constructor_Messsage():
    init()
    msg = Message(priority="info",
                  publisher="node_context",
                  payload={"message": "Test Message"})
    assert msg.priority == "info"
    assert msg.publisher == "node_context"
    assert msg.caller is not None
    assert msg.message_id is not None
    msg = Message("info",
                  "node_context",
                  message_id=1,
                  timestamp=now(),
                  payload={"message": "Test Message"})
    assert msg.message_id == 1
    assert isinstance(msg.timestamp, datetime.datetime)
    obj_caller = getframeinfo(stack()[1][0])
    obj_caller = {
        "filename": obj_caller.filename,
        "line_no": obj_caller.lineno,
        "function": obj_caller.function
    }
    msg = Message("info",
                  "node_context",
                  payload={"message": "Test Message"},
                  caller=obj_caller)
    msg = Message(priority="info",
                  publisher="node_context",
                  payload={"message": "Test Message"},
                  node_id="Test id")
    assert msg.node_id == "Test id"
Exemple #4
0
    def _emit_event(self,
                    severity,
                    resource,
                    curr_value,
                    msg,
                    plugin_instance=None):
        if not NS.node_context.node_id:
            return

        alert = {}
        alert['source'] = NS.publisher_id
        alert['pid'] = os.getpid()
        alert['time_stamp'] = now().isoformat()
        alert['alert_type'] = 'status'
        alert['severity'] = SEVERITIES[severity]
        alert['resource'] = resource
        alert['current_value'] = curr_value
        alert['tags'] = dict(message=msg,
                             cluster_id=NS.tendrl_context.integration_id,
                             cluster_name=NS.tendrl_context.cluster_name,
                             sds_name=NS.tendrl_context.sds_name,
                             fqdn=socket.getfqdn())
        if plugin_instance:
            alert['tags']['plugin_instance'] = plugin_instance
        alert['node_id'] = NS.node_context.node_id
        Event(Message("notice", "alerting", {'message': json.dumps(alert)}))
Exemple #5
0
 def __init__(self, priority, publisher, payload, job_id=None,
              flow_id=None, parent_id=None, integration_id=None,
              message_id=None, timestamp=None, node_id=None,
              caller=None, *args, **kwargs):
     super(Message, self).__init__(*args, **kwargs)
     if message_id is None:
         self.message_id = str(uuid.uuid4())
         self.timestamp = now()
     else:
         self.message_id = message_id
         self.timestamp = timestamp
     if caller is None:
         # From which function, line and file error raised
         caller = getframeinfo(stack()[1][0])
         self.caller = {"filename": caller.filename,
                        "line_no": caller.lineno,
                        "function": caller.function}
     else:
         self.caller = caller
     self.priority = priority
     self.publisher = publisher
     self.node_id = node_id
     if self.node_id is None:
         self.node_id = NS.node_context.node_id
     self.job_id = job_id
     self.flow_id = flow_id
     self.parent_id = parent_id
     self.integration_id = integration_id
     self.payload = payload
Exemple #6
0
def _read(*args, **kwargs):
    test_job._complete._Event__flag = True
    global status_flag
    global status_valid
    if args[1] == "/queue/job/status" and status_flag == 0:
        status_flag = 1
        return maps.NamedDict(leaves=[maps.NamedDict(key="test/job")],
                              value="finished")
    elif args[1] == "/queue/job/status" and status_flag == 1:
        status_flag = 2
        return maps.NamedDict(leaves=[maps.NamedDict(key="test/job")],
                              value="unfinished")
    elif args[1] == "/queue/job/status" and status_flag == 2:
        raise etcd.EtcdKeyNotFound
    elif args[1] == "/queue" or args[1] == "/queue/job/locked_by":
        return maps.NamedDict(leaves=[maps.NamedDict(key="test/job")],
                              value=False)
    elif args[1] == "/queue/job/valid_until" and status_valid == 0:
        status_valid = 1
        return maps.NamedDict(leaves=[maps.NamedDict(key="test/job")],
                              value=False)
    elif args[1] == "/queue/job/valid_until" and status_valid == 1:
        return maps.NamedDict(
            leaves=[maps.NamedDict(key="test/job")],
            value=(time_utils.now() - datetime.datetime(
                1970, 1, 1).replace(tzinfo=utc)).total_seconds())
Exemple #7
0
 def __init__(
     self,
     plugin_name=None,
     node_id=None,
     job_id='',
     time_stamp=str(now()),
     *args,
     **kwargs
 ):
     # TODO(anmol_b): Add status to track configuration status and retrial
     # count if auto-retrials are required. node-monitoring which is the
     # consumer of these monitoring configuration jobs, will update success
     # or failure in configuration when the job is picked by it. And retrial
     # counter would be incremented from the only supposed way to load
     # monitoring utils#util#initiate_config_generation jobs to /queue.
     # There unpicked timed-out jobs can then be updated from configuration
     # threads - configure_node_monitoiring and configure_cluster_monitoring
     # before deciding to attempt/re-attempt config generation by looking at
     # job's' status using the job_id here..
     super(NodeMonitoringPlugin, self).__init__(*args, **kwargs)
     self.node_id = node_id
     self.plugin_name = plugin_name
     self.job_id = job_id
     self.time_stamp = time_stamp
     self.value = 'monitoring/plugin_configurations/nodes/{0}/{1}'
Exemple #8
0
    def _run(self):
        Event(
            Message(
                priority="info",
                publisher=NS.publisher_id,
                payload={"message": "%s running" % self.__class__.__name__}))

        # Check if monitor key exists, if not sync
        try:
            NS._int.client.read("clusters/%s/_mon_key" %
                                NS.tendrl_context.integration_id)
        except etcd.EtcdKeyNotFound:
            out, err, rc = cmd_utils.Command(
                "ceph auth get mon. --cluster %s" %
                NS.tendrl_context.cluster_name).run()

            if rc != 0:
                Event(
                    Message(priority="debug",
                            publisher=NS.publisher_id,
                            payload={
                                "message":
                                "Couldn't get monitor key. Error:%s" % err
                            }))
            else:
                if out and out != "":
                    mon_sec = out.split('\n')[1].strip().split(
                        ' = ')[1].strip()
                    NS._int.wclient.write(
                        "clusters/%s/_mon_key" %
                        NS.tendrl_context.integration_id, mon_sec)

        while not self._complete.is_set():
            gevent.sleep(int(NS.config.data.get("sync_interval", 10)))
            try:
                NS._int.wclient.write("clusters/%s/sync_status" %
                                      NS.tendrl_context.integration_id,
                                      "in_progress",
                                      prevExist=False)
            except (etcd.EtcdAlreadyExist, etcd.EtcdCompareFailed) as ex:
                pass

            cluster_data = ceph.heartbeat(NS.tendrl_context.cluster_id)

            self.on_heartbeat(cluster_data)

            _cluster = NS.tendrl.objects.Cluster(
                integration_id=NS.tendrl_context.integration_id)
            if _cluster.exists():
                _cluster.sync_status = "done"
                _cluster.last_sync = str(now())
                _cluster.save()
        Event(
            Message(
                priority="info",
                publisher=NS.publisher_id,
                payload={"message": "%s complete" % self.__class__.__name__}))
Exemple #9
0
 def update(self, new_alert, existing_alert):
     if (alert_severity_map[new_alert.severity] <=
             alert_severity_map[existing_alert.severity]
             and alert_severity_map[new_alert.severity]
             == alert_severity_map['INFO']):
         new_alert.ackedby = "TENDRL"
         new_alert.acked = True
         new_alert.acked_at = now()
         new_alert.ack_comment = ['System acked']
     new_alert.alert_id = existing_alert.alert_id
     return new_alert
Exemple #10
0
 def update(self, new_alert, existing_alert):
     if (
         alert_severity_map[new_alert.severity] <= alert_severity_map[
             existing_alert.severity] and
         alert_severity_map[new_alert.severity] == alert_severity_map[
             'INFO']
     ):
         new_alert.ackedby = "TENDRL"
         new_alert.acked = True
         new_alert.acked_at = now()
         new_alert.ack_comment = ['System acked']
     new_alert.alert_id = existing_alert.alert_id
     return new_alert
Exemple #11
0
 def update(self, new_alert, existing_alert):
     time_stamp = existing_alert.time_stamp
     if (alert_severity_map[new_alert.severity] <
             alert_severity_map[existing_alert.severity]
             and alert_severity_map[new_alert.severity]
             == alert_severity_map['INFO']):
         time_stamp = new_alert.time_stamp
         new_alert.ackedby = constants.TENDRL
         new_alert.acked = True
         new_alert.acked_at = now()
         new_alert.ack_comment = ['System acked']
     new_alert.alert_id = existing_alert.alert_id
     new_alert.time_stamp = time_stamp
     return new_alert
Exemple #12
0
    def save(self, update=True, ttl=None):
        hash_key_changed = True
        if "Message" not in self.__class__.__name__:
            # If local object.hash is equal to
            # central_store object.hash, return
            if self.hash_compare_with_central_store(ttl=ttl):
                # No change in hashkey
                hash_key_changed = False
        rendered_obj = self.render()
        watchables = self._defs.get("watch_attrs", [])
        if self.__class__.__name__ in ['Config', 'Definition'] or \
            len(watchables) > 0:
            for item in rendered_obj:
                if item['name'] in watchables:
                    _type = self._defs.get("attrs", {}).get(item['name'],
                                                            {}).get("type")
                    if _type and _type.lower() in ['json', 'list'] and \
                        item['value']:
                        try:
                            item['value'] = json.dumps(item['value'])
                        except ValueError:
                            _msg = "Error save() attr %s for object %s" % \
                                   (item['name'], self.__name__)
                            logger.log("debug", NS.publisher_id,
                                       {"message": _msg})
                    if self._ttl and item['name'] in self._attrs_with_ttl:
                        etcd_utils.write(item['key'],
                                         item['value'],
                                         quorum=True,
                                         ttl=self._ttl)
                    else:
                        etcd_utils.write(item['key'],
                                         item['value'],
                                         quorum=True)
        if hash_key_changed:
            data_key = self.value + '/data'
            etcd_utils.write(data_key, self.json)
            updated_at_key = self.value + '/updated_at'
            hash_key = self.value + '/hash'
            etcd_utils.write(updated_at_key, str(time_utils.now()))
            if hasattr(self, 'hash'):
                etcd_utils.write(hash_key, self.hash)

            if ttl:
                etcd_utils.refresh(self.value, ttl)

        self.watch_attrs()
Exemple #13
0
    def save(self, update=True, ttl=None):
        hash_key_changed = True
        if "Message" not in self.__class__.__name__:
            # If local object.hash is equal to
            # central_store object.hash, return
            if self.hash_compare_with_central_store(ttl=ttl):
                # No change in hashkey
                hash_key_changed = False
        rendered_obj = self.render()
        watchables = self._defs.get("watch_attrs", [])
        if self.__class__.__name__ in ['Config', 'Definition'] or \
            len(watchables) > 0:
            for item in rendered_obj:
                if item['name'] in watchables:
                    _type = self._defs.get("attrs", {}).get(
                        item['name'],
                        {}
                    ).get("type")
                    if _type and _type.lower() in ['json', 'list'] and \
                        item['value']:
                        try:
                            item['value'] = json.dumps(item['value'])
                        except ValueError:
                            _msg = "Error save() attr %s for object %s" % \
                                   (item['name'], self.__name__)
                            logger.log(
                                "debug",
                                NS.publisher_id,
                                {"message": _msg}
                            )
                    etcd_utils.write(item['key'], item['value'], quorum=True)
        if hash_key_changed:
            data_key = self.value + '/data'
            etcd_utils.write(data_key, self.json)
            updated_at_key = self.value + '/updated_at'
            hash_key = self.value + '/hash'
            etcd_utils.write(updated_at_key, str(time_utils.now()))
            if hasattr(self, 'hash'):
                etcd_utils.write(hash_key, self.hash)

            if ttl:
                etcd_utils.refresh(self.value, ttl)

        self.watch_attrs()
Exemple #14
0
    def __init__(self,
                 node_id=None,
                 fqdn=None,
                 ipv4_addr=None,
                 tags=None,
                 status=None,
                 sync_status=None,
                 last_sync=None,
                 updated_at=None,
                 pkey=None,
                 *args,
                 **kwargs):
        super(NodeContext, self).__init__(*args, **kwargs)
        self.node_id = node_id or self._get_node_id() or self._create_node_id()
        self.fqdn = fqdn or socket.getfqdn()
        self.ipv4_addr = ipv4_addr or socket.gethostbyname(self.fqdn)

        curr_tags = []
        try:
            curr_tags = NS._int.client.read("/nodes/%s/NodeContext/tags" %
                                            self.node_id).value
        except etcd.EtcdKeyNotFound:
            pass

        try:
            curr_tags = json.loads(curr_tags)
        except (ValueError, TypeError):
            # No existing tags
            pass
        self.tags = tags or []
        self.tags += NS.config.data.get('tags', [])
        self.tags += curr_tags
        self.tags = list(set(self.tags))

        self.status = status or "UP"
        self.sync_status = sync_status
        self.last_sync = last_sync
        self.updated_at = updated_at or str(time_utils.now())
        self.pkey = pkey or self.fqdn
        self.value = 'nodes/{0}/NodeContext'
Exemple #15
0
def init(patch_write, patch_refresh, patch_client):
    patch_write.return_value = True
    patch_refresh.return_value = True
    patch_client.return_value = etcd.Client()
    setattr(__builtin__, "NS", maps.NamedDict())
    setattr(NS, "_int", maps.NamedDict())
    NS._int.etcd_kwargs = {
        'port': 1,
        'host': 2,
        'allow_reconnect': True}
    NS._int.client = etcd.Client(**NS._int.etcd_kwargs)
    NS._int.wclient = etcd.Client(**NS._int.etcd_kwargs)
    NS["config"] = maps.NamedDict()
    NS.config["data"] = maps.NamedDict()
    NS.config.data['message_retention_time'] = "infinite"
    NS.node_agent = maps.NamedDict()
    NS.node_agent.objects = importlib.import_module(
        "tendrl.commons.tests.fixtures.cluster_message")
    NS.node_context = maps.NamedDict()
    NS.node_context.node_id = 1
    message = maps.NamedDict()
    message["priority"] = "info"
    message["cluster_id"] = "test_cluster"
    message["message_id"] = "test_id"
    message["timestamp"] = now()
    message["publisher"] = "node_context"
    message["node_id"] = "test_id"
    message["payload"] = {"message": "test_message"}
    message["job_id"] = "test_job_id"
    message["flow_id"] = "test_flow_id"
    message["parent_id"] = "test_parent_id"
    obj_caller = getframeinfo(stack()[1][0])
    obj_caller = {"filename": obj_caller.filename,
                  "line_no": obj_caller.lineno,
                  "function": obj_caller.function}
    message["caller"] = obj_caller
    return message
Exemple #16
0
def init(patch_write, patch_refresh, patch_client):
    patch_write.return_value = True
    patch_refresh.return_value = True
    patch_client.return_value = etcd.Client()
    setattr(__builtin__, "NS", maps.NamedDict())
    setattr(NS, "_int", maps.NamedDict())
    NS._int.etcd_kwargs = {
        'port': 1,
        'host': 2,
        'allow_reconnect': True}
    NS._int.client = etcd.Client(**NS._int.etcd_kwargs)
    NS._int.wclient = etcd.Client(**NS._int.etcd_kwargs)
    NS["config"] = maps.NamedDict()
    NS.config["data"] = maps.NamedDict()
    NS.config.data['message_retention_time'] = "infinite"
    NS.node_agent = maps.NamedDict()
    NS.node_agent.objects = importlib.import_module(
        "tendrl.commons.tests.fixtures.cluster_message")
    NS.node_context = maps.NamedDict()
    NS.node_context.node_id = 1
    message = maps.NamedDict()
    message["priority"] = "info"
    message["integration_id"] = "test_cluster"
    message["message_id"] = "test_id"
    message["timestamp"] = now()
    message["publisher"] = "node_context"
    message["node_id"] = "test_id"
    message["payload"] = {"message": "test_message"}
    message["job_id"] = "test_job_id"
    message["flow_id"] = "test_flow_id"
    message["parent_id"] = "test_parent_id"
    obj_caller = getframeinfo(stack()[1][0])
    obj_caller = {"filename": obj_caller.filename,
                  "line_no": obj_caller.lineno,
                  "function": obj_caller.function}
    message["caller"] = obj_caller
    return message
Exemple #17
0
    def run(self):
        Event(
            Message(
                priority="info",
                publisher=NS.publisher_id,
                payload={"message": "%s running" % self.__class__.__name__}))

        NS.node_context = NS.node_context.load()
        current_tags = list(NS.node_context.tags)
        current_tags += ["tendrl/node_%s" % NS.node_context.node_id]
        NS.node_context.tags = list(set(current_tags))
        NS.node_context.status = "UP"
        NS.node_context.save()
        # Initialize alert count
        try:
            key = '/nodes/%s/alert_counters' % NS.node_context.node_id
            etcd_utils.read(key)
        except (EtcdException) as ex:
            if type(ex) == EtcdKeyNotFound:
                NodeAlertCounters(node_id=NS.node_context.node_id).save()

        _sleep = 0
        while not self._complete.is_set():
            _sync_ttl = int(NS.config.data.get("sync_interval", 10)) + 100
            if _sleep > 5:
                _sleep = int(NS.config.data.get("sync_interval", 10))
            else:
                _sleep += 1

            NS.node_context = NS.node_context.load()
            NS.node_context.sync_status = "in_progress"
            NS.node_context.status = "UP"
            NS.node_context.save(ttl=_sync_ttl)
            NS.tendrl_context = NS.tendrl_context.load()

            sync_cluster_contexts_thread = threading.Thread(
                target=cluster_contexts_sync.sync, args=(_sync_ttl, ))
            sync_cluster_contexts_thread.daemon = True
            sync_cluster_contexts_thread.start()
            sync_cluster_contexts_thread.join()

            platform_detect_thread = threading.Thread(
                target=platform_detect.sync)
            platform_detect.daemon = True
            platform_detect_thread.start()
            platform_detect_thread.join()
            sds_detect_thread = threading.Thread(target=sds_detect.sync)
            sds_detect_thread.daemon = True
            sds_detect_thread.start()
            sds_detect_thread.join()

            sync_service_and_index_thread = threading.Thread(
                target=services_and_index_sync.sync, args=(_sync_ttl, ))
            sync_service_and_index_thread.daemon = True
            sync_service_and_index_thread.start()
            sync_service_and_index_thread.join()

            try:
                NS.tendrl.objects.Os().save()
                NS.tendrl.objects.Cpu().save()
                NS.tendrl.objects.Memory().save()
            except Exception as ex:
                Event(
                    ExceptionMessage(priority="error",
                                     publisher=NS.publisher_id,
                                     payload={
                                         "message":
                                         "node_sync "
                                         "os/cpu/memory sync failed: " +
                                         ex.message,
                                         "exception":
                                         ex
                                     }))
                NS.node_context = NS.node_context.load()
                NS.node_context.sync_status = "failed"
                NS.node_context.last_sync = str(time_utils.now())
                NS.node_context.status = "UP"
                NS.node_context.save(ttl=_sync_ttl)
                time.sleep(_sleep)

            sync_disks_thread = threading.Thread(target=disk_sync.sync)
            sync_disks_thread.daemon = True
            sync_disks_thread.start()
            sync_disks_thread.join()

            sync_networks_thread = threading.Thread(target=network_sync.sync)
            sync_networks_thread.daemon = True
            sync_networks_thread.start()
            sync_networks_thread.join()

            NS.node_context = NS.node_context.load()
            NS.node_context.sync_status = "done"
            NS.node_context.last_sync = str(time_utils.now())
            NS.node_context.status = "UP"
            NS.node_context.save(ttl=_sync_ttl)

            sync_cluster_contexts_thread = threading.Thread(
                target=cluster_contexts_sync.sync, args=(_sync_ttl, ))
            sync_cluster_contexts_thread.daemon = True
            sync_cluster_contexts_thread.start()
            sync_cluster_contexts_thread.join()

            if "tendrl/monitor" in NS.node_context.tags:
                check_all_managed_node_status_thread = threading.Thread(
                    target=check_all_managed_nodes_status.run)
                check_all_managed_node_status_thread.daemon = True
                check_all_managed_node_status_thread.start()
                check_all_managed_node_status_thread.join()

                check_cluster_status_thread = threading.Thread(
                    target=check_cluster_status.run)
                check_cluster_status_thread.daemon = True
                check_cluster_status_thread.start()
                check_cluster_status_thread.join()

                if not NS.gluster_sds_sync_running:
                    NS.gluster_integrations_sync_thread = \
                        gluster_integrations_sds_sync.\
                        GlusterIntegrtaionsSyncThread()
                    NS.gluster_integrations_sync_thread.start()
                    NS.gluster_sds_sync_running = True

            time.sleep(_sleep)

        Event(
            Message(
                priority="info",
                publisher=NS.publisher_id,
                payload={"message": "%s complete" % self.__class__.__name__}))
Exemple #18
0
    def volume_delete(self, event):
        time.sleep(self.sync_interval)
        fetched_volumes = NS.tendrl.objects.GlusterVolume(
            NS.tendrl_context.integration_id).load_all()
        for fetched_volume in fetched_volumes:
            if fetched_volume.name == event['message']['name']:
                fetched_volume.deleted = True
                fetched_volume.deleted_at = time_utils.now()
                fetched_volume.save()
                try:
                    sub_volumes = etcd_utils.read(
                        "/clusters/{0}/Volumes/{1}/Bricks".format(
                            NS.tendrl_context.integration_id,
                            fetched_volume.vol_id))

                    for sub_volume in sub_volumes.leaves:
                        bricks = etcd_utils.read(sub_volume.key)
                        for brick in bricks.leaves:
                            fqdn = brick.key.split('/')[-1].split(':')[0]
                            path = brick.key.split('/')[-1].split(':')[-1][1:]
                            # Delete brick dashboard from grafana
                            brick_obj = NS.tendrl.objects.GlusterBrick(
                                NS.tendrl_context.integration_id, fqdn,
                                path).load()
                            # Delete brick
                            brick_path = "clusters/{0}/Bricks/"\
                                         "all/{1}/{2}".format(
                                             NS.tendrl_context.integration_id,
                                             fqdn,
                                             path
                                         )
                            etcd_utils.delete(brick_path, recursive=True)
                            brick_full_path = fqdn + ":" + brick_obj.\
                                brick_path.split(":")[-1]
                            job_id = monitoring_utils.update_dashboard(
                                "%s|%s" %
                                (event['message']['name'], brick_full_path),
                                RESOURCE_TYPE_BRICK,
                                NS.tendrl_context.integration_id, "delete")
                            logger.log(
                                "debug", NS.publisher_id, {
                                    "message":
                                    "Update dashboard job %s"
                                    " for brick %s "
                                    "in cluster %s created" %
                                    (job_id, brick.key.split('/')[-1],
                                     NS.tendrl_context.integration_id)
                                })
                            # Delete brick from graphite
                            job_id = monitoring_utils.\
                                delete_resource_from_graphite(
                                    "%s|%s" % (
                                        event['message']['name'],
                                        brick_full_path
                                    ),
                                    RESOURCE_TYPE_BRICK,
                                    NS.tendrl_context.integration_id,
                                    "delete"
                                )
                            logger.log(
                                "debug", NS.publisher_id, {
                                    "message":
                                    "Delete resource "
                                    "from graphite job %s "
                                    "for brick %s in cluster %s created" %
                                    (job_id, brick.key.split('/')[-1],
                                     NS.tendrl_context.integration_id)
                                })
                except etcd.EtcdKeyNotFound:
                    pass
        # Delete volume dashboard from grafana
        job_id = monitoring_utils.update_dashboard(
            event['message']['name'], RESOURCE_TYPE_VOLUME,
            NS.tendrl_context.integration_id, "delete")
        logger.log("debug", NS.publisher_id,
                   {"message": "Update dashboard job %s "
                    "created" % job_id})
        # Delete volume details from graphite
        job_id = monitoring_utils.delete_resource_from_graphite(
            event['message']['name'], RESOURCE_TYPE_VOLUME,
            NS.tendrl_context.integration_id, "delete")
        logger.log("debug", NS.publisher_id, {
            "message":
            "Delete resource from graphite job %s "
            "created" % job_id
        })
Exemple #19
0
def process_job(jid):
    job = NS.tendrl.objects.Job(job_id=jid).load()
    if job.status in [None, ""]:
        job.status = "new"
        job.save()

    NS.node_context = NS.node_context.load()
    # Check job not already "finished", or "processing"
    try:
        if job.status in ["finished",
                          "processing",
                          "failed"]:
            return
    except etcd.EtcdKeyNotFound:
        pass

    try:
        _timeout = None
        _timeout = job.timeout
        if _timeout:
            _timeout = _timeout.lower()
    except etcd.EtcdKeyNotFound:
        pass

    # tendrl-node-agent tagged as tendrl/monitor will ensure
    # >10 min old "new" jobs are timed out and marked as
    # "failed" (the parent job of these jobs will also be
    # marked as "failed")
    if "tendrl/monitor" in NS.node_context.tags and \
        _timeout == "yes" and job.status == "new":
        _valid_until = job.valid_until

        if _valid_until:
            _now_epoch = (time_utils.now() -
                          datetime.datetime(1970, 1,
                                            1).replace(
                              tzinfo=utc)).total_seconds()
            if int(_now_epoch) >= int(_valid_until):
                # Job has "new" status since 10 minutes,
                # mark status as "failed" and Job.error =
                # "Timed out"
                try:
                    job = job.load()
                    if job.status == "new":
                        job.status = "failed"
                        job.save()
                except etcd.EtcdCompareFailed:
                    pass
                else:
                    job = NS.tendrl.objects.Job(job_id=jid).load()
                    if job.status == "new":
                        _msg = str("Timed-out (>10min as 'new')")
                        job.errors = _msg
                        job.save()
                        if job.payload.get('parent') is None:
                            integration_id = NS.tendrl_context.integration_id
                            alert_utils.alert_job_status(
                                "failed",
                                "Job timed out (job_id: %s)" % jid,
                                integration_id=integration_id or
                                job.payload['parameters'].get(
                                    'TendrlContext.integration_id'
                                ),
                                cluster_name=NS.tendrl_context.cluster_name or
                                job.payload['parameters'].get(
                                    'TendrlContext.cluster_name'
                                )
                            )
                    return
        else:
            _now_plus_10 = time_utils.now() + datetime.timedelta(minutes=10)
            _epoch_start = datetime.datetime(1970, 1, 1).replace(tzinfo=utc)

            _now_plus_10_epoch = (_now_plus_10 -
                                  _epoch_start).total_seconds()
            time.sleep(7)
            job = job.load()
            if job.status == "new":
                # To avoid  server and storage node do save same time
                job.valid_until = int(_now_plus_10_epoch)
                job.save()

    job = NS.tendrl.objects.Job(job_id=jid).load()
    if job.payload["type"] == NS.type and \
            job.status == "new":
        # Job routing
        # Flows created by tendrl-api use 'tags' from flow
        # definition to target jobs
        _tag_match = False
        if job.payload.get("tags", []):
            for flow_tag in job.payload['tags']:
                if flow_tag in NS.node_context.tags:
                    _tag_match = True

        if not _tag_match:
            _job_tags = ", ".join(job.payload.get("tags", []))
            _msg = "Node (%s)(type: %s)(tags: %s) will not " \
                   "process job-%s (tags: %s)" % \
                   (NS.node_context.node_id, NS.type,
                    NS.node_context.tags, jid,
                    _job_tags)
            logger.log(
                "debug",
                NS.publisher_id,
                {"message": _msg}
            )
            return

        try:
            try:
                job_status_key = "/queue/%s/status" % job.job_id
                etcd_utils.write(job_status_key,
                                 "processing",
                                 prevValue="new")
            except etcd.EtcdKeyNotFound:
                # if status watchable attribute not present
                # then it will be created when job save happens
                pass
            lock_info = dict(node_id=NS.node_context.node_id,
                             fqdn=NS.node_context.fqdn,
                             type=NS.type)
            job = job.load()
            job.locked_by = lock_info
            job.status = "processing"
            job.save(ttl=DEFAULT_JOB_TTL)
        except etcd.EtcdCompareFailed:
            # job is already being processed by some tendrl
            # agent
            return

        the_flow = None
        try:
            current_ns, flow_name, obj_name = \
                _extract_fqdn(job.payload['run'])

            if obj_name:
                runnable_flow = current_ns.ns.get_obj_flow(
                    obj_name, flow_name)
            else:
                runnable_flow = current_ns.ns.get_flow(flow_name)

            job = job.load()
            lock_info = dict(node_id=NS.node_context.node_id,
                             fqdn=NS.node_context.fqdn,
                             type=NS.type)
            if job.locked_by != lock_info:
                return

            the_flow = runnable_flow(parameters=job.payload[
                'parameters'], job_id=job.job_id)
            logger.log(
                "info",
                NS.publisher_id,
                {"message": "Starting Job %s" %
                            job.job_id},
                job_id=job.job_id,
                flow_id=the_flow.parameters['flow_id']
            )

            logger.log(
                "info",
                NS.publisher_id,
                {"message": "Running %s" %
                            job.payload['run'].split('.')[-1]},
                job_id=job.job_id,
                flow_id=the_flow.parameters['flow_id']
            )

            the_flow.run()

            try:
                job = job.load()
                job.status = "finished"
                job.save()
            except etcd.EtcdCompareFailed:
                # This should not happen!
                _msg = "Cannot mark job as 'finished', " \
                       "current job status invalid"
                raise FlowExecutionFailedError(_msg)

            logger.log(
                "info",
                NS.publisher_id,
                {"message": "Job (%s) for %s finished. "
                            % (
                                job.job_id,
                                job.payload['run'].split('.')[-1])},
                job_id=job.job_id,
                flow_id=the_flow.parameters['flow_id'],
            )
            if job.payload.get('parent') is None:
                alert_utils.alert_job_status(
                    "finished",
                    "%s (job ID: %s) completed successfully " % (
                        job.payload['run'].split('.')[-1],
                        job.job_id),
                    integration_id=NS.tendrl_context.integration_id or
                    job.payload['parameters'].get(
                        'TendrlContext.integration_id'
                    ),
                    cluster_name=NS.tendrl_context.cluster_name or
                    job.payload['parameters'].get(
                        'TendrlContext.cluster_name'
                    )
                )
        except (FlowExecutionFailedError,
                AtomExecutionFailedError,
                Exception) as e:
            _trace = str(traceback.format_exc(e))
            _msg = "Failure in Job %s Flow %s with error:" % \
                   (job.job_id, job.payload['run'])
            Event(
                ExceptionMessage(
                    priority="error",
                    publisher=NS.publisher_id,
                    payload={"message": _msg + _trace,
                             "exception": e
                             }
                )
            )
            if the_flow:
                logger.log(
                    "error",
                    NS.publisher_id,
                    {"message": _msg + "\n" + _trace},
                    job_id=job.job_id,
                    flow_id=the_flow.parameters['flow_id']
                )
            else:
                logger.log(
                    "error",
                    NS.publisher_id,
                    {"message": _msg + "\n" + _trace}
                )

            try:
                job = job.load()
                job.status = "failed"
                job.save()
            except etcd.EtcdCompareFailed:
                # This should not happen!
                _msg = "Cannot mark job as 'failed', current" \
                       "job status invalid"
                raise FlowExecutionFailedError(_msg)
            else:
                job = job.load()
                job.errors = _trace
                if job.payload.get('parent') is None:
                    alert_utils.alert_job_status(
                        "failed",
                        "Job failed (job_id: %s)" % job.job_id,
                        integration_id=NS.tendrl_context.integration_id or
                        job.payload['parameters'].get(
                            'TendrlContext.integration_id'
                        ),
                        cluster_name=NS.tendrl_context.cluster_name or
                        job.payload['parameters'].get(
                            'TendrlContext.cluster_name'
                        )
                    )
                job.save()
Exemple #20
0
 def __init__(self):
     self.time_stamp = now()
     self.alert = None
Exemple #21
0
    def volume_delete(self, event):
        time.sleep(self.sync_interval)
        fetched_volumes = NS.tendrl.objects.GlusterVolume(
            NS.tendrl_context.integration_id
        ).load_all()
        for fetched_volume in fetched_volumes:
            if fetched_volume.name == event['message']['name']:
                fetched_volume.deleted = True
                fetched_volume.deleted_at = time_utils.now()
                fetched_volume.save()
                try:
                    sub_volumes = etcd_utils.read(
                        "/clusters/{0}/Volumes/{1}/Bricks".format(
                            NS.tendrl_context.integration_id,
                            fetched_volume.vol_id
                        )
                    )

                    for sub_volume in sub_volumes.leaves:
                        bricks = etcd_utils.read(
                            sub_volume.key
                        )
                        for brick in bricks.leaves:
                            fqdn = brick.key.split('/')[-1].split(':')[0]
                            path = brick.key.split('/')[-1].split(':')[-1][1:]
                            # Delete brick dashboard from grafana
                            brick_obj = NS.tendrl.objects.GlusterBrick(
                                NS.tendrl_context.integration_id,
                                fqdn,
                                path
                            ).load()
                            # Delete brick
                            brick_path = "clusters/{0}/Bricks/"\
                                         "all/{1}/{2}".format(
                                             NS.tendrl_context.integration_id,
                                             fqdn,
                                             path
                                         )
                            etcd_utils.delete(
                                brick_path,
                                recursive=True
                            )
                            brick_full_path = fqdn + ":" + brick_obj.\
                                brick_path.split(":")[-1]
                            job_id = monitoring_utils.update_dashboard(
                                "%s|%s" % (
                                    event['message']['name'],
                                    brick_full_path
                                ),
                                RESOURCE_TYPE_BRICK,
                                NS.tendrl_context.integration_id,
                                "delete"
                            )
                            logger.log(
                                "debug",
                                NS.publisher_id,
                                {
                                    "message": "Update dashboard job %s"
                                    " for brick %s "
                                    "in cluster %s created" % (
                                        job_id,
                                        brick.key.split('/')[-1],
                                        NS.tendrl_context.integration_id
                                    )
                                }
                            )
                            # Delete brick from graphite
                            job_id = monitoring_utils.\
                                delete_resource_from_graphite(
                                    "%s|%s" % (
                                        event['message']['name'],
                                        brick_full_path
                                    ),
                                    RESOURCE_TYPE_BRICK,
                                    NS.tendrl_context.integration_id,
                                    "delete"
                                )
                            logger.log(
                                "debug",
                                NS.publisher_id,
                                {
                                    "message": "Delete resource "
                                    "from graphite job %s "
                                    "for brick %s in cluster %s created" % (
                                        job_id,
                                        brick.key.split('/')[-1],
                                        NS.tendrl_context.integration_id
                                    )
                                }
                            )
                except etcd.EtcdKeyNotFound:
                    pass
        # Delete volume dashboard from grafana
        job_id = monitoring_utils.update_dashboard(
            event['message']['name'],
            RESOURCE_TYPE_VOLUME,
            NS.tendrl_context.integration_id,
            "delete"
        )
        logger.log(
            "debug",
            NS.publisher_id,
            {
                "message": "Update dashboard job %s "
                "created" % job_id
            }
        )
        # Delete volume details from graphite
        job_id = monitoring_utils.delete_resource_from_graphite(
            event['message']['name'],
            RESOURCE_TYPE_VOLUME,
            NS.tendrl_context.integration_id,
            "delete"
        )
        logger.log(
            "debug",
            NS.publisher_id,
            {
                "message": "Delete resource from graphite job %s "
                "created" % job_id
            }
        )
Exemple #22
0
def process_job(jid):
    job = NS.tendrl.objects.Job(job_id=jid).load()
    if job.status in [None, ""]:
        job.status = "new"
        job.save()

    NS.node_context = NS.node_context.load()
    # Check job not already "finished", or "processing"
    try:
        if job.status in ["finished", "processing", "failed"]:
            return
    except etcd.EtcdKeyNotFound:
        pass

    try:
        _timeout = None
        _timeout = job.timeout
        if _timeout:
            _timeout = _timeout.lower()
    except etcd.EtcdKeyNotFound:
        pass

    # tendrl-node-agent tagged as tendrl/monitor will ensure
    # >10 min old "new" parent jobs are timed out and marked
    # as "failed"
    if "tendrl/monitor" in NS.node_context.tags and _timeout == "yes" and \
        job.status == "new" and job.payload.get('parent') is None:
        _valid_until = job.valid_until

        if _valid_until:
            _now_epoch = (time_utils.now() - datetime.datetime(
                1970, 1, 1).replace(tzinfo=utc)).total_seconds()
            if int(_now_epoch) >= int(_valid_until):
                # Job has "new" status since 10 minutes,
                # mark status as "failed" and Job.error =
                # "Timed out"
                _msg = str("Timed-out (>10min as 'new')")
                job.errors = _msg
                job.status = "failed"
                job.save()
                integration_id = NS.tendrl_context.integration_id
                alert_utils.alert_job_status(
                    "failed",
                    "Job timed out (job_id: %s)" % jid,
                    integration_id=integration_id
                    or job.payload['parameters'].get(
                        'TendrlContext.integration_id'),
                    cluster_name=NS.tendrl_context.cluster_name
                    or job.payload['parameters'].get(
                        'TendrlContext.cluster_name'))
                return
        else:
            _now_plus_10 = time_utils.now() + datetime.timedelta(minutes=10)
            _epoch_start = datetime.datetime(1970, 1, 1).replace(tzinfo=utc)

            _now_plus_10_epoch = (_now_plus_10 - _epoch_start).total_seconds()
            job = NS.tendrl.objects.Job(job_id=jid).load()
            if job.status == "new":
                # To avoid  server and storage node do save same time
                job.valid_until = int(_now_plus_10_epoch)
                job.save()

    job = NS.tendrl.objects.Job(job_id=jid).load()
    if job.payload["type"] == NS.type and \
            job.status == "new":
        # Job routing
        # Flows created by tendrl-api use 'tags' from flow
        # definition to target jobs
        _tag_match = False
        if job.payload.get("tags", []):
            for flow_tag in job.payload['tags']:
                if flow_tag in NS.node_context.tags:
                    _tag_match = True

        if not _tag_match:
            _job_tags = ", ".join(job.payload.get("tags", []))
            _msg = "Node (%s)(type: %s)(tags: %s) will not " \
                   "process job-%s (tags: %s)" % \
                   (NS.node_context.node_id, NS.type,
                    NS.node_context.tags, jid,
                    _job_tags)
            logger.log("debug", NS.publisher_id, {"message": _msg})
            return

        try:
            try:
                job_status_key = "/queue/%s/status" % job.job_id
                etcd_utils.write(job_status_key, "processing", prevValue="new")
            except etcd.EtcdKeyNotFound:
                # if status watchable attribute not present
                # then it will be created when job save happens
                pass
            lock_info = dict(node_id=NS.node_context.node_id,
                             fqdn=NS.node_context.fqdn,
                             type=NS.type)
            job = NS.tendrl.objects.Job(job_id=jid).load()
            job.locked_by = lock_info
            job.status = "processing"
            job.save(ttl=DEFAULT_JOB_TTL)
        except etcd.EtcdCompareFailed:
            # job is already being processed by some tendrl
            # agent
            return

        the_flow = None
        try:
            current_ns, flow_name, obj_name = \
                _extract_fqdn(job.payload['run'])

            if obj_name:
                runnable_flow = current_ns.ns.get_obj_flow(obj_name, flow_name)
            else:
                runnable_flow = current_ns.ns.get_flow(flow_name)
            time.sleep(2)
            job = NS.tendrl.objects.Job(job_id=jid).load()
            lock_info = dict(node_id=NS.node_context.node_id,
                             fqdn=NS.node_context.fqdn,
                             type=NS.type)
            if job.locked_by != lock_info:
                return

            the_flow = runnable_flow(parameters=job.payload['parameters'],
                                     job_id=job.job_id)
            # Tendrl server does not have fqdn in node_context
            logger.log("info",
                       NS.publisher_id, {
                           "message":
                           "Starting %s Job: %s on %s" %
                           (job.payload['run'].split('.')[-1], job.job_id,
                            NS.node_context.fqdn or "server")
                       },
                       job_id=job.job_id,
                       flow_id=the_flow.parameters['flow_id'])

            logger.log("info",
                       NS.publisher_id, {
                           "message":
                           "Running %s job: %s on %s" %
                           (job.payload['run'].split('.')[-1], job.job_id,
                            NS.node_context.fqdn or "server")
                       },
                       job_id=job.job_id,
                       flow_id=the_flow.parameters['flow_id'])

            the_flow.run()

            try:
                job = NS.tendrl.objects.Job(job_id=jid).load()
                job.status = "finished"
                job.save()
            except etcd.EtcdCompareFailed:
                # This should not happen!
                _msg = "Cannot mark job as 'finished', " \
                       "current job status invalid"
                raise FlowExecutionFailedError(_msg)

            logger.log(
                "info",
                NS.publisher_id,
                {
                    "message":
                    "Job (%s) for %s finished. " %
                    (job.job_id, job.payload['run'].split('.')[-1])
                },
                job_id=job.job_id,
                flow_id=the_flow.parameters['flow_id'],
            )
            if job.payload.get('parent') is None:
                alert_utils.alert_job_status(
                    "finished",
                    "%s (job ID: %s) completed successfully " %
                    (job.payload['run'].split('.')[-1], job.job_id),
                    integration_id=NS.tendrl_context.integration_id
                    or job.payload['parameters'].get(
                        'TendrlContext.integration_id'),
                    cluster_name=NS.tendrl_context.cluster_name
                    or job.payload['parameters'].get(
                        'TendrlContext.cluster_name'))
        except (FlowExecutionFailedError, AtomExecutionFailedError,
                Exception) as e:
            _trace = str(traceback.format_exc(e))
            _msg = "Failure in Job %s Flow %s with error:" % \
                   (job.job_id, job.payload['run'])
            Event(
                ExceptionMessage(priority="error",
                                 publisher=NS.publisher_id,
                                 payload={
                                     "message": _msg + _trace,
                                     "exception": e
                                 }))
            if the_flow:
                logger.log("error",
                           NS.publisher_id, {"message": _msg + "\n" + _trace},
                           job_id=job.job_id,
                           flow_id=the_flow.parameters['flow_id'])
            else:
                logger.log("error", NS.publisher_id,
                           {"message": _msg + "\n" + _trace})

            try:
                job = NS.tendrl.objects.Job(job_id=jid).load()
                job.status = "failed"
                job.save()
            except etcd.EtcdCompareFailed:
                # This should not happen!
                _msg = "Cannot mark job as 'failed', current" \
                       "job status invalid"
                raise FlowExecutionFailedError(_msg)
            else:
                job = NS.tendrl.objects.Job(job_id=jid).load()
                job.errors = _trace
                if job.payload.get('parent') is None:
                    alert_utils.alert_job_status(
                        "failed",
                        "Job failed (job_id: %s)" % job.job_id,
                        integration_id=NS.tendrl_context.integration_id
                        or job.payload['parameters'].get(
                            'TendrlContext.integration_id'),
                        cluster_name=NS.tendrl_context.cluster_name
                        or job.payload['parameters'].get(
                            'TendrlContext.cluster_name'))
                job.save()
Exemple #23
0
    def on_sync_object(self, data):

        assert data['fsid'] == self.fsid

        sync_object = copy.deepcopy(data['data'])

        sync_type = SYNC_OBJECT_STR_TYPE[data['type']]
        new_object = self.inject_sync_object(data['type'], data['version'],
                                             sync_object)
        self._request_coll.on_map(sync_type, new_object)
        if new_object:
            # Check and raise any alerts if required

            # TODO(team) Enabled the below if condition as when
            # alerting needed for cluster health, mon status, pool
            # status etc

            # if sync_type.str == "health":
            #    self._on_health(sync_object)
            # if sync_type.str == "mon_status":
            #    self._on_mon_status(sync_object)
            if sync_type.str == "osd_map":
                # self._on_pool_status(sync_object)
                self._on_osd_map(sync_object)

            NS.ceph.objects.SyncObject(
                updated=now(),
                sync_type=sync_type.str,
                version=new_object.version if isinstance(
                    new_object.version, int) else None,
                when=now(),
                data=data['data']).save(update=False)

            if sync_type.str == "health":
                NS.ceph.objects.GlobalDetails(
                    status=sync_object['overall_status']).save()
            if sync_type.str == "osd_map":
                # Pool out of band deletion handling
                try:
                    pools = NS._int.client.read(
                        "clusters/%s/Pools" % NS.tendrl_context.integration_id)
                    old_pool_ids = []
                    for pool in pools.leaves:
                        old_pool_ids.append(int(pool.key.split("/")[-1]))
                    new_pool_ids = []
                    for raw_pool in sync_object.get('pools', []):
                        new_pool_ids.append(raw_pool['pool'])
                    delete_pool_ids = set(old_pool_ids) - set(new_pool_ids)
                    for id in delete_pool_ids:
                        NS._int.client.delete(
                            "clusters/%s/Pools/%s" %
                            (NS.tendrl_context.integration_id, id),
                            recursive=True)
                except etcd.EtcdKeyNotFound as ex:
                    Event(
                        ExceptionMessage(priority="debug",
                                         publisher=NS.publisher_id,
                                         payload={
                                             "message":
                                             "No pools found \
                                     for ceph cluster %s" %
                                             NS.tendrl_context.integration_id,
                                             "exception":
                                             ex
                                         }))
                for raw_pool in sync_object.get('pools', []):
                    Event(
                        Message(priority="info",
                                publisher=NS.publisher_id,
                                payload={
                                    "message":
                                    "Updating Pool %s" % raw_pool['pool_name']
                                }))
                    pool_type = 'replicated'
                    if 'erasure_code_profile' in raw_pool and \
                        raw_pool['erasure_code_profile'] != "":
                        pool_type = 'erasure_coded'
                    quota_enabled = False
                    if ('quota_max_objects' in raw_pool and
                        raw_pool['quota_max_objects'] > 0) or \
                        ('quota_max_bytes' in raw_pool and
                         raw_pool['quota_max_bytes'] > 0):
                        quota_enabled = True
                    NS.ceph.objects.Pool(
                        pool_id=raw_pool['pool'],
                        pool_name=raw_pool['pool_name'],
                        pg_num=raw_pool['pg_num'],
                        type=pool_type,
                        erasure_code_profile=raw_pool.get(
                            'erasure_code_profile'),
                        min_size=raw_pool['min_size'],
                        size=raw_pool.get('size', None),
                        quota_enabled=quota_enabled,
                        quota_max_objects=raw_pool['quota_max_objects'],
                        quota_max_bytes=raw_pool['quota_max_bytes'],
                    ).save()
                # Osd out of band deletion handling
                try:
                    osds = NS._int.client.read(
                        "clusters/%s/Osds" % NS.tendrl_context.integration_id)
                    old_osds = []
                    for osd in osds.leaves:
                        old_osds.append(str(osd.key.split("/")[-1]))
                    new_osds = []
                    for raw_osd in sync_object.get('osds', []):
                        new_osds.append(raw_osd['uuid'])
                    delete_osds = set(old_osds) - set(new_osds)
                    for id in delete_osds:
                        NS._int.client.delete(
                            "clusters/%s/Osds/%s" %
                            (NS.tendrl_context.integration_id, id),
                            recursive=True)
                except etcd.EtcdKeyNotFound as ex:
                    Event(
                        ExceptionMessage(priority="debug",
                                         publisher=NS.publisher_id,
                                         payload={
                                             "message":
                                             "key not found in etcd",
                                             "exception": ex
                                         }))
                for raw_osd in sync_object.get('osds', []):
                    Event(
                        Message(priority="info",
                                publisher=NS.publisher_id,
                                payload={
                                    "message":
                                    "Updating OSD %s" % raw_osd['osd']
                                }))
                    osd_host = socket.gethostbyaddr(
                        raw_osd['public_addr'].split(':')[0])[0]
                    NS.ceph.objects.Osd(
                        id=raw_osd['osd'],
                        uuid=raw_osd['uuid'],
                        hostname=osd_host,
                        public_addr=raw_osd['public_addr'],
                        cluster_addr=raw_osd['cluster_addr'],
                        heartbeat_front_addr=raw_osd['heartbeat_front_addr'],
                        heartbeat_back_addr=raw_osd['heartbeat_back_addr'],
                        down_at=raw_osd['down_at'],
                        up_from=raw_osd['up_from'],
                        lost_at=raw_osd['lost_at'],
                        osd_up=raw_osd['up'],
                        osd_in=raw_osd['in'],
                        up_thru=raw_osd['up_thru'],
                        weight=str(raw_osd['weight']),
                        primary_affinity=str(raw_osd['primary_affinity']),
                        state=raw_osd['state'],
                        last_clean_begin=raw_osd['last_clean_begin'],
                        last_clean_end=raw_osd['last_clean_end']).save()
        else:
            Event(
                Message(priority="debug",
                        publisher=NS.publisher_id,
                        payload={
                            "message":
                            "ClusterMonitor.on_sync_object: "
                            "stale object received for %s" % data['type']
                        }))
Exemple #24
0
    def save(self, update=True, ttl=None):
        self.render()
        if "Message" not in self.__class__.__name__:
            # If local object.hash is equal to
            # central_store object.hash, return
            if self.hash_compare_with_central_store(ttl=ttl):
                return
        if update:
            current_obj = self.load()
            for attr, val in vars(self).iteritems():
                if isinstance(val, (types.FunctionType,
                                    types.BuiltinFunctionType,
                                    types.MethodType, types.BuiltinMethodType,
                                    types.UnboundMethodType)) or \
                        attr.startswith("_") or attr in ['value', 'list']:
                    continue

                if val is None and hasattr(current_obj, attr):
                    # if self.attr is None, use attr value from central
                    # store (i.e. current_obj.attr)
                    if getattr(current_obj, attr):
                        setattr(self, attr, getattr(current_obj, attr))

        self.updated_at = str(time_utils.now())
        for item in self.render():
            '''
                Note: Log messages in this file have try-except
                blocks to run
                in the condition when the node_agent has not been
                started and
                name spaces are being created.
            '''
            try:
                logger.log("debug", NS.publisher_id, {
                    "message":
                    "Writing %s to %s" % (item['key'], item['value'])
                })
            except KeyError:
                sys.stdout.write("Writing %s to %s \n" %
                                 (item['key'], item['value']))
            # convert list, dict (json) to python based on definitions
            _type = self._defs.get("attrs", {}).get(item['name'],
                                                    {}).get("type")
            if _type:
                if _type.lower() in ['json', 'list']:
                    if item['value']:
                        try:
                            item['value'] = json.dumps(item['value'])
                        except ValueError as ex:
                            _msg = "Error save() attr %s for object %s" % \
                                   (item['name'], self.__name__)
                            Event(
                                ExceptionMessage(priority="debug",
                                                 publisher=NS.publisher_id,
                                                 payload={
                                                     "message": _msg,
                                                     "exception": ex
                                                 }))
            try:
                NS._int.wclient.write(item['key'], item['value'], quorum=True)
            except (etcd.EtcdConnectionFailed, etcd.EtcdException):
                NS._int.wreconnect()
                NS._int.wclient.write(item['key'], item['value'], quorum=True)
        if ttl:
            etcd_utils.refresh(self.value, ttl)

        self.watch_attrs()
Exemple #25
0
    def run(self):
        logger.log("info", NS.publisher_id,
                   {"message": "%s running" % self.__class__.__name__})
        NS.node_context = NS.node_context.load()
        current_tags = list(NS.node_context.tags)
        current_tags += ["tendrl/node_%s" % NS.node_context.node_id]
        NS.node_context.tags = list(set(current_tags))
        NS.node_context.status = "UP"
        NS.node_context.save()
        _sleep = 0
        msg = "{0} is UP".format(NS.node_context.fqdn)
        event_utils.emit_event("node_status",
                               "UP",
                               msg,
                               "node_{0}".format(NS.node_context.fqdn),
                               "INFO",
                               node_id=NS.node_context.node_id)
        while not self._complete.is_set():
            _sync_ttl = int(NS.config.data.get("sync_interval", 10)) + 100
            if _sleep > 5:
                _sleep = int(NS.config.data.get("sync_interval", 10))
            else:
                _sleep += 1

            NS.node_context = NS.node_context.load()
            NS.node_context.sync_status = "in_progress"

            current_tags = list(NS.node_context.tags)
            current_tags += ["tendrl/node_%s" % NS.node_context.node_id]
            NS.node_context.tags = list(set(current_tags))
            NS.node_context.status = "UP"
            NS.node_context.save(ttl=_sync_ttl)
            NS.tendrl_context = NS.tendrl_context.load()

            sync_service_and_index_thread = threading.Thread(
                target=services_and_index_sync.sync, args=(_sync_ttl, ))
            sync_service_and_index_thread.daemon = True
            sync_service_and_index_thread.start()
            sync_service_and_index_thread.join()

            NS.node_context = NS.node_context.load()
            if "tendrl/monitor" in NS.node_context.tags:
                check_all_managed_node_status_thread = threading.Thread(
                    target=check_all_managed_nodes_status.run)
                check_all_managed_node_status_thread.daemon = True
                check_all_managed_node_status_thread.start()
                check_all_managed_node_status_thread.join()

                check_cluster_status_thread = threading.Thread(
                    target=check_cluster_status.run)
                check_cluster_status_thread.daemon = True
                check_cluster_status_thread.start()
                check_cluster_status_thread.join()

            if "tendrl/monitor" not in NS.node_context.tags:
                sync_cluster_contexts_thread = threading.Thread(
                    target=cluster_contexts_sync.sync, args=(_sync_ttl, ))
                sync_cluster_contexts_thread.daemon = True
                sync_cluster_contexts_thread.start()
                sync_cluster_contexts_thread.join()

            platform_detect_thread = threading.Thread(
                target=platform_detect.sync)
            platform_detect.daemon = True
            platform_detect_thread.start()
            platform_detect_thread.join()

            if "tendrl/monitor" not in NS.node_context.tags:
                sds_detect_thread = threading.Thread(target=sds_detect.sync,
                                                     args=(_sleep, ))

                sds_detect_thread.daemon = True
                sds_detect_thread.start()
                sds_detect_thread.join()

            NS.tendrl_context = NS.tendrl_context.load()

            try:
                NS.tendrl.objects.Os().save()
                NS.tendrl.objects.Cpu().save()
                NS.tendrl.objects.Memory().save()
            except Exception as ex:
                Event(
                    ExceptionMessage(priority="error",
                                     publisher=NS.publisher_id,
                                     payload={
                                         "message":
                                         "node_sync "
                                         "os/cpu/memory sync failed: " +
                                         ex.message,
                                         "exception":
                                         ex
                                     }))
                NS.node_context = NS.node_context.load()
                NS.node_context.sync_status = "failed"
                NS.node_context.last_sync = str(time_utils.now())
                NS.node_context.status = "UP"
                NS.node_context.save(ttl=_sync_ttl)
                time.sleep(_sleep)

            sync_disks_thread = threading.Thread(target=disk_sync.sync)
            sync_disks_thread.daemon = True
            sync_disks_thread.start()
            sync_disks_thread.join()

            sync_networks_thread = threading.Thread(target=network_sync.sync)
            sync_networks_thread.daemon = True
            sync_networks_thread.start()
            sync_networks_thread.join()

            NS.node_context = NS.node_context.load()
            NS.node_context.sync_status = "done"
            NS.node_context.last_sync = str(time_utils.now())
            NS.node_context.status = "UP"
            NS.node_context.save(ttl=_sync_ttl)

            if "tendrl/monitor" not in NS.node_context.tags:
                sync_cluster_contexts_thread = threading.Thread(
                    target=cluster_contexts_sync.sync, args=(_sync_ttl, ))
                sync_cluster_contexts_thread.daemon = True
                sync_cluster_contexts_thread.start()
                sync_cluster_contexts_thread.join()
            # Update node alert count
            if not NS.tendrl.objects.ClusterNodeAlertCounters().exists():
                update_cluster_node_alert_count()
            time.sleep(_sleep)
        logger.log("info", NS.publisher_id,
                   {"message": "%s complete" % self.__class__.__name__})
Exemple #26
0
    def save(self, update=True, ttl=None):
        self.render()
        if "Message" not in self.__class__.__name__:
            try:
                # Generate current in memory object hash
                self.hash = self._hash()
                _hash_key = "/{0}/hash".format(self.value)
                _stored_hash = None
                try:
                    _stored_hash = NS._int.client.read(_hash_key).value
                except (etcd.EtcdConnectionFailed, etcd.EtcdException) as ex:
                    if type(ex) != etcd.EtcdKeyNotFound:
                        NS._int.reconnect()
                        _stored_hash = NS._int.client.read(_hash_key).value
                if self.hash == _stored_hash:
                    # No changes in stored object and current object,
                    # dont save current object to central store
                    if ttl:
                        etcd_utils.refresh(self.value, ttl)
                    return
            except TypeError:
                # no hash for this object, save the current hash as is
                pass

        if update:
            current_obj = self.load()
            for attr, val in vars(self).iteritems():
                if isinstance(val, (types.FunctionType,
                                    types.BuiltinFunctionType,
                                    types.MethodType, types.BuiltinMethodType,
                                    types.UnboundMethodType)) or \
                        attr.startswith("_") or attr in ['value', 'list']:
                    continue

                if val is None and hasattr(current_obj, attr):
                    # if self.attr is None, use attr value from central
                    # store (i.e. current_obj.attr)
                    if getattr(current_obj, attr):
                        setattr(self, attr, getattr(current_obj, attr))

        self.updated_at = str(time_utils.now())
        for item in self.render():
            '''
                Note: Log messages in this file have try-except
                blocks to run
                in the condition when the node_agent has not been
                started and
                name spaces are being created.
            '''
            try:
                Event(
                    Message(priority="debug",
                            publisher=NS.publisher_id,
                            payload={
                                "message":
                                "Writing %s to %s" %
                                (item['key'], item['value'])
                            }))
            except KeyError:
                sys.stdout.write("Writing %s to %s" %
                                 (item['key'], item['value']))
            # convert list, dict (json) to python based on definitions
            _type = self._defs.get("attrs", {}).get(item['name'],
                                                    {}).get("type")
            if _type:
                if _type.lower() in ['json', 'list']:
                    if item['value']:
                        try:
                            item['value'] = json.dumps(item['value'])
                        except ValueError as ex:
                            _msg = "Error save() attr %s for object %s" % \
                                   (item['name'], self.__name__)
                            Event(
                                ExceptionMessage(priority="debug",
                                                 publisher=NS.publisher_id,
                                                 payload={
                                                     "message": _msg,
                                                     "exception": ex
                                                 }))
            try:
                NS._int.wclient.write(item['key'], item['value'], quorum=True)
            except (etcd.EtcdConnectionFailed, etcd.EtcdException):
                NS._int.wreconnect()
                NS._int.wclient.write(item['key'], item['value'], quorum=True)
        if ttl:
            etcd_utils.refresh(self.value, ttl)
def test_now():
    date = time_utils.now()
    assert isinstance(date, datetime.datetime)
Exemple #28
0
def process_job(job):
    jid = job.key.split('/')[-1]
    job_status_key = "/queue/%s/status" % jid
    job_lock_key = "/queue/%s/locked_by" % jid
    NS.node_context = NS.node_context.load()
    # Check job not already locked by some agent
    try:
        _locked_by = etcd_utils.read(job_lock_key).value
        if _locked_by:
            return
    except etcd.EtcdKeyNotFound:
        pass

    # Check job not already "finished", or "processing"
    try:
        _status = etcd_utils.read(job_status_key).value
        if _status in ["finished", "processing"]:
            return
    except etcd.EtcdKeyNotFound:
        pass

    try:
        _job_timeout_key = "/queue/%s/timeout" % jid
        _timeout = None
        _timeout = etcd_utils.read(_job_timeout_key).value
        if _timeout:
            _timeout = _timeout.lower()
    except etcd.EtcdKeyNotFound:
        pass

    # tendrl-node-agent tagged as tendrl/monitor will ensure
    # >10 min old "new" jobs are timed out and marked as
    # "failed" (the parent job of these jobs will also be
    # marked as "failed")
    if "tendrl/monitor" in NS.node_context.tags and \
        _timeout == "yes":
        _job_valid_until_key = "/queue/%s/valid_until" % jid
        _valid_until = None
        try:
            _valid_until = etcd_utils.read(
                _job_valid_until_key).value
        except etcd.EtcdKeyNotFound:
            pass

        if _valid_until:
            _now_epoch = (time_utils.now() -
                          datetime.datetime(1970, 1,
                                            1).replace(
                              tzinfo=utc)).total_seconds()
            if int(_now_epoch) >= int(_valid_until):
                # Job has "new" status since 10 minutes,
                # mark status as "failed" and Job.error =
                # "Timed out"
                try:
                    etcd_utils.write(job_status_key,
                                     "failed",
                                     prevValue="new")
                except etcd.EtcdCompareFailed:
                    pass
                else:
                    job = NS.tendrl.objects.Job(job_id=jid).load()
                    _msg = str("Timed-out (>10min as 'new')")
                    job.errors = _msg
                    job.save()
                    if job.payload.get('parent') is None:
                        alert_utils.alert_job_status(
                            "failed",
                            "Job timed out (job_id: %s)" % jid,
                            integration_id=NS.tendrl_context.integration_id or
                            job.payload['parameters'].get(
                                'TendrlContext.integration_id'
                            ),
                            cluster_name=NS.tendrl_context.cluster_name or
                            job.payload['parameters'].get(
                                'TendrlContext.cluster_name'
                            )
                        )
                    return
        else:
            _now_plus_10 = time_utils.now() + datetime.timedelta(minutes=10)
            _epoch_start = datetime.datetime(1970, 1, 1).replace(tzinfo=utc)

            # noinspection PyTypeChecker
            _now_plus_10_epoch = (_now_plus_10 -
                                  _epoch_start).total_seconds()
            etcd_utils.write(_job_valid_until_key,
                             int(_now_plus_10_epoch))

    job = NS.tendrl.objects.Job(job_id=jid).load()
    if job.payload["type"] == NS.type and \
            job.status == "new":
        # Job routing
        # Flows created by tendrl-api use 'tags' from flow
        # definition to target jobs
        _tag_match = False
        if job.payload.get("tags", []):
            for flow_tag in job.payload['tags']:
                if flow_tag in NS.node_context.tags:
                    _tag_match = True

        if not _tag_match:
            _job_tags = ", ".join(job.payload.get("tags", []))
            _msg = "Node (%s)(type: %s)(tags: %s) will not " \
                   "process job-%s (tags: %s)" % \
                   (NS.node_context.node_id, NS.type,
                    NS.node_context.tags, jid,
                    _job_tags)
            logger.log(
                "info",
                NS.publisher_id,
                {"message": _msg}
            )
            return

        job_status_key = "/queue/%s/status" % job.job_id
        job_lock_key = "/queue/%s/locked_by" % job.job_id
        try:
            lock_info = dict(node_id=NS.node_context.node_id,
                             fqdn=NS.node_context.fqdn,
                             tags=NS.node_context.tags,
                             type=NS.type)
            etcd_utils.write(job_status_key, "processing",
                             prevValue="new")
            etcd_utils.write(job_lock_key,
                             json.dumps(lock_info))
        except etcd.EtcdCompareFailed:
            # job is already being processed by some tendrl
            # agent
            return

        the_flow = None
        try:
            current_ns, flow_name, obj_name = \
                _extract_fqdn(job.payload['run'])

            if obj_name:
                runnable_flow = current_ns.ns.get_obj_flow(
                    obj_name, flow_name)
            else:
                runnable_flow = current_ns.ns.get_flow(flow_name)

            the_flow = runnable_flow(parameters=job.payload[
                'parameters'], job_id=job.job_id)
            logger.log(
                "info",
                NS.publisher_id,
                {"message": "Processing Job %s" %
                            job.job_id},
                job_id=job.job_id,
                flow_id=the_flow.parameters['flow_id']
            )

            logger.log(
                "info",
                NS.publisher_id,
                {"message": "Running Flow %s" %
                            job.payload['run']},
                job_id=job.job_id,
                flow_id=the_flow.parameters['flow_id']
            )
            the_flow.run()
            try:
                etcd_utils.write(job_status_key,
                                 "finished",
                                 prevValue="processing")
            except etcd.EtcdCompareFailed:
                # This should not happen!
                _msg = "Cannot mark job as 'finished', " \
                       "current job status invalid"
                raise FlowExecutionFailedError(_msg)

            logger.log(
                "info",
                NS.publisher_id,
                {"message": "Job (%s):  Finished "
                            "Flow %s" % (
                                job.job_id,
                                job.payload['run'])},
                job_id=job.job_id,
                flow_id=the_flow.parameters['flow_id'],
            )
            if job.payload.get('parent') is None:
                alert_utils.alert_job_status(
                    "finished",
                    "Job finished successfully (job_id: %s)" % job.job_id,
                    integration_id=NS.tendrl_context.integration_id or
                    job.payload['parameters'].get(
                        'TendrlContext.integration_id'
                    ),
                    cluster_name=NS.tendrl_context.cluster_name or
                    job.payload['parameters'].get(
                        'TendrlContext.cluster_name'
                    )
                )
        except (FlowExecutionFailedError,
                AtomExecutionFailedError,
                Exception) as e:
            _trace = str(traceback.format_exc(e))
            _msg = "Failure in Job %s Flow %s with error:" % \
                   (job.job_id, job.payload['run'])
            Event(
                ExceptionMessage(
                    priority="error",
                    publisher=NS.publisher_id,
                    payload={"message": _msg + _trace,
                             "exception": e
                             }
                )
            )
            if the_flow:
                logger.log(
                    "error",
                    NS.publisher_id,
                    {"message": _msg + "\n" + _trace},
                    job_id=job.job_id,
                    flow_id=the_flow.parameters['flow_id']
                )
            else:
                logger.log(
                    "error",
                    NS.publisher_id,
                    {"message": _msg + "\n" + _trace}
                )

            try:
                etcd_utils.write(job_status_key,
                                 "failed",
                                 prevValue="processing")
            except etcd.EtcdCompareFailed:
                # This should not happen!
                _msg = "Cannot mark job as 'failed', current" \
                       "job status invalid"
                raise FlowExecutionFailedError(_msg)
            else:
                job = job.load()
                job.errors = _trace
                if job.payload.get('parent') is None:
                    alert_utils.alert_job_status(
                        "failed",
                        "Job failed (job_id: %s)" % job.job_id,
                        integration_id=NS.tendrl_context.integration_id or
                        job.payload['parameters'].get(
                            'TendrlContext.integration_id'
                        ),
                        cluster_name=NS.tendrl_context.cluster_name or
                        job.payload['parameters'].get(
                            'TendrlContext.cluster_name'
                        )
                    )
                job.save()
Exemple #29
0
def test_now():
    date = time_utils.now()
    assert type(date) == datetime.datetime