def test_counter_agg(self):
        print "*** Running test_counter_agg ***"

        uvevn = MakeUVEVirtualNetwork(None,
                                      "abc-corp:vn-00",
                                      "previous",
                                      in_tpkts=4)

        uvevn2 = MakeUVEVirtualNetwork(uvevn,
                                       "abc-corp:vn-00",
                                       "10.10.10.11",
                                       in_tpkts=7)

        uvevn3 = UVEServer.merge_previous(
            uvevn2, "abc-corp:vn-00", "UVEVirtualNetwork", "in_tpkts",
            uvevn["abc-corp:vn-00"]['UVEVirtualNetwork']['in_tpkts']
            ["previous"])

        pa = ParallelAggregator(uvevn3)
        res = pa.aggregate("abc-corp:vn-00", False)
        print json.dumps(res, indent=4, sort_keys=True)

        uvetest = MakeUVEVirtualNetwork(None,
                                        "abc-corp:vn-00",
                                        "sample",
                                        in_tpkts=15)
        in_tpkts = uvetest["abc-corp:vn-00"]['UVEVirtualNetwork']['in_tpkts'][
            "sample"]

        self.assertEqual(in_tpkts, res['UVEVirtualNetwork']['in_tpkts'])
    def test_counter_agg(self):
        print "*** Running test_counter_agg ***"

        uvevn = MakeUVEVirtualNetwork(
            None, "abc-corp:vn-00", "previous",
            in_tpkts = 4
            )

        uvevn2 = MakeUVEVirtualNetwork(
            uvevn, "abc-corp:vn-00", "10.10.10.11",
            in_tpkts = 7 
            )

        uvevn3 = UVEServer.merge_previous(uvevn2, "abc-corp:vn-00", "UVEVirtualNetwork",
            "in_tpkts",
            uvevn["abc-corp:vn-00"]['UVEVirtualNetwork']['in_tpkts']["previous"])

        pa = ParallelAggregator(uvevn3)
        res = pa.aggregate("abc-corp:vn-00", False)
        print json.dumps(res, indent = 4, sort_keys = True)

        uvetest = MakeUVEVirtualNetwork(
            None, "abc-corp:vn-00", "sample",
            in_tpkts = 15
            ) 
        in_tpkts = uvetest["abc-corp:vn-00"]['UVEVirtualNetwork']['in_tpkts']["sample"] 

        self.assertEqual(in_tpkts,res['UVEVirtualNetwork']['in_tpkts'])
    def test_append_agg(self):
        print "*** Running test_append_agg ***"

        uvevn = MakeUVEVirtualNetwork(
            None,
            "abc-corp:vn-00",
            "previous",
            in_stats=[("vn-01", "1000"), ("vn-02", "1800")],
        )

        uvevn2 = MakeUVEVirtualNetwork(
            uvevn,
            "abc-corp:vn-00",
            "10.10.10.11",
            in_stats=[("vn-02", "1200"), ("vn-03", "1500")],
        )

        uveprev = MakeUVEVirtualNetwork(
            None,
            "abc-corp:vn-00",
            "10.10.10.10",
            in_stats=[("vn-01", "1000"), ("vn-03", "1700")],
        )

        uvevn3 = UVEServer.merge_previous(
            uvevn2, "abc-corp:vn-00", "UVEVirtualNetwork", "in_stats",
            uveprev["abc-corp:vn-00"]['UVEVirtualNetwork']['in_stats']
            ["10.10.10.10"])

        pa = ParallelAggregator(uvevn3)
        res = pa.aggregate("abc-corp:vn-00", False)
        print json.dumps(res, indent=4, sort_keys=True)

        res['UVEVirtualNetwork']['in_stats']["list"]["VnStats"]= \
            sorted(res['UVEVirtualNetwork']['in_stats']["list"]["VnStats"])

        uvetest = MakeUVEVirtualNetwork(
            None,
            "abc-corp:vn-00",
            "sample",
            in_stats=[("vn-01", "2000"), ("vn-02", "3000"), ("vn-03", "3200")],
        )

        uvetest["abc-corp:vn-00"]["UVEVirtualNetwork"]["in_stats"]["sample"]["list"]["VnStats"] = \
            sorted(uvetest["abc-corp:vn-00"]["UVEVirtualNetwork"]["in_stats"]["sample"]["list"]["VnStats"])

        in_stats = uvetest["abc-corp:vn-00"]["UVEVirtualNetwork"]["in_stats"][
            "sample"]
        self.assertEqual(in_stats, res['UVEVirtualNetwork']['in_stats'])
    def test_append_agg(self):
        print "*** Running test_append_agg ***"
        
        uvevn = MakeUVEVirtualNetwork(
            None, "abc-corp:vn-00", "previous",
            in_stats = [("vn-01", "1000"),("vn-02", "1800")],
            )

        uvevn2 = MakeUVEVirtualNetwork(
            uvevn, "abc-corp:vn-00", "10.10.10.11",
            in_stats = [("vn-02", "1200"),("vn-03", "1500")],
            )

        uveprev = MakeUVEVirtualNetwork(
            None,  "abc-corp:vn-00", "10.10.10.10",
            in_stats = [("vn-01", "1000"),("vn-03", "1700")],
            )

        uvevn3 = UVEServer.merge_previous(uvevn2, "abc-corp:vn-00", "UVEVirtualNetwork",
            "in_stats",
            uveprev["abc-corp:vn-00"]['UVEVirtualNetwork']['in_stats']["10.10.10.10"])

        pa = ParallelAggregator(uvevn3)
        res = pa.aggregate("abc-corp:vn-00", False)
        print json.dumps(res, indent = 4, sort_keys = True)

        res['UVEVirtualNetwork']['in_stats']["list"]["VnStats"]= \
            sorted(res['UVEVirtualNetwork']['in_stats']["list"]["VnStats"])

        uvetest = MakeUVEVirtualNetwork(
            None,  "abc-corp:vn-00", "sample",
            in_stats = [("vn-01", "2000"),("vn-02", "3000"),("vn-03", "3200")],
            )
            
        uvetest["abc-corp:vn-00"]["UVEVirtualNetwork"]["in_stats"]["sample"]["list"]["VnStats"] = \
            sorted(uvetest["abc-corp:vn-00"]["UVEVirtualNetwork"]["in_stats"]["sample"]["list"]["VnStats"])

        in_stats = uvetest["abc-corp:vn-00"]["UVEVirtualNetwork"]["in_stats"]["sample"]
        self.assertEqual(in_stats, res['UVEVirtualNetwork']['in_stats'])
Ejemplo n.º 5
0
    def __init__(self, conf):
        self._conf = conf
        module = Module.ALARM_GENERATOR
        self._moduleid = ModuleNames[module]
        node_type = Module2NodeType[module]
        self._node_type_name = NodeTypeNames[node_type]
        self._hostname = socket.gethostname()
        self._instance_id = self._conf.worker_id()
        sandesh_global.init_generator(self._moduleid, self._hostname,
                                      self._node_type_name, self._instance_id,
                                      self._conf.collectors(), 
                                      self._node_type_name,
                                      self._conf.http_port(),
                                      ['opserver.sandesh', 'sandesh'],
                                      host_ip=self._conf.host_ip())
        sandesh_global.set_logging_params(
            enable_local_log=self._conf.log_local(),
            category=self._conf.log_category(),
            level=self._conf.log_level(),
            file=self._conf.log_file(),
            enable_syslog=self._conf.use_syslog(),
            syslog_facility=self._conf.syslog_facility())
        self._logger = sandesh_global._logger

        # Trace buffer list
        self.trace_buf = [
            {'name':'DiscoveryMsg', 'size':1000}
        ]
        # Create trace buffers 
        for buf in self.trace_buf:
            sandesh_global.trace_buffer_create(name=buf['name'], size=buf['size'])

        tables = [ "ObjectCollectorInfo",
                   "ObjectDatabaseInfo",
                   "ObjectVRouter",
                   "ObjectBgpRouter",
                   "ObjectConfigNode" ] 
        self.mgrs = {}
        self.tab_alarms = {}
        for table in tables:
            self.mgrs[table] = hook.HookManager(
                namespace='contrail.analytics.alarms',
                name=table,
                invoke_on_load=True,
                invoke_args=(),
                on_load_failure_callback=Controller.fail_cb
            )
            
            for extn in self.mgrs[table][table]:
                self._logger.info('Loaded extensions for %s: %s,%s doc %s' % \
                    (table, extn.name, extn.entry_point_target, extn.obj.__doc__))

            self.tab_alarms[table] = {}

        ConnectionState.init(sandesh_global, self._hostname, self._moduleid,
            self._instance_id,
            staticmethod(ConnectionState.get_process_state_cb),
            NodeStatusUVE, NodeStatus)

        self._us = UVEServer(None, self._logger, self._conf.redis_password())

        self._workers = {}

        self.disc = None
        self._libpart_name = self._hostname + ":" + self._instance_id
        self._libpart = None
        self._partset = set()
        if self._conf.discovery()['server']:
            import discoveryclient.client as client 
            data = {
                'ip-address': self._hostname ,
                'port': self._instance_id
            }
            self.disc = client.DiscoveryClient(
                self._conf.discovery()['server'],
                self._conf.discovery()['port'],
                ModuleNames[Module.ALARM_GENERATOR])
            self._logger.info("Disc Publish to %s : %s"
                          % (str(self._conf.discovery()), str(data)))
            self.disc.publish(ALARM_GENERATOR_SERVICE_NAME, data)
        else:
            # If there is no discovery service, use fixed redis_uve list
            redis_uve_list = []
            try:
                for redis_uve in self._conf.redis_uve_list():
                    redis_ip_port = redis_uve.split(':')
                    redis_ip_port = (redis_ip_port[0], int(redis_ip_port[1]))
                    redis_uve_list.append(redis_ip_port)
            except Exception as e:
                self._logger.error('Failed to parse redis_uve_list: %s' % e)
            else:
                self._us.update_redis_uve_list(redis_uve_list)

            # If there is no discovery service, use fixed alarmgen list
            self._libpart = self.start_libpart(self._conf.alarmgen_list())

        PartitionOwnershipReq.handle_request = self.handle_PartitionOwnershipReq
        PartitionStatusReq.handle_request = self.handle_PartitionStatusReq
        UVETableAlarmReq.handle_request = self.handle_UVETableAlarmReq 
Ejemplo n.º 6
0
class Controller(object):
    
    @staticmethod
    def fail_cb(manager, entrypoint, exception):
        sandesh_global._logger.info("Load failed for %s with exception %s" % \
                                     (str(entrypoint),str(exception)))
        
    def __init__(self, conf):
        self._conf = conf
        module = Module.ALARM_GENERATOR
        self._moduleid = ModuleNames[module]
        node_type = Module2NodeType[module]
        self._node_type_name = NodeTypeNames[node_type]
        self._hostname = socket.gethostname()
        self._instance_id = self._conf.worker_id()
        sandesh_global.init_generator(self._moduleid, self._hostname,
                                      self._node_type_name, self._instance_id,
                                      self._conf.collectors(), 
                                      self._node_type_name,
                                      self._conf.http_port(),
                                      ['opserver.sandesh', 'sandesh'],
                                      host_ip=self._conf.host_ip())
        sandesh_global.set_logging_params(
            enable_local_log=self._conf.log_local(),
            category=self._conf.log_category(),
            level=self._conf.log_level(),
            file=self._conf.log_file(),
            enable_syslog=self._conf.use_syslog(),
            syslog_facility=self._conf.syslog_facility())
        self._logger = sandesh_global._logger

        # Trace buffer list
        self.trace_buf = [
            {'name':'DiscoveryMsg', 'size':1000}
        ]
        # Create trace buffers 
        for buf in self.trace_buf:
            sandesh_global.trace_buffer_create(name=buf['name'], size=buf['size'])

        tables = [ "ObjectCollectorInfo",
                   "ObjectDatabaseInfo",
                   "ObjectVRouter",
                   "ObjectBgpRouter",
                   "ObjectConfigNode" ] 
        self.mgrs = {}
        self.tab_alarms = {}
        for table in tables:
            self.mgrs[table] = hook.HookManager(
                namespace='contrail.analytics.alarms',
                name=table,
                invoke_on_load=True,
                invoke_args=(),
                on_load_failure_callback=Controller.fail_cb
            )
            
            for extn in self.mgrs[table][table]:
                self._logger.info('Loaded extensions for %s: %s,%s doc %s' % \
                    (table, extn.name, extn.entry_point_target, extn.obj.__doc__))

            self.tab_alarms[table] = {}

        ConnectionState.init(sandesh_global, self._hostname, self._moduleid,
            self._instance_id,
            staticmethod(ConnectionState.get_process_state_cb),
            NodeStatusUVE, NodeStatus)

        self._us = UVEServer(None, self._logger, self._conf.redis_password())

        self._workers = {}

        self.disc = None
        self._libpart_name = self._hostname + ":" + self._instance_id
        self._libpart = None
        self._partset = set()
        if self._conf.discovery()['server']:
            import discoveryclient.client as client 
            data = {
                'ip-address': self._hostname ,
                'port': self._instance_id
            }
            self.disc = client.DiscoveryClient(
                self._conf.discovery()['server'],
                self._conf.discovery()['port'],
                ModuleNames[Module.ALARM_GENERATOR])
            self._logger.info("Disc Publish to %s : %s"
                          % (str(self._conf.discovery()), str(data)))
            self.disc.publish(ALARM_GENERATOR_SERVICE_NAME, data)
        else:
            # If there is no discovery service, use fixed redis_uve list
            redis_uve_list = []
            try:
                for redis_uve in self._conf.redis_uve_list():
                    redis_ip_port = redis_uve.split(':')
                    redis_ip_port = (redis_ip_port[0], int(redis_ip_port[1]))
                    redis_uve_list.append(redis_ip_port)
            except Exception as e:
                self._logger.error('Failed to parse redis_uve_list: %s' % e)
            else:
                self._us.update_redis_uve_list(redis_uve_list)

            # If there is no discovery service, use fixed alarmgen list
            self._libpart = self.start_libpart(self._conf.alarmgen_list())

        PartitionOwnershipReq.handle_request = self.handle_PartitionOwnershipReq
        PartitionStatusReq.handle_request = self.handle_PartitionStatusReq
        UVETableAlarmReq.handle_request = self.handle_UVETableAlarmReq 

    def libpart_cb(self, part_list):

        agpi = AlarmgenPartionInfo()
        agpi.instance = self._instance_id
        agpi.partitions = part_list

        agp = AlarmgenPartition()
        agp.name = self._hostname
        agp.inst_parts = [agpi]
       
        agp_trace = AlarmgenPartitionTrace(data=agp)
        agp_trace.send() 

        newset = set(part_list)
        oldset = self._partset
        self._partset = newset

        self._logger.error('Partition List : new %s old %s' % \
            (str(newset),str(oldset)))
        
        for addpart in (newset-oldset):
            self._logger.error('Partition Add : %s' % addpart)
            self.partition_change(addpart, True)
        
        for delpart in (oldset-newset):
            self._logger.error('Partition Del : %s' % delpart)
            self.partition_change(delpart, False)

    def start_libpart(self, ag_list):
        if not self._conf.zk_list():
            self._logger.error('Could not import libpartition: No zookeeper')
            return None
        if not ag_list:
            self._logger.error('Could not import libpartition: No alarmgen list')
            return None
        try:
            from libpartition.libpartition import PartitionClient
            self._logger.error('Starting PC')
            agpi = AlarmgenPartionInfo()
            agpi.instance = self._instance_id
            agpi.partitions = []

            agp = AlarmgenPartition()
            agp.name = self._hostname
            agp.inst_parts = [agpi]
           
            agp_trace = AlarmgenPartitionTrace(data=agp)
            agp_trace.send() 

            pc = PartitionClient("alarmgen",
                    self._libpart_name, ag_list,
                    self._conf.partitions(), self.libpart_cb,
                    ','.join(self._conf.zk_list()))
            self._logger.error('Started PC')
            return pc
        except Exception as e:
            self._logger.error('Could not import libpartition: %s' % str(e))
            return None

    def handle_uve_notif(self, uves, remove = False):
        self._logger.debug("Changed UVEs : %s" % str(uves))
        no_handlers = set()
        for uv in uves:
            tab = uv.split(':',1)[0]
            uve_name = uv.split(':',1)[1]
            if not self.mgrs.has_key(tab):
                no_handlers.add(tab)
                continue
            if remove:
                uve_data = []
            else:
                filters = {'kfilt': [uve_name]}
                itr = self._us.multi_uve_get(tab, True, filters)
                uve_data = itr.next()['value']
            if len(uve_data) == 0:
                self._logger.info("UVE %s deleted" % uv)
                if self.tab_alarms[tab].has_key(uv):
                    del self.tab_alarms[tab][uv]
                    ustruct = UVEAlarms(name = uve_name, deleted = True)
                    alarm_msg = AlarmTrace(data=ustruct, table=tab)
                    self._logger.info('send del alarm: %s' % (alarm_msg.log()))
                    alarm_msg.send()
                continue
            results = self.mgrs[tab].map_method("__call__", uv, uve_data)
            new_uve_alarms = {}
            for res in results:
                nm, sev, errs = res
                self._logger.debug("Alarm[%s] %s: %s" % (tab, nm, str(errs)))
                elems = []
                for ae in errs:
                    rule, val = ae
                    rv = AlarmElement(rule, val)
                    elems.append(rv)
                if len(elems):
                    new_uve_alarms[nm] = UVEAlarmInfo(type = nm, severity = sev,
                                           timestamp = 0,
                                           description = elems, ack = False)
            del_types = []
            if self.tab_alarms[tab].has_key(uv):
                for nm, uai in self.tab_alarms[tab][uv].iteritems():
                    uai2 = copy.deepcopy(uai)
                    uai2.timestamp = 0
                    # This type was present earlier, but is now gone
                    if not new_uve_alarms.has_key(nm):
                        del_types.append(nm)
                    else:
                        # This type has no new information
                        if pprint.pformat(uai2) == \
                                pprint.pformat(new_uve_alarms[nm]):
                            del new_uve_alarms[nm]
            if len(del_types) != 0  or \
                    len(new_uve_alarms) != 0:
                self._logger.debug("Alarm[%s] Deleted %s" % \
                        (tab, str(del_types))) 
                self._logger.debug("Alarm[%s] Updated %s" % \
                        (tab, str(new_uve_alarms))) 
                # These alarm types are new or updated
                for nm, uai2 in new_uve_alarms.iteritems():
                    uai = copy.deepcopy(uai2)
                    uai.timestamp = UTCTimestampUsec()
                    if not self.tab_alarms[tab].has_key(uv):
                        self.tab_alarms[tab][uv] = {}
                    self.tab_alarms[tab][uv][nm] = uai
                # These alarm types are now gone
                for dnm in del_types:
                    del self.tab_alarms[tab][uv][dnm]
                    
                ustruct = None
                if len(self.tab_alarms[tab][uv]) == 0:
                    ustruct = UVEAlarms(name = uve_name,
                            deleted = True)
                    del self.tab_alarms[tab][uv]
                else:
                    ustruct = UVEAlarms(name = uve_name,
                            alarms = self.tab_alarms[tab][uv].values(),
                            deleted = False)
                alarm_msg = AlarmTrace(data=ustruct, table=tab)
                self._logger.info('send alarm: %s' % (alarm_msg.log()))
                alarm_msg.send()
            
        if len(no_handlers):
            self._logger.debug('No Alarm Handlers for %s' % str(no_handlers))

    def handle_UVETableAlarmReq(self, req):
        status = False
        if req.table == "all":
            parts = self.tab_alarms.keys()
        else:
            parts = [req.table]
        self._logger.info("Got UVETableAlarmReq : %s" % str(parts))
        np = 1
        for pt in parts:
            resp = UVETableAlarmResp(table = pt)
            uves = []
            for uk,uv in self.tab_alarms[pt].iteritems():
                alms = []
                for ak,av in uv.iteritems():
                    alms.append(av)
                uves.append(UVEAlarms(name = uk, alarms = alms))
            resp.uves = uves 
            if np == len(parts):
                mr = False
            else:
                mr = True
            resp.response(req.context(), mr)
            np = np + 1

    
    def partition_change(self, partno, enl):
        """
        Call this function when getting or giving up
        ownership of a partition
        Args:
            partno : Partition Number
            enl    : True for acquiring, False for giving up
        Returns: 
            status of operation (True for success)
        """
        status = False
        if enl:
            if self._workers.has_key(partno):
                self._logger.info("Dup partition %d" % partno)
            else:
                #uvedb = self._us.get_part(partno)
                ph = UveStreamProc(','.join(self._conf.kafka_broker_list()),
                                   partno, "uve-" + str(partno),
                                   self._logger, self._us.get_part,
                                   self.handle_uve_notif)
                ph.start()
                self._workers[partno] = ph
                status = True
        else:
            if self._workers.has_key(partno):
                ph = self._workers[partno]
                gevent.kill(ph)
                res,db = ph.get()
                print "Returned " + str(res)
                print "State :"
                for k,v in db.iteritems():
                    print "%s -> %s" % (k,str(v)) 
                del self._workers[partno]
                status = True
            else:
                self._logger.info("No partition %d" % partno)

        return status
    
    def handle_PartitionOwnershipReq(self, req):
        self._logger.info("Got PartitionOwnershipReq: %s" % str(req))
        status = self.partition_change(req.partition, req.ownership)

        resp = PartitionOwnershipResp()
        resp.status = status
	resp.response(req.context())
               
    def process_stats(self):
        ''' Go through the UVEKey-Count stats collected over 
            the previous time period over all partitions
            and send it out
        '''
        s_partitions = set()
        s_keys = set()
        n_updates = 0
        for pk,pc in self._workers.iteritems():
            s_partitions.add(pk)
            din, dout = pc.stats()
            for ktab,tab in dout.iteritems():
                au_keys = []
                for uk,uc in tab.iteritems():
                    s_keys.add(uk)
                    n_updates += uc
                    ukc = UVEKeyInfo()
                    ukc.key = uk
                    ukc.count = uc
                    au_keys.append(ukc)
                au_obj = AlarmgenUpdate(name=sandesh_global._source + ':' + \
                        sandesh_global._node_type + ':' + \
                        sandesh_global._module + ':' + \
                        sandesh_global._instance_id,
                        partition = pk,
                        table = ktab,
                        keys = au_keys,
                        notifs = None)
                self._logger.debug('send key stats: %s' % (au_obj.log()))
                au_obj.send()

            for ktab,tab in din.iteritems():
                au_notifs = []
                for kcoll,coll in tab.iteritems():
                    for kgen,gen in coll.iteritems():
                        for tk,tc in gen.iteritems():
                            tkc = UVETypeInfo()
                            tkc.type= tk
                            tkc.count = tc
                            tkc.generator = kgen
                            tkc.collector = kcoll
                            au_notifs.append(tkc)
                au_obj = AlarmgenUpdate(name=sandesh_global._source + ':' + \
                        sandesh_global._node_type + ':' + \
                        sandesh_global._module + ':' + \
                        sandesh_global._instance_id,
                        partition = pk,
                        table = ktab,
                        keys = None,
                        notifs = au_notifs)
                self._logger.debug('send notif stats: %s' % (au_obj.log()))
                au_obj.send()

        au = AlarmgenStatus()
        au.name = self._hostname
        au.counters = []
        au.alarmgens = []
        ags = AlarmgenStats()
        ags.instance =  self._instance_id
        ags.partitions = len(s_partitions)
        ags.keys = len(s_keys)
        ags.updates = n_updates
        au.counters.append(ags)

        agname = sandesh_global._source + ':' + \
                        sandesh_global._node_type + ':' + \
                        sandesh_global._module + ':' + \
                        sandesh_global._instance_id
        au.alarmgens.append(agname)
 
        atrace = AlarmgenStatusTrace(data = au)
        self._logger.debug('send alarmgen status : %s' % (atrace.log()))
        atrace.send()
         
    def handle_PartitionStatusReq(self, req):
        ''' Return the entire contents of the UVE DB for the 
            requested partitions
        '''
        if req.partition == -1:
            parts = self._workers.keys()
        else:
            parts = [req.partition]
        
        self._logger.info("Got PartitionStatusReq: %s" % str(parts))
        np = 1
        for pt in parts:
            resp = PartitionStatusResp()
            resp.partition = pt
            if self._workers.has_key(pt):
                resp.enabled = True
                resp.uves = []
                for kcoll,coll in self._workers[pt].contents().iteritems():
                    uci = UVECollInfo()
                    uci.collector = kcoll
                    uci.uves = []
                    for kgen,gen in coll.iteritems():
                        ugi = UVEGenInfo()
                        ugi.generator = kgen
                        ugi.uves = []
                        for uk,uc in gen.iteritems():
                            ukc = UVEKeyInfo()
                            ukc.key = uk
                            ukc.count = uc
                            ugi.uves.append(ukc)
                        uci.uves.append(ugi)
                    resp.uves.append(uci)
            else:
                resp.enabled = False
            if np == len(parts):
                mr = False
            else:
                mr = True
            resp.response(req.context(), mr)
            np = np + 1

    def disc_cb_coll(self, clist):
        '''
        Analytics node may be brought up/down any time. For UVE aggregation,
        alarmgen needs to know the list of all Analytics nodes (redis-uves).
        Periodically poll the Collector list [in lieu of 
        redi-uve nodes] from the discovery. 
        '''
        newlist = []
        for elem in clist:
            (ipaddr,port) = elem
            newlist.append((ipaddr, self._conf.redis_server_port()))
        self._us.update_redis_uve_list(newlist)

    def disc_cb_ag(self, alist):
        '''
        Analytics node may be brought up/down any time. For partitioning,
        alarmgen needs to know the list of all Analytics nodes (alarmgens).
        Periodically poll the alarmgen list from the discovery service
        '''
        newlist = []
        for elem in alist:
            (ipaddr, inst) = elem
            newlist.append(ipaddr + ":" + inst)

        # We should always include ourselves in the list of memebers
        newset = set(newlist)
        newset.add(self._libpart_name)
        newlist = list(newset)
        if not self._libpart:
            self._libpart = self.start_libpart(newlist)
        else:
            self._libpart.update_cluster_list(newlist)

    def run(self):
        alarmgen_cpu_info = CpuInfoData()
        while True:
            before = time.time()
            mod_cpu_info = ModuleCpuInfo()
            mod_cpu_info.module_id = self._moduleid
            mod_cpu_info.instance_id = self._instance_id
            mod_cpu_info.cpu_info = alarmgen_cpu_info.get_cpu_info(
                system=False)
            mod_cpu_state = ModuleCpuState()
            mod_cpu_state.name = self._hostname

            mod_cpu_state.module_cpu_info = [mod_cpu_info]

            alarmgen_cpu_state_trace = ModuleCpuStateTrace(data=mod_cpu_state)
            alarmgen_cpu_state_trace.send()

            aly_cpu_state = AnalyticsCpuState()
            aly_cpu_state.name = self._hostname

            aly_cpu_info = ProcessCpuInfo()
            aly_cpu_info.module_id= self._moduleid
            aly_cpu_info.inst_id = self._instance_id
            aly_cpu_info.cpu_share = mod_cpu_info.cpu_info.cpu_share
            aly_cpu_info.mem_virt = mod_cpu_info.cpu_info.meminfo.virt
            aly_cpu_info.mem_res = mod_cpu_info.cpu_info.meminfo.res
            aly_cpu_state.cpu_info = [aly_cpu_info]

            aly_cpu_state_trace = AnalyticsCpuStateTrace(data=aly_cpu_state)
            aly_cpu_state_trace.send()

            # Send out the UVEKey-Count stats for this time period
            self.process_stats()

            duration = time.time() - before
            if duration < 60:
                gevent.sleep(60 - duration)
            else:
                self._logger.error("Periodic collection took %s sec" % duration)
Ejemplo n.º 7
0
class Controller(object):
    @staticmethod
    def fail_cb(manager, entrypoint, exception):
        sandesh_global._logger.info("Load failed for %s with exception %s" % \
                                     (str(entrypoint),str(exception)))

    def __init__(self, conf):
        self._conf = conf
        module = Module.ALARM_GENERATOR
        self._moduleid = ModuleNames[module]
        node_type = Module2NodeType[module]
        self._node_type_name = NodeTypeNames[node_type]
        self._hostname = socket.gethostname()
        self._instance_id = '0'
        sandesh_global.init_generator(self._moduleid, self._hostname,
                                      self._node_type_name, self._instance_id,
                                      self._conf.collectors(),
                                      self._node_type_name,
                                      self._conf.http_port(),
                                      ['opserver.sandesh', 'sandesh'])
        sandesh_global.set_logging_params(
            enable_local_log=self._conf.log_local(),
            category=self._conf.log_category(),
            level=self._conf.log_level(),
            file=self._conf.log_file(),
            enable_syslog=self._conf.use_syslog(),
            syslog_facility=self._conf.syslog_facility())
        self._logger = sandesh_global._logger

        # Trace buffer list
        self.trace_buf = [{'name': 'DiscoveryMsg', 'size': 1000}]
        # Create trace buffers
        for buf in self.trace_buf:
            sandesh_global.trace_buffer_create(name=buf['name'],
                                               size=buf['size'])

        tables = [
            "ObjectCollectorInfo", "ObjectDatabaseInfo", "ObjectVRouter",
            "ObjectBgpRouter", "ObjectConfigNode"
        ]
        self.mgrs = {}
        self.tab_alarms = {}
        for table in tables:
            self.mgrs[table] = hook.HookManager(
                namespace='contrail.analytics.alarms',
                name=table,
                invoke_on_load=True,
                invoke_args=(),
                on_load_failure_callback=Controller.fail_cb)

            for extn in self.mgrs[table][table]:
                self._logger.info('Loaded extensions for %s: %s,%s' % \
                    (table, extn.name, extn.entry_point_target))

            self.tab_alarms[table] = {}

        ConnectionState.init(
            sandesh_global, self._hostname, self._moduleid, self._instance_id,
            staticmethod(ConnectionState.get_process_state_cb), NodeStatusUVE,
            NodeStatus)

        self._us = UVEServer(None, self._logger, self._conf.redis_password())

        self.disc = None
        if self._conf.discovery()['server']:
            import discoveryclient.client as client
            data = {'ip-address': self._hostname, 'port': self._instance_id}
            self.disc = client.DiscoveryClient(
                self._conf.discovery()['server'],
                self._conf.discovery()['port'],
                ModuleNames[Module.ALARM_GENERATOR])
            self._logger.info("Disc Publish to %s : %s" %
                              (str(self._conf.discovery()), str(data)))
            self.disc.publish(ALARM_GENERATOR_SERVICE_NAME, data)
        else:
            redis_uve_list = []
            try:
                for redis_uve in self._conf.redis_uve_list():
                    redis_ip_port = redis_uve.split(':')
                    redis_ip_port = (redis_ip_port[0], int(redis_ip_port[1]))
                    redis_uve_list.append(redis_ip_port)
            except Exception as e:
                self._logger.error('Failed to parse redis_uve_list: %s' % e)
            else:
                self._us.update_redis_uve_list(redis_uve_list)

        PartitionOwnershipReq.handle_request = self.handle_PartitionOwnershipReq
        PartitionStatusReq.handle_request = self.handle_PartitionStatusReq
        UVETableAlarmReq.handle_request = self.handle_UVETableAlarmReq

        self._workers = {}

    def handle_uve_notif(self, uves):
        self._logger.debug("Changed UVEs : %s" % str(uves))
        no_handlers = set()
        for uv in uves:
            tab = uv.split(':', 1)[0]
            if not self.mgrs.has_key(tab):
                no_handlers.add(tab)
                continue
            itr = self._us.multi_uve_get(uv, True, None, None, None, None)
            uve_data = itr.next()['value']
            if len(uve_data) == 0:
                del self.tab_alarms[tab][uv]
                self._logger.info("UVE %s deleted" % uv)
                continue
            results = self.mgrs[tab].map_method("__call__", uv, uve_data)
            new_uve_alarms = {}
            for res in results:
                nm, errs = res
                self._logger.info("Alarm[%s] %s: %s" % (tab, nm, str(errs)))
                elems = []
                for ae in errs:
                    rule, val = ae
                    rv = AlarmElement(rule, val)
                    elems.append(rv)
                if len(elems):
                    new_uve_alarms[nm] = UVEAlarmInfo(type=nm,
                                                      description=elems,
                                                      ack=False)
            self.tab_alarms[tab][uv] = new_uve_alarms

        if len(no_handlers):
            self._logger.info('No Alarm Handlers for %s' % str(no_handlers))

    def handle_UVETableAlarmReq(self, req):
        status = False
        if req.table == "all":
            parts = self.tab_alarms.keys()
        else:
            parts = [req.table]
        self._logger.info("Got UVETableAlarmReq : %s" % str(parts))
        np = 1
        for pt in parts:
            resp = UVETableAlarmResp(table=pt)
            uves = []
            for uk, uv in self.tab_alarms[pt].iteritems():
                alms = []
                for ak, av in uv.iteritems():
                    alms.append(av)
                uves.append(UVEAlarms(name=uk, alarms=alms))
            resp.uves = uves
            if np == len(parts):
                mr = False
            else:
                mr = True
            resp.response(req.context(), mr)
            np = np + 1

    def handle_PartitionOwnershipReq(self, req):
        self._logger.info("Got PartitionOwnershipReq: %s" % str(req))
        status = False
        if req.ownership:
            if self._workers.has_key(req.partition):
                self._logger.info("Dup partition %d" % req.partition)
            else:
                uvedb = self._us.get_part(req.partition)
                ph = UveStreamProc(','.join(self._conf.kafka_broker_list()),
                                   req.partition, "uve-" + str(req.partition),
                                   self._logger, uvedb, self.handle_uve_notif)
                ph.start()
                self._workers[req.partition] = ph
                status = True
        else:
            #import pdb; pdb.set_trace()
            if self._workers.has_key(req.partition):
                ph = self._workers[req.partition]
                gevent.kill(ph)
                res, db = ph.get()
                print "Returned " + str(res)
                print "State :"
                for k, v in db.iteritems():
                    print "%s -> %s" % (k, str(v))
                del self._workers[req.partition]
                status = True
            else:
                self._logger.info("No partition %d" % req.partition)

        resp = PartitionOwnershipResp()
        resp.status = status
        resp.response(req.context())

    def handle_PartitionStatusReq(self, req):

        if req.partition == -1:
            parts = self._workers.keys()
        else:
            parts = [req.partition]

        self._logger.info("Got PartitionStatusReq: %s" % str(parts))
        np = 1
        for pt in parts:
            resp = PartitionStatusResp()
            resp.partition = pt
            if self._workers.has_key(pt):
                resp.enabled = True
                resp.uves = []
                for kcoll, coll in self._workers[pt].contents().iteritems():
                    uci = UVECollInfo()
                    uci.collector = kcoll
                    uci.uves = []
                    for kgen, gen in coll.iteritems():
                        ugi = UVEGenInfo()
                        ugi.generator = kgen
                        ugi.uves = list(gen)
                        uci.uves.append(ugi)
                    resp.uves.append(uci)
            else:
                resp.enabled = False
            if np == len(parts):
                mr = False
            else:
                mr = True
            resp.response(req.context(), mr)
            np = np + 1

    def disc_cb_coll(self, clist):
        '''
        Analytics node may be brought up/down any time. For UVE aggregation,
        alarmgen needs to know the list of all Analytics nodes (redis-uves).
        Periodically poll the Collector list [in lieu of 
        redi-uve nodes] from the discovery. 
        '''
        newlist = []
        for elem in clist:
            (ipaddr, port) = elem
            newlist.append((ipaddr, self._conf.redis_server_port()))
        self._us.update_redis_uve_list(newlist)

    def disc_cb_ag(self, alist):
        '''
        Analytics node may be brought up/down any time. For partitioning,
        alarmgen needs to know the list of all Analytics nodes (alarmgens).
        Periodically poll the alarmgen list from the discovery service
        '''
        # TODO : Hookup with partitioning library
        pass

    def run(self):
        while True:
            gevent.sleep(60)
Ejemplo n.º 8
0
class Controller(object):
    @staticmethod
    def token(sandesh, timestamp):
        token = {
            'host_ip': sandesh.host_ip(),
            'http_port': sandesh._http_server.get_port(),
            'timestamp': timestamp
        }
        return base64.b64encode(json.dumps(token))

    @staticmethod
    def alarm_encode(alarms):
        res = {}
        res["UVEAlarms"] = {}
        res["UVEAlarms"]["alarms"] = []
        for k, elem in alarms.iteritems():
            elem_dict = {}
            elem_dict["type"] = elem.type
            elem_dict["ack"] = elem.ack
            elem_dict["timestamp"] = elem.timestamp
            elem_dict["token"] = elem.token
            elem_dict["severity"] = elem.severity
            elem_dict["description"] = []
            for desc in elem.description:
                desc_dict = {}
                desc_dict["value"] = desc.value
                desc_dict["rule"] = desc.rule
                elem_dict["description"].append(desc_dict)
            res["UVEAlarms"]["alarms"].append(elem_dict)
        return res

    def fail_cb(self, manager, entrypoint, exception):
        self._sandesh._logger.info("Load failed for %s with exception %s" % \
                                     (str(entrypoint),str(exception)))

    def __init__(self, conf, test_logger=None):
        self._conf = conf
        module = Module.ALARM_GENERATOR
        self._moduleid = ModuleNames[module]
        node_type = Module2NodeType[module]
        self._node_type_name = NodeTypeNames[node_type]
        self._hostname = socket.gethostname()
        self._instance_id = self._conf.worker_id()
        is_collector = True
        if test_logger is not None:
            is_collector = False
        self._sandesh = Sandesh()
        self._sandesh.init_generator(self._moduleid,
                                     self._hostname,
                                     self._node_type_name,
                                     self._instance_id,
                                     self._conf.collectors(),
                                     self._node_type_name,
                                     self._conf.http_port(),
                                     ['opserver.sandesh', 'sandesh'],
                                     host_ip=self._conf.host_ip(),
                                     connect_to_collector=is_collector)
        if test_logger is not None:
            self._logger = test_logger
        else:
            self._sandesh.set_logging_params(
                enable_local_log=self._conf.log_local(),
                category=self._conf.log_category(),
                level=self._conf.log_level(),
                file=self._conf.log_file(),
                enable_syslog=self._conf.use_syslog(),
                syslog_facility=self._conf.syslog_facility())
            self._logger = self._sandesh._logger
        # Trace buffer list
        self.trace_buf = [{'name': 'DiscoveryMsg', 'size': 1000}]
        # Create trace buffers
        for buf in self.trace_buf:
            self._sandesh.trace_buffer_create(name=buf['name'],
                                              size=buf['size'])

        tables = [
            "ObjectCollectorInfo", "ObjectDatabaseInfo", "ObjectVRouter",
            "ObjectBgpRouter", "ObjectConfigNode"
        ]
        self.mgrs = {}
        self.tab_alarms = {}
        self.ptab_info = {}
        self.tab_perf = {}
        self.tab_perf_prev = {}
        for table in tables:
            self.mgrs[table] = hook.HookManager(
                namespace='contrail.analytics.alarms',
                name=table,
                invoke_on_load=True,
                invoke_args=(),
                on_load_failure_callback=self.fail_cb)

            for extn in self.mgrs[table][table]:
                self._logger.info('Loaded extensions for %s: %s,%s doc %s' % \
                    (table, extn.name, extn.entry_point_target, extn.obj.__doc__))

            self.tab_alarms[table] = {}
            self.tab_perf[table] = AGTabStats()

        ConnectionState.init(
            self._sandesh, self._hostname, self._moduleid, self._instance_id,
            staticmethod(ConnectionState.get_process_state_cb), NodeStatusUVE,
            NodeStatus)

        self._us = UVEServer(None, self._logger, self._conf.redis_password())

        self._workers = {}
        self._uveq = {}
        self._uveqf = {}

        self.disc = None
        self._libpart_name = self._hostname + ":" + self._instance_id
        self._libpart = None
        self._partset = set()
        if self._conf.discovery()['server']:
            data = {'ip-address': self._hostname, 'port': self._instance_id}
            self.disc = client.DiscoveryClient(
                self._conf.discovery()['server'],
                self._conf.discovery()['port'],
                ModuleNames[Module.ALARM_GENERATOR])
            self._logger.info("Disc Publish to %s : %s" %
                              (str(self._conf.discovery()), str(data)))
            self.disc.publish(ALARM_GENERATOR_SERVICE_NAME, data)
        else:
            # If there is no discovery service, use fixed redis_uve list
            redis_uve_list = []
            try:
                for redis_uve in self._conf.redis_uve_list():
                    redis_ip_port = redis_uve.split(':')
                    redis_elem = (redis_ip_port[0], int(redis_ip_port[1]), 0)
                    redis_uve_list.append(redis_elem)
            except Exception as e:
                self._logger.error('Failed to parse redis_uve_list: %s' % e)
            else:
                self._us.update_redis_uve_list(redis_uve_list)

            # If there is no discovery service, use fixed alarmgen list
            self._libpart = self.start_libpart(self._conf.alarmgen_list())

        PartitionOwnershipReq.handle_request = self.handle_PartitionOwnershipReq
        PartitionStatusReq.handle_request = self.handle_PartitionStatusReq
        UVETableAlarmReq.handle_request = self.handle_UVETableAlarmReq
        UVETableInfoReq.handle_request = self.handle_UVETableInfoReq
        UVETablePerfReq.handle_request = self.handle_UVETablePerfReq

    def libpart_cb(self, part_list):

        agpi = AlarmgenPartionInfo()
        agpi.instance = self._instance_id
        agpi.partitions = part_list

        agp = AlarmgenPartition()
        agp.name = self._hostname
        agp.inst_parts = [agpi]

        agp_trace = AlarmgenPartitionTrace(data=agp, sandesh=self._sandesh)
        agp_trace.send(sandesh=self._sandesh)

        newset = set(part_list)
        oldset = self._partset
        self._partset = newset

        self._logger.error('Partition List : new %s old %s' % \
            (str(newset),str(oldset)))

        for addpart in (newset - oldset):
            self._logger.error('Partition Add : %s' % addpart)
            self.partition_change(addpart, True)

        for delpart in (oldset - newset):
            self._logger.error('Partition Del : %s' % delpart)
            self.partition_change(delpart, False)

        self._logger.error('Partition List done : new %s old %s' % \
            (str(newset),str(oldset)))

    def start_libpart(self, ag_list):
        if not self._conf.zk_list():
            self._logger.error('Could not import libpartition: No zookeeper')
            return None
        if not ag_list:
            self._logger.error(
                'Could not import libpartition: No alarmgen list')
            return None
        try:
            self._logger.error('Starting PC')
            agpi = AlarmgenPartionInfo()
            agpi.instance = self._instance_id
            agpi.partitions = []

            agp = AlarmgenPartition()
            agp.name = self._hostname
            agp.inst_parts = [agpi]

            agp_trace = AlarmgenPartitionTrace(data=agp, sandesh=self._sandesh)
            agp_trace.send(sandesh=self._sandesh)

            pc = PartitionClient("alarmgen", self._libpart_name, ag_list,
                                 self._conf.partitions(), self.libpart_cb,
                                 ','.join(self._conf.zk_list()))
            self._logger.error('Started PC')
            return pc
        except Exception as e:
            self._logger.error('Could not import libpartition: %s' % str(e))
            return None

    def handle_uve_notifq(self, part, uves):
        """
        uves : 
          This is a dict of UVEs that have changed, as per the following scheme:
          <UVE-Key> : None               # Any of the types may have changed
                                         # Used during stop_partition and GenDelete
          <UVE-Key> : { <Struct>: {} }   # The given struct may have changed
          <UVE-Key> : { <Struct>: None } # The given struct may have gone
          Our treatment of the 2nd and 3rd case above is the same
        """
        if part not in self._uveq:
            self._uveq[part] = {}
            self._logger.error('Created uveQ for part %s' % str(part))
        for uv, types in uves.iteritems():
            if types is None:
                self._uveq[part][uv] = None
            else:
                if uv in self._uveq[part]:
                    if self._uveq[part][uv] is not None:
                        for kk in types.keys():
                            self._uveq[part][uv][kk] = {}
                else:
                    self._uveq[part][uv] = {}
                    for kk in types.keys():
                        self._uveq[part][uv][kk] = {}

    def handle_resource_check(self, part, current_inst, msgs):
        """
        This function compares the set of synced redis instances
        against the set now being reported by UVEServer
       
        It returns :
        - The updated set of redis instances
        - A set of collectors to be removed
        - A dict with the collector to be added, with the contents
        """
        us_redis_inst = self._us.redis_instances()
        disc_instances = copy.deepcopy(us_redis_inst)

        r_added = disc_instances - current_inst
        r_deleted = current_inst - disc_instances

        coll_delete = set()
        for r_inst in r_deleted:
            ipaddr = r_inst[0]
            port = r_inst[1]
            coll_delete.add(ipaddr + ":" + str(port))

        chg_res = {}
        for r_inst in r_added:
            coll, res = self._us.get_part(part, r_inst)
            chg_res[coll] = res

        return disc_instances, coll_delete, chg_res

    @staticmethod
    def send_agg_uve(redish, inst, part, acq_time, rows):
        """ 
        This function writes aggregated UVEs to redis

        Each row has a UVE key, one of it's structs type names and the structs value
        If type is "None", it means that the UVE is being removed
        If value is none, it mean that struct of the UVE is being removed

        The key and typename information is also published on a redis channel
        """
        old_acq_time = redish.hget("AGPARTS:%s" % inst, part)
        if old_acq_time is None:
            redish.hset("AGPARTS:%s" % inst, part, acq_time)
        else:
            # Is there stale information for this partition?
            if int(old_acq_time) != acq_time:
                ppe2 = redish.pipeline()
                ppe2.hdel("AGPARTS:%s" % inst, part)
                ppe2.smembers("AGPARTKEYS:%s:%d" % (inst, part))
                pperes2 = ppe2.execute()
                ppe3 = redish.pipeline()
                # Remove all contents for this AG-Partition
                for elem in pperes2[-1]:
                    ppe3.delete("AGPARTVALUES:%s:%d:%s" % (inst, part, elem))
                ppe3.delete("AGPARTKEYS:%s:%d" % (inst, part))
                ppe3.hset("AGPARTS:%s" % inst, part, acq_time)
                pperes3 = ppe3.execute()

        pub_list = []
        ppe = redish.pipeline()
        check_keys = set()
        for row in rows:
            vjson = json.dumps(row.val)
            typ = row.typ
            key = row.key
            pub_list.append({"key": key, "type": typ})
            if typ is None:
                # The entire contents of the UVE should be removed
                ppe.srem("AGPARTKEYS:%s:%d" % (inst, part), key)
                ppe.delete("AGPARTVALUES:%s:%d:%s" % (inst, part, key))
            else:
                if row.val is None:
                    # Remove the given struct from the UVE
                    ppe.hdel("AGPARTVALUES:%s:%d:%s" % (inst, part, key), typ)
                    check_keys.add(key)
                else:
                    ppe.sadd("AGPARTKEYS:%s:%d" % (inst, part), key)
                    ppe.hset("AGPARTVALUES:%s:%d:%s" % (inst, part, key), typ,
                             vjson)
        ppe.execute()

        # Find the keys that have no content (all structs have been deleted)
        ppe4 = redish.pipeline()
        check_keys_list = list(check_keys)
        for kk in check_keys_list:
            ppe4.exists("AGPARTVALUES:%s:%d:%s" % (inst, part, kk))
        pperes4 = ppe4.execute()

        # From the index, removes keys for which there are now no contents
        ppe5 = redish.pipeline()
        idx = 0
        for res in pperes4:
            if not res:
                ppe5.srem("AGPARTKEYS:%s:%d" % (inst, part),
                          check_keys_list[idx])
                # TODO: alarmgen should have already figured out if all structs of
                #       the UVE are gone, and should have sent a UVE delete
                #       We should not need to figure this out again
                assert ()
            idx += 1
        ppe5.execute()

        redish.publish('AGPARTPUB:%s:%d' % (inst, part), json.dumps(pub_list))

    def run_uve_processing(self):
        """
        This function runs in its own gevent, and provides state compression
        for UVEs.
        Kafka worker (PartitionHandler)  threads detect which UVE have changed
        and accumulate them onto a set. When this gevent runs, it processes
        all UVEs of the set. Even if this gevent cannot run for a while, the
        set should not grow in an unbounded manner (like a queue can)
        """

        if self.disc:
            max_out_rows = 20
        else:
            max_out_rows = 2
        lredis = None
        while True:
            for part in self._uveqf.keys():
                self._logger.error("Stop UVE processing for %d" % part)
                self.stop_uve_partition(part)
                del self._uveqf[part]
                if part in self._uveq:
                    del self._uveq[part]
            prev = time.time()
            gevs = {}
            pendingset = {}
            for part in self._uveq.keys():
                if not len(self._uveq[part]):
                    continue
                self._logger.info("UVE Process for %d" % part)

                # Allow the partition handlers to queue new UVEs without
                # interfering with the work of processing the current UVEs
                pendingset[part] = copy.deepcopy(self._uveq[part])
                self._uveq[part] = {}

                gevs[part] = gevent.spawn(self.handle_uve_notif,part,\
                    pendingset[part])
            if len(gevs):
                gevent.joinall(gevs.values())
                for part in gevs.keys():
                    # If UVE processing failed, requeue the working set
                    outp = gevs[part].get()
                    if outp is None:
                        self._logger.error("UVE Process failed for %d" % part)
                        self.handle_uve_notifq(part, pendingset[part])
                    else:
                        try:
                            if lredis is None:
                                lredis = redis.StrictRedis(
                                    host="127.0.0.1",
                                    port=self._conf.redis_server_port(),
                                    password=self._conf.redis_password(),
                                    db=2)

                            if len(outp):
                                rows = []
                                for ku, vu in outp.iteritems():
                                    if vu is None:
                                        # This message has no type!
                                        # Its used to indicate a delete of the entire UVE
                                        rows.append(
                                            OutputRow(key=ku,
                                                      typ=None,
                                                      val=None))
                                        if len(rows) >= max_out_rows:
                                            Controller.send_agg_uve(
                                                lredis, self._instance_id,
                                                part,
                                                self._workers[part].acq_time(),
                                                rows)
                                            rows[:] = []
                                        continue
                                    for kt, vt in vu.iteritems():
                                        rows.append(
                                            OutputRow(key=ku, typ=kt, val=vt))
                                        if len(rows) >= max_out_rows:
                                            Controller.send_agg_uve(
                                                lredis, self._instance_id,
                                                part,
                                                self._workers[part].acq_time(),
                                                rows)
                                            rows[:] = []
                                # Flush all remaining rows
                                if len(rows):
                                    Controller.send_agg_uve(
                                        lredis, self._instance_id, part,
                                        self._workers[part].acq_time(), rows)
                                    rows[:] = []

                        except Exception as ex:
                            template = "Exception {0} in uve proc. Arguments:\n{1!r}"
                            messag = template.format(
                                type(ex).__name__, ex.args)
                            self._logger.error("%s : traceback %s" % \
                                              (messag, traceback.format_exc()))
                            lredis = None
                            # We need to requeue
                            self.handle_uve_notifq(part, pendingset[part])
                            gevent.sleep(1)

            curr = time.time()
            if (curr - prev) < 0.5:
                gevent.sleep(0.5 - (curr - prev))
            else:
                self._logger.info("UVE Process saturated")
                gevent.sleep(0)

    def stop_uve_partition(self, part):
        for tk in self.ptab_info[part].keys():
            for rkey in self.ptab_info[part][tk].keys():
                uk = tk + ":" + rkey
                if tk in self.tab_alarms:
                    if uk in self.tab_alarms[tk]:
                        del self.tab_alarms[tk][uk]
                        ustruct = UVEAlarms(name=rkey, deleted=True)
                        alarm_msg = AlarmTrace(data=ustruct, \
                                table=tk, sandesh=self._sandesh)
                        self._logger.error('send del alarm for stop: %s' % \
                                (alarm_msg.log()))
                        alarm_msg.send(sandesh=self._sandesh)
                del self.ptab_info[part][tk][rkey]
                self._logger.error("UVE %s deleted in stop" % (uk))
            del self.ptab_info[part][tk]
        del self.ptab_info[part]

    def handle_uve_notif(self, part, uves):
        """
        Call this function when a UVE has changed. This can also
        happed when taking ownership of a partition, or when a
        generator is deleted.
        Args:
            part   : Partition Number
            uve    : dict, where the key is the UVE Name.
                     The value is either a dict of UVE structs, or "None",
                     which means that all UVE structs should be processed.

        Returns: 
            status of operation (True for success)
        """
        self._logger.debug("Changed part %d UVEs : %s" % (part, str(uves)))
        success = True
        output = {}
        for uv, types in uves.iteritems():
            tab = uv.split(':', 1)[0]
            if tab not in self.tab_perf:
                self.tab_perf[tab] = AGTabStats()

            uve_name = uv.split(':', 1)[1]
            prevt = UTCTimestampUsec()
            filters = {}
            if types:
                filters["cfilt"] = {}
                for typ in types.keys():
                    filters["cfilt"][typ] = set()

            failures, uve_data = self._us.get_uve(uv, True, filters)

            if failures:
                success = False
            self.tab_perf[tab].record_get(UTCTimestampUsec() - prevt)
            # Handling Agg UVEs
            if not part in self.ptab_info:
                self._logger.error("Creating UVE table for part %s" %
                                   str(part))
                self.ptab_info[part] = {}

            if not tab in self.ptab_info[part]:
                self.ptab_info[part][tab] = {}

            if uve_name not in self.ptab_info[part][tab]:
                self.ptab_info[part][tab][uve_name] = AGKeyInfo(part)
            prevt = UTCTimestampUsec()
            output[uv] = {}
            touched = False
            if not types:
                self.ptab_info[part][tab][uve_name].update(uve_data)
                if len(self.ptab_info[part][tab][uve_name].removed()):
                    touched = True
                    self._logger.info("UVE %s removed structs %s" % (uve_name, \
                            self.ptab_info[part][tab][uve_name].removed()))
                    for rems in self.ptab_info[part][tab][uve_name].removed():
                        output[uv][rems] = None
                if len(self.ptab_info[part][tab][uve_name].changed()):
                    touched = True
                    self._logger.debug("UVE %s changed structs %s" % (uve_name, \
                            self.ptab_info[part][tab][uve_name].changed()))
                    for chgs in self.ptab_info[part][tab][uve_name].changed():
                        output[uv][chgs] = \
                                self.ptab_info[part][tab][uve_name].values()[chgs]
                if len(self.ptab_info[part][tab][uve_name].added()):
                    touched = True
                    self._logger.debug("UVE %s added structs %s" % (uve_name, \
                            self.ptab_info[part][tab][uve_name].added()))
                    for adds in self.ptab_info[part][tab][uve_name].added():
                        output[uv][adds] = \
                                self.ptab_info[part][tab][uve_name].values()[adds]
            else:
                for typ in types:
                    val = None
                    if typ in uve_data:
                        val = uve_data[typ]
                    self.ptab_info[part][tab][uve_name].update_single(typ, val)
                    if len(self.ptab_info[part][tab][uve_name].removed()):
                        touched = True
                        self._logger.info("UVE %s removed structs %s" % (uve_name, \
                                self.ptab_info[part][tab][uve_name].removed()))
                        for rems in self.ptab_info[part][tab][
                                uve_name].removed():
                            output[uv][rems] = None
                    if len(self.ptab_info[part][tab][uve_name].changed()):
                        touched = True
                        self._logger.debug("UVE %s changed structs %s" % (uve_name, \
                                self.ptab_info[part][tab][uve_name].changed()))
                        for chgs in self.ptab_info[part][tab][
                                uve_name].changed():
                            output[uv][chgs] = \
                                    self.ptab_info[part][tab][uve_name].values()[chgs]
                    if len(self.ptab_info[part][tab][uve_name].added()):
                        touched = True
                        self._logger.debug("UVE %s added structs %s" % (uve_name, \
                                self.ptab_info[part][tab][uve_name].added()))
                        for adds in self.ptab_info[part][tab][uve_name].added(
                        ):
                            output[uv][adds] = \
                                    self.ptab_info[part][tab][uve_name].values()[adds]
            if not touched:
                del output[uv]
            local_uve = self.ptab_info[part][tab][uve_name].values()

            self.tab_perf[tab].record_pub(UTCTimestampUsec() - prevt)

            if len(local_uve.keys()) == 0:
                self._logger.info("UVE %s deleted in proc" % (uv))
                del self.ptab_info[part][tab][uve_name]
                output[uv] = None

                # Both alarm and non-alarm contents are gone.
                # We do not need to do alarm evaluation
                continue

            # Withdraw the alarm if the UVE has no non-alarm structs
            if len(local_uve.keys()) == 1 and "UVEAlarms" in local_uve:
                if tab in self.tab_alarms:
                    if uv in self.tab_alarms[tab]:
                        del self.tab_alarms[tab][uv]
                        ustruct = UVEAlarms(name=uve_name, deleted=True)
                        alarm_msg = AlarmTrace(data=ustruct, table=tab, \
                                sandesh=self._sandesh)
                        self._logger.info('send del alarm: %s' %
                                          (alarm_msg.log()))
                        alarm_msg.send(sandesh=self._sandesh)
                continue

            # Handing Alarms
            if not self.mgrs.has_key(tab):
                continue
            prevt = UTCTimestampUsec()

            #TODO: We may need to remove alarm from local_uve before
            #      alarm evaluation
            # if "UVEAlarms" in uve_data:
            #     del uve_data["UVEAlarms"]

            results = self.mgrs[tab].map_method("__call__", uv, local_uve)
            self.tab_perf[tab].record_call(UTCTimestampUsec() - prevt)
            new_uve_alarms = {}
            for res in results:
                nm, sev, errs = res
                self._logger.debug("Alarm[%s] %s: %s" % (tab, nm, str(errs)))
                elems = []
                for ae in errs:
                    rule, val = ae
                    rv = AlarmElement(rule, val)
                    elems.append(rv)
                if len(elems):
                    new_uve_alarms[nm] = UVEAlarmInfo(type=nm,
                                                      severity=sev,
                                                      timestamp=0,
                                                      token="",
                                                      description=elems,
                                                      ack=False)
            del_types = []
            if self.tab_alarms[tab].has_key(uv):
                for nm, uai in self.tab_alarms[tab][uv].iteritems():
                    uai2 = copy.deepcopy(uai)
                    uai2.timestamp = 0
                    uai2.token = ""
                    # This type was present earlier, but is now gone
                    if not new_uve_alarms.has_key(nm):
                        del_types.append(nm)
                    else:
                        # This type has no new information
                        if uai2 == new_uve_alarms[nm]:
                            del new_uve_alarms[nm]
            if len(del_types) != 0  or \
                    len(new_uve_alarms) != 0:
                self._logger.debug("Alarm[%s] Deleted %s" % \
                        (tab, str(del_types)))
                self._logger.debug("Alarm[%s] Updated %s" % \
                        (tab, str(new_uve_alarms)))
                # These alarm types are new or updated
                for nm, uai2 in new_uve_alarms.iteritems():
                    uai = copy.deepcopy(uai2)
                    uai.timestamp = UTCTimestampUsec()
                    uai.token = Controller.token(self._sandesh, uai.timestamp)
                    if not self.tab_alarms[tab].has_key(uv):
                        self.tab_alarms[tab][uv] = {}
                    self.tab_alarms[tab][uv][nm] = uai
                # These alarm types are now gone
                for dnm in del_types:
                    del self.tab_alarms[tab][uv][dnm]

                ustruct = None
                if len(self.tab_alarms[tab][uv]) == 0:
                    ustruct = UVEAlarms(name=uve_name, deleted=True)
                    del self.tab_alarms[tab][uv]
                else:
                    alm_copy = copy.deepcopy(self.tab_alarms[tab][uv])
                    ustruct = UVEAlarms(name=uve_name,
                                        alarms=alm_copy.values(),
                                        deleted=False)
                alarm_msg = AlarmTrace(data=ustruct, table=tab, \
                        sandesh=self._sandesh)
                self._logger.info('send alarm: %s' % (alarm_msg.log()))
                alarm_msg.send(sandesh=self._sandesh)
        if success:
            return output
        else:
            return None

    def handle_UVETableInfoReq(self, req):
        if req.partition == -1:
            parts = self.ptab_info.keys()
        else:
            parts = [req.partition]

        self._logger.info("Got UVETableInfoReq : %s" % str(parts))
        np = 1
        for part in parts:
            if part not in self.ptab_info:
                continue
            tables = []
            for tab in self.ptab_info[part].keys():
                uvel = []
                for uk, uv in self.ptab_info[part][tab].iteritems():
                    types = []
                    for tk, tv in uv.values().iteritems():
                        types.append(
                            UVEStructInfo(type=tk, content=json.dumps(tv)))
                    uvel.append(UVEObjectInfo(name=uk, structs=types))
                tables.append(UVETableInfo(table=tab, uves=uvel))
            resp = UVETableInfoResp(partition=part)
            resp.tables = tables

            if np == len(parts):
                mr = False
            else:
                mr = True
            resp.response(req.context(), mr)
            np = np + 1

    def handle_UVETableAlarmReq(self, req):
        status = False
        if req.table == "all":
            parts = self.tab_alarms.keys()
        else:
            parts = [req.table]
        self._logger.info("Got UVETableAlarmReq : %s" % str(parts))
        np = 1
        for pt in parts:
            resp = UVETableAlarmResp(table=pt)
            uves = []
            for uk, uv in self.tab_alarms[pt].iteritems():
                alms = []
                for ak, av in uv.iteritems():
                    alms.append(av)
                uves.append(UVEAlarms(name=uk, alarms=alms))
            resp.uves = uves
            if np == len(parts):
                mr = False
            else:
                mr = True
            resp.response(req.context(), mr)
            np = np + 1

    def handle_UVETablePerfReq(self, req):
        status = False
        if req.table == "all":
            parts = self.tab_perf_prev.keys()
        else:
            parts = [req.table]
        self._logger.info("Got UVETablePerfReq : %s" % str(parts))
        np = 1
        for pt in parts:
            resp = UVETablePerfResp(table=pt)
            resp.call_time = self.tab_perf_prev[pt].call_result()
            resp.get_time = self.tab_perf_prev[pt].get_result()
            resp.pub_time = self.tab_perf_prev[pt].pub_result()
            resp.updates = self.tab_perf_prev[pt].get_n

            if np == len(parts):
                mr = False
            else:
                mr = True
            resp.response(req.context(), mr)
            np = np + 1

    def partition_change(self, partno, enl):
        """
        Call this function when getting or giving up
        ownership of a partition
        Args:
            partno : Partition Number
            enl    : True for acquiring, False for giving up
        Returns: 
            status of operation (True for success)
        """
        status = False
        if enl:
            if partno in self._workers:
                self._logger.info("Dup partition %d" % partno)
            else:
                cdisc = None
                if self.disc:
                    cdisc = client.DiscoveryClient(
                        self._conf.discovery()['server'],
                        self._conf.discovery()['port'],
                        ModuleNames[Module.ALARM_GENERATOR], '%s-%s-%d' %
                        (self._hostname, self._instance_id, partno))
                ph = UveStreamProc(','.join(self._conf.kafka_broker_list()),
                                   partno, "uve-" + str(partno),
                                   self._logger, self.handle_uve_notifq,
                                   self._conf.host_ip(),
                                   self.handle_resource_check,
                                   self._instance_id,
                                   self._conf.redis_server_port(), cdisc)
                ph.start()
                self._workers[partno] = ph
                tout = 600
                idx = 0
                while idx < tout:
                    # When this partitions starts,
                    # uveq will get created
                    if partno not in self._uveq:
                        gevent.sleep(.1)
                    else:
                        break
                    idx += 1
                if partno in self._uveq:
                    status = True
                else:
                    # TODO: The partition has not started yet,
                    #       but it still might start later.
                    #       We possibly need to exit
                    status = False
                    self._logger.error("Unable to start partition %d" % partno)
        else:
            if partno in self._workers:
                ph = self._workers[partno]
                self._logger.error("Kill part %s" % str(partno))
                ph.kill()
                res, db = ph.get(False)
                self._logger.error("Returned " + str(res))
                del self._workers[partno]
                self._uveqf[partno] = True

                tout = 600
                idx = 0
                while idx < tout:
                    # When this partitions stop.s
                    # uveq will get destroyed
                    if partno in self._uveq:
                        gevent.sleep(.1)
                    else:
                        break
                    idx += 1
                if partno not in self._uveq:
                    status = True
                else:
                    # TODO: The partition has not stopped yet
                    #       but it still might stop later.
                    #       We possibly need to exit
                    status = False
                    self._logger.error("Unable to stop partition %d" % partno)
            else:
                self._logger.info("No partition %d" % partno)

        return status

    def handle_PartitionOwnershipReq(self, req):
        self._logger.info("Got PartitionOwnershipReq: %s" % str(req))
        status = self.partition_change(req.partition, req.ownership)

        resp = PartitionOwnershipResp()
        resp.status = status
        resp.response(req.context())

    def process_stats(self):
        ''' Go through the UVEKey-Count stats collected over 
            the previous time period over all partitions
            and send it out
        '''
        self.tab_perf_prev = copy.deepcopy(self.tab_perf)
        for kt in self.tab_perf.keys():
            #self.tab_perf_prev[kt] = copy.deepcopy(self.tab_perf[kt])
            self.tab_perf[kt].reset()

        s_partitions = set()
        s_keys = set()
        n_updates = 0
        for pk, pc in self._workers.iteritems():
            s_partitions.add(pk)
            din, dout = pc.stats()
            for ktab, tab in dout.iteritems():
                au_keys = []
                for uk, uc in tab.iteritems():
                    s_keys.add(uk)
                    n_updates += uc
                    ukc = UVEKeyInfo()
                    ukc.key = uk
                    ukc.count = uc
                    au_keys.append(ukc)
                au_obj = AlarmgenUpdate(name=self._sandesh._source + ':' + \
                        self._sandesh._node_type + ':' + \
                        self._sandesh._module + ':' + \
                        self._sandesh._instance_id,
                        partition = pk,
                        table = ktab,
                        keys = au_keys,
                        notifs = None,
                        sandesh=self._sandesh)
                self._logger.debug('send key stats: %s' % (au_obj.log()))
                au_obj.send(sandesh=self._sandesh)

            for ktab, tab in din.iteritems():
                au_notifs = []
                for kcoll, coll in tab.iteritems():
                    for kgen, gen in coll.iteritems():
                        for tk, tc in gen.iteritems():
                            tkc = UVETypeInfo()
                            tkc.type = tk
                            tkc.count = tc
                            tkc.generator = kgen
                            tkc.collector = kcoll
                            au_notifs.append(tkc)
                au_obj = AlarmgenUpdate(name=self._sandesh._source + ':' + \
                        self._sandesh._node_type + ':' + \
                        self._sandesh._module + ':' + \
                        self._sandesh._instance_id,
                        partition = pk,
                        table = ktab,
                        keys = None,
                        notifs = au_notifs,
                        sandesh=self._sandesh)
                self._logger.debug('send notif stats: %s' % (au_obj.log()))
                au_obj.send(sandesh=self._sandesh)

        au = AlarmgenStatus()
        au.name = self._hostname
        au.counters = []
        au.alarmgens = []
        ags = AlarmgenStats()
        ags.instance = self._instance_id
        ags.partitions = len(s_partitions)
        ags.keys = len(s_keys)
        ags.updates = n_updates
        au.counters.append(ags)

        agname = self._sandesh._source + ':' + \
                        self._sandesh._node_type + ':' + \
                        self._sandesh._module + ':' + \
                        self._sandesh._instance_id
        au.alarmgens.append(agname)

        atrace = AlarmgenStatusTrace(data=au, sandesh=self._sandesh)
        self._logger.debug('send alarmgen status : %s' % (atrace.log()))
        atrace.send(sandesh=self._sandesh)

    def handle_PartitionStatusReq(self, req):
        ''' Return the entire contents of the UVE DB for the 
            requested partitions
        '''
        if req.partition == -1:
            parts = self._workers.keys()
        else:
            parts = [req.partition]

        self._logger.info("Got PartitionStatusReq: %s" % str(parts))
        np = 1
        for pt in parts:
            resp = PartitionStatusResp()
            resp.partition = pt
            if self._workers.has_key(pt):
                resp.enabled = True
                resp.offset = self._workers[pt]._partoffset
                resp.uves = []
                for kcoll, coll in self._workers[pt].contents().iteritems():
                    uci = UVECollInfo()
                    uci.collector = kcoll
                    uci.uves = []
                    for kgen, gen in coll.iteritems():
                        ugi = UVEGenInfo()
                        ugi.generator = kgen
                        ugi.uves = []
                        for tabk, tabc in gen.iteritems():
                            for uk, uc in tabc.iteritems():
                                ukc = UVEKeyInfo()
                                ukc.key = tabk + ":" + uk
                                ukc.types = []
                                for tk, tc in uc.iteritems():
                                    uvtc = UVETypeCount()
                                    uvtc.type = tk
                                    uvtc.count = tc["c"]
                                    uvtc.agg_uuid = str(tc["u"])
                                    ukc.types.append(uvtc)
                                ugi.uves.append(ukc)
                        uci.uves.append(ugi)
                    resp.uves.append(uci)
            else:
                resp.enabled = False
            if np == len(parts):
                mr = False
            else:
                mr = True
            resp.response(req.context(), mr)
            np = np + 1

    def disc_cb_coll(self, clist):
        '''
        Analytics node may be brought up/down any time. For UVE aggregation,
        alarmgen needs to know the list of all Analytics nodes (redis-uves).
        Periodically poll the Collector list [in lieu of 
        redi-uve nodes] from the discovery. 
        '''
        self._logger.error("Discovery Collector callback : %s" % str(clist))
        newlist = []
        for elem in clist:
            ipaddr = elem["ip-address"]
            cpid = 0
            if "pid" in elem:
                cpid = int(elem["pid"])
            newlist.append((ipaddr, self._conf.redis_server_port(), cpid))
        self._us.update_redis_uve_list(newlist)

    def disc_cb_ag(self, alist):
        '''
        Analytics node may be brought up/down any time. For partitioning,
        alarmgen needs to know the list of all Analytics nodes (alarmgens).
        Periodically poll the alarmgen list from the discovery service
        '''
        self._logger.error("Discovery AG callback : %s" % str(alist))
        newlist = []
        for elem in alist:
            ipaddr = elem["ip-address"]
            inst = elem["port"]
            newlist.append(ipaddr + ":" + inst)

        # We should always include ourselves in the list of memebers
        newset = set(newlist)
        newset.add(self._libpart_name)
        newlist = list(newset)
        if not self._libpart:
            self._libpart = self.start_libpart(newlist)
        else:
            self._libpart.update_cluster_list(newlist)

    def run(self):
        alarmgen_cpu_info = CpuInfoData()
        while True:
            before = time.time()
            mod_cpu_info = ModuleCpuInfo()
            mod_cpu_info.module_id = self._moduleid
            mod_cpu_info.instance_id = self._instance_id
            mod_cpu_info.cpu_info = alarmgen_cpu_info.get_cpu_info(
                system=False)
            mod_cpu_state = ModuleCpuState()
            mod_cpu_state.name = self._hostname

            mod_cpu_state.module_cpu_info = [mod_cpu_info]

            alarmgen_cpu_state_trace = ModuleCpuStateTrace(\
                    data=mod_cpu_state, sandesh = self._sandesh)
            alarmgen_cpu_state_trace.send(sandesh=self._sandesh)

            aly_cpu_state = AnalyticsCpuState()
            aly_cpu_state.name = self._hostname

            aly_cpu_info = ProcessCpuInfo()
            aly_cpu_info.module_id = self._moduleid
            aly_cpu_info.inst_id = self._instance_id
            aly_cpu_info.cpu_share = mod_cpu_info.cpu_info.cpu_share
            aly_cpu_info.mem_virt = mod_cpu_info.cpu_info.meminfo.virt
            aly_cpu_info.mem_res = mod_cpu_info.cpu_info.meminfo.res
            aly_cpu_state.cpu_info = [aly_cpu_info]

            aly_cpu_state_trace = AnalyticsCpuStateTrace(\
                    data=aly_cpu_state, sandesh = self._sandesh)
            aly_cpu_state_trace.send(sandesh=self._sandesh)

            # Send out the UVEKey-Count stats for this time period
            self.process_stats()

            duration = time.time() - before
            if duration < 60:
                gevent.sleep(60 - duration)
            else:
                self._logger.error("Periodic collection took %s sec" %
                                   duration)
Ejemplo n.º 9
0
class Controller(object):
    
    @staticmethod
    def fail_cb(manager, entrypoint, exception):
        sandesh_global._logger.info("Load failed for %s with exception %s" % \
                                     (str(entrypoint),str(exception)))
        
    def __init__(self, conf):
        self._conf = conf
        module = Module.ALARM_GENERATOR
        self._moduleid = ModuleNames[module]
        node_type = Module2NodeType[module]
        self._node_type_name = NodeTypeNames[node_type]
        self._hostname = socket.gethostname()
        self._instance_id = self._conf.worker_id()
        sandesh_global.init_generator(self._moduleid, self._hostname,
                                      self._node_type_name, self._instance_id,
                                      self._conf.collectors(), 
                                      self._node_type_name,
                                      self._conf.http_port(),
                                      ['opserver.sandesh', 'sandesh'],
                                      host_ip=self._conf.host_ip())
        sandesh_global.set_logging_params(
            enable_local_log=self._conf.log_local(),
            category=self._conf.log_category(),
            level=self._conf.log_level(),
            file=self._conf.log_file(),
            enable_syslog=self._conf.use_syslog(),
            syslog_facility=self._conf.syslog_facility())
        self._logger = sandesh_global._logger
        # Trace buffer list
        self.trace_buf = [
            {'name':'DiscoveryMsg', 'size':1000}
        ]
        # Create trace buffers 
        for buf in self.trace_buf:
            sandesh_global.trace_buffer_create(name=buf['name'], size=buf['size'])

        tables = [ "ObjectCollectorInfo",
                   "ObjectDatabaseInfo",
                   "ObjectVRouter",
                   "ObjectBgpRouter",
                   "ObjectConfigNode" ] 
        self.mgrs = {}
        self.tab_alarms = {}
        self.ptab_info = {}
        self.tab_perf = {}
        self.tab_perf_prev = {}
        for table in tables:
            self.mgrs[table] = hook.HookManager(
                namespace='contrail.analytics.alarms',
                name=table,
                invoke_on_load=True,
                invoke_args=(),
                on_load_failure_callback=Controller.fail_cb
            )
            
            for extn in self.mgrs[table][table]:
                self._logger.info('Loaded extensions for %s: %s,%s doc %s' % \
                    (table, extn.name, extn.entry_point_target, extn.obj.__doc__))

            self.tab_alarms[table] = {}
            self.tab_perf[table] = AGTabStats()

        ConnectionState.init(sandesh_global, self._hostname, self._moduleid,
            self._instance_id,
            staticmethod(ConnectionState.get_process_state_cb),
            NodeStatusUVE, NodeStatus)

        self._us = UVEServer(None, self._logger, self._conf.redis_password())

        self._workers = {}
        self._uveq = {}
        self._uveqf = {}

        self.disc = None
        self._libpart_name = self._hostname + ":" + self._instance_id
        self._libpart = None
        self._partset = set()
        if self._conf.discovery()['server']:
            import discoveryclient.client as client 
            data = {
                'ip-address': self._hostname ,
                'port': self._instance_id
            }
            self.disc = client.DiscoveryClient(
                self._conf.discovery()['server'],
                self._conf.discovery()['port'],
                ModuleNames[Module.ALARM_GENERATOR])
            self._logger.info("Disc Publish to %s : %s"
                          % (str(self._conf.discovery()), str(data)))
            self.disc.publish(ALARM_GENERATOR_SERVICE_NAME, data)
        else:
            # If there is no discovery service, use fixed redis_uve list
            redis_uve_list = []
            try:
                for redis_uve in self._conf.redis_uve_list():
                    redis_ip_port = redis_uve.split(':')
                    redis_ip_port = (redis_ip_port[0], int(redis_ip_port[1]))
                    redis_uve_list.append(redis_ip_port)
            except Exception as e:
                self._logger.error('Failed to parse redis_uve_list: %s' % e)
            else:
                self._us.update_redis_uve_list(redis_uve_list)

            # If there is no discovery service, use fixed alarmgen list
            self._libpart = self.start_libpart(self._conf.alarmgen_list())

        PartitionOwnershipReq.handle_request = self.handle_PartitionOwnershipReq
        PartitionStatusReq.handle_request = self.handle_PartitionStatusReq
        UVETableAlarmReq.handle_request = self.handle_UVETableAlarmReq 
        UVETableInfoReq.handle_request = self.handle_UVETableInfoReq
        UVETablePerfReq.handle_request = self.handle_UVETablePerfReq

    def libpart_cb(self, part_list):

        agpi = AlarmgenPartionInfo()
        agpi.instance = self._instance_id
        agpi.partitions = part_list

        agp = AlarmgenPartition()
        agp.name = self._hostname
        agp.inst_parts = [agpi]
       
        agp_trace = AlarmgenPartitionTrace(data=agp)
        agp_trace.send() 

        newset = set(part_list)
        oldset = self._partset
        self._partset = newset

        self._logger.error('Partition List : new %s old %s' % \
            (str(newset),str(oldset)))
        
        for addpart in (newset-oldset):
            self._logger.error('Partition Add : %s' % addpart)
            self.partition_change(addpart, True)
        
        for delpart in (oldset-newset):
            self._logger.error('Partition Del : %s' % delpart)
            self.partition_change(delpart, False)

        self._logger.error('Partition List done : new %s old %s' % \
            (str(newset),str(oldset)))

    def start_libpart(self, ag_list):
        if not self._conf.zk_list():
            self._logger.error('Could not import libpartition: No zookeeper')
            return None
        if not ag_list:
            self._logger.error('Could not import libpartition: No alarmgen list')
            return None
        try:
            from libpartition.libpartition import PartitionClient
            self._logger.error('Starting PC')
            agpi = AlarmgenPartionInfo()
            agpi.instance = self._instance_id
            agpi.partitions = []

            agp = AlarmgenPartition()
            agp.name = self._hostname
            agp.inst_parts = [agpi]
           
            agp_trace = AlarmgenPartitionTrace(data=agp)
            agp_trace.send() 

            pc = PartitionClient("alarmgen",
                    self._libpart_name, ag_list,
                    self._conf.partitions(), self.libpart_cb,
                    ','.join(self._conf.zk_list()))
            self._logger.error('Started PC')
            return pc
        except Exception as e:
            self._logger.error('Could not import libpartition: %s' % str(e))
            return None

    def handle_uve_notifq(self, part, uves):
        if part not in self._uveq:
            self._uveq[part] = {}
        for uv,types in uves.iteritems():
            if types is None:
                self._uveq[part][uv] = None
            else:
                if uv in self._uveq[part]:
                    if self._uveq[part][uv] is not None:
                        self._uveq[part][uv].update(types)
                else:
                    self._uveq[part][uv] = set()
                    self._uveq[part][uv].update(types)

    def run_uve_processing(self):
        """
        This function runs in its own gevent, and provides state compression
        for UVEs.
        Kafka worker (PartitionHandler)  threads detect which UVE have changed
        and accumulate them onto a set. When this gevent runs, it processes
        all UVEs of the set. Even if this gevent cannot run for a while, the
        set should not grow in an unbounded manner (like a queue can)
        """

        while True:
            for part in self._uveqf.keys():
                self._logger.error("Stop UVE processing for %d" % part)
                self.stop_uve_partition(part)
                del self._uveqf[part]
                if part in self._uveq:
                    del self._uveq[part]
            prev = time.time()
            gevs = {}
            for part in self._uveq.keys():
                if not len(self._uveq[part]):
                    continue
                self._logger.info("UVE Process for %d" % part)

                # Allow the partition handlers to queue new UVEs without
                # interfering with the work of processing the current UVEs
                workingset = copy.deepcopy(self._uveq[part])
                self._uveq[part] = {}

                gevs[part] = gevent.spawn(self.handle_uve_notif,part,workingset)
            if len(gevs):
                gevent.joinall(gevs.values())
                for part in gevs.keys():
                    # If UVE processing failed, requeue the working set
                    if not gevs[part].get():
                        self._logger.error("UVE Process failed for %d" % part)
                        self.handle_uve_notifq(part, workingset)
            curr = time.time()
            if (curr - prev) < 0.1:
                gevent.sleep(0.1 - (curr - prev))
            else:
                self._logger.info("UVE Process saturated")
                gevent.sleep(0)
             
    def stop_uve_partition(self, part):
        for tk in self.ptab_info[part].keys():
            for uk in self.ptab_info[part][tk].keys():
                if tk in self.tab_alarms:
                    if uk in self.tab_alarms[tk]:
                        del self.tab_alarms[tk][uk]
                        ustruct = UVEAlarms(name = ok, deleted = True)
                        alarm_msg = AlarmTrace(data=ustruct, table=tk)
                        self._logger.info('send del alarm: %s' % (alarm_msg.log()))
                        alarm_msg.send()
                del self.ptab_info[part][tk][uk]
                self._logger.info("UVE %s deleted" % (uk))
            del self.ptab_info[part][tk]
        del self.ptab_info[part]

    def handle_uve_notif(self, part, uves):
        """
        Call this function when a UVE has changed. This can also
        happed when taking ownership of a partition, or when a
        generator is deleted.
        Args:
            part   : Partition Number
            uve    : dict, where the key is the UVE Name.
                     The value is either a set of UVE structs, or "None",
                     which means that all UVE structs should be processed

        Returns: 
            status of operation (True for success)
        """
        self._logger.debug("Changed part %d UVEs : %s" % (part, str(uves)))
        success = True
        for uv,types in uves.iteritems():
            tab = uv.split(':',1)[0]
            if tab not in self.tab_perf:
                self.tab_perf[tab] = AGTabStats()

            uve_name = uv.split(':',1)[1]
            prevt = UTCTimestampUsec() 
            filters = {}
            if types:
                filters["cfilt"] = {}
                for typ in types:
                    filters["cfilt"][typ] = set()
            failures, uve_data = self._us.get_uve(uv, True, filters)
            if failures:
                success = False
            self.tab_perf[tab].record_get(UTCTimestampUsec() - prevt)
            # Handling Agg UVEs
            if not part in self.ptab_info:
                self.ptab_info[part] = {}

            if not tab in self.ptab_info[part]:
                self.ptab_info[part][tab] = {}

            if uve_name not in self.ptab_info[part][tab]:
                self.ptab_info[part][tab][uve_name] = AGKeyInfo(part)

            prevt = UTCTimestampUsec()       
            if not types:
                self.ptab_info[part][tab][uve_name].update(uve_data)
                if len(self.ptab_info[part][tab][uve_name].removed()):
                    self._logger.info("UVE %s removed structs %s" % (uve_name, \
                            self.ptab_info[part][tab][uve_name].removed()))
                if len(self.ptab_info[part][tab][uve_name].changed()):
                    self._logger.debug("UVE %s changed structs %s" % (uve_name, \
                            self.ptab_info[part][tab][uve_name].changed()))
                if len(self.ptab_info[part][tab][uve_name].added()):
                    self._logger.debug("UVE %s added structs %s" % (uve_name, \
                            self.ptab_info[part][tab][uve_name].added()))
            else:
                for typ in types:
                    val = None
                    if typ in uve_data:
                        val = uve_data[typ]
                    self.ptab_info[part][tab][uve_name].update_single(typ, val)
                    if len(self.ptab_info[part][tab][uve_name].removed()):
                        self._logger.info("UVE %s removed structs %s" % (uve_name, \
                                self.ptab_info[part][tab][uve_name].removed()))
                    if len(self.ptab_info[part][tab][uve_name].changed()):
                        self._logger.debug("UVE %s changed structs %s" % (uve_name, \
                                self.ptab_info[part][tab][uve_name].changed()))
                    if len(self.ptab_info[part][tab][uve_name].added()):
                        self._logger.debug("UVE %s added structs %s" % (uve_name, \
                                self.ptab_info[part][tab][uve_name].added()))

            local_uve = self.ptab_info[part][tab][uve_name].values()
            
            if len(local_uve.keys()) == 0:
                self._logger.info("UVE %s deleted" % (uve_name))
                del self.ptab_info[part][tab][uve_name]

            self.tab_perf[tab].record_pub(UTCTimestampUsec() - prevt)

            # Withdraw the alarm if the UVE has no non-alarm structs
            if len(local_uve.keys()) == 0 or \
                    (len(local_uve.keys()) == 1 and "UVEAlarms" in local_uve):
                if tab in self.tab_alarms:
                    if uv in self.tab_alarms[tab]:
                        del self.tab_alarms[tab][uv]
                        ustruct = UVEAlarms(name = uve_name, deleted = True)
                        alarm_msg = AlarmTrace(data=ustruct, table=tab)
                        self._logger.info('send del alarm: %s' % (alarm_msg.log()))
                        alarm_msg.send()
                        continue

            # Handing Alarms
            if not self.mgrs.has_key(tab):
                continue
            prevt = UTCTimestampUsec()
            results = self.mgrs[tab].map_method("__call__", uv, local_uve)
            self.tab_perf[tab].record_call(UTCTimestampUsec() - prevt)
            new_uve_alarms = {}
            for res in results:
                nm, sev, errs = res
                self._logger.debug("Alarm[%s] %s: %s" % (tab, nm, str(errs)))
                elems = []
                for ae in errs:
                    rule, val = ae
                    rv = AlarmElement(rule, val)
                    elems.append(rv)
                if len(elems):
                    new_uve_alarms[nm] = UVEAlarmInfo(type = nm, severity = sev,
                                           timestamp = 0,
                                           description = elems, ack = False)
            del_types = []
            if self.tab_alarms[tab].has_key(uv):
                for nm, uai in self.tab_alarms[tab][uv].iteritems():
                    uai2 = copy.deepcopy(uai)
                    uai2.timestamp = 0
                    # This type was present earlier, but is now gone
                    if not new_uve_alarms.has_key(nm):
                        del_types.append(nm)
                    else:
                        # This type has no new information
                        if pprint.pformat(uai2) == \
                                pprint.pformat(new_uve_alarms[nm]):
                            del new_uve_alarms[nm]
            if len(del_types) != 0  or \
                    len(new_uve_alarms) != 0:
                self._logger.debug("Alarm[%s] Deleted %s" % \
                        (tab, str(del_types))) 
                self._logger.debug("Alarm[%s] Updated %s" % \
                        (tab, str(new_uve_alarms))) 
                # These alarm types are new or updated
                for nm, uai2 in new_uve_alarms.iteritems():
                    uai = copy.deepcopy(uai2)
                    uai.timestamp = UTCTimestampUsec()
                    if not self.tab_alarms[tab].has_key(uv):
                        self.tab_alarms[tab][uv] = {}
                    self.tab_alarms[tab][uv][nm] = uai
                # These alarm types are now gone
                for dnm in del_types:
                    del self.tab_alarms[tab][uv][dnm]
                    
                ustruct = None
                if len(self.tab_alarms[tab][uv]) == 0:
                    ustruct = UVEAlarms(name = uve_name,
                            deleted = True)
                    del self.tab_alarms[tab][uv]
                else:
                    ustruct = UVEAlarms(name = uve_name,
                            alarms = self.tab_alarms[tab][uv].values(),
                            deleted = False)
                alarm_msg = AlarmTrace(data=ustruct, table=tab)
                self._logger.info('send alarm: %s' % (alarm_msg.log()))
                alarm_msg.send()
        return success
 
    def handle_UVETableInfoReq(self, req):
        if req.partition == -1:
            parts = self.ptab_info.keys()
        else:
            parts = [req.partition]
        
        self._logger.info("Got UVETableInfoReq : %s" % str(parts))
        np = 1
        for part in parts:
            if part not in self.ptab_info:
                continue
            tables = []
            for tab in self.ptab_info[part].keys():
                uvel = []
                for uk,uv in self.ptab_info[part][tab].iteritems():
                    types = []
                    for tk,tv in uv.values().iteritems():
                        types.append(UVEStructInfo(type = tk,
                                content = json.dumps(tv)))
                    uvel.append(UVEObjectInfo(
                            name = uk, structs = types))
                tables.append(UVETableInfo(table = tab, uves = uvel))
            resp = UVETableInfoResp(partition = part)
            resp.tables = tables

            if np == len(parts):
                mr = False
            else:
                mr = True
            resp.response(req.context(), mr)
            np = np + 1

    def handle_UVETableAlarmReq(self, req):
        status = False
        if req.table == "all":
            parts = self.tab_alarms.keys()
        else:
            parts = [req.table]
        self._logger.info("Got UVETableAlarmReq : %s" % str(parts))
        np = 1
        for pt in parts:
            resp = UVETableAlarmResp(table = pt)
            uves = []
            for uk,uv in self.tab_alarms[pt].iteritems():
                alms = []
                for ak,av in uv.iteritems():
                    alms.append(av)
                uves.append(UVEAlarms(name = uk, alarms = alms))
            resp.uves = uves 
            if np == len(parts):
                mr = False
            else:
                mr = True
            resp.response(req.context(), mr)
            np = np + 1

    def handle_UVETablePerfReq(self, req):
        status = False
        if req.table == "all":
            parts = self.tab_perf_prev.keys()
        else:
            parts = [req.table]
        self._logger.info("Got UVETablePerfReq : %s" % str(parts))
        np = 1
        for pt in parts:
            resp = UVETablePerfResp(table = pt)
            resp.call_time = self.tab_perf_prev[pt].call_result()
            resp.get_time = self.tab_perf_prev[pt].get_result()
            resp.pub_time = self.tab_perf_prev[pt].pub_result()
            resp.updates = self.tab_perf_prev[pt].get_n

            if np == len(parts):
                mr = False
            else:
                mr = True
            resp.response(req.context(), mr)
            np = np + 1
    
    def partition_change(self, partno, enl):
        """
        Call this function when getting or giving up
        ownership of a partition
        Args:
            partno : Partition Number
            enl    : True for acquiring, False for giving up
        Returns: 
            status of operation (True for success)
        """
        status = False
        if enl:
            if partno in self._workers:
                self._logger.info("Dup partition %d" % partno)
            else:
                ph = UveStreamProc(','.join(self._conf.kafka_broker_list()),
                                   partno, "uve-" + str(partno),
                                   self._logger, self._us.get_part,
                                   self.handle_uve_notifq)
                ph.start()
                self._workers[partno] = ph
                tout = 600
                idx = 0
                while idx < tout:
                    # When this partitions starts,
                    # uveq will get created
                    if partno not in self._uveq:
                        gevent.sleep(.1)
                    else:
                        break
                    idx += 1
                if partno in self._uveq:
                    status = True 
                else:
                    # TODO: The partition has not started yet,
                    #       but it still might start later.
                    #       We possibly need to exit
                    status = False
                    self._logger.error("Unable to start partition %d" % partno)
        else:
            if partno in self._workers:
                ph = self._workers[partno]
                gevent.kill(ph)
                res,db = ph.get()
                print "Returned " + str(res)
                print "State :"
                for k,v in db.iteritems():
                    print "%s -> %s" % (k,str(v)) 
                del self._workers[partno]
                self._uveqf[partno] = True

                tout = 600
                idx = 0
                while idx < tout:
                    # When this partitions stop.s
                    # uveq will get destroyed
                    if partno in self._uveq:
                        gevent.sleep(.1)
                    else:
                        break
                    idx += 1
                if partno not in self._uveq:
                    status = True 
                else:
                    # TODO: The partition has not stopped yet
                    #       but it still might stop later.
                    #       We possibly need to exit
                    status = False
                    self._logger.error("Unable to stop partition %d" % partno)
            else:
                self._logger.info("No partition %d" % partno)

        return status
    
    def handle_PartitionOwnershipReq(self, req):
        self._logger.info("Got PartitionOwnershipReq: %s" % str(req))
        status = self.partition_change(req.partition, req.ownership)

        resp = PartitionOwnershipResp()
        resp.status = status
	resp.response(req.context())
               
    def process_stats(self):
        ''' Go through the UVEKey-Count stats collected over 
            the previous time period over all partitions
            and send it out
        '''
        self.tab_perf_prev = copy.deepcopy(self.tab_perf)
        for kt in self.tab_perf.keys():
            #self.tab_perf_prev[kt] = copy.deepcopy(self.tab_perf[kt])
            self.tab_perf[kt].reset()

        s_partitions = set()
        s_keys = set()
        n_updates = 0
        for pk,pc in self._workers.iteritems():
            s_partitions.add(pk)
            din, dout = pc.stats()
            for ktab,tab in dout.iteritems():
                au_keys = []
                for uk,uc in tab.iteritems():
                    s_keys.add(uk)
                    n_updates += uc
                    ukc = UVEKeyInfo()
                    ukc.key = uk
                    ukc.count = uc
                    au_keys.append(ukc)
                au_obj = AlarmgenUpdate(name=sandesh_global._source + ':' + \
                        sandesh_global._node_type + ':' + \
                        sandesh_global._module + ':' + \
                        sandesh_global._instance_id,
                        partition = pk,
                        table = ktab,
                        keys = au_keys,
                        notifs = None)
                self._logger.debug('send key stats: %s' % (au_obj.log()))
                au_obj.send()

            for ktab,tab in din.iteritems():
                au_notifs = []
                for kcoll,coll in tab.iteritems():
                    for kgen,gen in coll.iteritems():
                        for tk,tc in gen.iteritems():
                            tkc = UVETypeInfo()
                            tkc.type= tk
                            tkc.count = tc
                            tkc.generator = kgen
                            tkc.collector = kcoll
                            au_notifs.append(tkc)
                au_obj = AlarmgenUpdate(name=sandesh_global._source + ':' + \
                        sandesh_global._node_type + ':' + \
                        sandesh_global._module + ':' + \
                        sandesh_global._instance_id,
                        partition = pk,
                        table = ktab,
                        keys = None,
                        notifs = au_notifs)
                self._logger.debug('send notif stats: %s' % (au_obj.log()))
                au_obj.send()

        au = AlarmgenStatus()
        au.name = self._hostname
        au.counters = []
        au.alarmgens = []
        ags = AlarmgenStats()
        ags.instance =  self._instance_id
        ags.partitions = len(s_partitions)
        ags.keys = len(s_keys)
        ags.updates = n_updates
        au.counters.append(ags)

        agname = sandesh_global._source + ':' + \
                        sandesh_global._node_type + ':' + \
                        sandesh_global._module + ':' + \
                        sandesh_global._instance_id
        au.alarmgens.append(agname)
 
        atrace = AlarmgenStatusTrace(data = au)
        self._logger.debug('send alarmgen status : %s' % (atrace.log()))
        atrace.send()
         
    def handle_PartitionStatusReq(self, req):
        ''' Return the entire contents of the UVE DB for the 
            requested partitions
        '''
        if req.partition == -1:
            parts = self._workers.keys()
        else:
            parts = [req.partition]
        
        self._logger.info("Got PartitionStatusReq: %s" % str(parts))
        np = 1
        for pt in parts:
            resp = PartitionStatusResp()
            resp.partition = pt
            if self._workers.has_key(pt):
                resp.enabled = True
                resp.offset = self._workers[pt]._partoffset
                resp.uves = []
                for kcoll,coll in self._workers[pt].contents().iteritems():
                    uci = UVECollInfo()
                    uci.collector = kcoll
                    uci.uves = []
                    for kgen,gen in coll.iteritems():
                        ugi = UVEGenInfo()
                        ugi.generator = kgen
                        ugi.uves = []
                        for uk,uc in gen.iteritems():
                            ukc = UVEKeyInfo()
                            ukc.key = uk
                            ukc.count = uc
                            ugi.uves.append(ukc)
                        uci.uves.append(ugi)
                    resp.uves.append(uci)
            else:
                resp.enabled = False
            if np == len(parts):
                mr = False
            else:
                mr = True
            resp.response(req.context(), mr)
            np = np + 1

    def disc_cb_coll(self, clist):
        '''
        Analytics node may be brought up/down any time. For UVE aggregation,
        alarmgen needs to know the list of all Analytics nodes (redis-uves).
        Periodically poll the Collector list [in lieu of 
        redi-uve nodes] from the discovery. 
        '''
        newlist = []
        for elem in clist:
            (ipaddr,port) = elem
            newlist.append((ipaddr, self._conf.redis_server_port()))
        self._us.update_redis_uve_list(newlist)

    def disc_cb_ag(self, alist):
        '''
        Analytics node may be brought up/down any time. For partitioning,
        alarmgen needs to know the list of all Analytics nodes (alarmgens).
        Periodically poll the alarmgen list from the discovery service
        '''
        newlist = []
        for elem in alist:
            (ipaddr, inst) = elem
            newlist.append(ipaddr + ":" + inst)

        # We should always include ourselves in the list of memebers
        newset = set(newlist)
        newset.add(self._libpart_name)
        newlist = list(newset)
        if not self._libpart:
            self._libpart = self.start_libpart(newlist)
        else:
            self._libpart.update_cluster_list(newlist)

    def run(self):
        alarmgen_cpu_info = CpuInfoData()
        while True:
            before = time.time()
            mod_cpu_info = ModuleCpuInfo()
            mod_cpu_info.module_id = self._moduleid
            mod_cpu_info.instance_id = self._instance_id
            mod_cpu_info.cpu_info = alarmgen_cpu_info.get_cpu_info(
                system=False)
            mod_cpu_state = ModuleCpuState()
            mod_cpu_state.name = self._hostname

            mod_cpu_state.module_cpu_info = [mod_cpu_info]

            alarmgen_cpu_state_trace = ModuleCpuStateTrace(data=mod_cpu_state)
            alarmgen_cpu_state_trace.send()

            aly_cpu_state = AnalyticsCpuState()
            aly_cpu_state.name = self._hostname

            aly_cpu_info = ProcessCpuInfo()
            aly_cpu_info.module_id= self._moduleid
            aly_cpu_info.inst_id = self._instance_id
            aly_cpu_info.cpu_share = mod_cpu_info.cpu_info.cpu_share
            aly_cpu_info.mem_virt = mod_cpu_info.cpu_info.meminfo.virt
            aly_cpu_info.mem_res = mod_cpu_info.cpu_info.meminfo.res
            aly_cpu_state.cpu_info = [aly_cpu_info]

            aly_cpu_state_trace = AnalyticsCpuStateTrace(data=aly_cpu_state)
            aly_cpu_state_trace.send()

            # Send out the UVEKey-Count stats for this time period
            self.process_stats()

            duration = time.time() - before
            if duration < 60:
                gevent.sleep(60 - duration)
            else:
                self._logger.error("Periodic collection took %s sec" % duration)
Ejemplo n.º 10
0
class Controller(object):
    def fail_cb(self, manager, entrypoint, exception):
        self._sandesh._logger.info("Load failed for %s with exception %s" % \
                                     (str(entrypoint),str(exception)))

    def __init__(self, conf, test_logger=None):
        self._conf = conf
        module = Module.ALARM_GENERATOR
        self._moduleid = ModuleNames[module]
        node_type = Module2NodeType[module]
        self._node_type_name = NodeTypeNames[node_type]
        self._hostname = socket.gethostname()
        self._instance_id = self._conf.worker_id()
        is_collector = True
        if test_logger is not None:
            is_collector = False
        self._sandesh = Sandesh()
        self._sandesh.init_generator(self._moduleid,
                                     self._hostname,
                                     self._node_type_name,
                                     self._instance_id,
                                     self._conf.collectors(),
                                     self._node_type_name,
                                     self._conf.http_port(),
                                     ['opserver.sandesh', 'sandesh'],
                                     host_ip=self._conf.host_ip(),
                                     connect_to_collector=is_collector)
        if test_logger is not None:
            self._logger = test_logger
        else:
            self._sandesh.set_logging_params(
                enable_local_log=self._conf.log_local(),
                category=self._conf.log_category(),
                level=self._conf.log_level(),
                file=self._conf.log_file(),
                enable_syslog=self._conf.use_syslog(),
                syslog_facility=self._conf.syslog_facility())
            self._logger = self._sandesh._logger
        # Trace buffer list
        self.trace_buf = [{'name': 'DiscoveryMsg', 'size': 1000}]
        # Create trace buffers
        for buf in self.trace_buf:
            self._sandesh.trace_buffer_create(name=buf['name'],
                                              size=buf['size'])

        tables = [
            "ObjectCollectorInfo", "ObjectDatabaseInfo", "ObjectVRouter",
            "ObjectBgpRouter", "ObjectConfigNode"
        ]
        self.mgrs = {}
        self.tab_alarms = {}
        self.ptab_info = {}
        self.tab_perf = {}
        self.tab_perf_prev = {}
        for table in tables:
            self.mgrs[table] = hook.HookManager(
                namespace='contrail.analytics.alarms',
                name=table,
                invoke_on_load=True,
                invoke_args=(),
                on_load_failure_callback=self.fail_cb)

            for extn in self.mgrs[table][table]:
                self._logger.info('Loaded extensions for %s: %s,%s doc %s' % \
                    (table, extn.name, extn.entry_point_target, extn.obj.__doc__))

            self.tab_alarms[table] = {}
            self.tab_perf[table] = AGTabStats()

        ConnectionState.init(
            self._sandesh, self._hostname, self._moduleid, self._instance_id,
            staticmethod(ConnectionState.get_process_state_cb), NodeStatusUVE,
            NodeStatus)

        self._us = UVEServer(None, self._logger, self._conf.redis_password())

        self._workers = {}
        self._uveq = {}
        self._uveqf = {}

        self.disc = None
        self._libpart_name = self._hostname + ":" + self._instance_id
        self._libpart = None
        self._partset = set()
        if self._conf.discovery()['server']:
            data = {'ip-address': self._hostname, 'port': self._instance_id}
            self.disc = client.DiscoveryClient(
                self._conf.discovery()['server'],
                self._conf.discovery()['port'],
                ModuleNames[Module.ALARM_GENERATOR])
            self._logger.info("Disc Publish to %s : %s" %
                              (str(self._conf.discovery()), str(data)))
            self.disc.publish(ALARM_GENERATOR_SERVICE_NAME, data)
        else:
            # If there is no discovery service, use fixed redis_uve list
            redis_uve_list = []
            try:
                for redis_uve in self._conf.redis_uve_list():
                    redis_ip_port = redis_uve.split(':')
                    redis_elem = (redis_ip_port[0], int(redis_ip_port[1]), 0)
                    redis_uve_list.append(redis_elem)
            except Exception as e:
                self._logger.error('Failed to parse redis_uve_list: %s' % e)
            else:
                self._us.update_redis_uve_list(redis_uve_list)

            # If there is no discovery service, use fixed alarmgen list
            self._libpart = self.start_libpart(self._conf.alarmgen_list())

        PartitionOwnershipReq.handle_request = self.handle_PartitionOwnershipReq
        PartitionStatusReq.handle_request = self.handle_PartitionStatusReq
        UVETableAlarmReq.handle_request = self.handle_UVETableAlarmReq
        UVETableInfoReq.handle_request = self.handle_UVETableInfoReq
        UVETablePerfReq.handle_request = self.handle_UVETablePerfReq

    def libpart_cb(self, part_list):

        agpi = AlarmgenPartionInfo()
        agpi.instance = self._instance_id
        agpi.partitions = part_list

        agp = AlarmgenPartition()
        agp.name = self._hostname
        agp.inst_parts = [agpi]

        agp_trace = AlarmgenPartitionTrace(data=agp)
        agp_trace.send(sandesh=self._sandesh)

        newset = set(part_list)
        oldset = self._partset
        self._partset = newset

        self._logger.error('Partition List : new %s old %s' % \
            (str(newset),str(oldset)))

        for addpart in (newset - oldset):
            self._logger.error('Partition Add : %s' % addpart)
            self.partition_change(addpart, True)

        for delpart in (oldset - newset):
            self._logger.error('Partition Del : %s' % delpart)
            self.partition_change(delpart, False)

        self._logger.error('Partition List done : new %s old %s' % \
            (str(newset),str(oldset)))

    def start_libpart(self, ag_list):
        if not self._conf.zk_list():
            self._logger.error('Could not import libpartition: No zookeeper')
            return None
        if not ag_list:
            self._logger.error(
                'Could not import libpartition: No alarmgen list')
            return None
        try:
            self._logger.error('Starting PC')
            agpi = AlarmgenPartionInfo()
            agpi.instance = self._instance_id
            agpi.partitions = []

            agp = AlarmgenPartition()
            agp.name = self._hostname
            agp.inst_parts = [agpi]

            agp_trace = AlarmgenPartitionTrace(data=agp)
            agp_trace.send(sandesh=self._sandesh)

            pc = PartitionClient("alarmgen", self._libpart_name, ag_list,
                                 self._conf.partitions(), self.libpart_cb,
                                 ','.join(self._conf.zk_list()))
            self._logger.error('Started PC')
            return pc
        except Exception as e:
            self._logger.error('Could not import libpartition: %s' % str(e))
            return None

    def handle_uve_notifq(self, part, uves):
        if part not in self._uveq:
            self._uveq[part] = {}
            self._logger.error('Created uveQ for part %s' % str(part))
        for uv, types in uves.iteritems():
            if types is None:
                self._uveq[part][uv] = None
            else:
                if uv in self._uveq[part]:
                    if self._uveq[part][uv] is not None:
                        self._uveq[part][uv].update(types)
                else:
                    self._uveq[part][uv] = set()
                    self._uveq[part][uv].update(types)

    def run_uve_processing(self):
        """
        This function runs in its own gevent, and provides state compression
        for UVEs.
        Kafka worker (PartitionHandler)  threads detect which UVE have changed
        and accumulate them onto a set. When this gevent runs, it processes
        all UVEs of the set. Even if this gevent cannot run for a while, the
        set should not grow in an unbounded manner (like a queue can)
        """

        while True:
            for part in self._uveqf.keys():
                self._logger.error("Stop UVE processing for %d" % part)
                self.stop_uve_partition(part)
                del self._uveqf[part]
                if part in self._uveq:
                    del self._uveq[part]
            prev = time.time()
            gevs = {}
            for part in self._uveq.keys():
                if not len(self._uveq[part]):
                    continue
                self._logger.info("UVE Process for %d" % part)

                # Allow the partition handlers to queue new UVEs without
                # interfering with the work of processing the current UVEs
                workingset = copy.deepcopy(self._uveq[part])
                self._uveq[part] = {}

                gevs[part] = gevent.spawn(self.handle_uve_notif, part,
                                          workingset)
            if len(gevs):
                gevent.joinall(gevs.values())
                for part in gevs.keys():
                    # If UVE processing failed, requeue the working set
                    if not gevs[part].get():
                        self._logger.error("UVE Process failed for %d" % part)
                        self.handle_uve_notifq(part, workingset)
            curr = time.time()
            if (curr - prev) < 0.2:
                gevent.sleep(0.2 - (curr - prev))
            else:
                self._logger.info("UVE Process saturated")
                gevent.sleep(0)

    def stop_uve_partition(self, part):
        for tk in self.ptab_info[part].keys():
            for rkey in self.ptab_info[part][tk].keys():
                uk = tk + ":" + rkey
                if tk in self.tab_alarms:
                    if uk in self.tab_alarms[tk]:
                        del self.tab_alarms[tk][uk]
                        ustruct = UVEAlarms(name=rkey, deleted=True)
                        alarm_msg = AlarmTrace(data=ustruct, table=tk)
                        self._logger.error('send del alarm for stop: %s' % \
                                (alarm_msg.log()))
                        alarm_msg.send(sandesh=self._sandesh)
                del self.ptab_info[part][tk][rkey]
                self._logger.error("UVE %s deleted in stop" % (uk))
            del self.ptab_info[part][tk]
        del self.ptab_info[part]

    def handle_uve_notif(self, part, uves):
        """
        Call this function when a UVE has changed. This can also
        happed when taking ownership of a partition, or when a
        generator is deleted.
        Args:
            part   : Partition Number
            uve    : dict, where the key is the UVE Name.
                     The value is either a set of UVE structs, or "None",
                     which means that all UVE structs should be processed

        Returns: 
            status of operation (True for success)
        """
        self._logger.debug("Changed part %d UVEs : %s" % (part, str(uves)))
        success = True
        for uv, types in uves.iteritems():
            tab = uv.split(':', 1)[0]
            if tab not in self.tab_perf:
                self.tab_perf[tab] = AGTabStats()

            uve_name = uv.split(':', 1)[1]
            prevt = UTCTimestampUsec()
            filters = {}
            if types:
                filters["cfilt"] = {}
                for typ in types:
                    filters["cfilt"][typ] = set()

            failures, uve_data = self._us.get_uve(uv, True, filters)

            # Do not store alarms in the UVE Cache
            if "UVEAlarms" in uve_data:
                del uve_data["UVEAlarms"]

            if failures:
                success = False
            self.tab_perf[tab].record_get(UTCTimestampUsec() - prevt)
            # Handling Agg UVEs
            if not part in self.ptab_info:
                self._logger.error("Creating UVE table for part %s" %
                                   str(part))
                self.ptab_info[part] = {}

            if not tab in self.ptab_info[part]:
                self.ptab_info[part][tab] = {}

            if uve_name not in self.ptab_info[part][tab]:
                self.ptab_info[part][tab][uve_name] = AGKeyInfo(part)
            prevt = UTCTimestampUsec()
            if not types:
                self.ptab_info[part][tab][uve_name].update(uve_data)
                if len(self.ptab_info[part][tab][uve_name].removed()):
                    self._logger.info("UVE %s removed structs %s" % (uve_name, \
                            self.ptab_info[part][tab][uve_name].removed()))
                if len(self.ptab_info[part][tab][uve_name].changed()):
                    self._logger.debug("UVE %s changed structs %s" % (uve_name, \
                            self.ptab_info[part][tab][uve_name].changed()))
                if len(self.ptab_info[part][tab][uve_name].added()):
                    self._logger.debug("UVE %s added structs %s" % (uve_name, \
                            self.ptab_info[part][tab][uve_name].added()))
            else:
                for typ in types:
                    val = None
                    if typ in uve_data:
                        val = uve_data[typ]
                    self.ptab_info[part][tab][uve_name].update_single(typ, val)
                    if len(self.ptab_info[part][tab][uve_name].removed()):
                        self._logger.info("UVE %s removed structs %s" % (uve_name, \
                                self.ptab_info[part][tab][uve_name].removed()))
                    if len(self.ptab_info[part][tab][uve_name].changed()):
                        self._logger.debug("UVE %s changed structs %s" % (uve_name, \
                                self.ptab_info[part][tab][uve_name].changed()))
                    if len(self.ptab_info[part][tab][uve_name].added()):
                        self._logger.debug("UVE %s added structs %s" % (uve_name, \
                                self.ptab_info[part][tab][uve_name].added()))

            local_uve = self.ptab_info[part][tab][uve_name].values()

            if len(local_uve.keys()) == 0:
                self._logger.info("UVE %s deleted in proc" % (uv))
                del self.ptab_info[part][tab][uve_name]

            self.tab_perf[tab].record_pub(UTCTimestampUsec() - prevt)

            # Withdraw the alarm if the UVE has no non-alarm structs
            if len(local_uve.keys()) == 0:
                if tab in self.tab_alarms:
                    if uv in self.tab_alarms[tab]:
                        del self.tab_alarms[tab][uv]
                        ustruct = UVEAlarms(name=uve_name, deleted=True)
                        alarm_msg = AlarmTrace(data=ustruct, table=tab)
                        self._logger.info('send del alarm: %s' %
                                          (alarm_msg.log()))
                        alarm_msg.send(sandesh=self._sandesh)
                continue

            # Handing Alarms
            if not self.mgrs.has_key(tab):
                continue
            prevt = UTCTimestampUsec()
            results = self.mgrs[tab].map_method("__call__", uv, local_uve)
            self.tab_perf[tab].record_call(UTCTimestampUsec() - prevt)
            new_uve_alarms = {}
            for res in results:
                nm, sev, errs = res
                self._logger.debug("Alarm[%s] %s: %s" % (tab, nm, str(errs)))
                elems = []
                for ae in errs:
                    rule, val = ae
                    rv = AlarmElement(rule, val)
                    elems.append(rv)
                if len(elems):
                    new_uve_alarms[nm] = UVEAlarmInfo(type=nm,
                                                      severity=sev,
                                                      timestamp=0,
                                                      description=elems,
                                                      ack=False)
            del_types = []
            if self.tab_alarms[tab].has_key(uv):
                for nm, uai in self.tab_alarms[tab][uv].iteritems():
                    uai2 = copy.deepcopy(uai)
                    uai2.timestamp = 0
                    # This type was present earlier, but is now gone
                    if not new_uve_alarms.has_key(nm):
                        del_types.append(nm)
                    else:
                        # This type has no new information
                        if pprint.pformat(uai2) == \
                                pprint.pformat(new_uve_alarms[nm]):
                            del new_uve_alarms[nm]
            if len(del_types) != 0  or \
                    len(new_uve_alarms) != 0:
                self._logger.debug("Alarm[%s] Deleted %s" % \
                        (tab, str(del_types)))
                self._logger.debug("Alarm[%s] Updated %s" % \
                        (tab, str(new_uve_alarms)))
                # These alarm types are new or updated
                for nm, uai2 in new_uve_alarms.iteritems():
                    uai = copy.deepcopy(uai2)
                    uai.timestamp = UTCTimestampUsec()
                    if not self.tab_alarms[tab].has_key(uv):
                        self.tab_alarms[tab][uv] = {}
                    self.tab_alarms[tab][uv][nm] = uai
                # These alarm types are now gone
                for dnm in del_types:
                    del self.tab_alarms[tab][uv][dnm]

                ustruct = None
                if len(self.tab_alarms[tab][uv]) == 0:
                    ustruct = UVEAlarms(name=uve_name, deleted=True)
                    del self.tab_alarms[tab][uv]
                else:
                    ustruct = UVEAlarms(
                        name=uve_name,
                        alarms=self.tab_alarms[tab][uv].values(),
                        deleted=False)
                alarm_msg = AlarmTrace(data=ustruct, table=tab)
                self._logger.info('send alarm: %s' % (alarm_msg.log()))
                alarm_msg.send(sandesh=self._sandesh)
        return success

    def handle_UVETableInfoReq(self, req):
        if req.partition == -1:
            parts = self.ptab_info.keys()
        else:
            parts = [req.partition]

        self._logger.info("Got UVETableInfoReq : %s" % str(parts))
        np = 1
        for part in parts:
            if part not in self.ptab_info:
                continue
            tables = []
            for tab in self.ptab_info[part].keys():
                uvel = []
                for uk, uv in self.ptab_info[part][tab].iteritems():
                    types = []
                    for tk, tv in uv.values().iteritems():
                        types.append(
                            UVEStructInfo(type=tk, content=json.dumps(tv)))
                    uvel.append(UVEObjectInfo(name=uk, structs=types))
                tables.append(UVETableInfo(table=tab, uves=uvel))
            resp = UVETableInfoResp(partition=part)
            resp.tables = tables

            if np == len(parts):
                mr = False
            else:
                mr = True
            resp.response(req.context(), mr)
            np = np + 1

    def handle_UVETableAlarmReq(self, req):
        status = False
        if req.table == "all":
            parts = self.tab_alarms.keys()
        else:
            parts = [req.table]
        self._logger.info("Got UVETableAlarmReq : %s" % str(parts))
        np = 1
        for pt in parts:
            resp = UVETableAlarmResp(table=pt)
            uves = []
            for uk, uv in self.tab_alarms[pt].iteritems():
                alms = []
                for ak, av in uv.iteritems():
                    alms.append(av)
                uves.append(UVEAlarms(name=uk, alarms=alms))
            resp.uves = uves
            if np == len(parts):
                mr = False
            else:
                mr = True
            resp.response(req.context(), mr)
            np = np + 1

    def handle_UVETablePerfReq(self, req):
        status = False
        if req.table == "all":
            parts = self.tab_perf_prev.keys()
        else:
            parts = [req.table]
        self._logger.info("Got UVETablePerfReq : %s" % str(parts))
        np = 1
        for pt in parts:
            resp = UVETablePerfResp(table=pt)
            resp.call_time = self.tab_perf_prev[pt].call_result()
            resp.get_time = self.tab_perf_prev[pt].get_result()
            resp.pub_time = self.tab_perf_prev[pt].pub_result()
            resp.updates = self.tab_perf_prev[pt].get_n

            if np == len(parts):
                mr = False
            else:
                mr = True
            resp.response(req.context(), mr)
            np = np + 1

    def partition_change(self, partno, enl):
        """
        Call this function when getting or giving up
        ownership of a partition
        Args:
            partno : Partition Number
            enl    : True for acquiring, False for giving up
        Returns: 
            status of operation (True for success)
        """
        status = False
        if enl:
            if partno in self._workers:
                self._logger.info("Dup partition %d" % partno)
            else:
                ph = UveStreamProc(','.join(self._conf.kafka_broker_list()),
                                   partno, "uve-" + str(partno),
                                   self._logger, self.handle_uve_notifq,
                                   self._conf.host_ip(), self._us)
                ph.start()
                self._workers[partno] = ph
                tout = 600
                idx = 0
                while idx < tout:
                    # When this partitions starts,
                    # uveq will get created
                    if partno not in self._uveq:
                        gevent.sleep(.1)
                    else:
                        break
                    idx += 1
                if partno in self._uveq:
                    status = True
                else:
                    # TODO: The partition has not started yet,
                    #       but it still might start later.
                    #       We possibly need to exit
                    status = False
                    self._logger.error("Unable to start partition %d" % partno)
        else:
            if partno in self._workers:
                ph = self._workers[partno]
                self._logger.error("Kill part %s" % str(partno))
                ph.kill()
                res, db = ph.get(False)
                self._logger.error("Returned " + str(res))
                del self._workers[partno]
                self._uveqf[partno] = True

                tout = 600
                idx = 0
                while idx < tout:
                    # When this partitions stop.s
                    # uveq will get destroyed
                    if partno in self._uveq:
                        gevent.sleep(.1)
                    else:
                        break
                    idx += 1
                if partno not in self._uveq:
                    status = True
                else:
                    # TODO: The partition has not stopped yet
                    #       but it still might stop later.
                    #       We possibly need to exit
                    status = False
                    self._logger.error("Unable to stop partition %d" % partno)
            else:
                self._logger.info("No partition %d" % partno)

        return status

    def handle_PartitionOwnershipReq(self, req):
        self._logger.info("Got PartitionOwnershipReq: %s" % str(req))
        status = self.partition_change(req.partition, req.ownership)

        resp = PartitionOwnershipResp()
        resp.status = status
        resp.response(req.context())

    def process_stats(self):
        ''' Go through the UVEKey-Count stats collected over 
            the previous time period over all partitions
            and send it out
        '''
        self.tab_perf_prev = copy.deepcopy(self.tab_perf)
        for kt in self.tab_perf.keys():
            #self.tab_perf_prev[kt] = copy.deepcopy(self.tab_perf[kt])
            self.tab_perf[kt].reset()

        s_partitions = set()
        s_keys = set()
        n_updates = 0
        for pk, pc in self._workers.iteritems():
            s_partitions.add(pk)
            din, dout = pc.stats()
            for ktab, tab in dout.iteritems():
                au_keys = []
                for uk, uc in tab.iteritems():
                    s_keys.add(uk)
                    n_updates += uc
                    ukc = UVEKeyInfo()
                    ukc.key = uk
                    ukc.count = uc
                    au_keys.append(ukc)
                au_obj = AlarmgenUpdate(name=self._sandesh._source + ':' + \
                        self._sandesh._node_type + ':' + \
                        self._sandesh._module + ':' + \
                        self._sandesh._instance_id,
                        partition = pk,
                        table = ktab,
                        keys = au_keys,
                        notifs = None)
                self._logger.debug('send key stats: %s' % (au_obj.log()))
                au_obj.send(sandesh=self._sandesh)

            for ktab, tab in din.iteritems():
                au_notifs = []
                for kcoll, coll in tab.iteritems():
                    for kgen, gen in coll.iteritems():
                        for tk, tc in gen.iteritems():
                            tkc = UVETypeInfo()
                            tkc.type = tk
                            tkc.count = tc
                            tkc.generator = kgen
                            tkc.collector = kcoll
                            au_notifs.append(tkc)
                au_obj = AlarmgenUpdate(name=self._sandesh._source + ':' + \
                        self._sandesh._node_type + ':' + \
                        self._sandesh._module + ':' + \
                        self._sandesh._instance_id,
                        partition = pk,
                        table = ktab,
                        keys = None,
                        notifs = au_notifs)
                self._logger.debug('send notif stats: %s' % (au_obj.log()))
                au_obj.send(sandesh=self._sandesh)

        au = AlarmgenStatus()
        au.name = self._hostname
        au.counters = []
        au.alarmgens = []
        ags = AlarmgenStats()
        ags.instance = self._instance_id
        ags.partitions = len(s_partitions)
        ags.keys = len(s_keys)
        ags.updates = n_updates
        au.counters.append(ags)

        agname = self._sandesh._source + ':' + \
                        self._sandesh._node_type + ':' + \
                        self._sandesh._module + ':' + \
                        self._sandesh._instance_id
        au.alarmgens.append(agname)

        atrace = AlarmgenStatusTrace(data=au)
        self._logger.debug('send alarmgen status : %s' % (atrace.log()))
        atrace.send(sandesh=self._sandesh)

    def handle_PartitionStatusReq(self, req):
        ''' Return the entire contents of the UVE DB for the 
            requested partitions
        '''
        if req.partition == -1:
            parts = self._workers.keys()
        else:
            parts = [req.partition]

        self._logger.info("Got PartitionStatusReq: %s" % str(parts))
        np = 1
        for pt in parts:
            resp = PartitionStatusResp()
            resp.partition = pt
            if self._workers.has_key(pt):
                resp.enabled = True
                resp.offset = self._workers[pt]._partoffset
                resp.uves = []
                for kcoll, coll in self._workers[pt].contents().iteritems():
                    uci = UVECollInfo()
                    uci.collector = kcoll
                    uci.uves = []
                    for kgen, gen in coll.iteritems():
                        ugi = UVEGenInfo()
                        ugi.generator = kgen
                        ugi.uves = []
                        for tabk, tabc in gen.iteritems():
                            for uk, uc in tabc.iteritems():
                                ukc = UVEKeyInfo()
                                ukc.key = tabk + ":" + uk
                                ukc.types = []
                                for tk, tc in uc.iteritems():
                                    uvtc = UVETypeCount()
                                    uvtc.type = tk
                                    uvtc.count = tc["c"]
                                    uvtc.agg_uuid = str(tc["u"])
                                    ukc.types.append(uvtc)
                                ugi.uves.append(ukc)
                        uci.uves.append(ugi)
                    resp.uves.append(uci)
            else:
                resp.enabled = False
            if np == len(parts):
                mr = False
            else:
                mr = True
            resp.response(req.context(), mr)
            np = np + 1

    def disc_cb_coll(self, clist):
        '''
        Analytics node may be brought up/down any time. For UVE aggregation,
        alarmgen needs to know the list of all Analytics nodes (redis-uves).
        Periodically poll the Collector list [in lieu of 
        redi-uve nodes] from the discovery. 
        '''
        self._logger.error("Discovery Collector callback : %s" % str(clist))
        newlist = []
        for elem in clist:
            ipaddr = elem["ip-address"]
            cpid = 0
            if "pid" in elem:
                cpid = int(elem["pid"])
            newlist.append((ipaddr, self._conf.redis_server_port(), cpid))
        self._us.update_redis_uve_list(newlist)

    def disc_cb_ag(self, alist):
        '''
        Analytics node may be brought up/down any time. For partitioning,
        alarmgen needs to know the list of all Analytics nodes (alarmgens).
        Periodically poll the alarmgen list from the discovery service
        '''
        self._logger.error("Discovery AG callback : %s" % str(alist))
        newlist = []
        for elem in alist:
            ipaddr = elem["ip-address"]
            inst = elem["port"]
            newlist.append(ipaddr + ":" + inst)

        # We should always include ourselves in the list of memebers
        newset = set(newlist)
        newset.add(self._libpart_name)
        newlist = list(newset)
        if not self._libpart:
            self._libpart = self.start_libpart(newlist)
        else:
            self._libpart.update_cluster_list(newlist)

    def run(self):
        alarmgen_cpu_info = CpuInfoData()
        while True:
            before = time.time()
            mod_cpu_info = ModuleCpuInfo()
            mod_cpu_info.module_id = self._moduleid
            mod_cpu_info.instance_id = self._instance_id
            mod_cpu_info.cpu_info = alarmgen_cpu_info.get_cpu_info(
                system=False)
            mod_cpu_state = ModuleCpuState()
            mod_cpu_state.name = self._hostname

            mod_cpu_state.module_cpu_info = [mod_cpu_info]

            alarmgen_cpu_state_trace = ModuleCpuStateTrace(data=mod_cpu_state)
            alarmgen_cpu_state_trace.send(sandesh=self._sandesh)

            aly_cpu_state = AnalyticsCpuState()
            aly_cpu_state.name = self._hostname

            aly_cpu_info = ProcessCpuInfo()
            aly_cpu_info.module_id = self._moduleid
            aly_cpu_info.inst_id = self._instance_id
            aly_cpu_info.cpu_share = mod_cpu_info.cpu_info.cpu_share
            aly_cpu_info.mem_virt = mod_cpu_info.cpu_info.meminfo.virt
            aly_cpu_info.mem_res = mod_cpu_info.cpu_info.meminfo.res
            aly_cpu_state.cpu_info = [aly_cpu_info]

            aly_cpu_state_trace = AnalyticsCpuStateTrace(data=aly_cpu_state)
            aly_cpu_state_trace.send(sandesh=self._sandesh)

            # Send out the UVEKey-Count stats for this time period
            self.process_stats()

            duration = time.time() - before
            if duration < 60:
                gevent.sleep(60 - duration)
            else:
                self._logger.error("Periodic collection took %s sec" %
                                   duration)
Ejemplo n.º 11
0
    def __init__(self, conf, test_logger=None):
        self._conf = conf
        module = Module.ALARM_GENERATOR
        self._moduleid = ModuleNames[module]
        node_type = Module2NodeType[module]
        self._node_type_name = NodeTypeNames[node_type]
        self._hostname = socket.gethostname()
        self._instance_id = self._conf.worker_id()
        is_collector = True
        if test_logger is not None:
            is_collector = False
        self._sandesh = Sandesh()
        # Reset the sandesh send rate limit value
        if self._conf.sandesh_send_rate_limit() is not None:
            SandeshSystem.set_sandesh_send_rate_limit(self._conf.sandesh_send_rate_limit())
        self._sandesh.init_generator(
            self._moduleid,
            self._hostname,
            self._node_type_name,
            self._instance_id,
            self._conf.collectors(),
            self._node_type_name,
            self._conf.http_port(),
            ["opserver.sandesh", "sandesh"],
            host_ip=self._conf.host_ip(),
            connect_to_collector=is_collector,
        )
        if test_logger is not None:
            self._logger = test_logger
        else:
            self._sandesh.set_logging_params(
                enable_local_log=self._conf.log_local(),
                category=self._conf.log_category(),
                level=self._conf.log_level(),
                file=self._conf.log_file(),
                enable_syslog=self._conf.use_syslog(),
                syslog_facility=self._conf.syslog_facility(),
            )
            self._logger = self._sandesh._logger
        # Trace buffer list
        self.trace_buf = [{"name": "DiscoveryMsg", "size": 1000}]
        # Create trace buffers
        for buf in self.trace_buf:
            self._sandesh.trace_buffer_create(name=buf["name"], size=buf["size"])

        tables = ["ObjectCollectorInfo", "ObjectDatabaseInfo", "ObjectVRouter", "ObjectBgpRouter", "ObjectConfigNode"]
        self.mgrs = {}
        self.tab_alarms = {}
        self.ptab_info = {}
        self.tab_perf = {}
        self.tab_perf_prev = {}
        for table in tables:
            self.mgrs[table] = hook.HookManager(
                namespace="contrail.analytics.alarms",
                name=table,
                invoke_on_load=True,
                invoke_args=(),
                on_load_failure_callback=self.fail_cb,
            )

            for extn in self.mgrs[table][table]:
                self._logger.info(
                    "Loaded extensions for %s: %s,%s doc %s"
                    % (table, extn.name, extn.entry_point_target, extn.obj.__doc__)
                )

            self.tab_alarms[table] = {}
            self.tab_perf[table] = AGTabStats()

        ConnectionState.init(
            self._sandesh,
            self._hostname,
            self._moduleid,
            self._instance_id,
            staticmethod(ConnectionState.get_process_state_cb),
            NodeStatusUVE,
            NodeStatus,
        )

        self._us = UVEServer(None, self._logger, self._conf.redis_password())

        self._workers = {}
        self._uvestats = {}
        self._uveq = {}
        self._uveqf = {}

        self.disc = None
        self._libpart_name = self._hostname + ":" + self._instance_id
        self._libpart = None
        self._partset = set()
        if self._conf.discovery()["server"]:
            data = {"ip-address": self._hostname, "port": self._instance_id}
            self.disc = client.DiscoveryClient(
                self._conf.discovery()["server"], self._conf.discovery()["port"], ModuleNames[Module.ALARM_GENERATOR]
            )
            self._logger.info("Disc Publish to %s : %s" % (str(self._conf.discovery()), str(data)))
            self.disc.publish(ALARM_GENERATOR_SERVICE_NAME, data)
        else:
            # If there is no discovery service, use fixed redis_uve list
            redis_uve_list = []
            try:
                for redis_uve in self._conf.redis_uve_list():
                    redis_ip_port = redis_uve.split(":")
                    redis_elem = (redis_ip_port[0], int(redis_ip_port[1]), 0)
                    redis_uve_list.append(redis_elem)
            except Exception as e:
                self._logger.error("Failed to parse redis_uve_list: %s" % e)
            else:
                self._us.update_redis_uve_list(redis_uve_list)

            # If there is no discovery service, use fixed alarmgen list
            self._libpart = self.start_libpart(self._conf.alarmgen_list())

        PartitionOwnershipReq.handle_request = self.handle_PartitionOwnershipReq
        PartitionStatusReq.handle_request = self.handle_PartitionStatusReq
        UVETableAlarmReq.handle_request = self.handle_UVETableAlarmReq
        UVETableInfoReq.handle_request = self.handle_UVETableInfoReq
        UVETablePerfReq.handle_request = self.handle_UVETablePerfReq
Ejemplo n.º 12
0
class Controller(object):
    @staticmethod
    def token(sandesh, timestamp):
        token = {"host_ip": sandesh.host_ip(), "http_port": sandesh._http_server.get_port(), "timestamp": timestamp}
        return base64.b64encode(json.dumps(token))

    @staticmethod
    def alarm_encode(alarms):
        res = {}
        res["UVEAlarms"] = {}
        res["UVEAlarms"]["alarms"] = []
        for k, elem in alarms.iteritems():
            elem_dict = {}
            elem_dict["type"] = elem.type
            elem_dict["ack"] = elem.ack
            elem_dict["timestamp"] = elem.timestamp
            elem_dict["token"] = elem.token
            elem_dict["severity"] = elem.severity
            elem_dict["description"] = []
            for desc in elem.description:
                desc_dict = {}
                desc_dict["value"] = desc.value
                desc_dict["rule"] = desc.rule
                elem_dict["description"].append(desc_dict)
            res["UVEAlarms"]["alarms"].append(elem_dict)
        return res

    def fail_cb(self, manager, entrypoint, exception):
        self._sandesh._logger.info("Load failed for %s with exception %s" % (str(entrypoint), str(exception)))

    def __init__(self, conf, test_logger=None):
        self._conf = conf
        module = Module.ALARM_GENERATOR
        self._moduleid = ModuleNames[module]
        node_type = Module2NodeType[module]
        self._node_type_name = NodeTypeNames[node_type]
        self._hostname = socket.gethostname()
        self._instance_id = self._conf.worker_id()
        is_collector = True
        if test_logger is not None:
            is_collector = False
        self._sandesh = Sandesh()
        # Reset the sandesh send rate limit value
        if self._conf.sandesh_send_rate_limit() is not None:
            SandeshSystem.set_sandesh_send_rate_limit(self._conf.sandesh_send_rate_limit())
        self._sandesh.init_generator(
            self._moduleid,
            self._hostname,
            self._node_type_name,
            self._instance_id,
            self._conf.collectors(),
            self._node_type_name,
            self._conf.http_port(),
            ["opserver.sandesh", "sandesh"],
            host_ip=self._conf.host_ip(),
            connect_to_collector=is_collector,
        )
        if test_logger is not None:
            self._logger = test_logger
        else:
            self._sandesh.set_logging_params(
                enable_local_log=self._conf.log_local(),
                category=self._conf.log_category(),
                level=self._conf.log_level(),
                file=self._conf.log_file(),
                enable_syslog=self._conf.use_syslog(),
                syslog_facility=self._conf.syslog_facility(),
            )
            self._logger = self._sandesh._logger
        # Trace buffer list
        self.trace_buf = [{"name": "DiscoveryMsg", "size": 1000}]
        # Create trace buffers
        for buf in self.trace_buf:
            self._sandesh.trace_buffer_create(name=buf["name"], size=buf["size"])

        tables = ["ObjectCollectorInfo", "ObjectDatabaseInfo", "ObjectVRouter", "ObjectBgpRouter", "ObjectConfigNode"]
        self.mgrs = {}
        self.tab_alarms = {}
        self.ptab_info = {}
        self.tab_perf = {}
        self.tab_perf_prev = {}
        for table in tables:
            self.mgrs[table] = hook.HookManager(
                namespace="contrail.analytics.alarms",
                name=table,
                invoke_on_load=True,
                invoke_args=(),
                on_load_failure_callback=self.fail_cb,
            )

            for extn in self.mgrs[table][table]:
                self._logger.info(
                    "Loaded extensions for %s: %s,%s doc %s"
                    % (table, extn.name, extn.entry_point_target, extn.obj.__doc__)
                )

            self.tab_alarms[table] = {}
            self.tab_perf[table] = AGTabStats()

        ConnectionState.init(
            self._sandesh,
            self._hostname,
            self._moduleid,
            self._instance_id,
            staticmethod(ConnectionState.get_process_state_cb),
            NodeStatusUVE,
            NodeStatus,
        )

        self._us = UVEServer(None, self._logger, self._conf.redis_password())

        self._workers = {}
        self._uvestats = {}
        self._uveq = {}
        self._uveqf = {}

        self.disc = None
        self._libpart_name = self._hostname + ":" + self._instance_id
        self._libpart = None
        self._partset = set()
        if self._conf.discovery()["server"]:
            data = {"ip-address": self._hostname, "port": self._instance_id}
            self.disc = client.DiscoveryClient(
                self._conf.discovery()["server"], self._conf.discovery()["port"], ModuleNames[Module.ALARM_GENERATOR]
            )
            self._logger.info("Disc Publish to %s : %s" % (str(self._conf.discovery()), str(data)))
            self.disc.publish(ALARM_GENERATOR_SERVICE_NAME, data)
        else:
            # If there is no discovery service, use fixed redis_uve list
            redis_uve_list = []
            try:
                for redis_uve in self._conf.redis_uve_list():
                    redis_ip_port = redis_uve.split(":")
                    redis_elem = (redis_ip_port[0], int(redis_ip_port[1]), 0)
                    redis_uve_list.append(redis_elem)
            except Exception as e:
                self._logger.error("Failed to parse redis_uve_list: %s" % e)
            else:
                self._us.update_redis_uve_list(redis_uve_list)

            # If there is no discovery service, use fixed alarmgen list
            self._libpart = self.start_libpart(self._conf.alarmgen_list())

        PartitionOwnershipReq.handle_request = self.handle_PartitionOwnershipReq
        PartitionStatusReq.handle_request = self.handle_PartitionStatusReq
        UVETableAlarmReq.handle_request = self.handle_UVETableAlarmReq
        UVETableInfoReq.handle_request = self.handle_UVETableInfoReq
        UVETablePerfReq.handle_request = self.handle_UVETablePerfReq

    def libpart_cb(self, part_list):

        agpi = AlarmgenPartionInfo()
        agpi.instance = self._instance_id
        agpi.partitions = part_list

        agp = AlarmgenPartition()
        agp.name = self._hostname
        agp.inst_parts = [agpi]

        agp_trace = AlarmgenPartitionTrace(data=agp, sandesh=self._sandesh)
        agp_trace.send(sandesh=self._sandesh)

        newset = set(part_list)
        oldset = self._partset
        self._partset = newset

        self._logger.error("Partition List : new %s old %s" % (str(newset), str(oldset)))

        for addpart in newset - oldset:
            self._logger.error("Partition Add : %s" % addpart)
            self.partition_change(addpart, True)

        for delpart in oldset - newset:
            self._logger.error("Partition Del : %s" % delpart)
            self.partition_change(delpart, False)

        self._logger.error("Partition List done : new %s old %s" % (str(newset), str(oldset)))

    def start_libpart(self, ag_list):
        if not self._conf.zk_list():
            self._logger.error("Could not import libpartition: No zookeeper")
            return None
        if not ag_list:
            self._logger.error("Could not import libpartition: No alarmgen list")
            return None
        try:
            self._logger.error("Starting PC")
            agpi = AlarmgenPartionInfo()
            agpi.instance = self._instance_id
            agpi.partitions = []

            agp = AlarmgenPartition()
            agp.name = self._hostname
            agp.inst_parts = [agpi]

            agp_trace = AlarmgenPartitionTrace(data=agp, sandesh=self._sandesh)
            agp_trace.send(sandesh=self._sandesh)

            pc = PartitionClient(
                "alarmgen",
                self._libpart_name,
                ag_list,
                self._conf.partitions(),
                self.libpart_cb,
                ",".join(self._conf.zk_list()),
            )
            self._logger.error("Started PC")
            return pc
        except Exception as e:
            self._logger.error("Could not import libpartition: %s" % str(e))
            return None

    def handle_uve_notifq(self, part, uves):
        """
        uves : 
          This is a dict of UVEs that have changed, as per the following scheme:
          <UVE-Key> : None               # Any of the types may have changed
                                         # Used during stop_partition and GenDelete
          <UVE-Key> : { <Struct>: {} }   # The given struct may have changed
          <UVE-Key> : { <Struct>: None } # The given struct may have gone
          Our treatment of the 2nd and 3rd case above is the same
        """
        if part not in self._uveq:
            self._uveq[part] = {}
            self._logger.error("Created uveQ for part %s" % str(part))
        for uv, types in uves.iteritems():
            if types is None:
                self._uveq[part][uv] = None
            else:
                if uv in self._uveq[part]:
                    if self._uveq[part][uv] is not None:
                        for kk in types.keys():
                            self._uveq[part][uv][kk] = {}
                else:
                    self._uveq[part][uv] = {}
                    for kk in types.keys():
                        self._uveq[part][uv][kk] = {}

    def handle_resource_check(self, part, current_inst, msgs):
        """
        This function compares the set of synced redis instances
        against the set now being reported by UVEServer
       
        It returns :
        - The updated set of redis instances
        - A set of collectors to be removed
        - A dict with the collector to be added, with the contents
        """
        us_redis_inst = self._us.redis_instances()
        disc_instances = copy.deepcopy(us_redis_inst)

        r_added = disc_instances - current_inst
        r_deleted = current_inst - disc_instances

        coll_delete = set()
        for r_inst in r_deleted:
            ipaddr = r_inst[0]
            port = r_inst[1]
            coll_delete.add(ipaddr + ":" + str(port))

        chg_res = {}
        for r_inst in r_added:
            coll, res = self._us.get_part(part, r_inst)
            chg_res[coll] = res

        return disc_instances, coll_delete, chg_res

    @staticmethod
    def send_agg_uve(redish, inst, part, acq_time, rows):
        """ 
        This function writes aggregated UVEs to redis

        Each row has a UVE key, one of it's structs type names and the structs value
        If type is "None", it means that the UVE is being removed
        If value is none, it mean that struct of the UVE is being removed

        The key and typename information is also published on a redis channel
        """
        old_acq_time = redish.hget("AGPARTS:%s" % inst, part)
        if old_acq_time is None:
            redish.hset("AGPARTS:%s" % inst, part, acq_time)
        else:
            # Is there stale information for this partition?
            if int(old_acq_time) != acq_time:
                ppe2 = redish.pipeline()
                ppe2.hdel("AGPARTS:%s" % inst, part)
                ppe2.smembers("AGPARTKEYS:%s:%d" % (inst, part))
                pperes2 = ppe2.execute()
                ppe3 = redish.pipeline()
                # Remove all contents for this AG-Partition
                for elem in pperes2[-1]:
                    ppe3.delete("AGPARTVALUES:%s:%d:%s" % (inst, part, elem))
                ppe3.delete("AGPARTKEYS:%s:%d" % (inst, part))
                ppe3.hset("AGPARTS:%s" % inst, part, acq_time)
                pperes3 = ppe3.execute()

        pub_list = []
        ppe = redish.pipeline()
        check_keys = set()
        for row in rows:
            vjson = json.dumps(row.val)
            typ = row.typ
            key = row.key
            pub_list.append({"key": key, "type": typ})
            if typ is None:
                # The entire contents of the UVE should be removed
                ppe.srem("AGPARTKEYS:%s:%d" % (inst, part), key)
                ppe.delete("AGPARTVALUES:%s:%d:%s" % (inst, part, key))
            else:
                if row.val is None:
                    # Remove the given struct from the UVE
                    ppe.hdel("AGPARTVALUES:%s:%d:%s" % (inst, part, key), typ)
                    check_keys.add(key)
                else:
                    ppe.sadd("AGPARTKEYS:%s:%d" % (inst, part), key)
                    ppe.hset("AGPARTVALUES:%s:%d:%s" % (inst, part, key), typ, vjson)
        ppe.execute()

        # Find the keys that have no content (all structs have been deleted)
        ppe4 = redish.pipeline()
        check_keys_list = list(check_keys)
        for kk in check_keys_list:
            ppe4.exists("AGPARTVALUES:%s:%d:%s" % (inst, part, kk))
        pperes4 = ppe4.execute()

        # From the index, removes keys for which there are now no contents
        ppe5 = redish.pipeline()
        idx = 0
        for res in pperes4:
            if not res:
                ppe5.srem("AGPARTKEYS:%s:%d" % (inst, part), check_keys_list[idx])
                # TODO: alarmgen should have already figured out if all structs of
                #       the UVE are gone, and should have sent a UVE delete
                #       We should not need to figure this out again
                assert ()
            idx += 1
        ppe5.execute()

        redish.publish("AGPARTPUB:%s:%d" % (inst, part), json.dumps(pub_list))

    def run_uve_processing(self):
        """
        This function runs in its own gevent, and provides state compression
        for UVEs.
        Kafka worker (PartitionHandler)  threads detect which UVE have changed
        and accumulate them onto a set. When this gevent runs, it processes
        all UVEs of the set. Even if this gevent cannot run for a while, the
        set should not grow in an unbounded manner (like a queue can)
        """

        if self.disc:
            max_out_rows = 20
        else:
            max_out_rows = 2
        lredis = None
        while True:
            for part in self._uveqf.keys():
                self._logger.error("Stop UVE processing for %d" % part)
                self.stop_uve_partition(part)
                del self._uveqf[part]
                if part in self._uveq:
                    del self._uveq[part]
            prev = time.time()
            gevs = {}
            pendingset = {}
            for part in self._uveq.keys():
                if not len(self._uveq[part]):
                    continue
                self._logger.info("UVE Process for %d" % part)

                # Allow the partition handlers to queue new UVEs without
                # interfering with the work of processing the current UVEs
                pendingset[part] = copy.deepcopy(self._uveq[part])
                self._uveq[part] = {}

                gevs[part] = gevent.spawn(self.handle_uve_notif, part, pendingset[part])
            if len(gevs):
                gevent.joinall(gevs.values())
                for part in gevs.keys():
                    # If UVE processing failed, requeue the working set
                    outp = gevs[part].get()
                    if outp is None:
                        self._logger.error("UVE Process failed for %d" % part)
                        self.handle_uve_notifq(part, pendingset[part])
                    else:
                        try:
                            if lredis is None:
                                lredis = redis.StrictRedis(
                                    host="127.0.0.1",
                                    port=self._conf.redis_server_port(),
                                    password=self._conf.redis_password(),
                                    db=2,
                                )

                            if len(outp):
                                rows = []
                                for ku, vu in outp.iteritems():
                                    if vu is None:
                                        # This message has no type!
                                        # Its used to indicate a delete of the entire UVE
                                        rows.append(OutputRow(key=ku, typ=None, val=None))
                                        if len(rows) >= max_out_rows:
                                            Controller.send_agg_uve(
                                                lredis, self._instance_id, part, self._workers[part].acq_time(), rows
                                            )
                                            rows[:] = []
                                        continue
                                    for kt, vt in vu.iteritems():
                                        rows.append(OutputRow(key=ku, typ=kt, val=vt))
                                        if len(rows) >= max_out_rows:
                                            Controller.send_agg_uve(
                                                lredis, self._instance_id, part, self._workers[part].acq_time(), rows
                                            )
                                            rows[:] = []
                                # Flush all remaining rows
                                if len(rows):
                                    Controller.send_agg_uve(
                                        lredis, self._instance_id, part, self._workers[part].acq_time(), rows
                                    )
                                    rows[:] = []

                        except Exception as ex:
                            template = "Exception {0} in uve proc. Arguments:\n{1!r}"
                            messag = template.format(type(ex).__name__, ex.args)
                            self._logger.error("%s : traceback %s" % (messag, traceback.format_exc()))
                            lredis = None
                            # We need to requeue
                            self.handle_uve_notifq(part, pendingset[part])
                            gevent.sleep(1)

            curr = time.time()
            if (curr - prev) < 0.5:
                gevent.sleep(0.5 - (curr - prev))
            else:
                self._logger.info("UVE Process saturated")
                gevent.sleep(0)

    def stop_uve_partition(self, part):
        for tk in self.ptab_info[part].keys():
            for rkey in self.ptab_info[part][tk].keys():
                uk = tk + ":" + rkey
                if tk in self.tab_alarms:
                    if uk in self.tab_alarms[tk]:
                        del self.tab_alarms[tk][uk]
                        ustruct = UVEAlarms(name=rkey, deleted=True)
                        alarm_msg = AlarmTrace(data=ustruct, table=tk, sandesh=self._sandesh)
                        self._logger.error("send del alarm for stop: %s" % (alarm_msg.log()))
                        alarm_msg.send(sandesh=self._sandesh)
                del self.ptab_info[part][tk][rkey]
                self._logger.error("UVE %s deleted in stop" % (uk))
            del self.ptab_info[part][tk]
        del self.ptab_info[part]

    def handle_uve_notif(self, part, uves):
        """
        Call this function when a UVE has changed. This can also
        happed when taking ownership of a partition, or when a
        generator is deleted.
        Args:
            part   : Partition Number
            uve    : dict, where the key is the UVE Name.
                     The value is either a dict of UVE structs, or "None",
                     which means that all UVE structs should be processed.

        Returns: 
            status of operation (True for success)
        """
        self._logger.debug("Changed part %d UVEs : %s" % (part, str(uves)))
        success = True
        output = {}
        for uv, types in uves.iteritems():
            tab = uv.split(":", 1)[0]
            if tab not in self.tab_perf:
                self.tab_perf[tab] = AGTabStats()

            if part in self._uvestats:
                # Record stats on UVE Keys being processed
                if not tab in self._uvestats[part]:
                    self._uvestats[part][tab] = {}
                if uv in self._uvestats[part][tab]:
                    self._uvestats[part][tab][uv] += 1
                else:
                    self._uvestats[part][tab][uv] = 1

            uve_name = uv.split(":", 1)[1]
            prevt = UTCTimestampUsec()
            filters = {}
            if types:
                filters["cfilt"] = {}
                for typ in types.keys():
                    filters["cfilt"][typ] = set()

            failures, uve_data = self._us.get_uve(uv, True, filters)

            if failures:
                success = False
            self.tab_perf[tab].record_get(UTCTimestampUsec() - prevt)
            # Handling Agg UVEs
            if not part in self.ptab_info:
                self._logger.error("Creating UVE table for part %s" % str(part))
                self.ptab_info[part] = {}

            if not tab in self.ptab_info[part]:
                self.ptab_info[part][tab] = {}

            if uve_name not in self.ptab_info[part][tab]:
                self.ptab_info[part][tab][uve_name] = AGKeyInfo(part)
            prevt = UTCTimestampUsec()
            output[uv] = {}
            touched = False
            if not types:
                self.ptab_info[part][tab][uve_name].update(uve_data)
                if len(self.ptab_info[part][tab][uve_name].removed()):
                    touched = True
                    self._logger.info(
                        "UVE %s removed structs %s" % (uve_name, self.ptab_info[part][tab][uve_name].removed())
                    )
                    for rems in self.ptab_info[part][tab][uve_name].removed():
                        output[uv][rems] = None
                if len(self.ptab_info[part][tab][uve_name].changed()):
                    touched = True
                    self._logger.debug(
                        "UVE %s changed structs %s" % (uve_name, self.ptab_info[part][tab][uve_name].changed())
                    )
                    for chgs in self.ptab_info[part][tab][uve_name].changed():
                        output[uv][chgs] = self.ptab_info[part][tab][uve_name].values()[chgs]
                if len(self.ptab_info[part][tab][uve_name].added()):
                    touched = True
                    self._logger.debug(
                        "UVE %s added structs %s" % (uve_name, self.ptab_info[part][tab][uve_name].added())
                    )
                    for adds in self.ptab_info[part][tab][uve_name].added():
                        output[uv][adds] = self.ptab_info[part][tab][uve_name].values()[adds]
            else:
                for typ in types:
                    val = None
                    if typ in uve_data:
                        val = uve_data[typ]
                    self.ptab_info[part][tab][uve_name].update_single(typ, val)
                    if len(self.ptab_info[part][tab][uve_name].removed()):
                        touched = True
                        self._logger.info(
                            "UVE %s removed structs %s" % (uve_name, self.ptab_info[part][tab][uve_name].removed())
                        )
                        for rems in self.ptab_info[part][tab][uve_name].removed():
                            output[uv][rems] = None
                    if len(self.ptab_info[part][tab][uve_name].changed()):
                        touched = True
                        self._logger.debug(
                            "UVE %s changed structs %s" % (uve_name, self.ptab_info[part][tab][uve_name].changed())
                        )
                        for chgs in self.ptab_info[part][tab][uve_name].changed():
                            output[uv][chgs] = self.ptab_info[part][tab][uve_name].values()[chgs]
                    if len(self.ptab_info[part][tab][uve_name].added()):
                        touched = True
                        self._logger.debug(
                            "UVE %s added structs %s" % (uve_name, self.ptab_info[part][tab][uve_name].added())
                        )
                        for adds in self.ptab_info[part][tab][uve_name].added():
                            output[uv][adds] = self.ptab_info[part][tab][uve_name].values()[adds]
            if not touched:
                del output[uv]
            local_uve = self.ptab_info[part][tab][uve_name].values()

            self.tab_perf[tab].record_pub(UTCTimestampUsec() - prevt)

            if len(local_uve.keys()) == 0:
                self._logger.info("UVE %s deleted in proc" % (uv))
                del self.ptab_info[part][tab][uve_name]
                output[uv] = None

                # Both alarm and non-alarm contents are gone.
                # We do not need to do alarm evaluation
                continue

            # Withdraw the alarm if the UVE has no non-alarm structs
            if len(local_uve.keys()) == 1 and "UVEAlarms" in local_uve:
                if tab in self.tab_alarms:
                    if uv in self.tab_alarms[tab]:
                        del self.tab_alarms[tab][uv]
                        ustruct = UVEAlarms(name=uve_name, deleted=True)
                        alarm_msg = AlarmTrace(data=ustruct, table=tab, sandesh=self._sandesh)
                        self._logger.info("send del alarm: %s" % (alarm_msg.log()))
                        alarm_msg.send(sandesh=self._sandesh)
                continue

            # Handing Alarms
            if not self.mgrs.has_key(tab):
                continue
            prevt = UTCTimestampUsec()

            # TODO: We may need to remove alarm from local_uve before
            #      alarm evaluation
            # if "UVEAlarms" in uve_data:
            #     del uve_data["UVEAlarms"]

            results = self.mgrs[tab].map_method("__call__", uv, local_uve)
            self.tab_perf[tab].record_call(UTCTimestampUsec() - prevt)
            new_uve_alarms = {}
            for res in results:
                nm, sev, errs = res
                self._logger.debug("Alarm[%s] %s: %s" % (tab, nm, str(errs)))
                elems = []
                for ae in errs:
                    rule, val = ae
                    rv = AlarmElement(rule, val)
                    elems.append(rv)
                if len(elems):
                    new_uve_alarms[nm] = UVEAlarmInfo(
                        type=nm, severity=sev, timestamp=0, token="", description=elems, ack=False
                    )
            del_types = []
            if self.tab_alarms[tab].has_key(uv):
                for nm, uai in self.tab_alarms[tab][uv].iteritems():
                    uai2 = copy.deepcopy(uai)
                    uai2.timestamp = 0
                    uai2.token = ""
                    # This type was present earlier, but is now gone
                    if not new_uve_alarms.has_key(nm):
                        del_types.append(nm)
                    else:
                        # This type has no new information
                        if uai2 == new_uve_alarms[nm]:
                            del new_uve_alarms[nm]
            if len(del_types) != 0 or len(new_uve_alarms) != 0:
                self._logger.debug("Alarm[%s] Deleted %s" % (tab, str(del_types)))
                self._logger.debug("Alarm[%s] Updated %s" % (tab, str(new_uve_alarms)))
                # These alarm types are new or updated
                for nm, uai2 in new_uve_alarms.iteritems():
                    uai = copy.deepcopy(uai2)
                    uai.timestamp = UTCTimestampUsec()
                    uai.token = Controller.token(self._sandesh, uai.timestamp)
                    if not self.tab_alarms[tab].has_key(uv):
                        self.tab_alarms[tab][uv] = {}
                    self.tab_alarms[tab][uv][nm] = uai
                # These alarm types are now gone
                for dnm in del_types:
                    del self.tab_alarms[tab][uv][dnm]

                ustruct = None
                if len(self.tab_alarms[tab][uv]) == 0:
                    ustruct = UVEAlarms(name=uve_name, deleted=True)
                    del self.tab_alarms[tab][uv]
                else:
                    alm_copy = copy.deepcopy(self.tab_alarms[tab][uv])
                    ustruct = UVEAlarms(name=uve_name, alarms=alm_copy.values(), deleted=False)
                alarm_msg = AlarmTrace(data=ustruct, table=tab, sandesh=self._sandesh)
                self._logger.info("send alarm: %s" % (alarm_msg.log()))
                alarm_msg.send(sandesh=self._sandesh)
        if success:
            return output
        else:
            return None

    def handle_UVETableInfoReq(self, req):
        if req.partition == -1:
            parts = self.ptab_info.keys()
        else:
            parts = [req.partition]

        self._logger.info("Got UVETableInfoReq : %s" % str(parts))
        np = 1
        for part in parts:
            if part not in self.ptab_info:
                continue
            tables = []
            for tab in self.ptab_info[part].keys():
                uvel = []
                for uk, uv in self.ptab_info[part][tab].iteritems():
                    types = []
                    for tk, tv in uv.values().iteritems():
                        types.append(UVEStructInfo(type=tk, content=json.dumps(tv)))
                    uvel.append(UVEObjectInfo(name=uk, structs=types))
                tables.append(UVETableInfo(table=tab, uves=uvel))
            resp = UVETableInfoResp(partition=part)
            resp.tables = tables

            if np == len(parts):
                mr = False
            else:
                mr = True
            resp.response(req.context(), mr)
            np = np + 1

    def handle_UVETableAlarmReq(self, req):
        status = False
        if req.table == "all":
            parts = self.tab_alarms.keys()
        else:
            parts = [req.table]
        self._logger.info("Got UVETableAlarmReq : %s" % str(parts))
        np = 1
        for pt in parts:
            resp = UVETableAlarmResp(table=pt)
            uves = []
            for uk, uv in self.tab_alarms[pt].iteritems():
                alms = []
                for ak, av in uv.iteritems():
                    alms.append(av)
                uves.append(UVEAlarms(name=uk, alarms=alms))
            resp.uves = uves
            if np == len(parts):
                mr = False
            else:
                mr = True
            resp.response(req.context(), mr)
            np = np + 1

    def handle_UVETablePerfReq(self, req):
        status = False
        if req.table == "all":
            parts = self.tab_perf_prev.keys()
        else:
            parts = [req.table]
        self._logger.info("Got UVETablePerfReq : %s" % str(parts))
        np = 1
        for pt in parts:
            resp = UVETablePerfResp(table=pt)
            resp.call_time = self.tab_perf_prev[pt].call_result()
            resp.get_time = self.tab_perf_prev[pt].get_result()
            resp.pub_time = self.tab_perf_prev[pt].pub_result()
            resp.updates = self.tab_perf_prev[pt].get_n

            if np == len(parts):
                mr = False
            else:
                mr = True
            resp.response(req.context(), mr)
            np = np + 1

    def partition_change(self, partno, enl):
        """
        Call this function when getting or giving up
        ownership of a partition
        Args:
            partno : Partition Number
            enl    : True for acquiring, False for giving up
        Returns: 
            status of operation (True for success)
        """
        status = False
        if enl:
            if partno in self._workers:
                self._logger.info("Dup partition %d" % partno)
            else:
                cdisc = None
                if self.disc:
                    cdisc = client.DiscoveryClient(
                        self._conf.discovery()["server"],
                        self._conf.discovery()["port"],
                        ModuleNames[Module.ALARM_GENERATOR],
                        "%s-%s-%d" % (self._hostname, self._instance_id, partno),
                    )
                ph = UveStreamProc(
                    ",".join(self._conf.kafka_broker_list()),
                    partno,
                    "uve-" + str(partno),
                    self._logger,
                    self.handle_uve_notifq,
                    self._conf.host_ip(),
                    self.handle_resource_check,
                    self._instance_id,
                    self._conf.redis_server_port(),
                    cdisc,
                )
                ph.start()
                self._workers[partno] = ph
                self._uvestats[partno] = {}
                tout = 600
                idx = 0
                while idx < tout:
                    # When this partitions starts,
                    # uveq will get created
                    if partno not in self._uveq:
                        gevent.sleep(0.1)
                    else:
                        break
                    idx += 1
                if partno in self._uveq:
                    status = True
                else:
                    # TODO: The partition has not started yet,
                    #       but it still might start later.
                    #       We possibly need to exit
                    status = False
                    self._logger.error("Unable to start partition %d" % partno)
        else:
            if partno in self._workers:
                ph = self._workers[partno]
                self._logger.error("Kill part %s" % str(partno))
                ph.kill()
                res, db = ph.get(False)
                self._logger.error("Returned " + str(res))
                del self._workers[partno]
                del self._uvestats[partno]
                self._uveqf[partno] = True

                tout = 600
                idx = 0
                while idx < tout:
                    # When this partitions stop.s
                    # uveq will get destroyed
                    if partno in self._uveq:
                        gevent.sleep(0.1)
                    else:
                        break
                    idx += 1
                if partno not in self._uveq:
                    status = True
                else:
                    # TODO: The partition has not stopped yet
                    #       but it still might stop later.
                    #       We possibly need to exit
                    status = False
                    self._logger.error("Unable to stop partition %d" % partno)
            else:
                self._logger.info("No partition %d" % partno)

        return status

    def handle_PartitionOwnershipReq(self, req):
        self._logger.info("Got PartitionOwnershipReq: %s" % str(req))
        status = self.partition_change(req.partition, req.ownership)

        resp = PartitionOwnershipResp()
        resp.status = status
        resp.response(req.context())

    def process_stats(self):
        """ Go through the UVEKey-Count stats collected over 
            the previous time period over all partitions
            and send it out
        """
        self.tab_perf_prev = copy.deepcopy(self.tab_perf)
        for kt in self.tab_perf.keys():
            # self.tab_perf_prev[kt] = copy.deepcopy(self.tab_perf[kt])
            self.tab_perf[kt].reset()

        s_partitions = set()
        s_keys = set()
        n_updates = 0
        for pk, pc in self._workers.iteritems():
            s_partitions.add(pk)
            din = pc.stats()
            dout = copy.deepcopy(self._uvestats[pk])
            self._uvestats[pk] = {}
            for ktab, tab in dout.iteritems():
                au_keys = []
                for uk, uc in tab.iteritems():
                    s_keys.add(uk)
                    n_updates += uc
                    ukc = UVEKeyCount()
                    ukc.key = uk
                    ukc.count = uc
                    au_keys.append(ukc)
                au_obj = AlarmgenUpdate(
                    name=self._sandesh._source
                    + ":"
                    + self._sandesh._node_type
                    + ":"
                    + self._sandesh._module
                    + ":"
                    + self._sandesh._instance_id,
                    partition=pk,
                    table=ktab,
                    o=au_keys,
                    i=None,
                    sandesh=self._sandesh,
                )
                self._logger.debug("send output stats: %s" % (au_obj.log()))
                au_obj.send(sandesh=self._sandesh)

            for ktab, tab in din.iteritems():
                au_notifs = []
                for kcoll, coll in tab.iteritems():
                    for kgen, gen in coll.iteritems():
                        for tk, tc in gen.iteritems():
                            tkc = UVETypeInfo()
                            tkc.type = tk
                            tkc.count = tc
                            tkc.generator = kgen
                            tkc.collector = kcoll
                            au_notifs.append(tkc)
                au_obj = AlarmgenUpdate(
                    name=self._sandesh._source
                    + ":"
                    + self._sandesh._node_type
                    + ":"
                    + self._sandesh._module
                    + ":"
                    + self._sandesh._instance_id,
                    partition=pk,
                    table=ktab,
                    o=None,
                    i=au_notifs,
                    sandesh=self._sandesh,
                )
                self._logger.debug("send input stats: %s" % (au_obj.log()))
                au_obj.send(sandesh=self._sandesh)

        au = AlarmgenStatus()
        au.name = self._hostname
        au.counters = []
        au.alarmgens = []
        ags = AlarmgenStats()
        ags.instance = self._instance_id
        ags.partitions = len(s_partitions)
        ags.keys = len(s_keys)
        ags.updates = n_updates
        au.counters.append(ags)

        agname = (
            self._sandesh._source
            + ":"
            + self._sandesh._node_type
            + ":"
            + self._sandesh._module
            + ":"
            + self._sandesh._instance_id
        )
        au.alarmgens.append(agname)

        atrace = AlarmgenStatusTrace(data=au, sandesh=self._sandesh)
        self._logger.debug("send alarmgen status : %s" % (atrace.log()))
        atrace.send(sandesh=self._sandesh)

    def handle_PartitionStatusReq(self, req):
        """ Return the entire contents of the UVE DB for the 
            requested partitions
        """
        if req.partition == -1:
            parts = self._workers.keys()
        else:
            parts = [req.partition]

        self._logger.info("Got PartitionStatusReq: %s" % str(parts))
        np = 1
        for pt in parts:
            resp = PartitionStatusResp()
            resp.partition = pt
            if self._workers.has_key(pt):
                resp.enabled = True
                resp.offset = self._workers[pt]._partoffset
                resp.uves = []
                for kcoll, coll in self._workers[pt].contents().iteritems():
                    uci = UVECollInfo()
                    uci.collector = kcoll
                    uci.uves = []
                    for kgen, gen in coll.iteritems():
                        ugi = UVEGenInfo()
                        ugi.generator = kgen
                        ugi.uves = []
                        for tabk, tabc in gen.iteritems():
                            for uk, uc in tabc.iteritems():
                                ukc = UVEKeyInfo()
                                ukc.key = tabk + ":" + uk
                                ukc.types = []
                                for tk, tc in uc.iteritems():
                                    uvtc = UVETypeCount()
                                    uvtc.type = tk
                                    uvtc.count = tc["c"]
                                    uvtc.agg_uuid = str(tc["u"])
                                    ukc.types.append(uvtc)
                                ugi.uves.append(ukc)
                        uci.uves.append(ugi)
                    resp.uves.append(uci)
            else:
                resp.enabled = False
            if np == len(parts):
                mr = False
            else:
                mr = True
            resp.response(req.context(), mr)
            np = np + 1

    def disc_cb_coll(self, clist):
        """
        Analytics node may be brought up/down any time. For UVE aggregation,
        alarmgen needs to know the list of all Analytics nodes (redis-uves).
        Periodically poll the Collector list [in lieu of 
        redi-uve nodes] from the discovery. 
        """
        self._logger.error("Discovery Collector callback : %s" % str(clist))
        newlist = []
        for elem in clist:
            ipaddr = elem["ip-address"]
            cpid = 0
            if "pid" in elem:
                cpid = int(elem["pid"])
            newlist.append((ipaddr, self._conf.redis_server_port(), cpid))
        self._us.update_redis_uve_list(newlist)

    def disc_cb_ag(self, alist):
        """
        Analytics node may be brought up/down any time. For partitioning,
        alarmgen needs to know the list of all Analytics nodes (alarmgens).
        Periodically poll the alarmgen list from the discovery service
        """
        self._logger.error("Discovery AG callback : %s" % str(alist))
        newlist = []
        for elem in alist:
            ipaddr = elem["ip-address"]
            inst = elem["port"]
            newlist.append(ipaddr + ":" + inst)

        # We should always include ourselves in the list of memebers
        newset = set(newlist)
        newset.add(self._libpart_name)
        newlist = list(newset)
        if not self._libpart:
            self._libpart = self.start_libpart(newlist)
        else:
            self._libpart.update_cluster_list(newlist)

    def run_cpu_mon(self):
        alarmgen_cpu_info = CpuInfoData()
        while True:
            before = time.time()
            mod_cpu_info = ModuleCpuInfo()
            mod_cpu_info.module_id = self._moduleid
            mod_cpu_info.instance_id = self._instance_id
            mod_cpu_info.cpu_info = alarmgen_cpu_info.get_cpu_info(system=False)
            mod_cpu_state = ModuleCpuState()
            mod_cpu_state.name = self._hostname

            mod_cpu_state.module_cpu_info = [mod_cpu_info]

            alarmgen_cpu_state_trace = ModuleCpuStateTrace(data=mod_cpu_state, sandesh=self._sandesh)
            alarmgen_cpu_state_trace.send(sandesh=self._sandesh)

            aly_cpu_state = AnalyticsCpuState()
            aly_cpu_state.name = self._hostname

            aly_cpu_info = ProcessCpuInfo()
            aly_cpu_info.module_id = self._moduleid
            aly_cpu_info.inst_id = self._instance_id
            aly_cpu_info.cpu_share = mod_cpu_info.cpu_info.cpu_share
            aly_cpu_info.mem_virt = mod_cpu_info.cpu_info.meminfo.virt
            aly_cpu_info.mem_res = mod_cpu_info.cpu_info.meminfo.res
            aly_cpu_state.cpu_info = [aly_cpu_info]

            aly_cpu_state_trace = AnalyticsCpuStateTrace(data=aly_cpu_state, sandesh=self._sandesh)
            aly_cpu_state_trace.send(sandesh=self._sandesh)

            # Send out the UVEKey-Count stats for this time period
            self.process_stats()

            duration = time.time() - before
            if duration < 60:
                gevent.sleep(60 - duration)
            else:
                self._logger.error("Periodic collection took %s sec" % duration)

    def run(self):
        self.gevs = [gevent.spawn(self.run_cpu_mon), gevent.spawn(self.run_uve_processing)]

        if self.disc:
            sp1 = ServicePoller(
                self._logger,
                CollectorTrace,
                self.disc,
                COLLECTOR_DISCOVERY_SERVICE_NAME,
                self.disc_cb_coll,
                self._sandesh,
            )

            sp1.start()
            self.gevs.append(sp1)

            sp2 = ServicePoller(
                self._logger, AlarmgenTrace, self.disc, ALARM_GENERATOR_SERVICE_NAME, self.disc_cb_ag, self._sandesh
            )
            sp2.start()
            self.gevs.append(sp2)

        try:
            gevent.joinall(self.gevs)
        except KeyboardInterrupt:
            print "Exiting on ^C"
        except:
            raise
        finally:
            self.stop()

    def stop(self):
        self._sandesh._client._connection.set_admin_state(down=True)
        self._sandesh.uninit()
        gevent.killall(self.gevs)

    def sigterm_handler(self):
        self.stop()
        exit()
 def setUp(self):
     self._oss = UVEServer(0, 0)
Ejemplo n.º 14
0
class Controller(object):
    
    @staticmethod
    def fail_cb(manager, entrypoint, exception):
        sandesh_global._logger.info("Load failed for %s with exception %s" % \
                                     (str(entrypoint),str(exception)))
        
    def __init__(self, conf):
        self._conf = conf
        module = Module.ALARM_GENERATOR
        self._moduleid = ModuleNames[module]
        node_type = Module2NodeType[module]
        self._node_type_name = NodeTypeNames[node_type]
        self._hostname = socket.gethostname()
        self._instance_id = self._conf.worker_id()
        sandesh_global.init_generator(self._moduleid, self._hostname,
                                      self._node_type_name, self._instance_id,
                                      self._conf.collectors(), 
                                      self._node_type_name,
                                      self._conf.http_port(),
                                      ['opserver.sandesh', 'sandesh'])
        sandesh_global.set_logging_params(
            enable_local_log=self._conf.log_local(),
            category=self._conf.log_category(),
            level=self._conf.log_level(),
            file=self._conf.log_file(),
            enable_syslog=self._conf.use_syslog(),
            syslog_facility=self._conf.syslog_facility())
        self._logger = sandesh_global._logger

        # Trace buffer list
        self.trace_buf = [
            {'name':'DiscoveryMsg', 'size':1000}
        ]
        # Create trace buffers 
        for buf in self.trace_buf:
            sandesh_global.trace_buffer_create(name=buf['name'], size=buf['size'])

        tables = [ "ObjectCollectorInfo",
                   "ObjectDatabaseInfo",
                   "ObjectVRouter",
                   "ObjectBgpRouter",
                   "ObjectConfigNode" ] 
        self.mgrs = {}
        self.tab_alarms = {}
        for table in tables:
            self.mgrs[table] = hook.HookManager(
                namespace='contrail.analytics.alarms',
                name=table,
                invoke_on_load=True,
                invoke_args=(),
                on_load_failure_callback=Controller.fail_cb
            )
            
            for extn in self.mgrs[table][table]:
                self._logger.info('Loaded extensions for %s: %s,%s' % \
                    (table, extn.name, extn.entry_point_target))

            self.tab_alarms[table] = {}

        ConnectionState.init(sandesh_global, self._hostname, self._moduleid,
            self._instance_id,
            staticmethod(ConnectionState.get_process_state_cb),
            NodeStatusUVE, NodeStatus)

        self._us = UVEServer(None, self._logger, self._conf.redis_password())

        self._workers = {}

        self.disc = None
        self._libpart_name = self._hostname + ":" + self._instance_id
        self._libpart = None
        self._partset = set()
        if self._conf.discovery()['server']:
            import discoveryclient.client as client 
            data = {
                'ip-address': self._hostname ,
                'port': self._instance_id
            }
            self.disc = client.DiscoveryClient(
                self._conf.discovery()['server'],
                self._conf.discovery()['port'],
                ModuleNames[Module.ALARM_GENERATOR])
            self._logger.info("Disc Publish to %s : %s"
                          % (str(self._conf.discovery()), str(data)))
            self.disc.publish(ALARM_GENERATOR_SERVICE_NAME, data)
        else:
            # If there is no discovery service, use fixed redis_uve list
            redis_uve_list = []
            try:
                for redis_uve in self._conf.redis_uve_list():
                    redis_ip_port = redis_uve.split(':')
                    redis_ip_port = (redis_ip_port[0], int(redis_ip_port[1]))
                    redis_uve_list.append(redis_ip_port)
            except Exception as e:
                self._logger.error('Failed to parse redis_uve_list: %s' % e)
            else:
                self._us.update_redis_uve_list(redis_uve_list)

            # If there is no discovery service, use fixed alarmgen list
            self._libpart = self.start_libpart(self._conf.alarmgen_list())

        PartitionOwnershipReq.handle_request = self.handle_PartitionOwnershipReq
        PartitionStatusReq.handle_request = self.handle_PartitionStatusReq
        UVETableAlarmReq.handle_request = self.handle_UVETableAlarmReq 

    def libpart_cb(self, part_list):

        newset = set(part_list)
        oldset = self._partset
        self._partset = newset

        self._logger.info('Partition List : new %s old %s' % \
            (str(newset),str(oldset)))
        
        for addpart in (newset-oldset):
            self._logger.info('Partition Add : %s' % addpart)
            self.partition_change(addpart, True)
        
        for delpart in (oldset-newset):
            self._logger.info('Partition Del : %s' % delpart)
            self.partition_change(delpart, True)

    def start_libpart(self, ag_list):
        if not self._conf.zk_list():
            self._logger.error('Could not import libpartition: No zookeeper')
            return None
        if not ag_list:
            self._logger.error('Could not import libpartition: No alarmgen list')
            return None
        try:
            from libpartition.libpartition import PartitionClient
            self._logger.error('Starting PC')
            pc = PartitionClient("alarmgen",
                    self._libpart_name, ag_list,
                    self._conf.partitions(), self.libpart_cb,
                    ','.join(self._conf.zk_list()))
            self._logger.error('Started PC')
            return pc
        except Exception as e:
            self._logger.error('Could not import libpartition: %s' % str(e))
            return None

    def handle_uve_notif(self, uves, remove = False):
        self._logger.debug("Changed UVEs : %s" % str(uves))
        no_handlers = set()
        for uv in uves:
            tab = uv.split(':',1)[0]
            uve_name = uv.split(':',1)[1]
            if not self.mgrs.has_key(tab):
                no_handlers.add(tab)
                continue
            if remove:
                uve_data = []
            else:
                filters = {'kfilt': [uve_name]}
                itr = self._us.multi_uve_get(tab, True, filters)
                uve_data = itr.next()['value']
            if len(uve_data) == 0:
                self._logger.info("UVE %s deleted" % uv)
                if self.tab_alarms[tab].has_key(uv):
		    del self.tab_alarms[tab][uv]
                    uname = uv.split(":",1)[1]
                    ustruct = UVEAlarms(name = uname, deleted = True)
                    alarm_msg = AlarmTrace(data=ustruct, table=tab)
                    self._logger.info('send del alarm: %s' % (alarm_msg.log()))
                    alarm_msg.send()
                continue
            results = self.mgrs[tab].map_method("__call__", uv, uve_data)
            new_uve_alarms = {}
            for res in results:
                nm, errs = res
                self._logger.debug("Alarm[%s] %s: %s" % (tab, nm, str(errs)))
                elems = []
                for ae in errs:
                    rule, val = ae
                    rv = AlarmElement(rule, val)
                    elems.append(rv)
                if len(elems):
                    new_uve_alarms[nm] = UVEAlarmInfo(type = nm,
                                           description = elems, ack = False)
            if (not self.tab_alarms[tab].has_key(uv)) or \
                       pprint.pformat(self.tab_alarms[tab][uv]) != \
                       pprint.pformat(new_uve_alarms):
                uname = uv.split(":")[1]
                ustruct = UVEAlarms(name = uname, alarms = new_uve_alarms.values(),
                                    deleted = False)
                alarm_msg = AlarmTrace(data=ustruct, table=tab)
                self._logger.info('send alarm: %s' % (alarm_msg.log()))
                alarm_msg.send()
            self.tab_alarms[tab][uv] = new_uve_alarms
            
        if len(no_handlers):
            self._logger.debug('No Alarm Handlers for %s' % str(no_handlers))

    def handle_UVETableAlarmReq(self, req):
        status = False
        if req.table == "all":
            parts = self.tab_alarms.keys()
        else:
            parts = [req.table]
        self._logger.info("Got UVETableAlarmReq : %s" % str(parts))
        np = 1
        for pt in parts:
            resp = UVETableAlarmResp(table = pt)
            uves = []
            for uk,uv in self.tab_alarms[pt].iteritems():
                alms = []
                for ak,av in uv.iteritems():
                    alms.append(av)
                uves.append(UVEAlarms(name = uk, alarms = alms))
            resp.uves = uves 
            if np == len(parts):
                mr = False
            else:
                mr = True
            resp.response(req.context(), mr)
            np = np + 1

    
    def partition_change(self, partno, enl):
        """
        Call this function when getting or giving up
        ownership of a partition
        Args:
            partno : Partition Number
            enl    : True for acquiring, False for giving up
        Returns: 
            status of operation (True for success)
        """
        status = False
        if enl:
            if self._workers.has_key(partno):
                self._logger.info("Dup partition %d" % partno)
            else:
                #uvedb = self._us.get_part(partno)
                ph = UveStreamProc(','.join(self._conf.kafka_broker_list()),
                                   partno, "uve-" + str(partno),
                                   self._logger, self._us.get_part,
                                   self.handle_uve_notif)
                ph.start()
                self._workers[partno] = ph
                status = True
        else:
            if self._workers.has_key(partno):
                ph = self._workers[partno]
                gevent.kill(ph)
                res,db = ph.get()
                print "Returned " + str(res)
                print "State :"
                for k,v in db.iteritems():
                    print "%s -> %s" % (k,str(v)) 
                del self._workers[partno]
                status = True
            else:
                self._logger.info("No partition %d" % partno)

        return status
    
    def handle_PartitionOwnershipReq(self, req):
        self._logger.info("Got PartitionOwnershipReq: %s" % str(req))
        status = self.partition_change(req.partition, req.ownership)

        resp = PartitionOwnershipResp()
        resp.status = status
	resp.response(req.context())
               
    def process_stats(self):
        ''' Go through the UVEKey-Count stats collected over 
            the previous time period over all partitions
            and send it out
        '''
        for pk,pc in self._workers.iteritems():
            din, dout = pc.stats()
            for ktab,tab in dout.iteritems():
                au = AlarmgenUpdate()
                au.name = self._hostname
                au.instance =  self._instance_id
                au.table = ktab
                au.partition = pk
                au.keys = []
                for uk,uc in tab.iteritems():
                    ukc = UVEKeyInfo()
                    ukc.key = uk
                    ukc.count = uc
                    au.keys.append(ukc)
                au_trace = AlarmgenUpdateTrace(data=au)
                self._logger.debug('send key stats: %s' % (au_trace.log()))
                au_trace.send()

            for ktab,tab in din.iteritems():
                au = AlarmgenUpdate()
                au.name = self._hostname
                au.instance =  self._instance_id
                au.table = ktab
                au.partition = pk
                au.notifs = []
                for kcoll,coll in tab.iteritems():
                    for kgen,gen in coll.iteritems():
                        for tk,tc in gen.iteritems():
                            tkc = UVETypeInfo()
                            tkc.type= tk
                            tkc.count = tc
                            tkc.generator = kgen
                            tkc.collector = kcoll
                            au.notifs.append(tkc)
                au_trace = AlarmgenUpdateTrace(data=au)
                self._logger.debug('send notif stats: %s' % (au_trace.log()))
                au_trace.send()
         
    def handle_PartitionStatusReq(self, req):
        ''' Return the entire contents of the UVE DB for the 
            requested partitions
        '''
        if req.partition == -1:
            parts = self._workers.keys()
        else:
            parts = [req.partition]
        
        self._logger.info("Got PartitionStatusReq: %s" % str(parts))
        np = 1
        for pt in parts:
            resp = PartitionStatusResp()
            resp.partition = pt
            if self._workers.has_key(pt):
                resp.enabled = True
                resp.uves = []
                for kcoll,coll in self._workers[pt].contents().iteritems():
                    uci = UVECollInfo()
                    uci.collector = kcoll
                    uci.uves = []
                    for kgen,gen in coll.iteritems():
                        ugi = UVEGenInfo()
                        ugi.generator = kgen
                        ugi.uves = []
                        for uk,uc in gen.iteritems():
                            ukc = UVEKeyInfo()
                            ukc.key = uk
                            ukc.count = uc
                            ugi.uves.append(ukc)
                        uci.uves.append(ugi)
                    resp.uves.append(uci)
            else:
                resp.enabled = False
            if np == len(parts):
                mr = False
            else:
                mr = True
            resp.response(req.context(), mr)
            np = np + 1

    def disc_cb_coll(self, clist):
        '''
        Analytics node may be brought up/down any time. For UVE aggregation,
        alarmgen needs to know the list of all Analytics nodes (redis-uves).
        Periodically poll the Collector list [in lieu of 
        redi-uve nodes] from the discovery. 
        '''
        newlist = []
        for elem in clist:
            (ipaddr,port) = elem
            newlist.append((ipaddr, self._conf.redis_server_port()))
        self._us.update_redis_uve_list(newlist)

    def disc_cb_ag(self, alist):
        '''
        Analytics node may be brought up/down any time. For partitioning,
        alarmgen needs to know the list of all Analytics nodes (alarmgens).
        Periodically poll the alarmgen list from the discovery service
        '''
        newlist = []
        for elem in alist:
            (ipaddr, inst) = elem
            newlist.append(ipaddr + ":" + inst)

        # We should always include ourselves in the list of memebers
        newset = set(newlist)
        newset.add(self._libpart_name)
        newlist = list(newset)
        if not self._libpart:
            self._libpart = self.start_libpart(newlist)
        else:
            self._libpart.update_cluster_list(newlist)

    def run(self):
        alarmgen_cpu_info = CpuInfoData()
        while True:
            before = time.time()
            mod_cpu_info = ModuleCpuInfo()
            mod_cpu_info.module_id = self._moduleid
            mod_cpu_info.instance_id = self._instance_id
            mod_cpu_info.cpu_info = alarmgen_cpu_info.get_cpu_info(
                system=False)
            mod_cpu_state = ModuleCpuState()
            mod_cpu_state.name = self._hostname

            mod_cpu_state.module_cpu_info = [mod_cpu_info]

            alarmgen_cpu_state_trace = ModuleCpuStateTrace(data=mod_cpu_state)
            alarmgen_cpu_state_trace.send()

            aly_cpu_state = AnalyticsCpuState()
            aly_cpu_state.name = self._hostname

            aly_cpu_info = ProcessCpuInfo()
            aly_cpu_info.module_id= self._moduleid
            aly_cpu_info.inst_id = self._instance_id
            aly_cpu_info.cpu_share = mod_cpu_info.cpu_info.cpu_share
            aly_cpu_info.mem_virt = mod_cpu_info.cpu_info.meminfo.virt
            aly_cpu_info.mem_res = mod_cpu_info.cpu_info.meminfo.res
            aly_cpu_state.cpu_info = [aly_cpu_info]

            aly_cpu_state_trace = AnalyticsCpuStateTrace(data=aly_cpu_state)
            aly_cpu_state_trace.send()

            # Send out the UVEKey-Count stats for this time period
            self.process_stats()

            duration = time.time() - before
            if duration < 60:
                gevent.sleep(60 - duration)
            else:
                self._logger.error("Periodic collection took %s sec" % duration)
Ejemplo n.º 15
0
class Controller(object):
    
    @staticmethod
    def fail_cb(manager, entrypoint, exception):
        sandesh_global._logger.info("Load failed for %s with exception %s" % \
                                     (str(entrypoint),str(exception)))
        
    def __init__(self, conf):
        self._conf = conf
        module = Module.ALARM_GENERATOR
        self._moduleid = ModuleNames[module]
        node_type = Module2NodeType[module]
        self._node_type_name = NodeTypeNames[node_type]
        self._hostname = socket.gethostname()
        self._instance_id = self._conf.worker_id()
        sandesh_global.init_generator(self._moduleid, self._hostname,
                                      self._node_type_name, self._instance_id,
                                      self._conf.collectors(), 
                                      self._node_type_name,
                                      self._conf.http_port(),
                                      ['opserver.sandesh', 'sandesh'])
        sandesh_global.set_logging_params(
            enable_local_log=self._conf.log_local(),
            category=self._conf.log_category(),
            level=self._conf.log_level(),
            file=self._conf.log_file(),
            enable_syslog=self._conf.use_syslog(),
            syslog_facility=self._conf.syslog_facility())
        self._logger = sandesh_global._logger

        # Trace buffer list
        self.trace_buf = [
            {'name':'DiscoveryMsg', 'size':1000}
        ]
        # Create trace buffers 
        for buf in self.trace_buf:
            sandesh_global.trace_buffer_create(name=buf['name'], size=buf['size'])

        tables = [ "ObjectCollectorInfo",
                   "ObjectDatabaseInfo",
                   "ObjectVRouter",
                   "ObjectBgpRouter",
                   "ObjectConfigNode" ] 
        self.mgrs = {}
        self.tab_alarms = {}
        for table in tables:
            self.mgrs[table] = hook.HookManager(
                namespace='contrail.analytics.alarms',
                name=table,
                invoke_on_load=True,
                invoke_args=(),
                on_load_failure_callback=Controller.fail_cb
            )
            
            for extn in self.mgrs[table][table]:
                self._logger.info('Loaded extensions for %s: %s,%s' % \
                    (table, extn.name, extn.entry_point_target))

            self.tab_alarms[table] = {}

        ConnectionState.init(sandesh_global, self._hostname, self._moduleid,
            self._instance_id,
            staticmethod(ConnectionState.get_process_state_cb),
            NodeStatusUVE, NodeStatus)

        self._us = UVEServer(None, self._logger, self._conf.redis_password())

        self._workers = {}

        self.disc = None
        self._libpart_name = self._hostname + ":" + self._instance_id
        self._libpart = None
        self._partset = set()
        if self._conf.discovery()['server']:
            import discoveryclient.client as client 
            data = {
                'ip-address': self._hostname ,
                'port': self._instance_id
            }
            self.disc = client.DiscoveryClient(
                self._conf.discovery()['server'],
                self._conf.discovery()['port'],
                ModuleNames[Module.ALARM_GENERATOR])
            self._logger.info("Disc Publish to %s : %s"
                          % (str(self._conf.discovery()), str(data)))
            self.disc.publish(ALARM_GENERATOR_SERVICE_NAME, data)
        else:
            # If there is no discovery service, use fixed redis_uve list
            redis_uve_list = []
            try:
                for redis_uve in self._conf.redis_uve_list():
                    redis_ip_port = redis_uve.split(':')
                    redis_ip_port = (redis_ip_port[0], int(redis_ip_port[1]))
                    redis_uve_list.append(redis_ip_port)
            except Exception as e:
                self._logger.error('Failed to parse redis_uve_list: %s' % e)
            else:
                self._us.update_redis_uve_list(redis_uve_list)

            # If there is no discovery service, use fixed alarmgen list
            self._libpart = self.start_libpart(self._conf.alarmgen_list())

        PartitionOwnershipReq.handle_request = self.handle_PartitionOwnershipReq
        PartitionStatusReq.handle_request = self.handle_PartitionStatusReq
        UVETableAlarmReq.handle_request = self.handle_UVETableAlarmReq 


    def libpart_cb(self, part_list):

        newset = set(part_list)
        oldset = self._partset
        self._partset = newset

        self._logger.info('Partition List : new %s old %s' % \
            (str(newset),str(oldset)))
        
        for addpart in (newset-oldset):
            self._logger.info('Partition Add : %s' % addpart)
            self.partition_change(addpart, True)
        
        for delpart in (oldset-newset):
            self._logger.info('Partition Del : %s' % delpart)
            self.partition_change(delpart, True)

    def start_libpart(self, ag_list):
        if not self._conf.zk_list():
            self._logger.error('Could not import libpartition: No zookeeper')
            return None
        if not ag_list:
            self._logger.error('Could not import libpartition: No alarmgen list')
            return None
        try:
            from libpartition.libpartition import PartitionClient
            self._logger.error('Starting PC')

            pc = PartitionClient("alarmgen",
                    self._libpart_name, ag_list,
                    self._conf.partitions(), self.libpart_cb,
                    ','.join(self._conf.zk_list()))
            self._logger.error('Started PC')
            return pc
        except Exception as e:
            self._logger.error('Could not import libpartition: %s' % str(e))
            return None

    def handle_uve_notif(self, uves):
        self._logger.debug("Changed UVEs : %s" % str(uves))
        no_handlers = set()
        for uv in uves:
            tab = uv.split(':',1)[0]
            if not self.mgrs.has_key(tab):
                no_handlers.add(tab)
                continue
            itr = self._us.multi_uve_get(uv, True, None, None, None, None)
            uve_data = itr.next()['value']
            if len(uve_data) == 0:
                self._logger.info("UVE %s deleted" % uv)
                if self.tab_alarms[tab].has_key(uv):
		    del self.tab_alarms[tab][uv]
                    ustruct = UVEAlarms(name = uv, deleted = True)
                    alarm_msg = AlarmTrace(data=ustruct, table=tab)
                    self._logger.info('send del alarm: %s' % (alarm_msg.log()))
                    alarm_msg.send()
                continue
            results = self.mgrs[tab].map_method("__call__", uv, uve_data)
            new_uve_alarms = {}
            for res in results:
                nm, errs = res
                self._logger.info("Alarm[%s] %s: %s" % (tab, nm, str(errs)))
                elems = []
                for ae in errs:
                    rule, val = ae
                    rv = AlarmElement(rule, val)
                    elems.append(rv)
                if len(elems):
                    new_uve_alarms[nm] = UVEAlarmInfo(type = nm,
                                           description = elems, ack = False)
            if (not self.tab_alarms[tab].has_key(uv)) or \
                       pprint.pformat(self.tab_alarms[tab][uv]) != \
                       pprint.pformat(new_uve_alarms):
                ustruct = UVEAlarms(name = uv, alarms = new_uve_alarms.values(),
                                    deleted = False)
                alarm_msg = AlarmTrace(data=ustruct, table=tab)
                self._logger.info('send alarm: %s' % (alarm_msg.log()))
                alarm_msg.send()
            self.tab_alarms[tab][uv] = new_uve_alarms
            
        if len(no_handlers):
            self._logger.info('No Alarm Handlers for %s' % str(no_handlers))

    def handle_UVETableAlarmReq(self, req):
        status = False
        if req.table == "all":
            parts = self.tab_alarms.keys()
        else:
            parts = [req.table]
        self._logger.info("Got UVETableAlarmReq : %s" % str(parts))
        np = 1
        for pt in parts:
            resp = UVETableAlarmResp(table = pt)
            uves = []
            for uk,uv in self.tab_alarms[pt].iteritems():
                alms = []
                for ak,av in uv.iteritems():
                    alms.append(av)
                uves.append(UVEAlarms(name = uk, alarms = alms))
            resp.uves = uves 
            if np == len(parts):
                mr = False
            else:
                mr = True
            resp.response(req.context(), mr)
            np = np + 1

    
    def partition_change(self, partno, enl):
        """
        Call this function when getting or giving up
        ownership of a partition
        Args:
            partno : Partition Number
            enl    : True for acquiring, False for giving up
        Returns: 
            status of operation (True for success)
        """
        status = False
        if enl:
            if self._workers.has_key(partno):
                self._logger.info("Dup partition %d" % partno)
            else:
                uvedb = self._us.get_part(partno)
                ph = UveStreamProc(','.join(self._conf.kafka_broker_list()),
                                   partno, "uve-" + str(partno),
                                   self._logger, uvedb,
                                   self.handle_uve_notif)
                ph.start()
                self._workers[partno] = ph
                status = True
        else:
            if self._workers.has_key(partno):
                ph = self._workers[partno]
                gevent.kill(ph)
                res,db = ph.get()
                print "Returned " + str(res)
                print "State :"
                for k,v in db.iteritems():
                    print "%s -> %s" % (k,str(v)) 
                del self._workers[partno]
                status = True
            else:
                self._logger.info("No partition %d" % partno)

        return status
    
    def handle_PartitionOwnershipReq(self, req):
        self._logger.info("Got PartitionOwnershipReq: %s" % str(req))
        status = self.partition_change(req.partition, req.ownership)

        resp = PartitionOwnershipResp()
        resp.status = status
	resp.response(req.context())
                
    def handle_PartitionStatusReq(self, req):
        
        if req.partition == -1:
            parts = self._workers.keys()
        else:
            parts = [req.partition]
        
        self._logger.info("Got PartitionStatusReq: %s" % str(parts))
        np = 1
        for pt in parts:
            resp = PartitionStatusResp()
            resp.partition = pt
            if self._workers.has_key(pt):
                resp.enabled = True
                resp.uves = []
                for kcoll,coll in self._workers[pt].contents().iteritems():
                    uci = UVECollInfo()
                    uci.collector = kcoll
                    uci.uves = []
                    for kgen,gen in coll.iteritems():
                        ugi = UVEGenInfo()
                        ugi.generator = kgen
                        ugi.uves = []
                        for uk,uc in gen.iteritems():
                            ukc = UVEKeyInfo()
                            ukc.key = uk
                            ukc.count = uc
                            ugi.uves.append(ukc)
                        uci.uves.append(ugi)
                    resp.uves.append(uci)
            else:
                resp.enabled = False
            if np == len(parts):
                mr = False
            else:
                mr = True
            resp.response(req.context(), mr)
            np = np + 1

    def disc_cb_coll(self, clist):
        '''
        Analytics node may be brought up/down any time. For UVE aggregation,
        alarmgen needs to know the list of all Analytics nodes (redis-uves).
        Periodically poll the Collector list [in lieu of 
        redi-uve nodes] from the discovery. 
        '''
        newlist = []
        for elem in clist:
            (ipaddr,port) = elem
            newlist.append((ipaddr, self._conf.redis_server_port()))
        self._us.update_redis_uve_list(newlist)

    def disc_cb_ag(self, alist):
        '''
        Analytics node may be brought up/down any time. For partitioning,
        alarmgen needs to know the list of all Analytics nodes (alarmgens).
        Periodically poll the alarmgen list from the discovery service
        '''
        newlist = []
        for elem in alist:
            (ipaddr, inst) = elem
            newlist.append(ipaddr + ":" + inst)

        # We should always include ourselves in the list of memebers
        newset = set(newlist)
        newset.add(self._libpart_name)
        newlist = list(newset)
        if not self._libpart:
            self._libpart = self.start_libpart(newlist)
        else:
            self._libpart.update_cluster_list(newlist)

    def run(self):
        while True:
            gevent.sleep(60)
Ejemplo n.º 16
0
    def __init__(self, conf):
        self._conf = conf
        module = Module.ALARM_GENERATOR
        self._moduleid = ModuleNames[module]
        node_type = Module2NodeType[module]
        self._node_type_name = NodeTypeNames[node_type]
        self._hostname = socket.gethostname()
        self._instance_id = self._conf.worker_id()
        sandesh_global.init_generator(self._moduleid, self._hostname,
                                      self._node_type_name, self._instance_id,
                                      self._conf.collectors(), 
                                      self._node_type_name,
                                      self._conf.http_port(),
                                      ['opserver.sandesh', 'sandesh'])
        sandesh_global.set_logging_params(
            enable_local_log=self._conf.log_local(),
            category=self._conf.log_category(),
            level=self._conf.log_level(),
            file=self._conf.log_file(),
            enable_syslog=self._conf.use_syslog(),
            syslog_facility=self._conf.syslog_facility())
        self._logger = sandesh_global._logger

        # Trace buffer list
        self.trace_buf = [
            {'name':'DiscoveryMsg', 'size':1000}
        ]
        # Create trace buffers 
        for buf in self.trace_buf:
            sandesh_global.trace_buffer_create(name=buf['name'], size=buf['size'])

        tables = [ "ObjectCollectorInfo",
                   "ObjectDatabaseInfo",
                   "ObjectVRouter",
                   "ObjectBgpRouter",
                   "ObjectConfigNode" ] 
        self.mgrs = {}
        self.tab_alarms = {}
        for table in tables:
            self.mgrs[table] = hook.HookManager(
                namespace='contrail.analytics.alarms',
                name=table,
                invoke_on_load=True,
                invoke_args=(),
                on_load_failure_callback=Controller.fail_cb
            )
            
            for extn in self.mgrs[table][table]:
                self._logger.info('Loaded extensions for %s: %s,%s' % \
                    (table, extn.name, extn.entry_point_target))

            self.tab_alarms[table] = {}

        ConnectionState.init(sandesh_global, self._hostname, self._moduleid,
            self._instance_id,
            staticmethod(ConnectionState.get_process_state_cb),
            NodeStatusUVE, NodeStatus)

        self._us = UVEServer(None, self._logger, self._conf.redis_password())

        self._workers = {}

        self.disc = None
        self._libpart_name = self._hostname + ":" + self._instance_id
        self._libpart = None
        self._partset = set()
        if self._conf.discovery()['server']:
            import discoveryclient.client as client 
            data = {
                'ip-address': self._hostname ,
                'port': self._instance_id
            }
            self.disc = client.DiscoveryClient(
                self._conf.discovery()['server'],
                self._conf.discovery()['port'],
                ModuleNames[Module.ALARM_GENERATOR])
            self._logger.info("Disc Publish to %s : %s"
                          % (str(self._conf.discovery()), str(data)))
            self.disc.publish(ALARM_GENERATOR_SERVICE_NAME, data)
        else:
            # If there is no discovery service, use fixed redis_uve list
            redis_uve_list = []
            try:
                for redis_uve in self._conf.redis_uve_list():
                    redis_ip_port = redis_uve.split(':')
                    redis_ip_port = (redis_ip_port[0], int(redis_ip_port[1]))
                    redis_uve_list.append(redis_ip_port)
            except Exception as e:
                self._logger.error('Failed to parse redis_uve_list: %s' % e)
            else:
                self._us.update_redis_uve_list(redis_uve_list)

            # If there is no discovery service, use fixed alarmgen list
            self._libpart = self.start_libpart(self._conf.alarmgen_list())

        PartitionOwnershipReq.handle_request = self.handle_PartitionOwnershipReq
        PartitionStatusReq.handle_request = self.handle_PartitionStatusReq
        UVETableAlarmReq.handle_request = self.handle_UVETableAlarmReq 
Ejemplo n.º 17
0
class Controller(object):
    
    @staticmethod
    def fail_cb(manager, entrypoint, exception):
        sandesh_global._logger.info("Load failed for %s with exception %s" % \
                                     (str(entrypoint),str(exception)))
        
    def __init__(self, conf):
        self._conf = conf
        module = Module.ALARM_GENERATOR
        self._moduleid = ModuleNames[module]
        node_type = Module2NodeType[module]
        self._node_type_name = NodeTypeNames[node_type]
        self._hostname = socket.gethostname()
        self._instance_id = '0'
        sandesh_global.init_generator(self._moduleid, self._hostname,
                                      self._node_type_name, self._instance_id,
                                      self._conf.collectors(), 
                                      self._node_type_name,
                                      self._conf.http_port(),
                                      ['opserver.sandesh', 'sandesh'])
        sandesh_global.set_logging_params(
            enable_local_log=self._conf.log_local(),
            category=self._conf.log_category(),
            level=self._conf.log_level(),
            file=self._conf.log_file(),
            enable_syslog=self._conf.use_syslog(),
            syslog_facility=self._conf.syslog_facility())
        self._logger = sandesh_global._logger

        # Trace buffer list
        self.trace_buf = [
            {'name':'DiscoveryMsg', 'size':1000}
        ]
        # Create trace buffers 
        for buf in self.trace_buf:
            sandesh_global.trace_buffer_create(name=buf['name'], size=buf['size'])

        tables = [ "ObjectCollectorInfo",
                   "ObjectDatabaseInfo",
                   "ObjectVRouter",
                   "ObjectBgpRouter",
                   "ObjectConfigNode" ] 
        self.mgrs = {}
        self.tab_alarms = {}
        for table in tables:
            self.mgrs[table] = hook.HookManager(
                namespace='contrail.analytics.alarms',
                name=table,
                invoke_on_load=True,
                invoke_args=(),
                on_load_failure_callback=Controller.fail_cb
            )
            
            for extn in self.mgrs[table][table]:
                self._logger.info('Loaded extensions for %s: %s,%s' % \
                    (table, extn.name, extn.entry_point_target))

            self.tab_alarms[table] = {}

        ConnectionState.init(sandesh_global, self._hostname, self._moduleid,
            self._instance_id,
            staticmethod(ConnectionState.get_process_state_cb),
            NodeStatusUVE, NodeStatus)

        self._us = UVEServer(None, self._logger, self._conf.redis_password())

        self.disc = None
        if self._conf.discovery()['server']:
            import discoveryclient.client as client 
            data = {
                'ip-address': self._hostname ,
                'port': self._instance_id
            }
            self.disc = client.DiscoveryClient(
                self._conf.discovery()['server'],
                self._conf.discovery()['port'],
                ModuleNames[Module.ALARM_GENERATOR])
            self._logger.info("Disc Publish to %s : %s"
                          % (str(self._conf.discovery()), str(data)))
            self.disc.publish(ALARM_GENERATOR_SERVICE_NAME, data)
        else:
            redis_uve_list = []
            try:
                for redis_uve in self._conf.redis_uve_list():
                    redis_ip_port = redis_uve.split(':')
                    redis_ip_port = (redis_ip_port[0], int(redis_ip_port[1]))
                    redis_uve_list.append(redis_ip_port)
            except Exception as e:
                self._logger.error('Failed to parse redis_uve_list: %s' % e)
            else:
                self._us.update_redis_uve_list(redis_uve_list)

        PartitionOwnershipReq.handle_request = self.handle_PartitionOwnershipReq
        PartitionStatusReq.handle_request = self.handle_PartitionStatusReq
        UVETableAlarmReq.handle_request = self.handle_UVETableAlarmReq 

        self._workers = {}

    def handle_uve_notif(self, uves):
        self._logger.debug("Changed UVEs : %s" % str(uves))
        no_handlers = set()
        for uv in uves:
            tab = uv.split(':',1)[0]
            if not self.mgrs.has_key(tab):
                no_handlers.add(tab)
                continue
            itr = self._us.multi_uve_get(uv, True, None, None, None, None)
            uve_data = itr.next()['value']
            if len(uve_data) == 0:
                del self.tab_alarms[tab][uv]
                self._logger.info("UVE %s deleted" % uv)
                continue
            results = self.mgrs[tab].map_method("__call__", uv, uve_data)
            new_uve_alarms = {}
            for res in results:
                nm, errs = res
                self._logger.info("Alarm[%s] %s: %s" % (tab, nm, str(errs)))
                elems = []
                for ae in errs:
                    rule, val = ae
                    rv = AlarmElement(rule, val)
                    elems.append(rv)
                if len(elems):
                    new_uve_alarms[nm] = UVEAlarmInfo(type = nm,
                                           description = elems, ack = False)
            self.tab_alarms[tab][uv] = new_uve_alarms
            
        if len(no_handlers):
            self._logger.info('No Alarm Handlers for %s' % str(no_handlers))

    def handle_UVETableAlarmReq(self, req):
        status = False
        if req.table == "all":
            parts = self.tab_alarms.keys()
        else:
            parts = [req.table]
        self._logger.info("Got UVETableAlarmReq : %s" % str(parts))
        np = 1
        for pt in parts:
            resp = UVETableAlarmResp(table = pt)
            uves = []
            for uk,uv in self.tab_alarms[pt].iteritems():
                alms = []
                for ak,av in uv.iteritems():
                    alms.append(av)
                uves.append(UVEAlarms(name = uk, alarms = alms))
            resp.uves = uves 
            if np == len(parts):
                mr = False
            else:
                mr = True
            resp.response(req.context(), mr)
            np = np + 1
        
    def handle_PartitionOwnershipReq(self, req):
        self._logger.info("Got PartitionOwnershipReq: %s" % str(req))
        status = False
        if req.ownership:
            if self._workers.has_key(req.partition):
                self._logger.info("Dup partition %d" % req.partition)
            else:
                uvedb = self._us.get_part(req.partition)
                ph = UveStreamProc(','.join(self._conf.kafka_broker_list()),
                                   req.partition, "uve-" + str(req.partition),
                                   self._logger, uvedb,
                                   self.handle_uve_notif)
                ph.start()
                self._workers[req.partition] = ph
                status = True
        else:
            #import pdb; pdb.set_trace()
            if self._workers.has_key(req.partition):
                ph = self._workers[req.partition]
                gevent.kill(ph)
                res,db = ph.get()
                print "Returned " + str(res)
                print "State :"
                for k,v in db.iteritems():
                    print "%s -> %s" % (k,str(v)) 
                del self._workers[req.partition]
                status = True
            else:
                self._logger.info("No partition %d" % req.partition)

        resp = PartitionOwnershipResp()
        resp.status = status
	resp.response(req.context())
                
    def handle_PartitionStatusReq(self, req):
        
        if req.partition == -1:
            parts = self._workers.keys()
        else:
            parts = [req.partition]
        
        self._logger.info("Got PartitionStatusReq: %s" % str(parts))
        np = 1
        for pt in parts:
            resp = PartitionStatusResp()
            resp.partition = pt
            if self._workers.has_key(pt):
                resp.enabled = True
                resp.uves = []
                for kcoll,coll in self._workers[pt].contents().iteritems():
                    uci = UVECollInfo()
                    uci.collector = kcoll
                    uci.uves = []
                    for kgen,gen in coll.iteritems():
                        ugi = UVEGenInfo()
                        ugi.generator = kgen
                        ugi.uves = list(gen)
                        uci.uves.append(ugi)
                    resp.uves.append(uci)
            else:
                resp.enabled = False
            if np == len(parts):
                mr = False
            else:
                mr = True
            resp.response(req.context(), mr)
            np = np + 1

    def disc_cb_coll(self, clist):
        '''
        Analytics node may be brought up/down any time. For UVE aggregation,
        alarmgen needs to know the list of all Analytics nodes (redis-uves).
        Periodically poll the Collector list [in lieu of 
        redi-uve nodes] from the discovery. 
        '''
        newlist = []
        for elem in clist:
            (ipaddr,port) = elem
            newlist.append((ipaddr, self._conf.redis_server_port()))
        self._us.update_redis_uve_list(newlist)

    def disc_cb_ag(self, alist):
        '''
        Analytics node may be brought up/down any time. For partitioning,
        alarmgen needs to know the list of all Analytics nodes (alarmgens).
        Periodically poll the alarmgen list from the discovery service
        '''
        # TODO : Hookup with partitioning library
        pass

    def run(self):
        while True:
            gevent.sleep(60)