Example #1
0
    def check(self, data_node_info_list):
        zkOper = Scheduler_ZkOpers()
        started_nodes_list = zkOper.retrieve_started_nodes()

        error_record = {}
        ip = []
        for data_node_ip in started_nodes_list:
            ip.append(data_node_ip)

        error_record.setdefault("online_ip", ip)
        total_count = len(data_node_info_list)
        success_count = len(started_nodes_list)
        failed_count = total_count - success_count
        monitor_type = "node"
        monitor_key = "started"
        alarm_level = self.retrieve_alarm_level(total_count, success_count,
                                                failed_count)

        super(Check_Node_Active,
              self).write_status(total_count, success_count, failed_count,
                                 alarm_level, error_record, monitor_type,
                                 monitor_key)
        super(Check_Node_Active,
              self).write_status_to_es(total_count, success_count,
                                       failed_count, alarm_level, error_record,
                                       monitor_type, monitor_key)
Example #2
0
    def _get_check_user_list(self):
        conn = self.dba_opers.get_mysql_connection()
        user_tuple = self.dba_opers.get_db_users(conn)
        user_mysql_src_dict, user_zk_src_list = {}, []
        zkOper = Scheduler_ZkOpers()
        # We convert origin tuple grabbed from mysql into list,
        # then combine the elements subscripted 0 ,1 as key of
        # dict and combine the elements subscripted -3, -4 ,-5, -6
        # as the value of the dict.Finally we append the dict into list.

        for t in user_tuple:
            inner_value_list = []
            dict_key_str = (list(t)[1] + "|" + list(t)[0])
            inner_value_list.append(list(t)[-3])
            inner_value_list.append(list(t)[-4])
            inner_value_list.append(list(t)[-5])
            inner_value_list.append(list(t)[-6])
            user_mysql_src_dict.setdefault(dict_key_str, inner_value_list)

        db_list = zkOper.retrieve_db_list()
        for db_name in db_list:
            db_user_list = zkOper.retrieve_db_user_list(db_name)
            logging.info("dbName: " + db_name + " db_user_list : " +
                         str(db_user_list))
            for db_user in db_user_list:
                inner_list = []
                inner_list.append(db_user)
                prop = zkOper.get_db_user_prop(db_name, db_user)
                inner_list.append(prop)
                user_zk_src_list.append(inner_list)
        return user_mysql_src_dict, user_zk_src_list
Example #3
0
    def write_status(self,
                     total_count,
                     success_count,
                     failed_count,
                     alarm_level,
                     error_record_dict,
                     monitor_type,
                     monitor_key,
                     timeout_num_threshold=3):

        dt = datetime.datetime.now()
        _include_timeout_num_from_response = 0
        if {} != error_record_dict:
            _error_record_message = error_record_dict.get('msg')
            _include_timeout_list = re.findall(r'HTTP 599:',
                                               str(_error_record_message))
            _include_timeout_num_from_response = len(_include_timeout_list)

        _timeout_num_from_zk = 0
        zkOper = Scheduler_ZkOpers()
        if _include_timeout_num_from_response > 0:
            _monitor_value_dict = zkOper.retrieve_monitor_status_value(
                monitor_type, monitor_key)
            _timeout_num = _monitor_value_dict.get("timeout_num")
            if _timeout_num is not None:
                _timeout_num_from_zk = _timeout_num

            _timeout_num_from_zk += 1

        if _timeout_num_from_zk <= timeout_num_threshold and _include_timeout_num_from_response > 0:
            success_count = total_count
            failed_count = 0
            alarm_level = "nothing"
            error_record_dict = {}
        else:
            _timeout_num_from_zk = 0

        result_dict = {
            "message":
            "total=%s, success count=%s, failed count=%s" %
            (total_count, success_count, failed_count),
            "alarm":
            alarm_level,
            "error_record":
            error_record_dict,
            "ctime":
            dt.strftime('%Y-%m-%d %H:%M:%S'),
            "timeout_num":
            _timeout_num_from_zk
        }

        zkOper.write_monitor_status(monitor_type, monitor_key, result_dict)
Example #4
0
    def check_status(self, data_node_info_list, url_post, monitor_type,
                     monitor_key):
        zk_data_node_count = len(data_node_info_list)

        zkOper = Scheduler_ZkOpers()
        self._check_cluster_status(zk_data_node_count)
        self._check_node_status(data_node_info_list, url_post, monitor_type,
                                monitor_key)
Example #5
0
    def check(self, data_node_info_list):
        zkOper = Scheduler_ZkOpers()
        if not is_monitoring(get_localhost_ip(), zkOper):
            return
        conn = self.dba_opers.get_mysql_connection()

        monitor_type, monitor_key = "db", "existed_db_anti_item"
        error_record = {}
        anti_item_count, msg, failed_count = 0, "", 0
        _path_value = zkOper.retrieve_monitor_status_value(
            monitor_type, monitor_key)
        if _path_value != {}:
            failed_count = int(
                re.findall(r'failed count=(\d)', _path_value['message'])[0])

        if conn == None:
            failed_count += 1
            if failed_count > 4:
                anti_item_count = 500
                error_record.setdefault("msg", "no way to connect to db")
        else:
            try:
                anti_item_count, msg, anti_item_detail = self._anti_item_check(
                    conn)
            finally:
                conn.close()
            if anti_item_count > 0:
                error_record.setdefault(
                    "msg",
                    "mcluster existed on %s please check which db right now." %
                    (msg))
                error_record.setdefault("detail", anti_item_detail)
                logging.info(error_record)
        alarm_level = self.retrieve_alarm_level(anti_item_count, 0, 0)
        logging.info("existed anti_item alarm_level :%s" % (alarm_level))
        super(Check_DB_Anti_Item,
              self).write_status(anti_item_count, 0, failed_count, alarm_level,
                                 error_record, monitor_type, monitor_key)
        super(Check_DB_Anti_Item,
              self).write_status_to_es(anti_item_count, 0, failed_count,
                                       alarm_level, error_record, monitor_type,
                                       monitor_key)
Example #6
0
    def run(self):
        '''
            if no logic below, singleton Scheduler_ZkOpers may have no self.zk object.
        '''
        begin_time = time.time()
        lock_name = 'async_monitor/' + self.monitor_type

        zkOper = Scheduler_ZkOpers()
        logging.info('check zk is connected :%s' % str(zkOper.is_connected()))

        isLock, lock = None, None
        try:
            isLock, lock = zkOper.lock_async_monitor_action(lock_name)
            if not isLock:
                return
        except kazoo.exceptions.LockTimeout:
            logging.info("a thread is running the monitor async, give up this oper on this machine!")
            return

        try:
            data_node_info_list = zkOper.retrieve_data_node_list()
            getattr(self, '_async_' + self.monitor_type)(data_node_info_list)

            end_time = time.time()
            monitor_exc_time = int(end_time - begin_time)

            '''leave timeout for sleep
            '''
            real_time_out = self.timeout - self.time_constant

            if monitor_exc_time < real_time_out:
                time.sleep(real_time_out - monitor_exc_time)

            logging.info("%s task has finished" %self.monitor_type)

        except Exception, e:
            self.threading_exception_queue.put(sys.exc_info())
Example #7
0
    def check(self, data_node_info_list):
        #url_post = "/dbuser/inner/check"
        zkOper = Scheduler_ZkOpers()
        if not is_monitoring(get_localhost_ip(), zkOper):
            return

        monitor_type, monitor_key = "db", "dbuser"
        user_mysql_src_dict, user_zk_src_list = self._get_check_user_list()
        error_record, differ_dict_set = {}, {}
        count_dict_set = dict(total=0, failed=0, success=0)
        if len(user_zk_src_list) == 0 and len(user_mysql_src_dict) == 0:
            error_record.setdefault(
                "msg", "no database users in zk neither in mysql")
            differ_dict_set.setdefault("Empty", "")
        else:
            self.compare_center(user_mysql_src_dict, user_zk_src_list,
                                differ_dict_set, count_dict_set)
            count_dict_set[
                "total"] = count_dict_set["success"] + count_dict_set["failed"]
        alarm_level = self.retrieve_alarm_level(count_dict_set["total"],
                                                count_dict_set["success"],
                                                count_dict_set["failed"])

        total_count = count_dict_set["total"]
        failed_count = count_dict_set["failed"]
        success_count = count_dict_set["success"]
        if differ_dict_set:
            error_record.setdefault("dif", differ_dict_set)

        super(Check_Database_User,
              self).write_status(total_count, success_count, failed_count,
                                 alarm_level, error_record, monitor_type,
                                 monitor_key)
        super(Check_Database_User,
              self).write_status_to_es(total_count, success_count,
                                       failed_count, alarm_level, error_record,
                                       monitor_type, monitor_key)
Example #8
0
    def _check_cluster_status(self, zk_data_node_count):
        zkOper = Scheduler_ZkOpers()
        pre_stat = zkOper.retrieveClusterStatus()
        ''' The following logic expression means
            1. if we don't have the cluster_status node in zookeeper we will
               get pre_stat as {}, we will create the path in the following
               process.
            2. else the pre_stat is not {}, then it must have value in pre_stat
               dictionary and judge whether it is right or not.
        '''
        if pre_stat.has_key('_status') and pre_stat[
                '_status'] != 'initializing' or pre_stat == {}:
            online_node_list = zkOper.retrieve_started_nodes()
            result = {}

            online_num = len(online_node_list)
            if zk_data_node_count == online_num:
                result['_status'] = 'running'
            elif zk_data_node_count / 2 + 1 <= online_num < zk_data_node_count:
                result['_status'] = 'sub-health'
            else:
                result['_status'] = 'failed'
            zkOper.writeClusterStatus(result)