def get(self): zkOper = self.retrieve_zkOper() if not is_monitoring(get_localhost_ip(), zkOper): self.finish("true") return conn = self.dba_opers.get_mysql_connection() if conn is None: self.finish("false") return try: current_connections_rows = self.dba_opers.show_processlist(conn) max_connections_rows = self.dba_opers.show_variables( conn, 'max_connections') finally: conn.close() current_connections_count = len(current_connections_rows) max_connections_rows_dict = dict(max_connections_rows) max_connections = max_connections_rows_dict.get("max_connections") if current_connections_count / int(max_connections) < 0.7: self.finish("true") return self.finish("false")
def sync_info_from_zk(self, node_ip_addr): zkOper = ZkOpers() try: cluster_existed = zkOper.existCluster() if cluster_existed: clusterUUID = zkOper.getClusterUUID() data, _ = zkOper.retrieveClusterProp(clusterUUID) node_ip_addr = get_localhost_ip() assert node_ip_addr return_result = zkOper.retrieve_data_node_info(node_ip_addr) json_str_data = data.replace("'", "\"") dict_data = json.loads(json_str_data) if type(return_result) is dict and type(dict_data) is dict: config_file_obj = ConfigFileOpers() config_file_obj.setValue(options.data_node_property, return_result) config_file_obj.setValue(options.cluster_property, dict_data) logging.debug( "program has re-written zk data into configuration file" ) else: logging.info("write data into configuration failed") finally: zkOper.stop()
def _send_monitor_email(self, anti_item_content): local_ip = get_localhost_ip() # send email subject = "[%s] Auti-Item existed in MySQL according to Galera way" % options.sitename body = anti_item_content + "\nip:" + local_ip if options.send_email_switch: send_email(options.admins, subject, body)
def _send_log_info_email(self, subject, content): local_ip = get_localhost_ip() # send email # body = self.render_string("errors/500_email.html", exception=content) body = content + "\nip:" + local_ip if options.send_email_switch: send_email(options.admins, subject, body)
def run(self): isLock, lock = self.zkOpers.lock_backup_action() if not isLock: logging.info('zk is not lock') return try: _password = retrieve_monitor_password() conn = self.dba_opers.get_mysql_connection(user="******", passwd=_password) if None == conn: raise UserVisiableException("Can\'t connect to mysql server") db_status = self.dba_opers.show_status(conn) if 'Synced' != db_status[-14][1]: self.backup_record[ 'error: '] = 'Mcluster is not start %s' % datetime.datetime.now( ).strftime(TIME_FORMAT) self.backupOpers._write_info_to_local( self.backupOpers.path, self.backupOpers.file_name, self.backup_record) self.zkOpers.write_backup_backup_info(self.backup_record) return if '0' == self.__run_comm(CHECK_DMP_DATA_CMD): self.backup_record[ 'error: '] = 'No have /data partition %s' % datetime.datetime.now( ).strftime(TIME_FORMAT) self.backupOpers._write_info_to_local( self.backupOpers.path, self.backupOpers.file_name, self.backup_record) self.zkOpers.write_backup_backup_info(self.backup_record) return self.backupOpers.create_backup_directory() self.backupOpers.remove_expired_backup_file() self.backupOpers.backup_action(self.zkOpers) self.backupOpers.trans_backup_file(self.zkOpers) record = { "recently_backup_ip: ": str(get_localhost_ip()), 'time: ': datetime.datetime.now().strftime(TIME_FORMAT), 'backup_type: ': self._backup_mode } self.zkOpers.write_backup_backup_info(record) except Exception, e: record = { "error: ": 'backup is wrong, please check it!', 'time:': datetime.datetime.now().strftime(TIME_FORMAT), 'backup_type: ': self._backup_mode } self.zkOpers.write_backup_backup_info(record) logging.error(e, exc_info=True)
def _send_error_email(self, exception): try: # send email subject = "[%s]Internal Server Error" % options.sitename host_ip = get_localhost_ip() version_str = '{0}-{1}'.format(__app__, __version__) exception += "\n" + version_str + "\nhost ip :" + host_ip if options.send_email_switch: send_email(options.admins, subject, exception + '') except Exception: logging.error(traceback.format_exc())
def _send_error_email(self, exception): try: local_ip = get_localhost_ip() version_str = '{0}-{1}'.format(__app__, __version__) logging.info("version_str :" + str(version_str)) # send email subject = "[%s]Internal Server Error " % options.sitename body = "{0}\n{1}\nip:{2}".format(exception, version_str, local_ip) # email_from = "%s <noreply@%s>" % (options.sitename, options.domain) if options.send_email_switch: send_email(options.admins, subject, body) except Exception: logging.error(traceback.format_exc())
def trans_backup_file(self, zkOpers): now_time = datetime.datetime.now() record = '%s == cp incr_backup_file is starting == ' % now_time.strftime( TIME_FORMAT) self.status['cp_incr_file_status:'] = Status.backup_transmit_starting self.status['cp_incr_file_start_time:'] = now_time.strftime( TIME_FORMAT) self._write_info_to_local(self.path, self.file_name, record) zkOpers.write_backup_innerbackup_info(self.status) rsync_cmd = RSYNC % (self.time, BACKUP_CONFIG.INCR_LOCAL_DIR, BACKUP_CONFIG.INCR_REMOTE_DIR) run_rsync_relust = os.system(rsync_cmd) now_time = datetime.datetime.now() if 0 == run_rsync_relust: in_backup_rs_path_cmd = 'rm -rf ' + BACKUP_CONFIG.INCR_LOCAL_DIR + \ '/incre_backup-' + self.time self._run_comm_call(in_backup_rs_path_cmd) record = '%s == cp incr_backup_file ok == ' % now_time.strftime( TIME_FORMAT) self.status[ 'cp_incr_file_status:'] = Status.backup_transmit_succeed self.status['incr_backup_ip:'] = str(get_localhost_ip()) self.status['cp_incr_file_finish_time:'] = now_time.strftime( TIME_FORMAT) self._write_info_to_local(self.path, self.file_name, record) zkOpers.write_backup_innerbackup_info(self.status) else: record = '%s == incr_backup_file is not cp /data == ' % now_time.strftime( TIME_FORMAT) self.status['cp_incr_status:'] = Status.backup_transmit_faild self.status['cp_incr_finish_time:'] = now_time.strftime( TIME_FORMAT) self._write_info_to_local(self.path, self.file_name, record) zkOpers.write_backup_innerbackup_info(self.status) return record = '%s == the incr backup is completed == ' % now_time.strftime( TIME_FORMAT) self._write_info_to_local(self.path, self.file_name, record) self._delete_file(BACKUP_CONFIG.LOG_FILE_PATH + '/incr', days_count=8)
def get(self): zkOper = self.retrieve_zkOper() if not is_monitoring(get_localhost_ip(), zkOper): self.finish("true") return try: check_result = self.dba_opers.retrieve_wsrep_status() logging.info("check_wsrepstatus : %s" % (check_result)) except: error_message = "connection break down" raise HTTPAPIErrorException(error_message, status_code=417) if not check_result: self.finish("false") return self.finish("true")
def trans_backup_file(self, ZkOpers): now_time = datetime.datetime.now() record = '%s == cp backup_file is starting == ' % now_time.strftime( TIME_FORMAT) self.status['cp_file_status:'] = Status.backup_transmit_starting self.status['cp_file_start_time:'] = now_time.strftime(TIME_FORMAT) self._write_info_to_local(self.path, self.file_name, record) ZkOpers.write_backup_fullbackup_info(self.status) rsync_cmd = RSYNC % (self.time, BACKUP_CONFIG.FULL_LOCAL_DIR, BACKUP_CONFIG.FULL_REMOTE_DIR) run_rsync_result = os.system(rsync_cmd) now_time = datetime.datetime.now() if 0 == run_rsync_result: self._fb_update_index('/full_backup-' + self.time) record = '%s == Cp backup_file ok == ' % now_time.strftime( TIME_FORMAT) self.status['cp_status:'] = Status.backup_transmit_succeed self.status['full_backup_ip:'] = str(get_localhost_ip()) self.status['cp_finish_time:'] = now_time.strftime(TIME_FORMAT) self._write_info_to_local(self.path, self.file_name, record) ZkOpers.write_backup_fullbackup_info(self.status) else: record = '%s == backup_file is not cp /data == ' % now_time.strftime( TIME_FORMAT) self.status['cp_status:'] = Status.backup_transmit_faild self.status['backup_ip:'] = None self.status['cp_finish_time:'] = now_time.strftime(TIME_FORMAT) self._write_info_to_local(self.path, self.file_name, record) ZkOpers.write_backup_fullbackup_info(self.status) return record = '%s == the full backup is completed == ' % now_time.strftime( TIME_FORMAT) self._write_info_to_local(self.path, self.file_name, record) self._delete_file(BACKUP_CONFIG.LOG_FILE_PATH)
def check(self, data_node_info_list): zkOper = Scheduler_ZkOpers() if not is_monitoring(get_localhost_ip(), zkOper): return conn = self.dba_opers.get_mysql_connection() monitor_type, monitor_key = "db", "existed_db_anti_item" error_record = {} anti_item_count, msg, failed_count = 0, "", 0 _path_value = zkOper.retrieve_monitor_status_value( monitor_type, monitor_key) if _path_value != {}: failed_count = int( re.findall(r'failed count=(\d)', _path_value['message'])[0]) if conn == None: failed_count += 1 if failed_count > 4: anti_item_count = 500 error_record.setdefault("msg", "no way to connect to db") else: try: anti_item_count, msg, anti_item_detail = self._anti_item_check( conn) finally: conn.close() if anti_item_count > 0: error_record.setdefault( "msg", "mcluster existed on %s please check which db right now." % (msg)) error_record.setdefault("detail", anti_item_detail) logging.info(error_record) alarm_level = self.retrieve_alarm_level(anti_item_count, 0, 0) logging.info("existed anti_item alarm_level :%s" % (alarm_level)) super(Check_DB_Anti_Item, self).write_status(anti_item_count, 0, failed_count, alarm_level, error_record, monitor_type, monitor_key) super(Check_DB_Anti_Item, self).write_status_to_es(anti_item_count, 0, failed_count, alarm_level, error_record, monitor_type, monitor_key)
def bin_log_node_stat(self): conn = self.dba_opers.get_mysql_connection() if None == conn: raise UserVisiableException("Can\'t connect to mysql server") try: cursor = conn.cursor() cursor.execute("show variables like 'log_bin'") rows_stat_log_bin = cursor.fetchall() stat_log_bin = rows_stat_log_bin[0][1] finally: conn.close() zkOper = self.retrieve_zkOper() started_node_list = zkOper.retrieve_started_nodes() local_ip = get_localhost_ip() if local_ip in started_node_list: started_node_list.remove(local_ip) result = {} result.setdefault('node_list', started_node_list) result.setdefault('stat_log_bin', stat_log_bin) return result
def get(self): zkOper = self.retrieve_zkOper() if not is_monitoring(get_localhost_ip(), zkOper): self.finish("true") return conn = self.dba_opers.get_mysql_connection() if conn is None: self.finish("false") return ''' @todo: dbs[0] need to refactor ''' clusterUUID = zkOper.getClusterUUID() db_name = None dbs = zkOper.retrieve_db_list() if [] != dbs: db_name = dbs[0] user_prop_dict = {} if None is not db_name: user_prop_dict = zkOper.retrieve_db_user_prop(clusterUUID, db_name) try: for user_prop in user_prop_dict: max_user_connections_rows = self.dba_opers.show_user_max_conn( conn, user_prop, user_prop_dict[user_prop]) current_user_connections_rows = self.dba_opers.show_user_current_conn( conn, user_prop, user_prop_dict[user_prop]) if int(current_user_connections_rows ) > int(max_user_connections_rows) * 0.8: self.finish("false") return finally: conn.close() self.finish("true")
def check(self, data_node_info_list): #url_post = "/dbuser/inner/check" zkOper = Scheduler_ZkOpers() if not is_monitoring(get_localhost_ip(), zkOper): return monitor_type, monitor_key = "db", "dbuser" user_mysql_src_dict, user_zk_src_list = self._get_check_user_list() error_record, differ_dict_set = {}, {} count_dict_set = dict(total=0, failed=0, success=0) if len(user_zk_src_list) == 0 and len(user_mysql_src_dict) == 0: error_record.setdefault( "msg", "no database users in zk neither in mysql") differ_dict_set.setdefault("Empty", "") else: self.compare_center(user_mysql_src_dict, user_zk_src_list, differ_dict_set, count_dict_set) count_dict_set[ "total"] = count_dict_set["success"] + count_dict_set["failed"] alarm_level = self.retrieve_alarm_level(count_dict_set["total"], count_dict_set["success"], count_dict_set["failed"]) total_count = count_dict_set["total"] failed_count = count_dict_set["failed"] success_count = count_dict_set["success"] if differ_dict_set: error_record.setdefault("dif", differ_dict_set) super(Check_Database_User, self).write_status(total_count, success_count, failed_count, alarm_level, error_record, monitor_type, monitor_key) super(Check_Database_User, self).write_status_to_es(total_count, success_count, failed_count, alarm_level, error_record, monitor_type, monitor_key)
def get(self): conn = self.dba_opers.get_mysql_connection() try: dataNodeProKeyValue = self.confOpers.getValue( options.data_node_property, ['dataNodeIp']) data_node_ip = dataNodeProKeyValue['dataNodeIp'] zkOper = self.retrieve_zkOper() started_ip_list = zkOper.retrieve_started_nodes() identifier = socket.gethostname() ''' @todo: review the comment code for arbitrator way ''' # ret_dict = self.confOpers.getValue(options.data_node_property, ['dataNodeName','dataNodeIp']) # node_name = ret_dict['dataNodeName'] # obj = re.search("-n-2", node_name) # if obj != None: # self.finish("true") # return if conn is None: if data_node_ip in started_ip_list: zkOper.remove_started_node(data_node_ip) self.invokeCommand.run_check_shell(options.kill_innotop) self.finish("false") return zkOper.write_started_node(data_node_ip) if not is_monitoring(get_localhost_ip(), zkOper): self.finish("true") return dbName = 'monitor' n_time = datetime.datetime.now() h = n_time.hour min = n_time.minute offset = h / 6 tbName = '' prefix_tb_name = 'tem' mid_tb_name = str(identifier) mid_tb_name_rps = mid_tb_name.replace("-", "_") pre_tbname = prefix_tb_name + mid_tb_name_rps for i in range(4): tbName = pre_tbname + "_" + str(i) self.dba_opers.check_create_table(conn, tbName, dbName) tbName = pre_tbname + "_" + str(offset) del_tbName = '' ft = float(time.time()) if h % 6 == 0 and min <= 59 and (1000000 * ft) % 10 == 0: int_tbName = (offset + 2) % 4 del_tbName = "%s_%s" % (pre_tbname, int_tbName) self.dba_opers.delete_tb_contents(conn, del_tbName, dbName) logging.info( 'delete the contents in database (%s) before 12 hours success!' % (del_tbName)) str_time = n_time.strftime(TIME_FORMAT) self.dba_opers.insert_record_time(conn, str_time, identifier, tbName, dbName) logging.info('Insert time %s into table %s ' % (str_time, tbName)) record_time = self.dba_opers.query_record_time( conn, identifier, tbName, dbName) except Exception, e: return_flag = 'false' logging.error(e) self.finish(return_flag) return