def update_storages(self, resource_pool_uuid, data): if resource_pool_uuid: nodes = db_api.get_node_with_all( {'resource_pool_uuid': resource_pool_uuid}) else: nodes = db_api.get_node_with_all({}) # 需要所有节点都有此存储路径才能进行配置 node_uuid_list = [] for node in nodes: node_uuid_list.append(node.uuid) storages = db_api.get_node_storage_all({'node_uuid': node.uuid}) for path, role in data.items(): for store in storages: if path == store.path: break else: return build_result("NodeStorageNotExist", node=node.name, path=path) # 配置存储路径,同时清掉之前存储路径的角色 storages = db_api.get_node_storage_all({}) for store in storages: if store.node_uuid in node_uuid_list: for path, role in data.items(): if path == store.path: store.role = role break else: store.role = '' store.soft_update() return build_result("Success")
def update_node_performance(): nodes = db_api.get_node_with_all({'deleted': False}) for node in nodes: try: if node.status == constants.STATUS_ACTIVE: ret = monitor_post( node.ip, 'api/v1/monitor/resource_perf_for_database', {'statis_period': 30}) if ret.get('code') == 0: ret_data = ret.get("data", {}) node_utc = ret_data.get("utc", 0) node_datetime = dt.datetime.fromtimestamp(node_utc) insert_data = { "node_uuid": node.uuid, "node_datetime": node_datetime, "monitor_info": json.dumps(ret_data) } logger.debug( "insert monitor performance data success, node_ip: {}, data: {}" .format(node.ip, ret)) db_api.add_monitor_half_min(insert_data) else: logger.error( "monitor server error, node_ip:{}, ret: {}".format( node.ip, ret)) except Exception as e: logger.error("get service status error:%s", e, exc_info=True)
def rollback_publish(self, package_id, package_path): logger.info("rollback publish upgrade package on compute nodes") nodes = db_api.get_node_with_all({}) tasks = list() failed_nodes = list() with ThreadPoolExecutor(max_workers=constants.MAX_THREADS) as executor: for node in nodes: task = executor.submit(self._sync_delete_package, node.ip, package_id, package_path) tasks.append(task) for future in as_completed(tasks): res = future.result() if res.get("code") != 0: logger.error( "node: %s rollback publish upgrade package failed: %s", res.get("ipaddr", ""), res.get("msg", "")) failed_nodes.append({ "ipaddr": res.get("ipaddr", ""), "msg": res.get("msg", "") }) if failed_nodes: return get_error_result("UploadPackageSyncError", {"failed_nodes": failed_nodes}) return get_error_result("Success")
def check_node_status(self): nodes = db_api.get_node_with_all({}) master = None slaves = list() for node in nodes: if not icmp_ping( node.ip) or node.status == constants.STATUS_SHUTDOWN: return get_error_result("NodeIPConnetFail") if node.type in [ constants.ROLE_MASTER_AND_COMPUTE, constants.ROLE_MASTER ]: master = node.ip else: slaves.append(node.ip) return {"master": master, "slaves": slaves}
def publish(self, package_id, package_path, md5_value=None): logger.info("sync the upgrade package to compute nodes") controller_image = db_api.get_controller_image() nodes = db_api.get_node_with_all({}) tasks = list() failed_nodes = list() bind = SERVER_CONF.addresses.get_by_default('upgrade_bind', '') if bind: port = bind.split(':')[-1] else: port = constants.UPGRADE_DEFAULT_PORT with ThreadPoolExecutor(max_workers=constants.MAX_THREADS) as executor: for node in nodes: if node.type in [ constants.ROLE_MASTER_AND_COMPUTE, constants.ROLE_MASTER ]: continue task = executor.submit( self._sync_download_package, "http://%s:%s" % (controller_image.ip, port), node.ip, package_id, package_path, md5_value) tasks.append(task) for future in as_completed(tasks): res = future.result() if res.get("code") != 0: logger.error("node :%s sync upgrade package failed: %s", res.get("ipaddr", ""), res.get("msg", "")) failed_nodes.append({ "ipaddr": res.get("ipaddr", ""), "msg": res.get("msg", "") }) if failed_nodes: return get_error_result("UploadPackageSyncError", {"failed_nodes": failed_nodes}) return get_error_result("Success")
def update_node_status(): # 启用HA后,主备控节点的type是动态的,先检查HA信息,确保节点type是正确的 update_ha_master() nodes = db_api.get_node_with_all({'deleted': False}) for node in nodes: is_restart = False is_shutdowning = False if node.status == constants.STATUS_DELETING: continue if node.status == constants.STATUS_SHUTDOWNING: is_shutdowning = True # continue logger.debug("node %s updateing", node.name) status = constants.STATUS_ACTIVE if node.status == constants.STATUS_RESTARTING: restart_time = node.updated_at now = datetime.datetime.utcnow() if float(str(now - restart_time).split(":")[-1]) <= 120: if not icmp_ping(node.ip, count=2): continue is_restart = True if not icmp_ping(node.ip, count=3): if not is_restart: status = constants.STATUS_SHUTDOWN # rep_json = check_node_status(node.ip) # if rep_json.get('code') != 0: # status = constants.STATUS_SHUTDOWN else: try: ret = monitor_post(node.ip, 'api/v1/monitor/memory', {}) if ret.get('code') == 0: mem_info = ret['data'] node.running_mem = mem_info[ "available"] / 1024 / 1024 / 1024 node.total_mem = mem_info['total'] / 1024 / 1024 / 1024 node.mem_utilization = mem_info["utilization"] ret = monitor_post(node.ip, 'api/v1/monitor/cpu', {}) cpu_ratio = 0 if ret.get('code') == 0: cpu_info = ret['data'] cpu_ratio = cpu_info["utilization"] node.cpu_utilization = cpu_info["utilization"] node.soft_update() if cpu_ratio >= 95: status = constants.STATUS_ERROR ret = monitor_post(node.ip, 'api/v1/monitor/service', {}) if ret.get('code') == 0: services = ret['data'] not_running_services = list( filter( lambda service: services[service] != 'running', services.keys())) if node.type in [ constants.ROLE_MASTER_AND_COMPUTE, constants.ROLE_MASTER ]: node_services = constants.MASTER_SERVICE elif node.type in [ constants.ROLE_SLAVE_AND_COMPUTE, constants.ROLE_COMPUTE ]: node_services = constants.COMPUTE_SERVICE else: node_services = [] update_service_status(node, services, node_services) for service in not_running_services: if service in node_services: logger.error("service %s is not running", service) status = constants.STATUS_ERROR break else: status = constants.STATUS_ERROR else: status = constants.STATUS_ERROR except Exception as e: logger.error("get service status error:%s", e, exc_info=True) status = constants.STATUS_ERROR if node.status != status: if status == constants.STATUS_ERROR and is_restart and node.type not in [ 1, 3 ]: continue elif status == constants.STATUS_ERROR and is_shutdowning and node.type not in [ 1, 3 ]: continue logger.info("node %s status change from %s to %s", node.ip, node.status, status) node.status = status node.soft_update() # 只要节点没关机,就可以请求monitor服务去获取磁盘使用信息 if status != constants.STATUS_SHUTDOWN: update_node_storage(node.ip, node.uuid)
def get_top_data(self, statis_period): logger.debug("get nodes resource_statis top5 data, period {}".format( statis_period)) url = "/api/v1/monitor/resource_statis" request_data = {"statis_period": statis_period} all_node_cpu_info = [] all_node_memory_info = [] all_node_disk_info = [] all_node_nic_info = [] # get all nodes ip try: nodes = db_api.get_node_with_all( {'status': constants.STATUS_ACTIVE}) workers = len(nodes) if len(nodes) > 0 else 1 all_task = [] with ThreadPoolExecutor(max_workers=workers) as executor: for node in nodes: # get node system resource perf statistic info request_data.update({ "node_name": node.name, "node_uuid": node.uuid, "node_ip": node.ip }) future = executor.submit(monitor_post, node.ip, url, request_data) all_task.append(future) for future in as_completed(all_task): rep_json = future.result() logger.debug("rep:%s", rep_json) if rep_json["code"] != 0: logger.error( "get node:{} resource_statis info fail".format( rep_json)) continue node_name = rep_json.get("data").get("node_name", "") node_uuid = rep_json.get("data").get("node_uuid", "") node_ip = rep_json.get("data").get("node_ip", "") all_node_cpu_info.append( (node_name, rep_json["data"]["cpu_util"])) all_node_memory_info.append( (node_name, rep_json["data"]["memory_util"])) # get all ssd disk path, then sum storages = db_api.get_node_storage_all( {'node_uuid': node_uuid}) disk_ssd = [0, 0] for storage in storages: if storage.type == 1 and storage.path in rep_json[ "data"]["disk_util"].keys(): # 1-ssd 2-sata logger.debug(storage.path) disk_ssd[0] += rep_json["data"]["disk_util"][ storage.path]["total"] disk_ssd[1] += rep_json["data"]["disk_util"][ storage.path]["used"] all_node_disk_info.append( (node_name, '%0.2f' % (disk_ssd[1] / disk_ssd[0] * 100) if disk_ssd[0] else 0, disk_ssd[0], disk_ssd[1])) # just manage network nic, from yzy_node_network_info , yzy_interface_ip is_manage manage_network_name = db_api.get_node_manage_nic_name( node_uuid) logger.debug(manage_network_name) if manage_network_name and manage_network_name in rep_json[ "data"]["nic_util"].keys(): all_node_nic_info.append( (node_name, rep_json["data"]["nic_util"] [manage_network_name]["sum_bytes_avg"], rep_json["data"]["nic_util"][manage_network_name] ["sum_bytes_max"])) resp = get_error_result("Success") resp["data"] = {} resp["data"]['utc'] = int( (dt.datetime.utcnow() - dt.datetime.utcfromtimestamp(0)).total_seconds()) all_node_cpu_info.sort(key=lambda x: float(x[1]), reverse=True) resp["data"]["cpu_util"] = all_node_cpu_info[0:5] all_node_memory_info.sort(key=lambda x: float(x[1]), reverse=True) resp["data"]["memory_util"] = all_node_memory_info[0:5] all_node_disk_info.sort(key=lambda x: float(x[1]), reverse=True) resp["data"]["disk_util"] = all_node_disk_info[0:5] all_node_nic_info.sort(key=lambda x: float(x[1]), reverse=True) resp["data"]["nic_util"] = all_node_nic_info[0:5] return resp except Exception as err: logger.error("err {}".format(err)) logger.error(''.join(traceback.format_exc())) return get_error_result("OtherError")