class RemoveContainerHandler(APIHandler): container_opers = Container_Opers() @asynchronous def post(self): args = self.get_all_arguments() logging.info('all_arguments: %s' % str(args)) container_name = args.get('containerName') if not container_name: raise HTTPAPIError(status_code=400, error_detail="no container_name argument!",\ notification = "direct", \ log_message= "no container_name argument!",\ response = "please check params!") exists = self.container_opers.check_container_exists(container_name) if not exists: massage = {} massage.setdefault("status", "not exist") massage.setdefault("message", "no need this operation, there is no such a container!") self.finish(massage) return self.container_opers.destroy(container_name) return_message = {} return_message.setdefault("message", "remove container has been done but need some time, please wait a moment and check the result!") self.finish(return_message)
class StopContainerHandler(APIHandler): container_opers = Container_Opers() @asynchronous def post(self): args = self.get_all_arguments() logging.info('all_arguments: %s' % str(args)) container_name = args.get('containerName') if not container_name: raise HTTPAPIError(status_code=417, error_detail="no container_name argument!",\ notification = "direct", \ log_message= "no container_name argument!",\ response = "please check params!") exists = self.container_opers.check_container_exists(container_name) if not exists: raise HTTPAPIError(status_code=417, error_detail="container %s not exist!" % container_name,\ notification = "direct", \ log_message= "container %s not exist!" % container_name,\ response = "please check!") stat = self.container_opers.get_container_stat(container_name) if stat == Status.stopped: massage = {} massage.setdefault("status", stat) massage.setdefault("message", "no need this operation, the container has been stopped!") self.finish(massage) return self.container_opers.stop(container_name) return_message = {} return_message.setdefault("message", "due to stop a container need a little time, please wait and check the result~") self.finish(return_message)
class Containers_Oom_Worker(Abstract_Async_Thread): container_opers = Container_Opers() def __init__(self, timeout=55): self.timeout = timeout super(Containers_Oom_Worker, self).__init__() def run(self): try: zk_opers = Scheduler_ZkOpers() cluster_list = zk_opers.retrieve_cluster_list() if not cluster_list: logging.info('no cluster is created, no need to do this!') return self.__action_record_containers_resource() except Exception: self.threading_exception_queue.put(sys.exc_info()) def __action_record_containers_resource(self): logging.info('record containers under_oom, oom_kill_disable value') resource_items = ['under_oom', 'oom_kill_disable'] for resource_item in resource_items: resource_info = self.container_opers.get_containers_resource( resource_item) self.container_opers.write_containers_resource_to_zk( resource_item, resource_info)
class ContainerCluster_create_Action(Base_ContainerCluster_create_Action): component_container_cluster_config_factory = ComponentContainerClusterConfigFactory( ) container_opers = Container_Opers() def __init__(self, args): super(ContainerCluster_create_Action, self).__init__(args) self.args = args def run(self): __action_result = Status.failed __error_message = '' cluster = self.args.get('containerClusterName') try: logging.debug('begin create') __action_result = self.create(self.args) except: self.threading_exception_queue.put(sys.exc_info()) finally: self.update_zk_info_when_process_complete(cluster, __action_result, __error_message) def create(self, args): logging.info('args:%s' % str(args)) _component_type = args.get('componentType') _network_mode = args.get('networkMode') _cluster = self.args.get('containerClusterName') _component_container_cluster_config = self.component_container_cluster_config_factory.retrieve_config( args) node_count = _component_container_cluster_config.nodeCount _component_container_cluster_config.sum_count = node_count container_names = self.container_opers.generate_container_names( _component_type, node_count, _cluster) _component_container_cluster_config.container_names = container_names args.setdefault('component_config', _component_container_cluster_config) self.__create_cluser_info_to_zk(_network_mode, _component_type, _component_container_cluster_config) return super(ContainerCluster_create_Action, self).create(args) def __create_cluser_info_to_zk(self, network_mode, component_type, component_container_cluster_config): containerCount = component_container_cluster_config.nodeCount containerClusterName = component_container_cluster_config.container_cluster_name use_ip = 'bridge' != network_mode _container_cluster_info = { 'containerCount': containerCount, 'containerClusterName': containerClusterName, 'type': component_type, 'isUseIp': use_ip } zkOper = Container_ZkOpers() zkOper.write_container_cluster_info(_container_cluster_info)
class SetContainerCpusharesHandler(APIHandler): container_opers = Container_Opers() def post(self): args = self.get_all_arguments() ret = self.container_opers.set_container_cpushares(args) self.finish(ret)
class CheckContainerStatusHandler(APIHandler): ''' classdocs ''' container_opers = Container_Opers() @asynchronous def get(self, container_name): status = self.container_opers.check(container_name) self.finish(status)
class SetContainerCpusetHandler(APIHandler): container_opers = Container_Opers() def post(self): args = self.get_all_arguments() ret = self.container_opers.set_container_cpuset(args) result = {} result.setdefault(ret) self.finish(result)
class CheckContainerStatusHandler(BaseContainerHandler): ''' classdocs ''' container_opers = Container_Opers() @asynchronous @engine def get(self, container_name): result = yield self.do(container_name) self.finish(result) @run_on_executor() @run_callback def do(self, container_name): return self.container_opers.check(container_name)
class ContainerResourceHandler(object): con_op = Container_Opers() con_cache = ContainerCache() containers_diskio = {} containers_networkio = {} containers_cpuratio = {} def check_container_node_condition(self, container_node_detail): is_cluster_start = self.con_op.cluster_start( container_node_detail.cluster_name) is_container_name_legal = self.con_op.check_container_name_legal( container_node_detail.container_name) return container_node_detail and is_cluster_start and is_container_name_legal def get_container_nodes(self): """ 获取需要资源采集的容器信息。此处曾可能导致内存溢出,张增排查后并未 改进代码。现移除原来的局部变量 container_nodes = [], 换用yield。 """ current_ids = self.con_cache.current_ids.copy() for con_id in current_ids: detail = self.con_cache.find_detail_by_id(con_id) # 若当前id不在上一次缓存列表中, 则进行检查 # 否则在上一次缓存中,表示上一次已经检查过了 # 此次不再进行检查,降低连接zookeeper等消耗 if con_id not in self.con_cache.old_ids: # 则进行采集前置条件检查 check_passed = self.check_container_node_condition(detail) # 若检查不通过, 将容器信息置为 None if not check_passed: detail = None if detail is not None: yield detail del current_ids def write_to_es(self, resource_type, doc): _now = datetime.utcnow() _date = _now.strftime('%Y%m%d') _index = "monitor_container_resource_{0}_{1}".format( resource_type, _date) doc.update({'timestamp': _now}) ServerRes.index(index=_index, doc_type=resource_type, body=doc) def gather(self): raise NotImplemented("this gather method should be implemented")
class ContainerHandler(APIHandler): container_opers = Container_Opers() component_docker_model_factory = ComponentDockerModelFactory() #@asynchronous def post(self): args = self.get_all_arguments() docker_model = self.__create_docker_module(args) self.container_opers.create(docker_model) return_message = {} return_message.setdefault("message", "Success Create Container") self.finish(return_message) def __create_docker_module(self, arg_dict): logging.info('get create container args : %s, type:%s' % (str(arg_dict), type(arg_dict)) ) docker_model = self.component_docker_model_factory.create(arg_dict) return docker_model
class GatherClusterResourceHandler(APIHandler): ''' the result is webportal need, return to webportal ''' container_opers = Container_Opers() def cluster_resoure(self, cluster, resource_type): zkOper = Requests_ZkOpers() exists = zkOper.check_containerCluster_exists(cluster) if not exists: error_message = 'container cluster %s not exist, please check your cluster name' % cluster raise HTTPAPIError(status_code=417, error_detail=error_message, notification="direct", log_message=error_message, response=error_message) container_node_list = zkOper.retrieve_container_list(cluster) result = [] for container_node in container_node_list: resource = {} resource_value = zkOper.retrieve_container_resource(cluster, container_node, resource_type) host_ip = self.container_opers.get_host_ip_from_zk(cluster, container_node) container_name = self.container_opers.get_container_name_from_zk(cluster, container_node) resource.setdefault('value', resource_value) resource.setdefault('hostIp', host_ip) resource.setdefault('containerName', container_name) result.append(resource) return result @asynchronous @engine def get(self, cluster, resource_type): result = yield self.do(cluster, resource_type) self.finish({'data': result}) @run_on_executor() @run_callback def do(self, cluster, resource_type): return self.cluster_resoure(cluster, resource_type)
class BaseContainerHandler(APIHandler): container_opers = Container_Opers() def check_container_name(self, container_name): exists = self.container_opers.check_container_exists(container_name) if not exists: error_message = 'container %s not exist, please check your container name' % container_name raise HTTPAPIError(status_code=417, error_detail=error_message, notification="direct", log_message=error_message, response=error_message) def get_container_resource(self, container_name, resource_type): zk_opers = Requests_ZkOpers() result = {} cluster_name = get_containerClusterName_from_containerName( container_name) node_name = self.container_opers.get_container_node_from_container_name( cluster_name, container_name) resource_value = zk_opers.retrieve_container_resource( cluster_name, node_name, resource_type) result.setdefault('value', resource_value) result.setdefault('containerName', container_name) return result @asynchronous @engine def get(self, container_name, resource_type): result = yield self.do(container_name, resource_type) self.finish(result) @run_on_executor() @run_callback def do(self, container_name, resource_type): self.check_container_name(container_name) return self.get_container_resource(container_name, resource_type)
class ManagerStatusHandler(APIHandler): container_opers = Container_Opers() @asynchronous def post(self): """ eg. curl --user root:root -d "containerName=d-mcl-zz2-n-3&componentType=mcluster" /container/manager/status """ args = self.get_all_arguments() container_name = args.get('containerName') component_type = args.get('componentType') if not (container_name and component_type): raise HTTPAPIError(status_code=417, error_detail="no containerName or componentType argument!",\ notification = "direct", \ log_message= "no containerName or componentType argument!",\ response = "please check params!") ret = self.container_opers.manager_status_validate(component_type, container_name) result = {} result.setdefault("message", ret) self.finish(result)
class Base_ContainerCluster_Action(Abstract_Async_Thread): """if param "containers" not given, the action is about cluster; if param "containers" are given, the action is about such containers. """ container_opers = Container_Opers() def __init__(self, containerClusterName, action, containers=None): super(Base_ContainerCluster_Action, self).__init__() self.cluster = containerClusterName self.action = action self.containers = containers def run(self): try: self.__issue_action() except: self.threading_exception_queue.put(sys.exc_info()) def __issue_action(self): params = self.__get_params() adminUser, adminPasswd = _retrieve_userName_passwd() logging.info('params: %s' % str(params)) async_client = AsyncHTTPClient() try: for host_ip, container_name_list in params.items(): logging.info('container_name_list %s in host %s ' % (str(container_name_list), host_ip)) for container_name in container_name_list: args = {'containerName': container_name} request_uri = 'http://%s:%s/container/%s' % ( host_ip, options.port, self.action) logging.info('post----- url: %s, \n body: %s' % (request_uri, str(args))) async_http_post(async_client, request_uri, body=args, auth_username=adminUser, auth_password=adminPasswd) finally: async_client.close() if self.action == 'remove' and self._check_is_cluster_destroyed( container_name_list): self.do_when_remove_cluster() def _check_is_cluster_destroyed(self, container_name_list): timeout = 30 for i in range(timeout): _destroyed_sum = 0 for container_name in container_name_list: stats = self.container_opers.retrieve_container_status_from_containerName( container_name) if stats and stats.get('status') != Status.destroyed: break else: _destroyed_sum += 1 if _destroyed_sum == len(container_name_list): return True time.sleep(2) return False def do_when_remove_cluster(self): zkOper = Container_ZkOpers() cluster_info = zkOper.retrieve_container_cluster_info(self.cluster) use_ip = cluster_info.get('isUseIp') if use_ip: container_ip_list = zkOper.retrieve_container_list(self.cluster) logging.info('container_ip_list:%s' % str(container_ip_list)) zkOper.recover_ips_to_pool(container_ip_list) def __get_params(self): """ two containers may be with a host_ip """ params, container_info, container_nodes = {}, {}, [] zkOper = Container_ZkOpers() if self.containers: for container in self.containers: container_node = self.container_opers.get_container_node_from_container_name( self.cluster, container) container_nodes.append(container_node) else: container_nodes = zkOper.retrieve_container_list(self.cluster) self.container_nodes = container_nodes for container_node in self.container_nodes: container_name_list = [] container_info = zkOper.retrieve_container_node_value( self.cluster, container_node) container_name = container_info.get('containerName') host_ip = container_info.get('hostIp') container_name_list.append(container_name) if host_ip in params: container_name_list.extend(params[host_ip]) params[host_ip] = container_name_list return params
class Server_Res_Opers(): ''' # TODO: 资源采集与写入分离,规划好接口, 重构一下? classdocs ''' _logger = logging.getLogger("process_info") _logger.setLevel(logging.INFO) docker_opers = Docker_Opers() container_opers = Container_Opers() _server_cpu_ratio = CPURatio() def __init__(self, container_name=""): self.name = container_name if self.name != "": self.matrix_list = self.get_top_cmd_ret() self.id_pid_dict = self.get_container_id_pid_dict(self.name) def container_count(self): return len(self.container_opers.get_all_containers()) def memory_stat(self): mem, stat = {}, {} f = open("/proc/meminfo", "r") lines = f.readlines() f.close() for line in lines: if len(line) < 2: continue name = line.split(':')[0] var = line.split(':')[1].split()[0] mem[name] = long(var) * 1024.0 stat['total'] = int(mem['MemTotal']) stat['used'] = int(mem['MemTotal'] - mem['MemFree'] - mem['Buffers'] - mem['Cached']) stat['free'] = int(mem['MemFree'] + mem['Buffers'] + mem['Cached']) return stat def disk_iops(self): mountpoints = ('/srv/docker/vfs', '/srv') result = diskio.iops(mountpoints) return result def srv_disk_stat(self): """ @todo: 监控所有磁盘和分区 """ result = disk_stat('/srv/docker/vfs') return result def disk_loadavg(self): loadavg = {} f = open("/proc/loadavg", "r") con = f.read().split() f.close() loadavg['lavg_1'] = con[0] loadavg['lavg_5'] = con[1] loadavg['lavg_15'] = con[2] loadavg['nr'] = con[3] loadavg['last_pid'] = con[4] self._logger.info("disk io information: " + str(loadavg)) return loadavg def cpu_ratio(self): return self._server_cpu_ratio.get_result() @property def server_cpu_ratio(self): return self._server_cpu_ratio