class AllocationConfiguration: # Default value for cpu.cpu_period [ms] (used as denominator). cpu_quota_period: Numeric(1000, 1000000) = 1000 # Multiplier of AllocationType.CPU_SHARES allocation value. # E.g. setting 'CPU_SHARES' to 2.0 will set 2000 shares effectively # in cgroup cpu controller. cpu_shares_unit: Numeric(1000, 1000000) = 1000 # Default resource allocation for last level cache (L3) and memory bandwidth # for root RDT group. # Root RDT group is used as default group for all tasks, unless explicitly reconfigured by # allocator. # `None` (the default value) means no limit (effectively set to maximum available value). default_rdt_l3: Str = None default_rdt_mb: Str = None
def __init__( self, node: nodes.Node, metrics_storage: storage.Storage = DEFAULT_STORAGE, action_delay: Numeric(0, 60) = 1., # [s] rdt_enabled: Optional[bool] = None, # Defaults(None) - auto configuration. extra_labels: Dict[Str, Str] = None, event_names: List[str] = None, enable_derived_metrics: bool = False, _allocation_configuration: Optional[AllocationConfiguration] = None, ): self._node = node self._metrics_storage = metrics_storage self._action_delay = action_delay self._rdt_enabled = rdt_enabled # Disabled by default, to be overridden by subclasses. self._rdt_mb_control_required = False # Disabled by default, to overridden by subclasses. self._rdt_cache_control_required = False self._extra_labels = extra_labels or dict() self._finish = False # Guard to stop iterations. self._last_iteration = time.time() # Used internally by wait function. self._allocation_configuration = _allocation_configuration self._event_names = event_names or DEFAULT_EVENTS self._enable_derived_metrics = enable_derived_metrics
class Prometheus: host: str port: int timeout: Optional[Numeric(1, 60)] = 1.0 ssl: Optional[SSL] = None time: Optional[str] = None # Evaluation timestamp. def do_query(self, query: str, use_time: bool = True): """ Implements: https://prometheus.io/docs/prometheus/2.16/querying/api/#instant-queries""" url = URL_TPL.format(prometheus_ip='{}:{}'.format( self.host, str(self.port)), path=QUERY_PATH, name=query) if self.time and use_time: url += '&time={}'.format(self.time) try: if self.ssl: s = requests.Session() s.mount(self.ip, HTTPSAdapter()) response = s.get(url, timeout=self.timeout, verify=self.ssl.server_verify, cert=self.ssl.get_client_certs()) else: response = requests.get(url, timeout=self.timeout) response.raise_for_status() except requests.exceptions.RequestException as e: raise PrometheusDataProviderException(e) return response.json()['data']['result']
class Kubeapi: host: Str = None port: Str = None # Because !Env is String and another type cast might be problematic client_token_path: Optional[Path(absolute=True, mode=os.R_OK)] = SERVICE_TOKEN_FILENAME server_cert_ca_path: Optional[Path(absolute=True, mode=os.R_OK)] = SERVICE_CERT_FILENAME timeout: Numeric(1, 60) = 5 # [s] monitored_namespaces: List[Str] = field( default_factory=lambda: ["default"]) def __post_init__(self): self.endpoint = "https://{}:{}".format(self.host, self.port) log.debug("Created kubeapi endpoint %s", self.endpoint) with pathlib.Path(self.client_token_path).open() as f: self.service_token = f.read() def request_kubeapi(self, target): full_url = urljoin(self.endpoint, target) r = requests.get(full_url, headers={ "Authorization": "Bearer {}".format(self.service_token), }, timeout=self.timeout, verify=self.server_cert_ca_path) if not r.ok: log.error( 'An unexpected error occurred for target "%s": %i %s - %s', target, r.status_code, r.reason, r.raw) r.raise_for_status() return r.json() def delete(self, target): full_url = urljoin(self.endpoint, target) r = requests.delete(full_url, headers={ "Authorization": "Bearer {}".format(self.service_token), }, timeout=self.timeout, verify=self.server_cert_ca_path) if not r.ok: log.error( 'An unexpected error occurred for target "%s": %i %s - %s', target, r.status_code, r.reason, r.raw) r.raise_for_status() return r.json()
def __init__( self, node: nodes.Node, allocator: Allocator, metrics_storage: storage.Storage = DEFAULT_STORAGE, anomalies_storage: storage.Storage = DEFAULT_STORAGE, allocations_storage: storage.Storage = DEFAULT_STORAGE, action_delay: Numeric(0, 60) = 1., # [s] rdt_enabled: Optional[ bool] = None, # Defaults(None) - auto configuration. rdt_mb_control_required: bool = False, rdt_cache_control_required: bool = False, extra_labels: Dict[Str, Str] = None, allocation_configuration: Optional[AllocationConfiguration] = None, remove_all_resctrl_groups: bool = False, event_names: Optional[List[str]] = None, enable_derived_metrics: bool = False, task_label_generators: Dict[str, TaskLabelGenerator] = None, ): self._allocation_configuration = allocation_configuration or AllocationConfiguration( ) super().__init__( node, metrics_storage, action_delay, rdt_enabled, extra_labels, _allocation_configuration=self._allocation_configuration, event_names=event_names, enable_derived_metrics=enable_derived_metrics, task_label_generators=task_label_generators) # Allocation specific. self._allocator = allocator self._allocations_storage = allocations_storage self._rdt_mb_control_required = rdt_mb_control_required # Override False from superclass. self._rdt_cache_control_required = rdt_cache_control_required # Anomaly. self._anomalies_storage = anomalies_storage self._anomalies_statistics = AnomalyStatistics() # Internal allocation statistics self._allocations_counter = 0 self._allocations_errors = 0 self._remove_all_resctrl_groups = remove_all_resctrl_groups
def __init__( self, node: nodes.Node, metrics_storage: storage.Storage = DEFAULT_STORAGE, action_delay: Numeric(0, 60) = 1., # [s] rdt_enabled: Optional[ bool] = None, # Defaults(None) - auto configuration. extra_labels: Dict[Str, Str] = None, event_names: List[str] = DEFAULT_EVENTS, enable_derived_metrics: bool = False, task_label_generators: Dict[str, TaskLabelGenerator] = None, _allocation_configuration: Optional[AllocationConfiguration] = None, ): self._node = node self._metrics_storage = metrics_storage self._action_delay = action_delay self._rdt_enabled = rdt_enabled # Disabled by default, to be overridden by subclasses. self._rdt_mb_control_required = False # Disabled by default, to overridden by subclasses. self._rdt_cache_control_required = False self._extra_labels = extra_labels or dict() self._finish = False # Guard to stop iterations. self._last_iteration = time.time() # Used internally by wait function. self._allocation_configuration = _allocation_configuration self._event_names = event_names self._enable_derived_metrics = enable_derived_metrics # Default value for task_labels_generator. if task_label_generators is None: self._task_label_generators = { 'application': TaskLabelRegexGenerator('$', '', 'task_name'), 'application_version_name': TaskLabelRegexGenerator('.*$', '', 'task_name'), } else: self._task_label_generators = task_label_generators # Generate label value with cpu initial assignment, to simplify # management of distributed model system for plugin: # https://github.com/intel/platform-resource-manager/tree/master/prm""" # # To not risk subtle bugs in 1.0.x do not add it to _task_label_generators as default, # but make it hardcoded here and possible do be removed. self._task_label_generators['initial_task_cpu_assignment'] = \ TaskLabelResourceGenerator('cpus')
def __init__( self, node: nodes.Node, detector: detectors.AnomalyDetector, metrics_storage: storage.Storage = DEFAULT_STORAGE, anomalies_storage: storage.Storage = DEFAULT_STORAGE, action_delay: Numeric(0, 60) = 1., rdt_enabled: Optional[bool] = None, extra_labels: Dict[Str, Str] = None, event_names: Optional[List[str]] = None, enable_derived_metrics: bool = False, ): super().__init__(node, metrics_storage, action_delay, rdt_enabled, extra_labels, event_names, enable_derived_metrics) self._detector = detector # Anomaly. self._anomalies_storage = anomalies_storage self._anomalies_statistics = AnomalyStatistics()
class MesosNode(Node): mesos_agent_endpoint: Url = 'https://127.0.0.1:5051' # Timeout to access mesos agent. timeout: Numeric(1, 60) = 5. # [s] # https://github.com/kennethreitz/requests/blob/5c1f72e80a7d7ac129631ea5b0c34c7876bc6ed7/requests/api.py#L41 ssl: Optional[SSL] = None METHOD = 'GET_STATE' api_path = '/api/v1' def get_tasks(self): """ only return running tasks """ full_url = urllib.parse.urljoin(self.mesos_agent_endpoint, self.api_path) if self.ssl: r = requests.post(full_url, json=dict(type=self.METHOD), timeout=self.timeout, verify=self.ssl.server_verify, cert=self.ssl.get_client_certs()) else: r = requests.post(full_url, json=dict(type=self.METHOD), timeout=self.timeout) r.raise_for_status() state = r.json() tasks = [] # Fast return path if there is no any launched tasks. if 'launched_tasks' not in state['get_state']['get_tasks']: return [] for launched_task in state['get_state']['get_tasks']['launched_tasks']: if 'statuses' not in launched_task or not len( launched_task['statuses']): continue statuses = launched_task['statuses'] last_status = statuses[ -1] # Assume the last on is the latest state # TODO: confirm if last_status['state'] != MESOS_TASK_STATE_RUNNING: continue if 'executor_pid' not in last_status['container_status']: log.warning( "'executor_pid' not found in container status for task %s on agent %s", last_status['task_id']['value'], last_status['agent_id']['value']) continue executor_pid = last_status['container_status']['executor_pid'] try: cgroup_path = find_cgroup(executor_pid) except MesosCgroupNotFoundException: logging.warning( f'Cannot find pid/cgroup mesos path for {executor_pid}. ' f'Ignoring task (inconsistent state returned from Mesos).') continue labels = { label['key']: label['value'] for label in launched_task['labels']['labels'] } # Extract scalar resources. resources = dict() for resource in launched_task['resources']: if resource['type'] == 'SCALAR': resources[resource['name']] = float( resource['scalar']['value']) tasks.append( MesosTask(name=launched_task['name'], executor_pid=executor_pid, cgroup_path=cgroup_path, subcgroups_paths=[], container_id=last_status['container_status'] ['container_id']['value'], task_id=last_status['task_id']['value'], agent_id=last_status['agent_id']['value'], executor_id=last_status['executor_id']['value'], labels=labels, resources=resources)) return tasks
class ZookeeperDatabase(Database): # used as prefix for key, to namespace all queries hosts: List[str] namespace: str timeout: Numeric( 1, 60) = 5. # request timeout in seconds (tries another host) [s] ssl: Optional[SSL] = None def __post_init__(self): from kazoo.client import KazooClient if self.ssl: if isinstance(self.ssl.server_verify, str): self._client = KazooClient( hosts=self.hosts, timeout=self.timeout, handler=SecureSequentialThreadingHandler(), use_ssl=True, verify_certs=True, ca=self.ssl.server_verify, certfile=self.ssl.client_cert_path, keyfile=self.ssl.client_key_path, ) elif isinstance(self.ssl.server_verify, bool): self._client = KazooClient( hosts=self.hosts, timeout=self.timeout, handler=SecureSequentialThreadingHandler(), use_ssl=True, verify_certs=self.ssl.server_verify, certfile=self.ssl.client_cert_path, keyfile=self.ssl.client_key_path, ) else: raise ValidationError( 'SSL server verify must be type of Path or boolean!') else: self._client = KazooClient(hosts=self.hosts, timeout=self.timeout) self._client.start() def set(self, key: bytes, value: bytes): _validate_key(key) _validate_value(value) formatted_key = key.decode('ascii') full_path = os.path.join(self.namespace, formatted_key) self._client.ensure_path(full_path) self._client.set(full_path, value) def get(self, key: bytes) -> bytes: from kazoo.exceptions import NoNodeError _validate_key(key) formatted_key = key.decode('ascii') full_path = os.path.join(self.namespace, formatted_key) try: data = self._client.get(full_path) return bytes(data[0]) except NoNodeError: return None
class EtcdDatabase(Database): """Access etcd using internal grpc-gateway. Support version: 3.2.x (version) (other versions require change of api_path) https://coreos.com/etcd/docs/latest/dev-guide/api_grpc_gateway.html """ hosts: List[str] timeout: Optional[Numeric(1, 60)] = 5.0 api_path: Optional[str] = '/v3alpha' ssl: Optional[SSL] = None def _send(self, url, data): response_data = None for host in self.hosts: try: full_url = '{}{}{}'.format(host, self.api_path, url) if self.ssl: s = requests.Session() s.mount(host, HTTPSAdapter()) r = s.post(full_url, data=json.dumps(data), timeout=self.timeout, verify=self.ssl.server_verify, cert=self.ssl.get_client_certs()) else: r = requests.post(full_url, data=json.dumps(data), timeout=self.timeout) r.raise_for_status() response_data = r.json() break except requests.exceptions.Timeout: log.warning('EtcdDatabase: Timeout on host {}'.format(host)) return response_data def _format_data(self, data): formatted_data = dict() for key in data.keys(): formatted_data[key] = base64.b64encode(data[key]).decode('ascii') return formatted_data def set(self, key: bytes, value: bytes): _validate_key(key) _validate_value(value) data = {'key': key, 'value': value} formatted_data = self._format_data(data) url = '/kv/put' response_data = self._send(url, formatted_data) if not response_data: raise TimeoutOnAllHosts( 'EtcdDatabase: Cannot put key "{}": Timeout on all hosts!'. format(key)) def get(self, key) -> bytes: _validate_key(key) data = {'key': key} formatted_data = self._format_data(data) url = '/kv/range' response_data = self._send(url, formatted_data) if not response_data: raise TimeoutOnAllHosts( 'EtcdDatabase: Cannot get key "{}": Timeout on all hosts!'. format(key)) if 'kvs' in response_data and 'value' in response_data['kvs'][0]: return base64.b64decode(response_data['kvs'][0]['value']) return None
class KafkaStorage(Storage): """rst Storage for saving metrics in Kafka. - ``topic``: **Str** name of a kafka topic where message should be saved - ``brokers_ips``: **List[IpPort]** = *"127.0.0.1:9092"* list of addresses with ports of all kafka brokers (kafka nodes) - ``max_timeout_in_seconds``: **Numeric(0, 5)** = *0.5* if a message was not delivered in maximum_timeout seconds self.store will throw FailedDeliveryException - ``extra_config``: **Dict[Str, Str]** = *None* additionall key value pairs that will be passed to kafka driver https://github.com/edenhill/librdkafka/blob/master/CONFIGURATION.md e.g. {'debug':'broker,topic,msg'} to enable logging for kafka producer threads - ``ssl``: **Optional[SSL]** = *None* secure socket layer object """ topic: Str brokers_ips: List[IpPort] = field(default=("127.0.0.1:9092", )) max_timeout_in_seconds: Numeric(0, 5) = 0.5 # defaults half of a second extra_config: Dict[Str, Str] = None ssl: Optional[SSL] = None def __post_init__(self) -> None: check_kafka_dependency() try: self._get_ssl_config() self.producer = create_kafka_consumer(self.brokers_ips, self.extra_config) except Exception as e: log.exception('Exception during kafka consumer initialization:') raise KafkaConsumerInitializationException(str(e)) self.error_from_callback = None """used to pass error from within callback_on_delivery (called from different thread) to KafkaStorage instance""" def _get_ssl_config(self) -> None: """https://github.com/edenhill/librdkafka/wiki/Using-SSL-with-librdkafka""" if self.ssl is None: return if self.extra_config is None: self.extra_config = dict() self.extra_config['security.protocol'] = 'ssl' if isinstance(self.ssl.server_verify, str): if 'ssl.ca.location' in self.extra_config: log.warning( 'KafkaStorage `ssl.ca.location` in config replaced with SSL object!' ) self.extra_config['ssl.ca.location'] = self.ssl.server_verify elif self.ssl.server_verify is True: raise SSLConfigError( "It's necessary to provide CA cert path if you want to check it!" ) client_certs = self.ssl.get_client_certs() if isinstance(client_certs, tuple): if 'ssl.certificate.location' in self.extra_config: log.warning('KafkaStorage `ssl.certificate.location` ' 'in config replaced with SSL object!') self.extra_config['ssl.certificate.location'] = client_certs[0] if 'ssl.key.location' in self.extra_config: log.warning('KafkaStorage `ssl.key.location` ' 'in config replaced with SSL object!') self.extra_config['ssl.key.location'] = client_certs[1] else: raise SSLConfigError( "It's necessary to provide both client cert and key paths!") if 'ssl.cipher.suites' in self.extra_config: log.warning('KafkaStorage SSL uses extra config cipher suites!') else: self.extra_config['ssl.cipher.suites'] = SECURE_CIPHERS if 'ssl.enabled.protocols' in self.extra_config: log.warn('KafkaStorage SSL `ssl.enabled.protocols` not supported!') self.extra_config.pop('ssl.enabled.protocols') def callback_on_delivery(self, err, msg) -> None: """Called once for each message produced to indicate delivery result. Triggered by poll() or flush().""" if err is not None: self.error_from_callback = err log.error('KafkaStorage failed to send message; error message: {}'. format(err)) else: log.log( logger.TRACE, 'KafkaStorage succeeded to send message; message: {}'.format( msg)) @staticmethod def divide_message(msg): """Kafka won't accept more than 1Mb messages, therefore too big messages need to be divided into smaller chunks""" MAX_SIZE = 10**5 devided_message = [] msg_size = sys.getsizeof(msg) if msg_size < MAX_SIZE: return [msg] else: message = msg.split('\n') new_message = '' for i in range(len(message)): new_metric = '' while message[i].startswith('#'): new_metric += message[i] + '\n' i += 1 new_metric += message[i] + '\n' if sys.getsizeof(new_message + new_metric) > MAX_SIZE and new_message: devided_message.append(new_message) new_message = new_metric else: new_message += new_metric return devided_message def store(self, metrics: List[Metric]) -> None: """Stores synchronously metrics in kafka. The function returns only after sending the message - by using synchronous self.producer.flush to block until the message (metrics) are delivered to the kafka. Raises: * InconvertibleToPrometheusExpositionFormat - if metrics are not convertible into prometheus exposition format. * FailedDeliveryException - if a message could not be written to kafka. """ if not metrics: log.warning('Empty list of metrics, store is skipped!') return is_convertible, error_message = is_convertable_to_prometheus_exposition_format( metrics) if not is_convertible: log.error('KafkaStorage failed to convert metrics into' 'prometheus exposition format; error: "{}"'.format( error_message)) raise InconvertibleToPrometheusExpositionFormat(error_message) timestamp = get_current_time() msg = convert_to_prometheus_exposition_format(metrics, timestamp) messages = self.divide_message(msg) for message in messages: self.producer.produce(self.topic, message.encode('utf-8'), callback=self.callback_on_delivery) r = self.producer.flush( self.max_timeout_in_seconds) # block until all send # check if timeout expired if r > 0: raise FailedDeliveryException( "Maximum timeout {} for sending message had passed.". format(self.max_timeout_in_seconds)) # check if any failed to be delivered if self.error_from_callback is not None: # before resetting self.error_from_callback we # assign the original value to separate value # to pass it to exception error_from_callback__original_ref = self.error_from_callback self.error_from_callback = None raise FailedDeliveryException( "Message has failed to be writen to kafka. API error message: {}." .format(error_from_callback__original_ref)) log.debug( 'KafkaStorage: Message size=%i with timestamp=%s stored in kafka topic=%r', len(msg), timestamp, self.topic) return # the message has been send to kafka
from wca.config import assure_type, ValidationError, WeakValidationError, \ Url, Path, Numeric, Str, IpPort class Foo: pass class FooEnum(Enum): BAR = 1 BAZ = 2 @pytest.mark.parametrize( 'value, expected_type', [(1, int), (1, Numeric(0, 3)), (3.5, Numeric(2., 5.)), (1.2, float), (True, bool), (True, Optional[bool]), (None, Optional[bool]), (1, Optional[int]), (None, Optional[int]), ('str', str), ('str', Union[str, float]), (1.2, Union[str, float]), (Foo(), Foo), ([Foo()], List[Foo]), ([[1]], List[List[int]]), ({ 'x': 2 }, Dict[str, int]), ({ 'x': 2.5 }, Dict[str, Union[int, float]]), ({ 2: { 'x': 2.5 } }, Dict[int, Dict[str, Union[int, float]]]), (FooEnum.BAR, FooEnum), (1, FooEnum), (1, Numeric(0, 3)), (3.5, Numeric(2., 5.)), ('small_string', Str), ('small_string', Str()),
class KubernetesNode(Node): # We need to know what cgroup driver is used to properly build cgroup paths for pods. # Reference in source code for kubernetes version stable 1.13: # https://github.com/kubernetes/kubernetes/blob/v1.13.3/pkg/kubelet/cm/cgroup_manager_linux.go#L207 cgroup_driver: CgroupDriverType = field( default_factory=lambda: CgroupDriverType(CgroupDriverType.CGROUPFS)) ssl: Optional[SSL] = None # By default use localhost, however kubelet may not listen on it. kubelet_endpoint: Url = 'https://127.0.0.1:10250' # Timeout to access kubernetes agent. timeout: Numeric(1, 60) = 5 # [s] # List of namespaces to monitor pods in. monitored_namespaces: List[Str] = field( default_factory=lambda: ["default"]) def _request_kubelet(self): PODS_PATH = '/pods' full_url = urljoin(self.kubelet_endpoint, PODS_PATH) if self.ssl: s = requests.Session() s.mount(self.kubelet_endpoint, HTTPSAdapter()) r = s.get(full_url, json=dict(type='GET_STATE'), timeout=self.timeout, verify=self.ssl.server_verify, cert=self.ssl.get_client_certs()) else: r = requests.get(full_url, json=dict(type='GET_STATE'), timeout=self.timeout) r.raise_for_status() return r.json() def get_tasks(self) -> List[Task]: """Returns only running tasks.""" try: kubelet_json_response = self._request_kubelet() except requests.exceptions.ConnectionError as e: raise TaskSynchronizationException('%s' % e) from e tasks = [] for pod in kubelet_json_response.get('items'): container_statuses = pod.get('status').get('containerStatuses') if not container_statuses: # Lacking needed information. continue # Ignore pods in not monitored namespaces. if pod.get('metadata').get( 'namespace') not in self.monitored_namespaces: continue # Read into variables essential information about pod. pod_id = pod.get('metadata').get('uid') pod_name = pod.get('metadata').get('name') qos = pod.get('status').get('qosClass').lower() task_name = pod.get('metadata').get('namespace') + "/" + pod_name assert QosClass.has_value(qos) if pod.get('metadata').get('labels'): labels = { _sanitize_label(key): value for key, value in pod.get('metadata').get( 'labels').items() } else: labels = {} labels[_sanitize_label( QOS_LABELNAME)] = qos # Add label with QOS class of the pod. # Apart from obvious part of the loop it checks whether all # containers are in ready state - # if at least one is not ready then skip this pod. containers_cgroups = [] are_all_containers_ready = True for container in container_statuses: if not container.get('ready'): are_all_containers_ready = False container_state = list(container.get('state').keys())[0] log.debug( 'Ignore pod with uid={} name={}. Container {} is in state={} .' .format(pod_id, pod_name, container.get('name'), container_state)) break container_id = container.get('containerID').split( 'docker://')[1] containers_cgroups.append( _build_cgroup_path(self.cgroup_driver, qos, pod_id, container_id)) if not are_all_containers_ready: continue log.debug( 'Pod with uid={} name={} is ready and monitored by the system.' .format(pod_id, pod_name)) container_spec = pod.get('spec').get('containers') tasks.append( KubernetesTask( name=task_name, task_id=pod_id, qos=qos, labels=labels, resources=_calculate_pod_resources(container_spec), cgroup_path=_build_cgroup_path(self.cgroup_driver, qos, pod_id), subcgroups_paths=containers_cgroups)) _log_found_tasks(tasks) return tasks
class MesosNode(Node): """rst Class to communicate with orchestrator: Mesos. Derived from abstract Node class providing get_tasks interface. - ``mesos_agent_endpoint``: **Url** = *'https://127.0.0.1:5051'* By default localhost. - ``timeout``: **Numeric(1, 60)** = *5* Timeout to access kubernetes agent [seconds]. - ``ssl``: **Optional[SSL]** = *None* ssl object used to communicate with kubernetes """ mesos_agent_endpoint: Url = 'https://127.0.0.1:5051' # Timeout to access mesos agent. timeout: Numeric(1, 60) = 5. # [s] # https://github.com/kennethreitz/requests/blob/5c1f72e80a7d7ac129631ea5b0c34c7876bc6ed7/requests/api.py#L41 ssl: Optional[SSL] = None METHOD = 'GET_STATE' api_path = '/api/v1' def __post_init__(self): log.info('Mesos task discovery on: %r', self.mesos_agent_endpoint) def get_tasks(self): """ only return running tasks """ full_url = urllib.parse.urljoin(self.mesos_agent_endpoint, self.api_path) try: if self.ssl: s = requests.Session() s.mount(self.mesos_agent_endpoint, HTTPSAdapter()) r = s.post( full_url, json=dict(type=self.METHOD), timeout=self.timeout, verify=self.ssl.server_verify, cert=self.ssl.get_client_certs()) else: r = requests.post( full_url, json=dict(type=self.METHOD), timeout=self.timeout) except requests.exceptions.ConnectionError as e: raise TaskSynchronizationException('%s' % e) from e r.raise_for_status() state = r.json() tasks = [] # Fast return path if there is no any launched tasks. if 'launched_tasks' not in state['get_state']['get_tasks']: return [] for launched_task in state['get_state']['get_tasks']['launched_tasks']: if 'statuses' not in launched_task or not len(launched_task['statuses']): continue statuses = launched_task['statuses'] last_status = statuses[-1] # Assume the last on is the latest state # TODO: confirm if last_status['state'] != MESOS_TASK_STATE_RUNNING: continue if 'executor_pid' not in last_status['container_status']: log.warning("'executor_pid' not found in container status for task %s on agent %s", last_status['task_id']['value'], last_status['agent_id']['value']) continue executor_pid = last_status['container_status']['executor_pid'] task_name = launched_task['name'] try: cgroup_path = find_cgroup(executor_pid) except MesosCgroupNotFoundException as e: log.warning('Cannot determine proper cgroup for task=%r! ' 'Ignoring this task. Reason: %s', task_name, e) continue labels = {sanitize_label(label['key']): label['value'] for label in launched_task['labels']['labels']} # Extract scalar resources. resources = calculate_scalar_resources(launched_task['resources']) tasks.append( MesosTask( name=task_name, executor_pid=executor_pid, cgroup_path=cgroup_path, subcgroups_paths=[], container_id=last_status['container_status']['container_id']['value'], task_id=last_status['task_id']['value'], agent_id=last_status['agent_id']['value'], executor_id=last_status['executor_id']['value'], labels=labels, resources=resources ) ) return tasks
class KafkaStorage(Storage): """Storage for saving metrics in Kafka. Args: brokers_ips: list of addresses with ports of all kafka brokers (kafka nodes) topic: name of a kafka topic where message should be saved max_timeout_in_seconds: if a message was not delivered in maximum_timeout seconds self.store will throw FailedDeliveryException producer_config: additionall key value pairs that will be passed to kafka driver https://github.com/edenhill/librdkafka/blob/master/CONFIGURATION.md e.g. {'debug':'broker,topic,msg'} to enable logging for kafka producer threads """ topic: Str brokers_ips: List[IpPort] = field(default=("127.0.0.1:9092",)) max_timeout_in_seconds: Numeric(0, 5) = 0.5 # defaults half of a second extra_config: Dict[Str, Str] = None def __post_init__(self) -> None: check_kafka_dependency() try: self.producer = create_kafka_consumer(self.brokers_ips, self.extra_config) except Exception as e: log.exception('Exception during kafka consumer initialization:') raise KafkaConsumerInitializationException(str(e)) self.error_from_callback = None """used to pass error from within callback_on_delivery (called from different thread) to KafkaStorage instance""" def callback_on_delivery(self, err, msg) -> None: """Called once for each message produced to indicate delivery result. Triggered by poll() or flush().""" if err is not None: self.error_from_callback = err log.error( 'KafkaStorage failed to send message; error message: {}'.format(err)) else: log.log(logger.TRACE, 'KafkaStorage succeeded to send message; message: {}'.format(msg)) def store(self, metrics: List[Metric]) -> None: """Stores synchronously metrics in kafka. The function returns only after sending the message - by using synchronous self.producer.flush to block until the message (metrics) are delivered to the kafka. Raises: * InconvertibleToPrometheusExpositionFormat - if metrics are not convertible into prometheus exposition format. * FailedDeliveryException - if a message could not be written to kafka. """ if not metrics: log.warning('Empty list of metrics, store is skipped!') return is_convertible, error_message = is_convertable_to_prometheus_exposition_format(metrics) if not is_convertible: log.error('KafkaStorage failed to convert metrics into' 'prometheus exposition format; error: "{}"' .format(error_message)) raise InconvertibleToPrometheusExpositionFormat(error_message) timestamp = get_current_time() msg = convert_to_prometheus_exposition_format(metrics, timestamp) self.producer.produce(self.topic, msg.encode('utf-8'), callback=self.callback_on_delivery) r = self.producer.flush(self.max_timeout_in_seconds) # block until all send # check if timeout expired if r > 0: raise FailedDeliveryException( "Maximum timeout {} for sending message had passed.".format( self.max_timeout_in_seconds)) # check if any failed to be delivered if self.error_from_callback is not None: # before resetting self.error_from_callback we # assign the original value to separate value # to pass it to exception error_from_callback__original_ref = self.error_from_callback self.error_from_callback = None raise FailedDeliveryException( "Message has failed to be writen to kafka. API error message: {}.".format( error_from_callback__original_ref)) log.debug('message size=%i with timestamp=%s stored in kafka topic=%r', len(msg), timestamp, self.topic) return # the message has been send to kafka
def __init__( self, node: Node, metrics_storage: Storage = DEFAULT_STORAGE, interval: Numeric(0, 60) = 1., rdt_enabled: Optional[bool] = None, gather_hw_mm_topology: Optional[bool] = None, extra_labels: Optional[Dict[Str, Str]] = None, event_names: List[str] = [], perf_aggregate_cpus: bool = True, enable_derived_metrics: bool = False, uncore_event_names: List[Union[List[str], str]] = [], task_label_generators: Optional[Dict[str, TaskLabelGenerator]] = None, allocation_configuration: Optional[AllocationConfiguration] = None, wss_reset_cycles: Optional[int] = None, wss_stable_cycles: int = 0, wss_membw_threshold: Optional[float] = None, include_optional_labels: bool = False, zoneinfo: Union[Str, bool] = True, vmstat: Union[Str, bool] = True, sched: Union[Str, bool] = False, ): self._node = node self._metrics_storage = metrics_storage self._interval = interval self._rdt_enabled = rdt_enabled self._gather_hw_mm_topology = gather_hw_mm_topology self._include_optional_labels = include_optional_labels self._extra_labels = {k: str(v) for k, v in extra_labels.items()} if extra_labels else dict() log.debug('Extra labels: %r', self._extra_labels) self._finish = False # Guard to stop iterations. self._last_iteration = time.time() # Used internally by wait function. self._allocation_configuration = allocation_configuration self._event_names = event_names self._perf_aggregate_cpus = perf_aggregate_cpus # TODO: fix those workarounds for dynamic levels and dynamic perf event metrics. # First add dynamic metrics for event_name in event_names: # is dynamic raw event if '__r' in event_name: log.debug('Creating metadata for dynamic metric: %r', event_name) METRICS_METADATA[event_name] = MetricMetadata( 'Hardware PMU counter (raw event)', MetricType.COUNTER, MetricUnit.NUMERIC, MetricSource.PERF_SUBSYSTEM_WITH_CGROUPS, MetricGranularity.TASK, [], 'no (event_names)', ) # We had the modify levels for all metrics # The set proper levels based on perf_aggregate_cpus value if not perf_aggregate_cpus: log.debug('Enabling "cpu" level for PERF_SUBSYSTEM_WITH_CGROUPS and derived metrics.') for metric_metadata in METRICS_METADATA.values(): if metric_metadata.source == MetricSource.PERF_SUBSYSTEM_WITH_CGROUPS: metric_metadata.levels = ['cpu'] if metric_metadata.source == MetricSource.DERIVED_PERF_WITH_CGROUPS: metric_metadata.levels = ['cpu'] self._enable_derived_metrics = enable_derived_metrics self._uncore_events = uncore_event_names self._task_label_generators = task_label_generators or {} self._wss_reset_cycles = wss_reset_cycles self._wss_stable_cycles = wss_stable_cycles self._wss_membw_threshold = wss_membw_threshold self._uncore_pmu = None self._initialize_rdt_callback = None self._iterate_body_callback = None self._cached_bandwidth = None if zoneinfo is True: self._zoneinfo = zoneinfo zoneinfo_regexp = zoneinfo_module.DEFAULT_REGEXP log.debug('Enabled zoneinfo collection') elif zoneinfo is False: self._zoneinfo = zoneinfo log.debug('Disabled zoneinfo collection') zoneinfo_regexp = None else: zoneinfo_regexp = zoneinfo self._zoneinfo = True # Validate zoneinfo regexp. log.debug('zoneinfo=%r regexp=%r', self._zoneinfo, zoneinfo_regexp) self._zoneinfo_regexp_compiled = None if self._zoneinfo: try: self._zoneinfo_regexp_compiled = re.compile(zoneinfo_regexp) except re.error as e: raise ValidationError('zoneinfo_regexp_compile improper regexp: %s' % e) if not self._zoneinfo_regexp_compiled.groups == 2: raise ValidationError( 'zoneinfo_regexp_compile improper number of groups: should be 2') # Validate config and vmstat regexp. if vmstat in (True, False): self._vmstat = vmstat else: # Got regexp - compile and check... try: self._vmstat = re.compile(vmstat) except re.error as e: raise ValidationError('vmstat_regexp_compile improper regexp: %s' % e) # Validate config and sched regexp. if sched in (True, False): self._sched = sched else: # Got regexp - compile and check... try: self._sched = re.compile(sched) except re.error as e: raise ValidationError('sched regex compile improper regexp: %s' % e)
def reschedule_interval(self, interval: Numeric(0, 60)): while True: self.reschedule() time.sleep(interval)
class KubernetesNode(Node): """rst Class to communicate with orchestrator: Kubernetes. Derived from abstract Node class providing get_tasks interface. - ``cgroup_driver``: **CgroupDriverType** = *CgroupDriverType.CGROUPFS* We need to know what cgroup driver is used to properly build cgroup paths for pods. Reference in source code for kubernetes version stable 1.13: https://github.com/kubernetes/kubernetes/blob/v1.13.3/pkg/kubelet/cm/cgroup_manager_linux.go#L207 - ``ssl``: **Optional[SSL]** = *None* ssl object used to communicate with kubernetes - ``client_token_path``: **Optional[Path]** = *SERVICE_TOKEN_FILENAME* Default path is using by pods. You can override it to use wca outside pod. - ``server_cert_ca_path``: **Optional[Path]** = *SERVICE_CERT_FILENAME* Default path is using by pods. You can override it to use wca outside pod. - ``kubelet_enabled``: **bool** = *False* If true use **kubelet**, otherwise **kubeapi**. - ``kubelet_endpoint``: **Url** = *'https://127.0.0.1:10250'* By default use localhost. - ``kubeapi_host``: **Str** = *None* - ``kubeapi_port``: **Str** = *None* - ``node_ip``: **Str** = *None* - ``timeout``: **Numeric(1, 60)** = *5* Timeout to access kubernetes agent [seconds]. - ``monitored_namespaces``: **List[Str]** = *["default"]* List of namespaces to monitor pods in. """ cgroup_driver: CgroupDriverType = CgroupDriverType.CGROUPFS ssl: Optional[SSL] = None client_token_path: Optional[Path(absolute=True, mode=os.R_OK)] = SERVICE_TOKEN_FILENAME server_cert_ca_path: Optional[Path(absolute=True, mode=os.R_OK)] = SERVICE_CERT_FILENAME kubelet_enabled: bool = False kubelet_endpoint: Url = 'https://127.0.0.1:10250' kubeapi_host: Str = None kubeapi_port: Str = None # Because !Env is String and another type cast might be problematic node_ip: Str = None timeout: Numeric(1, 60) = 5 # [s] monitored_namespaces: List[Str] = field( default_factory=lambda: ["default"]) def _request_kubeapi(self): kubeapi_endpoint = "https://{}:{}".format(self.kubeapi_host, self.kubeapi_port) log.debug("Created kubeapi endpoint %s", kubeapi_endpoint) with pathlib.Path(self.client_token_path).open() as f: service_token = f.read() pod_list_from_all_namespaces = [] for namespace in self.monitored_namespaces: full_url = urljoin(kubeapi_endpoint, "/api/v1/namespaces/{}/pods".format(namespace)) r = requests.get( full_url, headers={"Authorization": "Bearer {}".format(service_token)}, timeout=self.timeout, verify=self.server_cert_ca_path) if not r.ok: log.error( 'An unexpected error occurred for namespace "%s": %i %s - %s', namespace, r.status_code, r.reason, r.raw) r.raise_for_status() pod_list_from_namespace = r.json().get('items') pod_list_from_all_namespaces.extend(pod_list_from_namespace) return pod_list_from_all_namespaces def _request_kubelet(self): PODS_PATH = '/pods' full_url = urljoin(self.kubelet_endpoint, PODS_PATH) if self.ssl: s = requests.Session() s.mount(self.kubelet_endpoint, HTTPSAdapter()) r = s.get(full_url, json=dict(type='GET_STATE'), timeout=self.timeout, verify=self.ssl.server_verify, cert=self.ssl.get_client_certs()) else: r = requests.get(full_url, json=dict(type='GET_STATE'), timeout=self.timeout) if not r.ok: log.error('%i %s - %s', r.status_code, r.reason, r.raw) r.raise_for_status() return r.json().get('items') def get_tasks(self) -> List[Task]: """Returns only running tasks.""" try: if self.kubelet_enabled: podlist_json_response = self._request_kubelet() else: podlist_json_response = self._request_kubeapi() if self.node_ip is None: raise ValueError("node_ip is not set in config") except requests.exceptions.ConnectionError as e: raise TaskSynchronizationException('connection error: %s' % e) from e except requests.exceptions.ReadTimeout as e: raise TaskSynchronizationException('timeout: %s' % e) from e tasks = [] for pod in podlist_json_response: container_statuses = pod.get('status').get('containerStatuses') # Kubeapi returns all pods in cluster if not self.kubelet_enabled and pod["status"][ "hostIP"] != self.node_ip.strip(): continue # Kubelet return all pods on the node. Ignore pods in not monitored namespaces. if self.kubelet_enabled and \ pod.get('metadata').get('namespace') not in self.monitored_namespaces: continue # Lacking needed information. if not container_statuses: continue # Read into variables essential information about pod. pod_id = pod.get('metadata').get('uid') pod_name = pod.get('metadata').get('name') qos = pod.get('status').get('qosClass').lower() task_name = pod.get('metadata').get('namespace') + "/" + pod_name assert QosClass.has_value(qos) if pod.get('metadata').get('labels'): labels = { _sanitize_label(key): value for key, value in pod.get('metadata').get( 'labels').items() } else: labels = {} labels[_sanitize_label( QOS_LABELNAME)] = qos # Add label with QOS class of the pod. # Apart from obvious part of the loop it checks whether all # containers are in ready state - # if at least one is not ready then skip this pod. containers_cgroups = [] are_all_containers_ready = True for container in container_statuses: if not container.get('ready'): are_all_containers_ready = False container_state = list(container.get('state').keys())[0] log.debug( 'Ignore pod with uid={} name={}. Container {} is in state={} .' .format(pod_id, pod_name, container.get('name'), container_state)) break container_id = container.get('containerID').split( 'docker://')[1] containers_cgroups.append( _build_cgroup_path(self.cgroup_driver, qos, pod_id, container_id)) if not are_all_containers_ready: continue log.debug( 'Pod with uid={} name={} is ready and monitored by the system.' .format(pod_id, pod_name)) container_spec = pod.get('spec').get('containers') tasks.append( KubernetesTask( name=task_name, task_id=pod_id, qos=qos, labels=labels, resources=calculate_pod_resources(container_spec), cgroup_path=_build_cgroup_path(self.cgroup_driver, qos, pod_id), subcgroups_paths=containers_cgroups)) _log_found_tasks(tasks) return tasks
def __init__( self, node: Node, metrics_storage: Storage = DEFAULT_STORAGE, interval: Numeric(0, 60) = 1., rdt_enabled: Optional[bool] = None, gather_hw_mm_topology: bool = False, extra_labels: Optional[Dict[Str, Str]] = None, event_names: List[str] = [], perf_aggregate_cpus: bool = True, enable_derived_metrics: bool = False, enable_perf_uncore: Optional[bool] = None, task_label_generators: Optional[Dict[str, TaskLabelGenerator]] = None, allocation_configuration: Optional[AllocationConfiguration] = None, wss_reset_interval: int = 0, include_optional_labels: bool = False ): self._node = node self._metrics_storage = metrics_storage self._interval = interval self._rdt_enabled = rdt_enabled self._gather_hw_mm_topology = gather_hw_mm_topology self._include_optional_labels = include_optional_labels self._extra_labels = {k: str(v) for k, v in extra_labels.items()} if extra_labels else dict() log.debug('Extra labels: %r', self._extra_labels) self._finish = False # Guard to stop iterations. self._last_iteration = time.time() # Used internally by wait function. self._allocation_configuration = allocation_configuration self._event_names = event_names log.info('Enabling %i perf events: %s', len(self._event_names), ', '.join(self._event_names)) self._perf_aggregate_cpus = perf_aggregate_cpus # TODO: fix those workarounds for dynamic levels and dynamic perf event metrics. # First add dynamic metrics for event_name in event_names: # is dynamic raw event if '__r' in event_name: log.debug('Creating metadata for dynamic metric: %r', event_name) METRICS_METADATA[event_name] = MetricMetadata( 'Hardware PMU counter (raw event)', MetricType.COUNTER, MetricUnit.NUMERIC, MetricSource.PERF_SUBSYSTEM_WITH_CGROUPS, MetricGranularity.TASK, [], 'no (event_names)', ) # We had the modify levels for all metrics # The set proper levels based on perf_aggregate_cpus value if not perf_aggregate_cpus: log.debug('Enabling "cpu" level for PERF_SUBSYSTEM_WITH_CGROUPS metrics.') for metric_metadata in METRICS_METADATA.values(): if metric_metadata.source == MetricSource.PERF_SUBSYSTEM_WITH_CGROUPS: metric_metadata.levels = ['cpu'] self._enable_derived_metrics = enable_derived_metrics self._enable_perf_uncore = enable_perf_uncore # Default value for task_labels_generator. if task_label_generators is None: self._task_label_generators = { 'application': TaskLabelRegexGenerator('$', '', 'task_name'), 'application_version_name': TaskLabelRegexGenerator('.*$', '', 'task_name'), } else: self._task_label_generators = task_label_generators self._wss_reset_interval = wss_reset_interval self._uncore_pmu = None self._initialize_rdt_callback = None self._iterate_body_callback = None