async def compute_util_and_waste(self): info = await get_container_utilization() try: containers = [docker_id[len('/docker/'):] for docker_id in info.keys()] util_list = [self.get_util(value) for value in info.values()] self.filter_dadvisor(containers, util_list) except Exception as e: log.error(e) return if not util_list: return cpu_util_list, mem_util_list = zip(*util_list) self.scale_list(cpu_util_list) self.scale_list(mem_util_list) cpu_waste_list = self.get_waste(cpu_util_list) mem_waste_list = self.get_waste(mem_util_list) log.info(cpu_util_list) for i, container in enumerate(containers): self.cpu_util_container_sum.labels(src=container, src_host=IP) \ .inc(cpu_util_list[i] * FACTOR) self.mem_util_container_sum.labels(src=container, src_host=IP) \ .inc(mem_util_list[i] * FACTOR) self.cpu_waste_container_sum.labels(src=container, src_host=IP) \ .inc(cpu_waste_list[i] * FACTOR) self.mem_waste_container_sum.labels(src=container, src_host=IP) \ .inc(mem_waste_list[i] * FACTOR)
def filter_dadvisor(self, containers, values): """ Don't compute utilization values about dAdvisor """ try: dadvisor_index = containers.index(self.container_collector.dadvisor_id) del containers[dadvisor_index] del values[dadvisor_index] except ValueError: log.error(f'dadvisor_id unkown: {self.container_collector.dadvisor_id}')
async def _send_get_json(url): try: async with aiohttp.ClientSession() as session: async with session.get(url) as resp: return await resp.json() except Exception as e: log.error(e) log.error(f'Cannot reach {url}') return None
def get_util(self, value): try: cores = self.node_collector.my_node_stats.get('num_cores', 1) memory = self.node_collector.my_node_stats.get('memory', 8 * 2 ** 30) cpu = value['minute_usage']['cpu']['mean'] / (cores * 1000.0) memory_percentage = value['minute_usage']['memory']['mean'] / memory return cpu, memory_percentage except Exception as e: log.error(e) return 0
def get_network(value): amount = 0 row = value[0] try: network = row['network'] interfaces = network['interfaces'] amount = sum(interface['tx_bytes'] for interface in interfaces) except Exception as e: log.error(e) return amount
async def _send_post(url, data): try: async with aiohttp.ClientSession() as session: await session.post(url, json=json.dumps(data, cls=JSONCustomEncoder)) return True except Exception as e: log.error(e) log.error(f'Cannot reach {url}') return False
async def compute_network_usage(self): data = await get_container_stats() try: containers = [docker_id[len('/docker/'):] for docker_id in data.keys()] network_values = [self.get_network(value) for value in data.values()] self.filter_dadvisor(containers, network_values) for i, container in enumerate(containers): prev = self.prev_network_container.get(container, 0) log.info(f'Container {container}: {prev}') self.prev_network_container[container] = network_values[i] self.network_container_sum.labels(src=container, src_host=IP) \ .inc(network_values[i] - prev) except Exception as e: log.error(e)
async def run(self): elapsed = 0 while self.running: try: await asyncio.sleep(SLEEP_TIME - elapsed) now = datetime.utcnow() log.info(f'Sleeping {SLEEP_TIME - elapsed} sec') # Execute once per SLEEP_TIME await self.compute_network_usage() await self.compute_util_and_waste() now2 = datetime.utcnow() elapsed = (now2 - now).seconds except Exception as e: log.error(e) log.info('StatsCollector stopped')
def run(self): self.check_installation() command = self.get_tcpdump_command() multiplier = 1 while self.running: """ One iteration of this while loop performns the following actions: 1. Run the tcpdump command that captures TRAFFIC_SAMPLE requests. This is collected in X seconds. 2. Resolve these requests by communicating with the other nodes 3. Sleep k*X seconds, with a lower- and upperbound. """ start_time = time.time() p = subprocess.Popen(command, stdout=subprocess.PIPE) # parse results for row in iter(p.stdout.readline, b''): try: dataflow = parse_row(self.container_collector, row.decode('utf-8')) dataflow.size = (dataflow.size + HEADER_SIZE) * multiplier self.analyser.loop.create_task(self.analyser.analyse_dataflow(dataflow)) except Exception as e: log.error(e) log.error('Cannot parse row: {}'.format(row.decode('utf-8').rstrip())) end_time = time.time() elapsed = end_time - start_time log.info('Monitoring {} packets in {} sec'.format(TRAFFIC_SAMPLE, elapsed)) self.analyser.loop.create_task(self.analyser.cache.resolve(self.node_collector)) # sleep K times the elapsed time. Minus the time it takes to resolve the cache sleep_time = TRAFFIC_K * elapsed - (time.time() - end_time) sleep_time = min(max(sleep_time, TRAFFIC_SLEEP_MIN), TRAFFIC_SLEEP_MAX) if elapsed != 0: multiplier = (sleep_time + elapsed) / elapsed else: multiplier = 1 log.info(f'Multiplier: {multiplier}') log.info('Sleeping for: {} sec'.format(sleep_time)) time.sleep(sleep_time) log.info('Inspector thread stopped')
async def run(self): """ Performs the following two actions: - only at initialization: collect static information about the host price - continuously (every 30 sec) perform the following actions: - find new containers - validate own containers (find out ip address and if they're alive) :return: """ while self.running: try: await asyncio.sleep(SLEEP_TIME) await self.collect_own_containers() await self.validate_own_containers() except Exception as e: log.error(e) log.info('ContainerCollector stopped')
async def run(self): """ This run method performs the following two actions: 1. register this peer in the tracker 2. continuously perform the following actions: - validate other nodes :return: """ register_node(self.loop, self.my_node) while self.running: try: await asyncio.sleep(SLEEP_TIME) self.loop.create_task(self.add_nodes(await get_all_nodes())) self.check_removal_counter += 1 if self.check_removal_counter == CHECK_REMOVE: self.check_removal_counter = 0 self.loop.create_task(self.check_nodes()) except Exception as e: log.error(e)
async def resolve(self, nodes_collector): """ Ask all nodes to resolve their ports into a container-hash. After this function has been called, the cache is empty """ for ip, data_list in list(self.cache.items()): node = nodes_collector.is_other_node(ip) if not node: log.error(f'Node not found {ip}') continue try: mapping = await get_mapping(node) ports = mapping['ports'] # port is encoded as string, therefore decode to int ports = {int(port): ip for port, ip in ports.items()} containers = mapping['containers'] for (from_to, local_hash, port, size) in data_list: ip = ports.get(port, None) remote_hash = containers.get(ip, None) if local_hash and remote_hash: if from_to == TO: self.counter.labels(src=local_hash, dst=remote_hash, src_host=IP)\ .inc(size) elif from_to == FROM: self.counter.labels(src=remote_hash, dst=local_hash, src_host=IP)\ .inc(size) try: del self.cache[ip] except KeyError: log.debug(f'Cannot remove {ip} from self.cache') except Exception as e: log.error(e) self.cache = {}
def check_installation(): try: subprocess.Popen(['tcpdump', '-D'], stdout=subprocess.PIPE) except ProcessLookupError: log.error('tcpdump is not installed. Please install it before running this code.') exit(-1)
This file contains all configurable options. In the future, set these values based on environment variables. """ import os import socket from datetime import datetime from dadvisor.log import log # INTERNAL PORTS AND ADDRESSES # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - try: ip = socket.gethostbyname(socket.gethostname()) except socket.gaierror as e: log.error(e) ip = 'localhost' IP = os.environ.get('IP', ip) IS_SUPER_NODE = os.environ.get('TYPE', 'NODE') == 'SUPERNODE' PROXY_PORT = int(os.environ.get('DADVISOR_PORT', 14100)) INTERNAL_PORT = 14101 PROMETHEUS_PORT = 14102 CADVISOR_URL = 'http://localhost:14104' PROMETHEUS_URL = f'http://localhost:{PROXY_PORT}/prometheus' TRACKER = os.environ.get('TRACKER', 'http://35.204.250.252:14100') FILTER_PORTS = os.environ.get( 'FILTER_PORTS',