def _source(self, node): attrs = InfoGraphNode.get_attributes(node) if InfoGraphNode.get_layer(node) == GRAPH_LAYER.PHYSICAL: if 'allocation' in attrs: return attrs['allocation'] # fix due to the landscape else: while attrs.get('attributes', None): attrs = attrs['attributes'] if 'allocation' in attrs: return attrs['allocation'] if InfoGraphNode.get_type(node) == NODE_TYPE.VIRTUAL_MACHINE: if 'vm_name' in attrs: return attrs['vm_name'] if InfoGraphNode.get_type(node) == NODE_TYPE.INSTANCE_DISK: # The machine is the source as this is a libvirt disk. disk_name = InfoGraphNode.get_name(node) vm = self.landscape.get_neighbour_by_type( disk_name, NODE_TYPE.VIRTUAL_MACHINE) machine = self.landscape.get_neighbour_by_type( vm, NODE_TYPE.PHYSICAL_MACHINE) return machine if InfoGraphNode.get_type(node) == NODE_TYPE.PHYSICAL_MACHINE: if 'name' in attrs: return attrs['name'] if InfoGraphNode.get_type(node) == NODE_TYPE.DOCKER_CONTAINER: docker_node = self.landscape.get_neighbour_by_type( InfoGraphNode.get_name(node), 'docker_node') if docker_node: machine = self.landscape.get_neighbour_by_type( docker_node, 'machine') return machine return None
def extract_infrastructure_graph(workload_name, ts_from, ts_to): """ Returns the entire landscape at the current time :return: """ landscape_ip = ConfigHelper.get("LANDSCAPE", "host") landscape_port = ConfigHelper.get("LANDSCAPE", "port") subgraph_extraction = SubGraphExtraction(landscape_ip=landscape_ip, landscape_port=landscape_port) # res = subgraph_extraction.get_workload_view_graph( # workload_name, int(ts_from), int(ts_to), # name_filtering_support=True) res = landscape.get_graph() #PARALLEL = True if PARALLEL: i = 0 threads = [] cpu_count = multiprocessing.cpu_count() all_node = res.nodes(data=True) no_node_thread = len(res.nodes()) / cpu_count node_pool = [] for node in all_node: if i < no_node_thread: node_pool.append(node) i = i + 1 else: thread1 = ParallelLandscape( i, "Thread-{}".format(InfoGraphNode.get_name(node)), i, node_pool) # thread1 = ParallelTelemetryAnnotation(i, "Thread-{}".format(InfoGraphNode.get_name(node)), i, # node_pool, internal_graph, self.telemetry, ts_to, ts_from) thread1.start() threads.append(thread1) i = 0 node_pool = [] if len(node_pool) != 0: thread1 = ParallelLandscape( i, "Thread-{}".format(InfoGraphNode.get_name(node)), i, node_pool) thread1.start() threads.append(thread1) [t.join() for t in threads] else: for node in res.nodes(data=True): attrs = InfoGraphNode.get_attributes(node) attrs = InfoGraphUtilities.str_to_dict(attrs) InfoGraphNode.set_attributes(node, attrs) return res
def annotate_machine_disk_util(internal_graph, node): source = InfoGraphNode.get_attributes(node)['allocation'] machine = InfoGraphNode.get_node(internal_graph, source) machine_util = InfoGraphNode.get_disk_utilization(machine) if 'intel/use/disk/utilization' not in machine_util.columns: disk_metric = 'intel/procfs/disk/utilization_percentage' disk_util_df = InfoGraphNode.get_disk_utilization(node) if disk_metric in disk_util_df.columns: disk_util = disk_util_df[disk_metric] disk_util = disk_util.fillna(0) machine_util[InfoGraphNode.get_attributes(node)['name']] = disk_util InfoGraphNode.set_disk_utilization(machine, machine_util) else: LOG.info('Disk util not Found use for node {}'.format(InfoGraphNode.get_name(node))) else: LOG.debug('Found use disk for node {}'.format(InfoGraphNode.get_name(node)))
def get_metrics(graph, metrics='all'): """ Returns all the metrics associated with the input graph :param graph: (NetworkX Graph) Graph to be annotated with data :param metrics: metric type to be considered. default = all :return: the list of metrics associated with the graph """ metric_list = [] for node in graph.nodes(data=True): node_name = InfoGraphNode.get_name(node) node_layer = InfoGraphNode.get_layer(node) node_type = InfoGraphNode.get_type(node) # This method supports export of either normal metrics coming # from telemetry agent or utilization type of metrics. if metrics == 'all': node_telemetry_data = InfoGraphNode.get_telemetry_data(node) else: node_telemetry_data = InfoGraphNode.get_utilization(node) metric_list.extend([ "{}@{}@{}@{}".format(node_name, node_layer, node_type, metric_name).replace(".", "_") for metric_name in node_telemetry_data.columns.values if metric_name != 'timestamp' ]) return metric_list
def get_compute_node_view(self, compute_node_hostnames, ts_from=None, ts_to=None, name_filtering_support=False): """ Returns a view for the compute node. """ res = None if isinstance(compute_node_hostnames, str): res = self._get_compute_node_subgraph(compute_node_hostnames, ts_from, ts_to) elif isinstance(compute_node_hostnames, list): res = self._get_network_subgraph(ts_from, ts_to) for hostname in compute_node_hostnames: if isinstance(hostname, str): graph = self._get_compute_node_subgraph( hostname, ts_from, ts_to) if len(graph.nodes()) > 0: graphs.merge_graph(res, graph) if name_filtering_support: for node in res.nodes(data=True): name = InfoGraphNode.get_name(node) InfoGraphNode.set_attribute(node, 'node_name', name) return res
def _create_pandas_data_frame_from_graph(graph, metrics='all'): """ Save on csv files the data in the graph. Stores one csv per node of the graph :param graph: (NetworkX Graph) Graph to be annotated with data :param directory: (str) directory where to store csv files :return: NetworkX Graph annotated with telemetry data """ result = pandas.DataFrame() for node in graph.nodes(data=True): node_name = InfoGraphNode.get_name(node) node_layer = InfoGraphNode.get_layer(node) node_type = InfoGraphNode.get_type(node) # This method supports export of either normal metrics coming # from telemetry agent or utilization type of metrics. if metrics == 'all': node_telemetry_data = InfoGraphNode.get_telemetry_data(node) else: node_telemetry_data = InfoGraphNode.get_utilization(node) # df = node_telemetry_data.copy() # LOG.info("Node Name: {} -- Telemetry: {}".format( # InfoGraphNode.get_name(node), # InfoGraphNode.get_telemetry_data(node).columns.values # )) node_telemetry_data['timestamp'] = node_telemetry_data[ 'timestamp'].astype(float) node_telemetry_data['timestamp'] = node_telemetry_data[ 'timestamp'].round() node_telemetry_data['timestamp'] = node_telemetry_data[ 'timestamp'].astype(int) for metric_name in node_telemetry_data.columns.values: if metric_name == 'timestamp': continue col_name = "{}@{}@{}@{}".\ format(node_name, node_layer, node_type, metric_name) col_name = col_name.replace(".", "_") node_telemetry_data = node_telemetry_data.rename( columns={metric_name: col_name}) # LOG.info("TELEMETRIA: {}".format(node_telemetry_data.columns.values)) if node_telemetry_data.empty or len( node_telemetry_data.columns) <= 1: continue if result.empty: result = node_telemetry_data.copy() else: node_telemetry_data = \ node_telemetry_data.drop_duplicates(subset='timestamp') result = pandas.merge(result, node_telemetry_data, how='outer', on='timestamp') # TODO: Try with this removed # result.set_index(['timestamp']) return result
def annotate_machine_network_util(internal_graph, node): source = InfoGraphNode.get_attributes(node)['allocation'] machine = InfoGraphNode.get_node(internal_graph, source) machine_util = InfoGraphNode.get_network_utilization(machine) if 'intel/use/network/utilization' not in machine_util.columns: net_metric = 'intel/psutil/net/utilization_percentage' net_util_df = InfoGraphNode.get_network_utilization(node) if net_metric in net_util_df.columns: net_util = net_util_df[net_metric] net_util = net_util.fillna(0) machine_util[InfoGraphNode.get_attributes(node)['name']] = net_util InfoGraphNode.set_network_utilization(machine, machine_util) else: LOG.info('Net util not Found use for node {}'.format(InfoGraphNode.get_name(node))) else: LOG.debug('Found use network for node {}'.format(InfoGraphNode.get_name(node)))
def annotate_machine_pu_util(internal_graph, node): source = InfoGraphNode.get_machine_name_of_pu(node) machine = InfoGraphNode.get_node(internal_graph, source) machine_util = InfoGraphNode.get_compute_utilization(machine) if 'intel/use/compute/utilization' not in machine_util.columns: sum_util = None cpu_metric = 'intel/procfs/cpu/utilization_percentage' pu_util_df = InfoGraphNode.get_compute_utilization(node) if cpu_metric in pu_util_df.columns: pu_util = pu_util_df[cpu_metric] pu_util = pu_util.fillna(0) machine_util[InfoGraphNode.get_attributes(node)['name']] = pu_util InfoGraphNode.set_compute_utilization(machine, machine_util) else: LOG.info('CPU util not Found use for node {}'.format(InfoGraphNode.get_name(node))) else: LOG.debug('Found use for node {}'.format(InfoGraphNode.get_name(node)))
def _nova_uuid(self, node): if InfoGraphNode.get_type(node) == NODE_TYPE.INSTANCE_DISK: disk_name = InfoGraphNode.get_name(node) vm = self.landscape.get_neighbour_by_type(disk_name, "vm") return vm if InfoGraphNode.get_type(node) == NODE_TYPE.PHYSICAL_MACHINE: vm = self.vms.pop() return vm return None
def filter_graph(graph): """ Returns the graph filtered removing all the nodes with no telemetry """ template_mapping = dict() res = graph.copy() for node in res.nodes(data=True): # for p in node[1]['attributes']: # p = str(p) template = node[1]['attributes']['template'] \ if 'template' in node[1]['attributes'] else None # If node is a service node, need to remove the template if template: template_mapping[InfoGraphNode.get_name(node)] = template node[1]['attributes'].pop('template') # Fix format for conversion to JSON (happening in analytics) node[1]['attributes'] = \ str(misc.convert_unicode_dict_to_string(node[1]['attributes'])).\ replace("'", '"') for node in res.nodes(data=True): node_name = InfoGraphNode.get_name(node) telemetry = InfoGraphNode.get_telemetry_data(node) layer = InfoGraphNode.get_layer(node) # if len(telemetry.columns.values) <= 1: if len(telemetry.columns) <= 1 and \ not layer == InfoGraphNodeLayer.SERVICE: InfoGraphNode.set_telemetry_data(node, dict()) res.filter_nodes('node_name', node_name) # Convert attributes back to dict() for node in res.nodes(data=True): string = InfoGraphNode.get_attributes(node) attrs = InfoGraphUtilities.str_to_dict(string) if InfoGraphNode.get_type(node) == \ InfoGraphNodeType.SERVICE_COMPUTE: attrs['template'] = \ template_mapping[InfoGraphNode.get_name(node)] InfoGraphNode.set_attributes(node, attrs) return res
def _disk(self, node): disk = None if (InfoGraphNode.get_type(node) == NODE_TYPE.PHYSICAL_DISK or InfoGraphNode.get_type(node) == NODE_TYPE.PHYSICAL_MACHINE): attrs = InfoGraphNode.get_attributes(node) if 'osdev_storage-name' in attrs: disk = attrs["osdev_storage-name"] elif InfoGraphNode.get_type(node) == NODE_TYPE.INSTANCE_DISK: disk = InfoGraphNode.get_name(node).split("_")[1] return disk
def get_annotated_graph(self, graph, ts_from, ts_to, utilization=True, saturation=True): internal_graph = graph.copy() i = 0 threads = [] cpu_count = multiprocessing.cpu_count() no_node_thread = len(internal_graph.nodes()) / (cpu_count) node_pool = [] node_pools = [] for node in internal_graph.nodes(data=True): if i < no_node_thread: node_pool.append(node) i = i + 1 else: thread1 = ParallelTelemetryAnnotation( i, "Thread-{}".format(InfoGraphNode.get_name(node)), i, node_pool, internal_graph, self.telemetry, ts_to, ts_from) threads.append(thread1) node_pools.append(node_pool) i = 1 node_pool = [node] if len(node_pool) != 0: node_pools.append(node_pool) thread1 = ParallelTelemetryAnnotation( i, "Thread-{}".format(InfoGraphNode.get_name(node)), i, node_pool, internal_graph, self.telemetry, ts_to, ts_from) threads.append(thread1) [t.start() for t in threads] [t.join() for t in threads] for node in internal_graph.nodes(data=True): if InfoGraphNode.get_type(node) == InfoGraphNodeType.PHYSICAL_PU: self.utils.annotate_machine_pu_util(internal_graph, node) elif InfoGraphNode.node_is_disk(node): self.utils.annotate_machine_disk_util(internal_graph, node) elif InfoGraphNode.node_is_nic(node): self.utils.annotate_machine_network_util(internal_graph, node) return internal_graph
def _stack(self, node): if InfoGraphNode.get_type(node) == NODE_TYPE.VIRTUAL_MACHINE: # Taking service node to which the VM is connected predecessors = self.landscape.predecessors( InfoGraphNode.get_name(node)) for predecessor in predecessors: predecessor_node = self.landscape.node[predecessor] if predecessor_node['type'] == NODE_TYPE.SERVICE_COMPUTE: if 'stack_name' in predecessor_node: return predecessor_node["stack_name"] return None
def get_correlation(node_a, node_b, metric_a, metric_b): # TODO: Add node validation # InfoGraphNode.validateNode(node_a) # InfoGraphNode.validateNode(node_b) node_name_a = InfoGraphNode.get_name(node_a) node_name_b = InfoGraphNode.get_name(node_b) if metric_a == 'utilization': telemetry_a = InfoGraphNode.get_utilization(node_a) else: telemetry_a = InfoGraphNode.get_telemetry_data(node_a) if metric_b == 'utilization': telemetry_b = InfoGraphNode.get_utilization(node_b) else: telemetry_b = InfoGraphNode.get_telemetry_data(node_b) if metric_a not in telemetry_a.columns.values: raise ValueError( "Metric {} is not in Telemetry data of Node {}".format( metric_a, node_name_a)) if metric_b not in telemetry_b.columns.values: raise ValueError( "Metric {} is not in Telemetry data of Node {}".format( metric_b, node_name_b)) if telemetry_a.empty and telemetry_b.empty: return 0 res = telemetry_a.corrwith(telemetry_b) df_a = telemetry_a.\ rename(columns={metric_a: "a-{}".format(metric_a)}).astype(float) df_b = telemetry_b.\ rename(columns={metric_b: "b-{}".format(metric_b)}).astype(float) correlation = pandas.merge(df_a, df_b, how='outer', on='timestamp') correlation = correlation.dropna() res = correlation["a-{}".format(metric_a)].\ corr(correlation["b-{}".format(metric_b)]) return res
def _source_metrics(self, node): """ Retrieves metrics associated with a source/host. The source is identified by the node and then all metrics types are collected for that source. If the node is physical then the metric types are retrieved using just the machine name as the source, if the node is virtual then the source (the vm hostname) and the stack name are required. """ metric_types = [] node_layer = InfoGraphNode.get_layer(node) node_type = InfoGraphNode.get_type(node) if node_layer == GRAPH_LAYER.PHYSICAL \ or node_type == NODE_TYPE.INSTANCE_DISK: try: source = self._source(node) identifier = source query_tags = {"source": source} metric_types = self._cached_metrics(identifier, query_tags) except Exception as ex: LOG.error('Malformed graph: {}'.format( InfoGraphNode.get_name(node))) LOG.error(ex) elif node_layer == GRAPH_LAYER.VIRTUAL: source = self._source(node) stack = self._stack(node) #LOG.info("SOURCE: {}".format(source)) #LOG.info("STACK: {}".format(stack)) if stack is not None: identifier = "{}-{}".format(source, stack) # query_tags = {"source": source, "stack": stack} query_tags = {"stack_name": stack} metric_types = self._cached_metrics(identifier, query_tags) elif node_type == NODE_TYPE.DOCKER_CONTAINER: source = self._source(node) docker_id = InfoGraphNode.get_docker_id(node) if docker_id is not None and source is not None: identifier = "{}-{}".format(source, docker_id) query_tags = {"docker_id": docker_id, "source": source} metric_types = self._cached_metrics(identifier, query_tags) return metric_types
def _node_is_nic_on_management_net(node, graph, mng_net_name): node_name = InfoGraphNode.get_name(node) node_type = InfoGraphNode.get_type(node) if node_type == InfoGraphNodeType.VIRTUAL_NIC or \ node_type == InfoGraphNodeType.VIRTUAL_NIC_2: neighs = graph.neighbors(node_name) for n in neighs: neighbor = InfoGraphNode.\ get_node(graph, n) if InfoGraphNode.get_type(neighbor) == \ InfoGraphNodeType.VIRTUAL_NETWORK: network_name = \ InfoGraphNode.get_attributes( neighbor)['name'] if network_name == mng_net_name: return True return False
def machine_capacity_usage(annotated_subgraph): """ This is a type of fingerprint from the infrastructure perspective """ # TODO: Validate graph categories = list() categories.append(InfoGraphNodeCategory.COMPUTE) categories.append(InfoGraphNodeCategory.NETWORK) # TODO: Add a Volume to the workloads to get HD usage categories.append(InfoGraphNodeCategory.STORAGE) # TODO: Get telemetry for Memory categories.append(InfoGraphNodeCategory.MEMORY) fingerprint = dict() counter = dict() for category in categories: fingerprint[category] = 0 counter[category] = 0 # calculation of the fingerprint on top of the virtual resources local_subgraph = annotated_subgraph.copy() local_subgraph.filter_nodes('layer', "virtual") local_subgraph.filter_nodes('layer', "service") local_subgraph.filter_nodes('type', 'machine') for node in local_subgraph.nodes(data=True): # if Fingerprint._node_is_nic_on_management_net( # node, annotated_subgraph, mng_net_name): # continue name = InfoGraphNode.get_name(node) category = InfoGraphNode.get_category(node) utilization = InfoGraphNode.get_utilization(node) if 'utilization' in utilization.columns.values: # LOG.info("NODE: {} - CATEGORY: {}".format(name, category)) mean = utilization['utilization'].mean() fingerprint[category] += mean counter[category] += 1 # This is just an average # TODO: Improve the average for category in categories: if counter[category] > 0: fingerprint[category] = \ fingerprint[category] / counter[category] return fingerprint
def get_workload_view_graph(self, stack_names, ts_from=None, ts_to=None, name_filtering_support=False): """ Returns a graph which only includes the resources related to the execution of the stack names indicated in the input parameter """ res = None if isinstance(stack_names, str): res = self._get_workload_subgraph(stack_names, ts_from, ts_to) # TODO - URGENT: Check this with the new Lanscape elif isinstance(stack_names, list): temp_res = list() for stack_name in stack_names: graph = self._get_workload_subgraph(str(stack_name), ts_from, ts_to) if len(graph.nodes()) > 0: temp_res.append(graph) for graph in temp_res: if not res and len(graph.nodes()) > 0: res = graph elif len(graph.nodes()) > 0: # TODO - URGENT: Fix this. Put Merge within the analytics res = graphs.merge_graph(res, graph) # TODO - URGENT: Check this with the new Lanscape machine_count = 0 for node in res.nodes(data=True): if InfoGraphNode.node_is_machine(node): machine_count += 1 if name_filtering_support: for node in res.nodes(data=True): name = InfoGraphNode.get_name(node) InfoGraphNode.set_attribute(node, 'node_name', name) return res
def _source(self, node): attrs = InfoGraphNode.get_attributes(node) if InfoGraphNode.get_layer(node) == GRAPH_LAYER.PHYSICAL: if 'allocation' in attrs: return attrs['allocation'] if InfoGraphNode.get_type(node) == NODE_TYPE.VIRTUAL_MACHINE: if 'vm_name' in attrs: return attrs['vm_name'] elif 'name' in attrs: return attrs['name'] if InfoGraphNode.get_type(node) == NODE_TYPE.INSTANCE_DISK: # The machine is the source as this is a libvirt disk. disk_name = InfoGraphNode.get_name(node) vm = self.landscape.get_neighbour_by_type( disk_name, NODE_TYPE.VIRTUAL_MACHINE) machine = self.landscape.get_neighbour_by_type( vm, NODE_TYPE.PHYSICAL_MACHINE) return machine if InfoGraphNode.get_type(node) == NODE_TYPE.PHYSICAL_MACHINE: if 'name' in attrs: return attrs['name'] return None
def saturation_scores(graph): """ Returns a dictionary with the scores of all the nodes of the graph. :param graph: InfoGraph :return: dict[node_name] = score """ res = dict() for node in graph.nodes(data=True): node_name = InfoGraphNode.get_name(node) res[node_name] = dict() sat = InfoGraphNode.get_saturation(node) import analytics_engine.common as common LOG = common.LOG res[node_name]['compute'] = 0 res[node_name]['disk'] = 0 res[node_name]['network'] = 0 res[node_name]['memory'] = 0 if (isinstance(sat, pandas.DataFrame) and sat.empty) or \ (not isinstance(sat, pandas.DataFrame) and sat == None): continue if 'intel/use/compute/saturation' in sat: res[node_name]['compute'] = ( sat.get('intel/use/compute/saturation').mean()) / 100.0 if 'intel/use/memory/saturation' in sat: res[node_name]['memory'] = ( sat.get('intel/use/memory/saturation').mean()) / 100.0 if 'intel/use/disk/saturation' in sat: res[node_name]['disk'] = ( sat.get('intel/use/disk/saturation').mean()) / 100.0 if 'intel/use/network/saturation' in sat: res[node_name]['network'] = ( sat.get('intel/use/network/saturation').mean()) / 100.0 return res
def get_queries(self, graph, node, ts_from, ts_to): """ :param graph: :param node: :param ts_from: :param ts_to: :return: """ node_name = InfoGraphNode.get_name(node) node_layer = InfoGraphNode.get_layer(node) queries = list() # No point to ask for Service Resources if node_layer == InfoGraphNodeLayer.SERVICE: return queries for metric in self._get_metrics(node): try: query = self._build_query(metric, node, ts_from, ts_to) except Exception as e: LOG.error('Exception for metric: {}'.format(metric)) queries.append({"{}_{}".format(metric, node_name): query}) return queries
def _get_nova_uuids(self, node): if InfoGraphNode.get_type(node) == NODE_TYPE.PHYSICAL_MACHINE: phy_name = InfoGraphNode.get_name(node) self.vms = self.landscape.get_neighbours_by_type(phy_name, "vm")
def compute_node_resources(annotated_subgraph, hostname=None): """ This is a type of fingerprint from the infrastructure perspective """ # TODO: Validate graph data = dict() statistics = dict() # Calculation of the fingerprint on top of the virtual resources local_subgraph = annotated_subgraph.copy() for node in local_subgraph.nodes(data=True): layer = InfoGraphNode.get_layer(node) if layer == InfoGraphNodeLayer.VIRTUAL: continue if layer == InfoGraphNodeLayer.SERVICE: continue type = InfoGraphNode.get_type(node) if type == 'core': continue # If hostname has been specified, need to take into account only # nodes that are related to the specific host attrs = InfoGraphNode.get_attributes(node) allocation = attrs['allocation'] if 'allocation' in attrs \ else None if hostname and not hostname == allocation: continue name = InfoGraphNode.get_name(node) statistics[name] = { 'mean': 0, 'median': 0, 'min': 0, 'max': 0, 'var': 0, 'std_dev': 0 } utilization = InfoGraphNode.get_utilization(node) try: utilization = utilization.drop('timestamp', 1) except ValueError: utilization = InfoGraphNode.get_utilization(node) data[name] = utilization if not data[name].empty: mean = data[name]['utilization'].mean() median = (data[name]['utilization']).median() min = data[name]['utilization'].min() maximum = data[name]['utilization'].max() var = data[name]['utilization'].var() std_dev = math.sqrt(var) else: mean = 0 median = 0 min = 0 maximum = 0 var = 0 std_dev = 0 statistics[name] = \ {'mean': mean, 'median': median, 'min': min, 'max': maximum, 'var': var, 'std_dev': std_dev} return [data, statistics]
def run(self, workload, optimal_node_type='machine'): """ Ranks machines by CPU utilization. :param workload: Contains workload related info and results. :return: heuristic results """ workload_config = workload.get_configuration() graph = workload.get_latest_graph() if not graph: raise KeyError('No graph to be processed.') scores = LandscapeScore.utilization_scores(graph) scores_sat = LandscapeScore.saturation_scores(graph) heuristic_results = pd.DataFrame(columns=[ 'node_name', 'type', 'ipaddress', 'compute utilization', 'compute saturation', 'memory utilization', 'memory saturation', 'network utilization', 'network saturation', 'disk utilization', 'disk saturation', ]) heuristic_results_nt = heuristic_results.copy() device_id_col_name = None project = None if workload_config.get('project'): project = workload_config['project'] device_id_col_name = workload_config['project'] + '_device_id' heuristic_results[device_id_col_name] = None telemetry_filter = workload_config.get('telemetry_filter') for node in graph.nodes(data=True): node_name = InfoGraphNode.get_name(node) node_type = InfoGraphNode.get_type(node) list_node_name = node_name if node_type == optimal_node_type: if InfoGraphNode.node_is_vm(node): vm_name = InfoGraphNode.get_properties(node).get('vm_name') if vm_name: list_node_name = vm_name data = { 'node_name': list_node_name, 'type': node_type, 'ipaddress': InfoGraphNode.get_attributes(node).get('ipaddress'), 'compute utilization': scores[node_name]['compute'], 'compute saturation': scores_sat[node_name]['compute'], 'memory utilization': scores[node_name]['memory'], 'memory saturation': scores_sat[node_name]['memory'], 'network utilization': scores[node_name]['network'], 'network saturation': scores_sat[node_name]['network'], 'disk utilization': scores[node_name]['disk'], 'disk saturation': scores_sat[node_name]['disk'] } if device_id_col_name: dev_id = InfoGraphNode.get_properties(node).get( device_id_col_name) if project == 'mf2c': dev_id = dev_id.replace('_', '-') data[device_id_col_name] = dev_id if InfoGraphNode.get_properties(node).get( "telemetry_data") is not None: heuristic_results = heuristic_results.append( data, ignore_index=True) elif not telemetry_filter: heuristic_results_nt = heuristic_results.append( data, ignore_index=True) if not workload.get_workload_name().startswith('optimal_'): if InfoGraphNode.get_type( node ) == "docker_container" and optimal_node_type == 'machine': node_name = InfoGraphNode.get_docker_id(node) heuristic_results = heuristic_results.append( { 'node_name': node_name, 'type': node_type, 'ipaddress': None, 'compute utilization': scores[node_name]['compute'], 'compute saturation': None, 'memory utilization': scores[node_name]['memory'], 'memory saturation': None, 'network utilization': scores[node_name]['network'], 'network saturation': None, 'disk utilization': scores[node_name]['disk'], 'disk saturation': None }, ignore_index=True) sort_fields = ['compute utilization'] sort_order = workload_config.get('sort_order') if sort_order: sort_fields = [] for val in sort_order: if val == 'cpu': sort_fields.append('compute utilization') if val == 'memory': sort_fields.append('memory utilization') if val == 'network': sort_fields.append('network utilization') if val == 'disk': sort_fields.append('disk utilization') heuristic_results_nt = heuristic_results_nt.replace([0], [None]) heuristic_results = heuristic_results.sort_values(by=sort_fields, ascending=True) heuristic_results = heuristic_results.append(heuristic_results_nt, ignore_index=True) workload.append_metadata(self.__filter_name__, heuristic_results) LOG.info('AVG: {}'.format(heuristic_results)) return heuristic_results
def utilization_scores(graph): """ Returns a dictionary with the scores of all the nodes of the graph. :param graph: InfoGraph :return: dict[node_name] = score """ res = dict() for node in graph.nodes(data=True): node_name = InfoGraphNode.get_name(node) res[node_name] = dict() util = InfoGraphNode.get_utilization(node) import analytics_engine.common as common LOG = common.LOG res[node_name]['compute'] = 0 res[node_name]['disk'] = 0 res[node_name]['network'] = 0 res[node_name]['memory'] = 0 if (isinstance(util, pandas.DataFrame) and util.empty) or \ (not isinstance(util, pandas.DataFrame) and util==None): continue # intel/use/ if 'intel/use/compute/utilization' in util: res[node_name]['compute'] = ( util.get('intel/use/compute/utilization').mean()) / 100.0 elif 'intel/procfs/cpu/utilization_percentage' in util: res[node_name]['compute'] = (util.get( 'intel/procfs/cpu/utilization_percentage').mean()) / 100.0 if 'intel/use/memory/utilization' in util: res[node_name]['memory'] = ( util.get('intel/use/memory/utilization').mean()) / 100.0 elif 'intel/procfs/memory/utilization_percentage' in util: res[node_name]['memory'] = ( util.get('intel/procfs/memory/utilization_percentage' ).mean()) / 100.0 if 'intel/use/disk/utilization' in util: res[node_name]['disk'] = ( util.get('intel/use/disk/utilization').mean()) / 100.0 elif 'intel/procfs/disk/utilization_percentage' in util: res[node_name]['disk'] = (util.get( 'intel/procfs/disk/utilization_percentage').mean()) / 100.0 if 'intel/use/network/utilization' in util: res[node_name]['network'] = ( util.get('intel/use/network/utilization').mean()) / 100.0 elif 'intel/psutil/net/utilization_percentage' in util: res[node_name]['network'] = (util.get( 'intel/psutil/net/utilization_percentage').mean()) / 100.0 # special handling of cpu, disk & network utilization if node is a machine if InfoGraphNode.node_is_machine(node): # mean from all cpu columns cpu_util = InfoGraphNode.get_compute_utilization(node) cpu_util['total'] = [ sum(row) / len(row) for index, row in cpu_util.iterrows() ] res[node_name]['compute'] = cpu_util['total'].mean() / 100 # mean from all disk columns disk_util = InfoGraphNode.get_disk_utilization(node) if disk_util.empty: res[node_name]['disk'] = 0.0 else: disk_util['total'] = [ sum(row) / len(row) for index, row in disk_util.iterrows() ] res[node_name]['disk'] = disk_util['total'].mean() / 100 # mean from all nic columns net_util = InfoGraphNode.get_network_utilization(node) if net_util.empty: res[node_name]['network'] = 0.0 else: net_util['total'] = [ sum(row) / len(row) for index, row in net_util.iterrows() ] res[node_name]['network'] = net_util['total'].mean() / 100 # custom metric if InfoGraphNode.get_type( node) == InfoGraphNodeType.DOCKER_CONTAINER: node_name = InfoGraphNode.get_docker_id(node) res[node_name] = {} if 'intel/docker/stats/cgroups/cpu_stats/cpu_usage/percentage' in util.columns: res[node_name]['compute'] = util[ 'intel/docker/stats/cgroups/cpu_stats/cpu_usage/percentage'].mean( ) / 100 else: res[node_name]['compute'] = 0 if 'intel/docker/stats/cgroups/memory_stats/usage/percentage' in util.columns: res[node_name]['memory'] = util[ 'intel/docker/stats/cgroups/memory_stats/usage/percentage'].mean( ) / 100 else: res[node_name]['memory'] = 0 if 'intel/docker/stats/network/utilization_percentage' in util.columns: res[node_name]['network'] = util[ 'intel/docker/stats/network/utilization_percentage'].mean( ) / 100 else: res[node_name]['network'] = 0 if 'intel/docker/stats/cgroups/blkio_stats/io_time_recursive/percentage' in util.columns: res[node_name]['disk'] = util[ 'intel/docker/stats/cgroups/blkio_stats/io_time_recursive/percentage'].mean( ) / 100 else: res[node_name]['disk'] = 0 return res
def get_annotated_graph(self, graph, ts_from, ts_to, utilization=False, saturation=False): """ Collect data from cimmaron tsdb in relation to the specified graph and time windows and store an annotated subgraph in specified directory :param graph: (NetworkX Graph) Graph to be annotated with data :param ts_from: (str) Epoch time representation of start time :param ts_to: (str) Epoch time representation of stop time :param utilization: (bool) if True the method calculates also utilization for each node, if available :return: NetworkX Graph annotated with telemetry data """ TelemetryAnnotation._get_annotated_graph_input_validation( graph, ts_from, ts_to) internal_graph = graph.copy() self.internal_graph = internal_graph for node in internal_graph.nodes(data=True): if isinstance(self.telemetry, SnapAnnotation): queries = list() try: queries = self.telemetry.get_queries( internal_graph, node, ts_from, ts_to) # queries = self.telemetry.get_queries(graph, node, ts_from, ts_to) except Exception as e: LOG.error("Exception: {}".format(e)) LOG.error(e) import traceback traceback.print_exc() if len(queries) != 0: InfoGraphNode.set_queries(node, queries) telemetry_data = self.telemetry.get_data(node) InfoGraphNode.set_telemetry_data(node, telemetry_data) if utilization and not telemetry_data.empty: SnapUtils.utilization(internal_graph, node, self.telemetry) # if only procfs is available, results needs to be # propagated at machine level if InfoGraphNode.get_type( node) == InfoGraphNodeType.PHYSICAL_PU: SnapUtils.annotate_machine_pu_util( internal_graph, node) if InfoGraphNode.node_is_disk(node): SnapUtils.annotate_machine_disk_util( internal_graph, node) if InfoGraphNode.node_is_nic(node): SnapUtils.annotate_machine_network_util( internal_graph, node) if saturation: SnapUtils.saturation(internal_graph, node, self.telemetry) elif isinstance(self.telemetry, PrometheusAnnotation): queries = list() try: queries = self.telemetry.get_queries( internal_graph, node, ts_from, ts_to) # queries = self.telemetry.get_queries(graph, node, ts_from, ts_to) except Exception as e: LOG.error("Exception: {}".format(e)) LOG.error(e) import traceback traceback.print_exc() if len(queries) != 0: InfoGraphNode.set_queries(node, queries) telemetry_data = self.telemetry.get_data(node) InfoGraphNode.set_telemetry_data(node, telemetry_data) # if utilization and not telemetry_data.empty: #PrometheusUtils.utilization(internal_graph, node, self.telemetry) # if only procfs is available, results needs to be # propagated at machine level #if InfoGraphNode.get_type(node) == InfoGraphNodeType.PHYSICAL_PU: # PrometheusUtils.annotate_machine_pu_util(internal_graph, node) #if InfoGraphNode.node_is_disk(node): # PrometheusUtils.annotate_machine_disk_util(internal_graph, node) #if InfoGraphNode.node_is_nic(node): # PrometheusUtils.annotate_machine_network_util(internal_graph, node) #if saturation: #PrometheusUtils.saturation(internal_graph, node, self.telemetry) else: telemetry_data = self.telemetry.get_data(node) InfoGraphNode.set_telemetry_data(node, telemetry_data) if utilization and not telemetry_data.empty: SnapUtils.utilization(internal_graph, node, self.telemetry) # if only procfs is available, results needs to be # propagated at machine level if InfoGraphNode.get_type( node) == InfoGraphNodeType.PHYSICAL_PU: source = InfoGraphNode.get_machine_name_of_pu(node) machine = InfoGraphNode.get_node( internal_graph, source) machine_util = InfoGraphNode.get_compute_utilization( machine) if '/intel/use/compute/utilization' not in machine_util.columns: sum_util = None pu_util = InfoGraphNode.get_compute_utilization( node )['intel/procfs/cpu/utilization_percentage'] pu_util = pu_util.fillna(0) if 'intel/procfs/cpu/utilization_percentage' in machine_util.columns: machine_util = machine_util[ 'intel/procfs/cpu/utilization_percentage'] machine_util = machine_util.fillna(0) sum_util = machine_util.add(pu_util, fill_value=0) else: sum_util = pu_util if isinstance(sum_util, pandas.Series): # sum_util.index.name = None sum_util = pandas.DataFrame( sum_util, columns=[ 'intel/procfs/cpu/utilization_percentage' ]) InfoGraphNode.set_compute_utilization( machine, sum_util) else: LOG.debug('Found use for node {}'.format( InfoGraphNode.get_name(node))) if saturation: self._saturation(internal_graph, node, self.telemetry) return internal_graph