class Monitoring_Controller: def __init__(self, logger, cost_controller, config_parser, config_file_path, manager_host, manager_port, process_state, ganglia_rrd_dir): self.cost_controller = cost_controller self.config_parser = config_parser self.manager_host = manager_host self.manager_port = manager_port self.logger = logger self.process_state = process_state self.ganglia_rrd_dir = ganglia_rrd_dir self.last_collect_time = time() self.stat_utils = StatUtils() try: self.config_parser.read(config_file_path) except: print >>sys.stderr, 'Failed to read configuration file' sys.exit(1) self.perf_info = ServicePerformance() self._performance_info_set(self.perf_info) self.monitoring_metrics_web = ['web_request_rate', 'web_response_time', 'cpu_user', 'boottime'] self.monitoring_metrics_backend = ['php_request_rate', 'php_response_time', 'cpu_user', 'cpu_system', 'cpu_num', 'mem_total', 'boottime'] self.monitoring_metrics_proxy = ['web_request_rate_lb', 'web_response_time_lb', 'php_request_rate_lb', 'php_response_time_lb', 'cpu_user', 'boottime'] def _performance_info_get(self): return self.performance_info def _performance_info_set(self, perf_info): self.performance_info = perf_info def nodes_info_update(self, killed_backends): #conpaas_init_ssl_ctx(self.certs_dir, 'manager') print('MANAGER %s' % self.manager_host) print('PORT %s' % self.manager_port) nodes = client.list_nodes(self.manager_host, self.manager_port) self.logger.debug('Got update info from manager') perf_info = self._performance_info_get() perf_info.reset_role_info() self.logger.debug('Updating nodes...') for node_id in nodes['proxy']: node = perf_info.serviceNodes.get(node_id) if node is not None: node.registered_with_manager = True node.isRunningProxy = True else: perf_info.serviceNodes[node_id] = ServiceNodePerf(node_id, '', True, False, False, self.process_state) for node_id in nodes['web']: node = perf_info.serviceNodes.get(node_id) if node is not None: node.registered_with_manager = True node.isRunningWeb = True else: perf_info.serviceNodes[node_id] = ServiceNodePerf(node_id, '', False, True, False, self.process_state) for node_id in nodes['backend']: node = perf_info.serviceNodes.get(node_id) if node is not None: node.registered_with_manager = True node.isRunningBackend = True else: perf_info.serviceNodes[node_id] = ServiceNodePerf(node_id, '', False, False, True, self.process_state) self.logger.info('Filtering backend nodes killed_backends : ' + str(killed_backends) + ' ' + str(perf_info.serviceNodes)) for id, node in perf_info.serviceNodes.items(): if node.ip == '': response = client.get_node_info(self.manager_host, self.manager_port, id) node.ip = response['serviceNode']['ip'] if not node.registered_with_manager: del perf_info.serviceNodes[id] # FIXME TO FILTER REMOVE OF BACKENDS ##### if id in killed_backends: self.logger.info('Filtered backend with id: ' + str(id)) try: del perf_info.serviceNodes[id] except: self.logger.warning('Backend already removed or not containing in serviceNodes: ' + str(id)) # self.logger.info('Filtered backend nodes killed_backends : ' + str(killed_backends) + ' ' + str(perf_info.serviceNodes)) self._performance_info_set(perf_info) self.logger.info('Updating nodes information from ConPaaS manager...') self.logger.info('Updated service nodes: %s' % str(perf_info.serviceNodes)) def collect_monitoring_metric(self, node_ip, metric_name): timestamps = [] param_values = [] # Added this for EC2, where the RRD directory names in Ganglia are hosts and not IPs: ganglia_dir_name = '' if node_ip.find('amazonaws') > 0: # this is an IP address ganglia_dir_name = node_ip else: # this is a DNS name for ganglia_host in listdir(self.ganglia_rrd_dir): #self.logger.error('collect from ganglia host: ' + str(ganglia_host)) if ganglia_host.find('Summary') > 0: continue try: hostname, array, array_ip = socket.gethostbyaddr(node_ip) except Exception as ex: self.logger.warning('Found private ip when trying to get the hostname for ip %s: %s. ' % (str(node_ip), ex)) ganglia_dir_name = node_ip break #self.logger.error('gethostbyaddr: ' + hostname) if ganglia_host == hostname: ganglia_dir_name = ganglia_host break rrd_file_name = self.ganglia_rrd_dir + ganglia_dir_name + '/' + metric_name + '.rrd' # self.logger.debug('rrd_file_name: ' + str(rrd_file_name)) # logger.info('Searching in RRD file:' + rrd_file_name) if (not path.isfile(rrd_file_name)): self.logger.error('RRD file not found: ' + rrd_file_name) return [] #logger.info('Getting monitoring info for node %s, parameter %s ...' % (node_ip, metric_name)) # logger.info('last collect time: ' + str(int(self.last_collect_time))) collect_from = self.last_collect_time - (time() - self.last_collect_time) #collect_from = self.last_collect_time fetch_cmd = ['rrdtool', 'fetch', '-s', str(int(collect_from)), '-r', '15', str(rrd_file_name), 'AVERAGE'] self.logger.debug("Fetching data with command: %s" % ' '.join(fetch_cmd)) proc = Popen(fetch_cmd, stdout=PIPE, stderr=PIPE, close_fds=True) stdout_req, stderr_req = proc.communicate() lines = stdout_req.splitlines() for line in lines: # logger.debug(line) tokens = line.split() if (line.find('sum') >= 0 or len(tokens) < 2): continue timestamps.append(int(tokens[0].replace(':', ''))) if (tokens[1].find('nan') < 0): param_values.append(float(tokens[1])) else: param_values.append(-1) # Cleaning the memory allocated by subprocess.Popen() try: proc.terminate() except OSError: # logger.critical("Cannot kill the subprocess.popen rrdtool") # can't kill a dead proc pass #logger.debug('timestamps: ' + str(timestamps)) #logger.debug('param values: ' + str(param_values)) return [timestamps, param_values] def init_collect_monitoring_data(self): self.perf_info = self._performance_info_get() # FIXME: dead code? def collect_monitoring_data(self): web_monitoring_data = {} backend_monitoring_data = {} proxy_monitoring_data = {} for web_node in self.perf_info.getWebServiceNodes(): self.logger.info('Getting web monitoring info for %s ...' % web_node.ip) # if web_node.ip not in web_monitoring_data: web_monitoring_data[web_node.ip] = {} cpu_num = DEFAULT_NUM_CPU mem_total = DEFAULT_RAM_MEMORY self.logger.info('Getting web monitoring info 1') for it in range(len(self.monitoring_metrics_web)): self.logger.info('Getting web monitoring info 2') ret = self.collect_monitoring_metric(web_node.ip, self.monitoring_metrics_web[it]) self.logger.info('Getting web monitoring info 3') if len(ret) == 0: # monitoring data was not found self.logger.warning("Could not retrieve data for metric %s: failed to collect all data for web node." % self.monitoring_metrics_web[it]) self.logger.info('Getting web monitoring info 4') return False if 'timestamps' not in web_monitoring_data[web_node.ip]: web_monitoring_data[web_node.ip]['timestamps'] = ret[0] web_monitoring_data[web_node.ip][self.monitoring_metrics_web[it]] = ret[1] if self.monitoring_metrics_web[it] == 'cpu_num' and web_monitoring_data[web_node.ip][self.monitoring_metrics_web[it]][0] != -1: cpu_num = backend_monitoring_data[web_node.ip][self.monitoring_metrics_web[it]][0] if self.monitoring_metrics_web[it] == 'mem_total' and web_monitoring_data[web_node.ip][self.monitoring_metrics_web[it]][0] != -1: mem_total = str(backend_monitoring_data[web_node.ip][self.monitoring_metrics_web[it]][0]) if self.monitoring_metrics_web[it] == 'boottime' and web_monitoring_data[web_node.ip][self.monitoring_metrics_web[it]][0] != -1: self.cost_controller.update_vm_usage(web_node.ip, web_monitoring_data[web_node.ip][self.monitoring_metrics_web[it]][0], self.cost_controller.instance_type_detector(cpu_num, mem_total)) for backend_node in self.perf_info.getBackendServiceNodes(): self.logger.info('Getting backend monitoring info for %s ...' % backend_node.ip) backend_monitoring_data[backend_node.ip] = {} cpu_num = DEFAULT_NUM_CPU mem_total = DEFAULT_RAM_MEMORY """ It iterates over the array to get the metrics in the same order, they defined added. It allows to detect the type of instance by analyzing the cpu, mem_total. """ for it in range(len(self.monitoring_metrics_backend)): ret = self.collect_monitoring_metric(backend_node.ip, self.monitoring_metrics_backend[it]) if len(ret) == 0: # monitoring data was not found self.logger.warning("Could not retrieve data for metric %s: failed to collect all data for backend node." % self.monitoring_metrics_backend[it]) return False if 'timestamps' not in backend_monitoring_data[backend_node.ip]: backend_monitoring_data[backend_node.ip]['timestamps'] = ret[0] backend_monitoring_data[backend_node.ip][self.monitoring_metrics_backend[it]] = ret[1] self.logger.info('There is a metric name: ' + str(self.monitoring_metrics_backend[it])) if self.monitoring_metrics_backend[it] == 'cpu_num': if backend_monitoring_data[backend_node.ip][self.monitoring_metrics_backend[it]][0] > 0: self.logger.info('There is a metric cpu_num with content: ' + str(backend_monitoring_data[backend_node.ip][self.monitoring_metrics_backend[it]][0])) cpu_num = str(backend_monitoring_data[backend_node.ip][self.monitoring_metrics_backend[it]][0]) else: # This is done to clean the negative and worng values from the monitoring data self.logger.info('There is a metric cpu_num with content equal or minus to zero ') for value in backend_monitoring_data[backend_node.ip][self.monitoring_metrics_backend[it]]: if value > 0: self.logger.info('There is a metric cpu_num with content: ' + str(value)) cpu_num = value break if self.monitoring_metrics_backend[it] == 'mem_total': if backend_monitoring_data[backend_node.ip][self.monitoring_metrics_backend[it]][0] > 0: self.logger.info('There is a metric mem_total with content: ' + str(backend_monitoring_data[backend_node.ip][self.monitoring_metrics_backend[it]][0])) mem_total = str(backend_monitoring_data[backend_node.ip][self.monitoring_metrics_backend[it]][0]) else: # This is done to clean the negative and worng values from the monitoring data self.logger.info('There is a metric mem_total with content equal or minus to zero ') for value in backend_monitoring_data[backend_node.ip][self.monitoring_metrics_backend[it]]: if value > 0: self.logger.info('There is a metric mem_total with content: ' + str(value)) mem_total = value break if self.monitoring_metrics_backend[it] == 'boottime': if backend_monitoring_data[backend_node.ip][self.monitoring_metrics_backend[it]][0] > 0: self.logger.info('There is a metric boottime with content: ' + str(backend_monitoring_data[backend_node.ip][self.monitoring_metrics_backend[it]][0])) self.cost_controller.update_vm_usage(backend_node.ip, backend_monitoring_data[backend_node.ip][self.monitoring_metrics_backend[it]][0], self.cost_controller.instance_type_detector(cpu_num, mem_total)) else: # This is done to clean the negative and worng values from the monitoring data boottime = backend_monitoring_data[backend_node.ip][self.monitoring_metrics_backend[it]][0] self.logger.info('There is a metric boottime with content equal or minus to zero ') for value in backend_monitoring_data[backend_node.ip][self.monitoring_metrics_backend[it]]: if value > 0: self.logger.info('There is a metric boottime with content: ' + str(value)) self.cost_controller.update_vm_usage(backend_node.ip, float(boottime), self.cost_controller.instance_type_detector(cpu_num, mem_total)) boottime = value break for proxy_node in self.perf_info.getProxyServiceNodes(): self.logger.info('Getting proxy monitoring info for %s ...' % proxy_node.ip) proxy_monitoring_data[proxy_node.ip] = {} for it in range(len(self.monitoring_metrics_proxy)): ret = self.collect_monitoring_metric(proxy_node.ip, self.monitoring_metrics_proxy[it]) if len(ret) == 0: # monitoring data was not found self.logger.warning("Could not retrieve data for metric %s: failed to collect all data for proxy node." % self.monitoring_metrics_proxy[it]) return False if 'timestamps' not in proxy_monitoring_data[proxy_node.ip]: proxy_monitoring_data[proxy_node.ip]['timestamps'] = ret[0] proxy_monitoring_data[proxy_node.ip][self.monitoring_metrics_proxy[it]] = ret[1] if self.monitoring_metrics_proxy[it] == 'cpu_num' and proxy_monitoring_data[proxy_node.ip][self.monitoring_metrics_proxy[it]][0] != -1: cpu_num = proxy_monitoring_data[proxy_node.ip][self.monitoring_metrics_proxy[it]][0] if self.monitoring_metrics_proxy[it] == 'mem_total' and proxy_monitoring_data[proxy_node.ip][self.monitoring_metrics_proxy[it]][0] != -1: mem_total = str(proxy_monitoring_data[proxy_node.ip][self.monitoring_metrics_proxy[it]][0]) if self.monitoring_metrics_proxy[it] == 'boottime' and proxy_monitoring_data[proxy_node.ip][self.monitoring_metrics_proxy[it]][0] != -1: self.cost_controller.update_vm_usage(proxy_node.ip, proxy_monitoring_data[proxy_node.ip][self.monitoring_metrics_proxy[it]][0], self.cost_controller.instance_type_detector(cpu_num, mem_total)) proxy_monitoring_data[proxy_node.ip] = self.stat_utils.filter_monitoring_data(proxy_monitoring_data[proxy_node.ip], self.monitoring_metrics_proxy) print proxy_monitoring_data print web_monitoring_data print backend_monitoring_data self.last_collect_time = time() print "Done getting monitoring data..." return True def collect_monitoring_data_web(self): web_monitoring_data = {} for web_node in self.perf_info.getWebServiceNodes(): self.logger.info('Getting web monitoring info for %s ...' % web_node.ip) # if web_node.ip not in web_monitoring_data: web_monitoring_data[web_node.ip] = {} cpu_num = DEFAULT_NUM_CPU mem_total = DEFAULT_RAM_MEMORY for it in range(len(self.monitoring_metrics_web)): ret = self.collect_monitoring_metric(web_node.ip, self.monitoring_metrics_web[it]) if len(ret) == 0: # monitoring data was not found self.logger.warning("Could not retrieve data for metric %s: failed to collect all data for web node." % self.monitoring_metrics_web[it]) return {} if 'timestamps' not in web_monitoring_data[web_node.ip]: web_monitoring_data[web_node.ip]['timestamps'] = ret[0] web_monitoring_data[web_node.ip][self.monitoring_metrics_web[it]] = ret[1] if self.monitoring_metrics_web[it] == 'cpu_num' and web_monitoring_data[web_node.ip][self.monitoring_metrics_web[it]][0] != -1: cpu_num = web_monitoring_data[web_node.ip][self.monitoring_metrics_web[it]][0] if self.monitoring_metrics_web[it] == 'mem_total' and web_monitoring_data[web_node.ip][self.monitoring_metrics_web[it]][0] != -1: mem_total = str(web_monitoring_data[web_node.ip][self.monitoring_metrics_web[it]][0]) if self.monitoring_metrics_web[it] == 'boottime': self.cost_controller.update_vm_usage(web_node.ip, web_monitoring_data[web_node.ip][self.monitoring_metrics_web[it]][0], self.cost_controller.instance_type_detector(cpu_num, mem_total)) return web_monitoring_data def collect_monitoring_data_backend(self): backend_monitoring_data = {} for backend_node in self.perf_info.getBackendServiceNodes(): self.logger.info('Getting backend monitoring info for %s ...' % backend_node.ip) backend_monitoring_data[backend_node.ip] = {} cpu_num = DEFAULT_NUM_CPU mem_total = DEFAULT_RAM_MEMORY """ It iterates over the array to get the metrics in the same order, they defined added. It allows to detect the type of instance by analyzing the cpu, mem_total. """ for it in range(len(self.monitoring_metrics_backend)): ret = self.collect_monitoring_metric(backend_node.ip, self.monitoring_metrics_backend[it]) if len(ret) == 0: # monitoring data was not found self.logger.warning("Could not retrieve data for metric %s: failed to collect all data for backend node." % self.monitoring_metrics_backend[it]) return {} if 'timestamps' not in backend_monitoring_data[backend_node.ip]: backend_monitoring_data[backend_node.ip]['timestamps'] = ret[0] backend_monitoring_data[backend_node.ip][self.monitoring_metrics_backend[it]] = ret[1] #self.logger.info('There is a metric name: '+str(self.monitoring_metrics_backend[it])) if self.monitoring_metrics_backend[it] == 'cpu_num': if backend_monitoring_data[backend_node.ip][self.monitoring_metrics_backend[it]][0] > 0: self.logger.info('There is a metric cpu_num with content: ' + str(backend_monitoring_data[backend_node.ip][self.monitoring_metrics_backend[it]][0])) cpu_num = backend_monitoring_data[backend_node.ip][self.monitoring_metrics_backend[it]][0] else: # This is done to clean the negative and worng values from the monitoring data self.logger.info('There is a metric cpu_num with content equal or minus to zero ') for value in backend_monitoring_data[backend_node.ip][self.monitoring_metrics_backend[it]]: if value > 0: self.logger.info('There is a metric cpu_num with content: ' + str(value)) cpu_num = value break if self.monitoring_metrics_backend[it] == 'mem_total': if backend_monitoring_data[backend_node.ip][self.monitoring_metrics_backend[it]][0] > 0: self.logger.info('There is a metric mem_total with content: ' + str(backend_monitoring_data[backend_node.ip][self.monitoring_metrics_backend[it]][0])) mem_total = backend_monitoring_data[backend_node.ip][self.monitoring_metrics_backend[it]][0] else: # This is done to clean the negative and worng values from the monitoring data self.logger.info('There is a metric mem_total with content equal or minus to zero ') for value in backend_monitoring_data[backend_node.ip][self.monitoring_metrics_backend[it]]: if value > 0: self.logger.info('There is a metric mem_total with content: ' + str(value)) mem_total = value break if self.monitoring_metrics_backend[it] == 'boottime': if backend_monitoring_data[backend_node.ip][self.monitoring_metrics_backend[it]][0] > 0: self.logger.info('There is a metric boottime with content: ' + str(backend_monitoring_data[backend_node.ip][self.monitoring_metrics_backend[it]][0])) self.cost_controller.update_vm_usage(backend_node.ip, backend_monitoring_data[backend_node.ip][self.monitoring_metrics_backend[it]][0], self.cost_controller.instance_type_detector(cpu_num, mem_total)) else: # This is done to clean the negative and worng values from the monitoring data boottime = backend_monitoring_data[backend_node.ip][self.monitoring_metrics_backend[it]][0] self.logger.info('There is a metric boottime with content equal or minus to zero ') for value in backend_monitoring_data[backend_node.ip][self.monitoring_metrics_backend[it]]: if value > 0: self.logger.info('There is a metric boottime with content: ' + str(value)) boottime = value break self.cost_controller.update_vm_usage(backend_node.ip, float(boottime), self.cost_controller.instance_type_detector(cpu_num, mem_total)) return backend_monitoring_data def collect_monitoring_data_proxy(self): self.perf_info = self._performance_info_get() proxy_monitoring_data = {} for proxy_node in self.perf_info.getProxyServiceNodes(): self.logger.info('Getting proxy monitoring info for %s ...' % proxy_node.ip) proxy_monitoring_data[proxy_node.ip] = {} cpu_num = DEFAULT_NUM_CPU mem_total = DEFAULT_RAM_MEMORY for it in range(len(self.monitoring_metrics_proxy)): ret = self.collect_monitoring_metric(proxy_node.ip, self.monitoring_metrics_proxy[it]) if len(ret) == 0: # monitoring data was not found self.logger.warning("Could not retrieve data for metric %s: failed to collect all data for proxy node." % self.monitoring_metrics_proxy[it]) return {} if 'timestamps' not in proxy_monitoring_data[proxy_node.ip]: proxy_monitoring_data[proxy_node.ip]['timestamps'] = ret[0] proxy_monitoring_data[proxy_node.ip][self.monitoring_metrics_proxy[it]] = ret[1] if self.monitoring_metrics_proxy[it] == 'cpu_num' and proxy_monitoring_data[proxy_node.ip][self.monitoring_metrics_proxy[it]][0] != -1: cpu_num = proxy_monitoring_data[proxy_node.ip][self.monitoring_metrics_proxy[it]][0] if self.monitoring_metrics_proxy[it] == 'mem_total' and proxy_monitoring_data[proxy_node.ip][self.monitoring_metrics_proxy[it]][0] != -1: mem_total = str(proxy_monitoring_data[proxy_node.ip][self.monitoring_metrics_proxy[it]][0]) if self.monitoring_metrics_proxy[it] == 'boottime': self.cost_controller.update_vm_usage(proxy_node.ip, proxy_monitoring_data[proxy_node.ip][self.monitoring_metrics_proxy[it]][0], self.cost_controller.instance_type_detector(cpu_num, mem_total)) proxy_monitoring_data[proxy_node.ip] = self.stat_utils.filter_monitoring_data(proxy_monitoring_data[proxy_node.ip], self.monitoring_metrics_proxy) self.last_collect_time = time() return proxy_monitoring_data
class Monitoring_Controller: def __init__(self, logger, cost_controller, config_parser, config_file_path, manager_host, manager_port, process_state, ganglia_rrd_dir): self.cost_controller = cost_controller self.config_parser = config_parser self.manager_host = manager_host self.manager_port = manager_port self.logger = logger self.process_state = process_state self.ganglia_rrd_dir = ganglia_rrd_dir self.last_collect_time = time() self.stat_utils = StatUtils() try: self.config_parser.read(config_file_path) except: print >> sys.stderr, 'Failed to read configuration file' sys.exit(1) #initialize a memcache client memcache_addr = config_parser.get('manager', 'MEMCACHE_ADDR') if memcache_addr == '': print >> sys.stderr, 'Failed to find memcache address in the config file' sys.exit(1) self.memcache = memcache.Client([memcache_addr]) self.perf_info = ServicePerformance() self._performance_info_set(self.perf_info) self.monitoring_metrics_web = [ 'web_request_rate', 'web_response_time', 'cpu_user', 'boottime' ] self.monitoring_metrics_backend = [ 'php_request_rate', 'php_response_time', 'cpu_user', 'cpu_system', 'cpu_num', 'mem_total', 'boottime' ] self.monitoring_metrics_proxy = ['web_request_rate_lb', 'web_response_time_lb', \ 'php_request_rate_lb', 'php_response_time_lb', 'cpu_user', 'boottime'] def _performance_info_get(self): return self.memcache.get('performance_info') def _performance_info_set(self, perf_info): self.memcache.set('performance_info', perf_info) def nodes_info_update(self, killed_backends): #conpaas_init_ssl_ctx(self.certs_dir, 'manager') print('MANAGER %s' % self.manager_host) print('PORT %s' % self.manager_port) nodes = client.list_nodes(self.manager_host, self.manager_port) self.logger.debug('Got update info from manager') perf_info = self._performance_info_get() perf_info.reset_role_info() self.logger.debug('Updating nodes...') for node_id in nodes['proxy']: node = perf_info.serviceNodes.get(node_id) if node != None: node.registered_with_manager = True node.isRunningProxy = True else: perf_info.serviceNodes[node_id] = ServiceNodePerf( node_id, '', True, False, False, self.process_state) for node_id in nodes['web']: node = perf_info.serviceNodes.get(node_id) if node != None: node.registered_with_manager = True node.isRunningWeb = True else: perf_info.serviceNodes[node_id] = ServiceNodePerf( node_id, '', False, True, False, self.process_state) for node_id in nodes['backend']: node = perf_info.serviceNodes.get(node_id) if node != None: node.registered_with_manager = True node.isRunningBackend = True else: perf_info.serviceNodes[node_id] = ServiceNodePerf( node_id, '', False, False, True, self.process_state) self.logger.info('Filtering backend nodes killed_backends : ' + str(killed_backends) + ' ' + str(perf_info.serviceNodes)) for id, node in perf_info.serviceNodes.items(): if node.ip == '': response = client.get_node_info(self.manager_host, self.manager_port, id) node.ip = response['serviceNode']['ip'] if node.registered_with_manager == False: del perf_info.serviceNodes[id] ####FIXME TO FILTER REMOVE OF BACKENDS ##### if id in killed_backends: self.logger.info('Filtered backend with id: ' + str(id)) try: del perf_info.serviceNodes[id] except: self.logger.warning( 'Backend already removed or not containing in serviceNodes: ' + str(id)) ########################################### self.logger.info('Filtered backend nodes killed_backends : ' + str(killed_backends) + ' ' + str(perf_info.serviceNodes)) self._performance_info_set(perf_info) self.logger.info('Updating nodes information from ConPaaS manager...') self.logger.info('Updated service nodes: %s' % str(perf_info.serviceNodes)) def collect_monitoring_metric(self, node_ip, metric_name): timestamps = [] param_values = [] # Added this for EC2, where the RRD directory names in Ganglia are hosts and not IPs: ganglia_dir_name = '' if node_ip.find('amazonaws') > 0: # this is an IP address ganglia_dir_name = node_ip else: # this is a DNS name for ganglia_host in listdir(self.ganglia_rrd_dir): #self.logger.error('collect from ganglia host: ' + str(ganglia_host)) if ganglia_host.find('Summary') > 0: continue try: hostname, array, array_ip = socket.gethostbyaddr(node_ip) except: self.logger.warning( 'Found private ip when trying to get the hostname for ip: ' + str(node_ip)) ganglia_dir_name = node_ip break #self.logger.error('gethostbyaddr: ' + hostname) if ganglia_host == hostname: ganglia_dir_name = ganglia_host break rrd_file_name = self.ganglia_rrd_dir + ganglia_dir_name + '/' + metric_name + '.rrd' self.logger.error('rrd_file_name: ' + str(rrd_file_name)) # logger.info('Searching in RRD file:' + rrd_file_name) if (not path.isfile(rrd_file_name)): self.logger.error('RRD file not found: ' + rrd_file_name) return [] #logger.info('Getting monitoring info for node %s, parameter %s ...' % (node_ip, metric_name)) # logger.info('last collect time: ' + str(int(self.last_collect_time))) collect_from = self.last_collect_time - (time() - self.last_collect_time) #collect_from = self.last_collect_time proc = Popen(['rrdtool', 'fetch', '-s', str(int(collect_from)), '-r', '15', \ str(rrd_file_name), 'AVERAGE'], stdout=PIPE, stderr=PIPE, close_fds=True) stdout_req, stderr_req = proc.communicate() lines = stdout_req.splitlines() for line in lines: #logger.debug(line) tokens = line.split() if (line.find('sum') >= 0 or len(tokens) < 2): continue timestamps.append(int(tokens[0].replace(':', ''))) if (tokens[1].find('nan') < 0): param_values.append(float(tokens[1])) else: param_values.append(-1) ## Cleaning the memory allocated by subprocess.Popen() try: proc.terminate() except OSError: # logger.critical("Cannot kill the subprocess.popen rrdtool") # can't kill a dead proc pass #logger.debug('timestamps: ' + str(timestamps)) #logger.debug('param values: ' + str(param_values)) return [timestamps, param_values] def init_collect_monitoring_data(self): self.perf_info = self._performance_info_get() def collect_monitoring_data(self): web_monitoring_data = {} backend_monitoring_data = {} proxy_monitoring_data = {} for web_node in self.perf_info.getWebServiceNodes(): self.logger.info('Getting web monitoring info for %s ...' % web_node.ip) #if web_node.ip not in web_monitoring_data: web_monitoring_data[web_node.ip] = {} cpu_num = DEFAULT_NUM_CPU mem_total = DEFAULT_RAM_MEMORY self.logger.info('Getting web monitoring info 1') for it in range(len(self.monitoring_metrics_web)): self.logger.info('Getting web monitoring info 2') ret = self.collect_monitoring_metric( web_node.ip, self.monitoring_metrics_web[it]) self.logger.info('Getting web monitoring info 3') if len(ret) == 0: # monitoring data was not found self.logger.info('Getting web monitoring info 4') return False if 'timestamps' not in web_monitoring_data[web_node.ip]: web_monitoring_data[web_node.ip]['timestamps'] = ret[0] web_monitoring_data[web_node.ip][ self.monitoring_metrics_web[it]] = ret[1] if self.monitoring_metrics_web[ it] == 'cpu_num' and web_monitoring_data[web_node.ip][ self.monitoring_metrics_web[it]][0] != -1: cpu_num = backend_monitoring_data[web_node.ip][ self.monitoring_metrics_web[it]][0] if self.monitoring_metrics_web[ it] == 'mem_total' and web_monitoring_data[ web_node.ip][ self.monitoring_metrics_web[it]][0] != -1: mem_total = str(backend_monitoring_data[web_node.ip][ self.monitoring_metrics_web[it]][0]) if self.monitoring_metrics_web[ it] == 'boottime' and web_monitoring_data[web_node.ip][ self.monitoring_metrics_web[it]][0] != -1: self.cost_controller.update_vm_usage( web_node.ip, web_monitoring_data[web_node.ip][ self.monitoring_metrics_web[it]][0], self.cost_controller.instance_type_detector( cpu_num, mem_total)) for backend_node in self.perf_info.getBackendServiceNodes(): self.logger.info('Getting backend monitoring info for %s ...' % backend_node.ip) backend_monitoring_data[backend_node.ip] = {} cpu_num = DEFAULT_NUM_CPU mem_total = DEFAULT_RAM_MEMORY """ It iterates over the array to get the metrics in the same order, they defined added. It allows to detect the type of instance by analyzing the cpu, mem_total. """ for it in range(len(self.monitoring_metrics_backend)): ret = self.collect_monitoring_metric( backend_node.ip, self.monitoring_metrics_backend[it]) if len(ret) == 0: # monitoring data was not found return False if 'timestamps' not in backend_monitoring_data[ backend_node.ip]: backend_monitoring_data[ backend_node.ip]['timestamps'] = ret[0] backend_monitoring_data[backend_node.ip][ self.monitoring_metrics_backend[it]] = ret[1] self.logger.info('There is a metric name: ' + str(self.monitoring_metrics_backend[it])) if self.monitoring_metrics_backend[it] == 'cpu_num': if backend_monitoring_data[backend_node.ip][ self.monitoring_metrics_backend[it]][0] > 0: self.logger.info( 'There is a metric cpu_num with content: ' + str(backend_monitoring_data[backend_node.ip][ self.monitoring_metrics_backend[it]][0])) cpu_num = str(backend_monitoring_data[backend_node.ip][ self.monitoring_metrics_backend[it]][0]) else: ## This is done to clean the negative and worng values from the monitoring data self.logger.info( 'There is a metric cpu_num with content equal or minus to zero ' ) for value in backend_monitoring_data[backend_node.ip][ self.monitoring_metrics_backend[it]]: if value > 0: self.logger.info( 'There is a metric cpu_num with content: ' + str(value)) cpu_num = value break if self.monitoring_metrics_backend[it] == 'mem_total': if backend_monitoring_data[backend_node.ip][ self.monitoring_metrics_backend[it]][0] > 0: self.logger.info( 'There is a metric mem_total with content: ' + str(backend_monitoring_data[backend_node.ip][ self.monitoring_metrics_backend[it]][0])) mem_total = str( backend_monitoring_data[backend_node.ip][ self.monitoring_metrics_backend[it]][0]) else: ## This is done to clean the negative and worng values from the monitoring data self.logger.info( 'There is a metric mem_total with content equal or minus to zero ' ) for value in backend_monitoring_data[backend_node.ip][ self.monitoring_metrics_backend[it]]: if value > 0: self.logger.info( 'There is a metric mem_total with content: ' + str(value)) mem_total = value break if self.monitoring_metrics_backend[it] == 'boottime': if backend_monitoring_data[backend_node.ip][ self.monitoring_metrics_backend[it]][0] > 0: self.logger.info( 'There is a metric boottime with content: ' + str(backend_monitoring_data[backend_node.ip][ self.monitoring_metrics_backend[it]][0])) self.cost_controller.update_vm_usage( backend_node.ip, backend_monitoring_data[backend_node.ip][ self.monitoring_metrics_backend[it]][0], self.cost_controller.instance_type_detector( cpu_num, mem_total)) else: ## This is done to clean the negative and worng values from the monitoring data boottime = backend_monitoring_data[backend_node.ip][ self.monitoring_metrics_backend[it]][0] self.logger.info( 'There is a metric boottime with content equal or minus to zero ' ) for value in backend_monitoring_data[backend_node.ip][ self.monitoring_metrics_backend[it]]: if value > 0: self.logger.info( 'There is a metric boottime with content: ' + str(value)) self.cost_controller.update_vm_usage( backend_node.ip, float(boottime), self.cost_controller. instance_type_detector(cpu_num, mem_total)) boottime = value break for proxy_node in self.perf_info.getProxyServiceNodes(): self.logger.info('Getting proxy monitoring info for %s ...' % proxy_node.ip) proxy_monitoring_data[proxy_node.ip] = {} for it in range(len(self.monitoring_metrics_proxy)): ret = self.collect_monitoring_metric( proxy_node.ip, self.monitoring_metrics_proxy[it]) if len(ret) == 0: # monitoring data was not found return False if 'timestamps' not in proxy_monitoring_data[proxy_node.ip]: proxy_monitoring_data[proxy_node.ip]['timestamps'] = ret[0] proxy_monitoring_data[proxy_node.ip][ self.monitoring_metrics_proxy[it]] = ret[1] if self.monitoring_metrics_proxy[ it] == 'cpu_num' and proxy_monitoring_data[ proxy_node.ip][ self.monitoring_metrics_proxy[it]][0] != -1: cpu_num = proxy_monitoring_data[proxy_node.ip][ self.monitoring_metrics_proxy[it]][0] if self.monitoring_metrics_proxy[ it] == 'mem_total' and proxy_monitoring_data[ proxy_node.ip][ self.monitoring_metrics_proxy[it]][0] != -1: mem_total = str(proxy_monitoring_data[proxy_node.ip][ self.monitoring_metrics_proxy[it]][0]) if self.monitoring_metrics_proxy[ it] == 'boottime' and proxy_monitoring_data[ proxy_node.ip][ self.monitoring_metrics_proxy[it]][0] != -1: self.cost_controller.update_vm_usage( proxy_node.ip, proxy_monitoring_data[proxy_node.ip][ self.monitoring_metrics_proxy[it]][0], self.cost_controller.instance_type_detector( cpu_num, mem_total)) proxy_monitoring_data[ proxy_node.ip] = self.stat_utils.filter_monitoring_data( proxy_monitoring_data[proxy_node.ip], self.monitoring_metrics_proxy) print proxy_monitoring_data print web_monitoring_data print backend_monitoring_data self.last_collect_time = time() print "Done getting monitoring data..." return True def collect_monitoring_data_web(self): web_monitoring_data = {} for web_node in self.perf_info.getWebServiceNodes(): self.logger.info('Getting web monitoring info for %s ...' % web_node.ip) #if web_node.ip not in web_monitoring_data: web_monitoring_data[web_node.ip] = {} cpu_num = DEFAULT_NUM_CPU mem_total = DEFAULT_RAM_MEMORY for it in range(len(self.monitoring_metrics_web)): ret = self.collect_monitoring_metric( web_node.ip, self.monitoring_metrics_web[it]) if len(ret) == 0: # monitoring data was not found return {} if 'timestamps' not in web_monitoring_data[web_node.ip]: web_monitoring_data[web_node.ip]['timestamps'] = ret[0] web_monitoring_data[web_node.ip][ self.monitoring_metrics_web[it]] = ret[1] if self.monitoring_metrics_web[ it] == 'cpu_num' and web_monitoring_data[web_node.ip][ self.monitoring_metrics_web[it]][0] != -1: cpu_num = web_monitoring_data[web_node.ip][ self.monitoring_metrics_web[it]][0] if self.monitoring_metrics_web[ it] == 'mem_total' and web_monitoring_data[ web_node.ip][ self.monitoring_metrics_web[it]][0] != -1: mem_total = str(web_monitoring_data[web_node.ip][ self.monitoring_metrics_web[it]][0]) if self.monitoring_metrics_web[it] == 'boottime': self.cost_controller.update_vm_usage( web_node.ip, web_monitoring_data[web_node.ip][ self.monitoring_metrics_web[it]][0], self.cost_controller.instance_type_detector( cpu_num, mem_total)) return web_monitoring_data def collect_monitoring_data_backend(self): backend_monitoring_data = {} for backend_node in self.perf_info.getBackendServiceNodes(): self.logger.info('Getting backend monitoring info for %s ...' % backend_node.ip) backend_monitoring_data[backend_node.ip] = {} cpu_num = DEFAULT_NUM_CPU mem_total = DEFAULT_RAM_MEMORY """ It iterates over the array to get the metrics in the same order, they defined added. It allows to detect the type of instance by analyzing the cpu, mem_total. """ for it in range(len(self.monitoring_metrics_backend)): ret = self.collect_monitoring_metric( backend_node.ip, self.monitoring_metrics_backend[it]) if len(ret) == 0: # monitoring data was not found return {} if 'timestamps' not in backend_monitoring_data[ backend_node.ip]: backend_monitoring_data[ backend_node.ip]['timestamps'] = ret[0] backend_monitoring_data[backend_node.ip][ self.monitoring_metrics_backend[it]] = ret[1] #self.logger.info('There is a metric name: '+str(self.monitoring_metrics_backend[it])) if self.monitoring_metrics_backend[it] == 'cpu_num': if backend_monitoring_data[backend_node.ip][ self.monitoring_metrics_backend[it]][0] > 0: self.logger.info( 'There is a metric cpu_num with content: ' + str(backend_monitoring_data[backend_node.ip][ self.monitoring_metrics_backend[it]][0])) cpu_num = backend_monitoring_data[backend_node.ip][ self.monitoring_metrics_backend[it]][0] else: ## This is done to clean the negative and worng values from the monitoring data self.logger.info( 'There is a metric cpu_num with content equal or minus to zero ' ) for value in backend_monitoring_data[backend_node.ip][ self.monitoring_metrics_backend[it]]: if value > 0: self.logger.info( 'There is a metric cpu_num with content: ' + str(value)) cpu_num = value break if self.monitoring_metrics_backend[it] == 'mem_total': if backend_monitoring_data[backend_node.ip][ self.monitoring_metrics_backend[it]][0] > 0: self.logger.info( 'There is a metric mem_total with content: ' + str(backend_monitoring_data[backend_node.ip][ self.monitoring_metrics_backend[it]][0])) mem_total = backend_monitoring_data[backend_node.ip][ self.monitoring_metrics_backend[it]][0] else: ## This is done to clean the negative and worng values from the monitoring data self.logger.info( 'There is a metric mem_total with content equal or minus to zero ' ) for value in backend_monitoring_data[backend_node.ip][ self.monitoring_metrics_backend[it]]: if value > 0: self.logger.info( 'There is a metric mem_total with content: ' + str(value)) mem_total = value break if self.monitoring_metrics_backend[it] == 'boottime': if backend_monitoring_data[backend_node.ip][ self.monitoring_metrics_backend[it]][0] > 0: self.logger.info( 'There is a metric boottime with content: ' + str(backend_monitoring_data[backend_node.ip][ self.monitoring_metrics_backend[it]][0])) self.cost_controller.update_vm_usage( backend_node.ip, backend_monitoring_data[backend_node.ip][ self.monitoring_metrics_backend[it]][0], self.cost_controller.instance_type_detector( cpu_num, mem_total)) else: ## This is done to clean the negative and worng values from the monitoring data boottime = backend_monitoring_data[backend_node.ip][ self.monitoring_metrics_backend[it]][0] self.logger.info( 'There is a metric boottime with content equal or minus to zero ' ) for value in backend_monitoring_data[backend_node.ip][ self.monitoring_metrics_backend[it]]: if value > 0: self.logger.info( 'There is a metric boottime with content: ' + str(value)) boottime = value break self.cost_controller.update_vm_usage( backend_node.ip, float(boottime), self.cost_controller.instance_type_detector( cpu_num, mem_total)) return backend_monitoring_data def collect_monitoring_data_proxy(self): self.perf_info = self._performance_info_get() proxy_monitoring_data = {} for proxy_node in self.perf_info.getProxyServiceNodes(): self.logger.info('Getting proxy monitoring info for %s ...' % proxy_node.ip) proxy_monitoring_data[proxy_node.ip] = {} cpu_num = DEFAULT_NUM_CPU mem_total = DEFAULT_RAM_MEMORY for it in range(len(self.monitoring_metrics_proxy)): ret = self.collect_monitoring_metric( proxy_node.ip, self.monitoring_metrics_proxy[it]) if len(ret) == 0: # monitoring data was not found return {} if 'timestamps' not in proxy_monitoring_data[proxy_node.ip]: proxy_monitoring_data[proxy_node.ip]['timestamps'] = ret[0] proxy_monitoring_data[proxy_node.ip][ self.monitoring_metrics_proxy[it]] = ret[1] if self.monitoring_metrics_proxy[ it] == 'cpu_num' and proxy_monitoring_data[ proxy_node.ip][ self.monitoring_metrics_proxy[it]][0] != -1: cpu_num = proxy_monitoring_data[proxy_node.ip][ self.monitoring_metrics_proxy[it]][0] if self.monitoring_metrics_proxy[ it] == 'mem_total' and proxy_monitoring_data[ proxy_node.ip][ self.monitoring_metrics_proxy[it]][0] != -1: mem_total = str(proxy_monitoring_data[proxy_node.ip][ self.monitoring_metrics_proxy[it]][0]) if self.monitoring_metrics_proxy[it] == 'boottime': self.cost_controller.update_vm_usage( proxy_node.ip, proxy_monitoring_data[proxy_node.ip][ self.monitoring_metrics_proxy[it]][0], self.cost_controller.instance_type_detector( cpu_num, mem_total)) proxy_monitoring_data[ proxy_node.ip] = self.stat_utils.filter_monitoring_data( proxy_monitoring_data[proxy_node.ip], self.monitoring_metrics_proxy) self.last_collect_time = time() return proxy_monitoring_data