def query_service_network(endpoint, stack_name, service_name): id = None log = logging.getLogger('pk_docker') client = docker.DockerClient(base_url=endpoint) full_service_name = stack_name + "_" + service_name if pk_config.simulate(): return None service_list = client.services.list() i = 0 while i < len(service_list) and service_list[i].name != full_service_name: i += 1 if i < len(service_list) and service_list[i].name == full_service_name: if len(service_list[i].attrs.get("Spec").get("TaskTemplate").get( "Networks")) == 1: id = service_list[i].attrs.get("Spec").get("TaskTemplate").get( "Networks")[0].get("Target") log.debug( 'Docker service "{0}" in stack "{1}" is connected to network "{2}" with id "{3}".' .format(service_name, stack_name, client.networks.get(id).name), str(id)) else: log.warning( 'Docker service "{0}" is connected to more than one network.'. format(full_service_name)) else: log.warning('Docker service "{0}" is not found in stack "{1}".'.format( service_name, stack_name)) return id
def query_list_of_nodes(endpoint, worker_name='micado-worker', status='ready'): log = logging.getLogger('pk_k8s') list_of_nodes = [] if pk_config.simulate(): return dict() kubernetes.config.load_kube_config() client = kubernetes.client.CoreV1Api() try: if status == 'ready': nodes = [x for x in client.list_node().items if not x.spec.taints] nodes = [ x for x in nodes if x.metadata.labels.get('micado.eu/node_type') == worker_name ] elif status == 'down': nodes = [ x for x in client.list_node().items if x.spec.taints and 'master' not in x.spec.taints[0].key ] for n in nodes: a = {} a['ID'] = n.metadata.name a['Addr'] = n.status.addresses[0].address list_of_nodes.append(a.copy()) return list_of_nodes except Exception as e: log.exception('(Q) Query of k8s nodes failed.') return dict()
def scale_occopus_worker_node(endpoint,infra_name,worker_name,replicas): log=logging.getLogger('pk_occopus') log.info('(S) => m_node_count: {0}'.format(replicas)) wscall = '{0}/infrastructures/{1}/scaleto/{2}/{3}'.format(endpoint,infra_name,worker_name,replicas) log.debug('-->curl -X POST {0}'.format(wscall)) if not pk_config.simulate(): response = requests.post(wscall).json() log.debug('-->response: {0}'.format(response)) return
def notify_to_reload_config(endpoint): log = logging.getLogger('pk_prometheus') try: if not pk_config.simulate(): requests.post(endpoint + "/-/reload") log.info('(C) Notification to reload config sent to Prometheus.') except Exception: log.exception( 'Sending config reload notification to Prometheus failed:')
def remove_node(endpoint, id): log = logging.getLogger('pk_docker') if pk_config.simulate(): return try: client = docker.APIClient(endpoint) client.remove_node(id, True) except Exception: log.error('(M) => Removing docker node failed.') return
def remove_alerts_under_prometheus(rules_directory, alerts, stack): if not alerts: return log = logging.getLogger('pk_prometheus') try: rule_file = os.path.join(rules_directory, stack + '.rules') if not pk_config.simulate(): os.remove(rule_file) except Exception: log.exception('Removing alerts under Prometheus failed:') return
def remove_node(endpoint,id): log=logging.getLogger('pk_k8s') if pk_config.simulate(): return kubernetes.config.load_kube_config() client = kubernetes.client.CoreV1Api() try: client.delete_node(id, {}) except Exception: log.error('(M) => Removing k8s node failed.') return
def query_number_of_worker_nodes(endpoint, infra_name, worker_name): log = logging.getLogger('pk_occopus') instances = 1 wscall = '{0}/infrastructures/{1}'.format(endpoint, infra_name) log.debug('-->curl -X GET {0}'.format(wscall)) if not pk_config.simulate(): response = requests.get(wscall).json() instances = response.get(worker_name, dict()).get('scaling', dict()).get('target', 0) log.debug('-->instances: {0}, response: {1}'.format( instances, response)) return instances
def scale_k8s_deploy(endpoint,service_name,replicas): service_name = '-'.join(service_name.split('_')[1:]) log=logging.getLogger('pk_k8s') log.info('(S) => m_container_count: {0}'.format(replicas)) if pk_config.simulate(): return kubernetes.config.load_kube_config() client = kubernetes.client.ExtensionsV1beta1Api() try: dep = client.read_namespaced_deployment(service_name, "default") dep.spec.replicas = replicas client.patch_namespaced_deployment_scale(service_name, "default", dep) except Exception as e: log.warning('(S) Scaling of k8s service "{0}" failed: {1}'.format(service_name,str(e))) return
def query_k8s_replicas(endpoint,service_name): service_name = '-'.join(service_name.split('_')[1:]) log=logging.getLogger('pk_k8s') instance = 1 if pk_config.simulate(): return kubernetes.config.load_kube_config() client = kubernetes.client.ExtensionsV1beta1Api() try: dep = client.read_namespaced_deployment(service_name, "default") replicas = dep.spec.replicas log.debug('(C) => m_container_count for {0}: {1}'.format(service_name,replicas)) except Exception as e: log.warning('(C) Querying k8s service "{0}" replicas failed: {1}'.format(service_name,str(e))) return instance
def deploy_alerts_under_prometheus(rules_directory, alerts, stack): if not alerts: return log = logging.getLogger('pk_prometheus') try: content = {'groups': [{'name': 'micado', 'rules': []}]} for alert in alerts: content['groups'][0]['rules'].append(dict(alert)) rule_file = os.path.join(rules_directory, stack + '.rules') if not pk_config.simulate(): with open(rule_file, 'w') as outfile: yaml.round_trip_dump(content, outfile, default_flow_style=False) except Exception: log.exception('Deploying alerts under Prometheus failed:') return
def query_docker_service_replicas(endpoint, service_name): log = logging.getLogger('pk_docker') instance = 1 if pk_config.simulate(): return client = docker.APIClient(endpoint) try: response = client.inspect_service(service_name) instance = response.get('Spec', dict()).get('Mode', dict()).get( 'Replicated', dict()).get('Replicas', 1) log.debug('(C) => m_container_count for {0}: {1}'.format( service_name, instance)) except Exception as e: log.warning( '(C) Querying docker service "{0}" replicas failed: {1}'.format( service_name, str(e))) return instance
def evaluate_data_queries_and_alerts_for_a_service(endpoint, policy, servicename): log = logging.getLogger('pk_prometheus') queries, alerts = dict(), dict() if 'query_results' not in policy['data']: policy['data']['query_results'] = dict() all_services = policy.get('scaling', dict()).get('services', dict()) target_service = [ srv for srv in all_services if srv.get('name', '') == servicename ] scaling_rule_str = target_service[0].get('scaling_rule', '') if target_service else '' for param, query in policy.get('data', dict()).get('queries', dict()).iteritems(): try: if scaling_rule_str is not None and scaling_rule_str.find( param) != -1: if pk_config.simulate(): continue response = requests.get(endpoint + "/api/v1/query?query=" + query).json() log.debug('Prometheus response query "{0}":{1}'.format( query, response)) val = extract_value_from_prometheus_response( query, response, dict()) policy['data']['query_results'][param] = float(val) queries[param] = float(val) except Exception as e: policy['data']['query_results'][param] = None queries[param] = None log.warning( 'Evaluating expression for query "{0}" failed: {1}'.format( param, e.message)) policy['data']['alert_results'] = {} for item in policy.get('data', dict()).get('alerts', dict()): attrname = item['alert'] if scaling_rule_str is not None and scaling_rule_str.find( attrname) != -1: if alerts_query(attrname) is not None: policy['data']['alert_results'][attrname] = True alerts[attrname] = True else: policy['data']['alert_results'][attrname] = False alerts[attrname] = False return queries, alerts
def query_list_of_nodes(endpoint, status='ready'): log = logging.getLogger('pk_docker') list_of_nodes = [] if pk_config.simulate(): return None client = docker.APIClient(endpoint) try: nodes = client.nodes(filters={'role': 'worker'}) for n in nodes: if n.get('Status', dict).get('State', '') == status: a = {} a['ID'] = n.get('ID', 'undefID') a['Addr'] = n.get('Status', dict()).get('Addr', '') list_of_nodes.append(a.copy()) return list_of_nodes except Exception as e: log.exception('(Q) Query of docker nodes failed.') return None
def scale_docker_service(endpoint, service_name, replicas): log = logging.getLogger('pk_docker') log.info('(S) => m_container_count: {0}'.format(replicas)) if pk_config.simulate(): return client = docker.APIClient(endpoint) try: version = client.inspect_service(service_name)['Version']['Index'] ret = client.update_service( service_name, version, mode={'Replicated': { 'Replicas': replicas }}, fetch_current_spec=True) except Exception as e: log.warning('(S) Scaling of docker service "{0}" failed: {1}'.format( service_name, str(e))) return
def detach_prometheus_from_exporters_network(policy, swarm_endpoint): log = logging.getLogger('pk_prometheus') for exporter_endpoint in policy.get('data', dict()).get('sources', dict()): try: exporter_name = exporter_endpoint.split(':')[0] if '.' not in exporter_name: log.info( '(C) => detaching prometheus from network of exporter "{0}"' .format(exporter_endpoint)) if pk_config.simulate(): continue exporter_netid = dock.query_service_network( swarm_endpoint, policy['stack'], exporter_name) if exporter_netid: dock.detach_container_from_network(swarm_endpoint, 'prometheus', exporter_netid) except Exception as e: log.exception( 'Detaching prometheus from network of exporter failed:')
def remove_exporters_from_prometheus_config(template_file, config_file): if not pk_config.simulate(): shutil.copyfile(template_file, config_file)
def add_exporters_to_prometheus_config(policy, template_file, config_file): log = logging.getLogger('pk_prometheus') try: config_content = dict() if not pk_config.simulate(): with open(template_file, 'r') as f: config_content = yaml.round_trip_load(f) if 'scrape_configs' not in config_content: config_content['scrape_configs'] = [] #Find proper scrape_config or create scrape_config = [ x for x in config_content['scrape_configs'] if x.get('job_name', '') == 'micado' and 'static_configs' in x ] if not scrape_config: config_content['scrape_configs'].append({ 'job_name': 'micado', 'static_configs': [] }) scrape_config = [ x for x in config_content['scrape_configs'] if x.get('job_name', '') == 'micado' and 'static_configs' in x ][0] else: scrape_config = scrape_config[0] #Find proper static_config or create static_config = [ x for x in scrape_config['static_configs'] if 'targets' in x.keys() ] if not static_config: scrape_config['static_configs'].append({'targets': []}) static_config = [ x for x in scrape_config['static_configs'] if 'targets' in x.keys() ][0] else: static_config = static_config[0] config_changed = False for exporter_endpoint in policy.get('data', dict()).get('sources', dict()): if exporter_endpoint not in static_config['targets']: static_config['targets'].append(exporter_endpoint) config_changed = True log.info('(C) => exporter "{0}" added to config'.format( exporter_endpoint)) else: log.info( '(C) => exporter "{0}" skipped, already part of config'. format(exporter_endpoint)) if config_changed and not pk_config.simulate(): with open(config_file, 'w') as outfile: yaml.round_trip_dump(config_content, outfile, default_flow_style=False) except Exception as e: log.exception('Adding exporters to prometheus config failed:') return
def pkmain(): global log parser = argparse.ArgumentParser( description='MiCADO component to realise scaling policies') parser.add_argument('--cfg', dest='cfg_path', default='./config.yaml', help='path to configuration file') parser.add_argument('--policy', dest='cfg_policy', help='specifies the policy to execute') parser.add_argument('--srv', action='store_true', dest='cfg_srv', default=False, help='run in service mode') parser.add_argument('--simulate', action='store_true', dest='cfg_simulate', default=False, help='ommit manipulating surrounding components') args = parser.parse_args() #read configuration try: with open(args.cfg_path, 'r') as c: pk_config.config(yaml.safe_load(c)) except Exception as e: print 'ERROR: Cannot read configuration file "{0}": {1}'.format( args.cfg_path, str(e)) config = pk_config.config() #initialise logging facility based on the configuration try: logging.config.dictConfig(config['logging']) log = logging.getLogger('pk') except Exception as e: print 'ERROR: Cannot process configuration file "{0}": {1}'.format( args.cfg_path, str(e)) #set simulate mode pk_config.simulate(args.cfg_simulate) if args.cfg_simulate: log.warning('SIMULATION mode is active! No changes will be performed.') #read policy file and start periodic policy evaluation in case of command-line mode if not args.cfg_srv: if not args.cfg_policy: log.error( 'Policy file must be specified for standalone execution!') sys.exit(1) try: policy_yaml = load_policy_from_file(args.cfg_policy) start(policy_yaml) except KeyboardInterrupt: log.warning('Keyboard interruption detected! Shutting down...') stop(policy_yaml) except Exception: log.exception('An error occured during policy execution:') return #launch web service and wait for oncoming requests if args.cfg_srv: if args.cfg_policy: log.warning( 'Policy file in parameter is unsused, must be defined through the API in service mode!' ) pk_rest.init_service() pk_rest.app.run(debug=True, host='0.0.0.0', port=12345)
def add_exporters_to_prometheus_config(policy, template_file, config_file): log = logging.getLogger('pk_prometheus') try: config_content = dict() if not pk_config.simulate(): shutil.copy(config_file, template_file) with open(template_file, 'r') as f: config_content = yaml.round_trip_load(f) if 'scrape_configs' not in config_content: config_content['scrape_configs'] = [] #Find proper scrape_config or create scrape_config = [ x for x in config_content['scrape_configs'] if x.get('job_name', '') == 'micado' and 'static_configs' in x ] if not scrape_config: config_content['scrape_configs'].append({ 'job_name': 'micado', 'static_configs': [] }) scrape_config = [ x for x in config_content['scrape_configs'] if x.get('job_name', '') == 'micado' and 'static_configs' in x ][0] else: scrape_config = scrape_config[0] #Find proper static_config or create static_config = [ x for x in scrape_config['static_configs'] if 'targets' in x.keys() ] if not static_config: scrape_config['static_configs'].append({'targets': []}) static_config = [ x for x in scrape_config['static_configs'] if 'targets' in x.keys() ][0] else: static_config = static_config[0] config_changed = False for exporter_endpoint in policy.get('data', dict()).get('sources', dict()): if exporter_endpoint not in static_config['targets']: exp = exporter_endpoint.split(':') if len(exp) == 1: continue elif '.' not in exp[0]: kube_job = [ x for x in config_content['scrape_configs'] if x.get('job_name') == 'kube-services' ] if not kube_job: continue relabel = kube_job[0].get('relabel_configs', []) old_label = [ x for x in relabel if x.get('action') == 'keep' ] if old_label: old_label = old_label[0] old_regex = old_label.get('regex') new_regex = '{}|{}:{}'.format(old_regex, exp[0], exp[1]) old_label['regex'] = new_regex else: label = { 'source_labels': ['endpoint'], 'action': 'keep', 'regex': '(^a)|{}:{}'.format(exp[0], exp[1]) } relabel.append(label) else: static_config['targets'].append(exporter_endpoint) config_changed = True log.info('(C) => exporter "{0}" added to config'.format( exporter_endpoint)) else: log.info( '(C) => exporter "{0}" skipped, already part of config'. format(exporter_endpoint)) if config_changed and not pk_config.simulate(): with open(config_file, 'w') as outfile: yaml.round_trip_dump(config_content, outfile, default_flow_style=False) except Exception as e: log.exception('Adding exporters to prometheus config failed:') return