def remove_node(endpoint, id): log = logging.getLogger('pk_k8s') if pk_config.dryrun_get(dryrun_id): log.info('(M) DRYRUN enabled. Skipping...') return kubernetes.config.load_kube_config() client = kubernetes.client.CoreV1Api() try: client.delete_node(id) except Exception: log.error('(M) => Removing k8s node failed.') return
def drop_worker_node(endpoint, infra_name, worker_name, replica): log = logging.getLogger('pk_occopus') if pk_config.dryrun_get(dryrun_id): log.info('(S) DRYRUN enabled. Skipping...') return log.info('(S) => node drop: {0}'.format(replica)) wscall = '{0}/infrastructures/{1}/scaledown/{2}/{3}'.format( endpoint, infra_name, worker_name, replica) log.debug('-->curl -X POST {0}'.format(wscall)) response = requests.post(wscall).json() log.debug('-->response: {0}'.format(response)) return
def calling_rest_api_sample(sample=dict()): log=logging.getLogger('pk_optimizer') config = pk_config.config() if pk_config.dryrun_get(dryrun_id): log.info('(O) DRYRUN enabled. Skipping...') return if not m_opt_accessible: return url = config.get('optimizer_endpoint')+'/sample' log.debug('(O) Calling optimizer REST API sample() method: '+url) response = requests.post(url, data=yaml.dump(sample)) log.debug('(O) Response: '+str(response)) return
def remove_alerts_under_prometheus(rules_directory, alerts, stack): log = logging.getLogger('pk_prometheus') if pk_config.dryrun_get(dryrun_id): log.info('(C) DRYRUN enabled. Skipping...') return if not alerts: return try: rule_file = os.path.join(rules_directory, stack + '.rules') os.remove(rule_file) except Exception: log.exception('Removing alerts under Prometheus failed:') return
def query_number_of_worker_nodes(config,worker_name): log=logging.getLogger('pk_occopus') instances=1 if pk_config.dryrun_get(dryrun_id): log.info('(C) DRYRUN enabled. Skipping...') return instances endpoint, infra_name = config[CONFIG_ENDPOINT], config[CONFIG_INFRA_NAME] wscall = '{0}/infrastructures/{1}'.format(endpoint,infra_name) log.debug('-->curl -X GET {0}'.format(wscall)) response = requests.get(wscall).json() instances = response.get(worker_name,dict()).get('scaling',dict()).get('target',0) log.debug('-->instances: {0}, response: {1}'.format(instances,response)) return instances
def scale_worker_node(config,scaling_info_list): log=logging.getLogger('pk_occopus') if pk_config.dryrun_get(dryrun_id): log.info('(S) DRYRUN enabled. Skipping...') return endpoint, infra_name = config[CONFIG_ENDPOINT], config[CONFIG_INFRA_NAME] for info in scaling_info_list: worker_name, replicas = info.get('node_name'), info.get('replicas') log.info('(S) {0} => m_node_count: {1}'.format(worker_name, replicas)) wscall = '{0}/infrastructures/{1}/scaleto/{2}/{3}'.format(endpoint,infra_name,worker_name,replicas) log.debug('-->curl -X POST {0}'.format(wscall)) response = requests.post(wscall).json() log.debug('-->response: {0}'.format(response)) return
def evaluate_data_queries_and_alerts_for_a_service(endpoint,policy,servicename): log=logging.getLogger('pk_prometheus') if pk_config.dryrun_get(dryrun_id): log.info('(Q) DRYRUN enabled. Skipping...') queries, alerts = dict(), dict() if 'query_results' not in policy['data']: policy['data']['query_results']=dict() all_services = policy.get('scaling',dict()).get('services',dict()) target_service = [ srv for srv in all_services if srv.get('name','')==servicename ] scaling_rule_str = target_service[0].get('scaling_rule','') if target_service else '' for param,query in policy.get('data',dict()).get('queries',dict()).items(): try: if scaling_rule_str is not None and scaling_rule_str.find(param) != -1: if pk_config.dryrun_get(dryrun_id): policy['data']['query_results'][param]=query queries[param]=query else: response = requests.get(endpoint+"/api/v1/query?query="+query).json() log.debug('Prometheus response query "{0}":{1}'.format(query,response)) val = extract_value_from_prometheus_response(query,response,dict()) policy['data']['query_results'][param]=float(val) queries[param]=float(val) except Exception as e: policy['data']['query_results'][param]=None queries[param]=None log.warning('Evaluating expression for query "{0}" failed: {1}'.format(param,e)) policy['data']['alert_results']={} for item in policy.get('data',dict()).get('alerts',dict()): attrname = item['alert'] if scaling_rule_str is not None and scaling_rule_str.find(attrname) != -1: if alerts_query(attrname) is not None: policy['data']['alert_results'][attrname]=True alerts[attrname]=True else: policy['data']['alert_results'][attrname]=False alerts[attrname]=False return queries, alerts
def remove_node(endpoint, id): log = logging.getLogger("pk_k8s") if pk_config.dryrun_get(dryrun_id): log.info("(M) DRYRUN enabled. Skipping...") return try: query = pykube.Node.objects(kube).filter( field_selector={"metadata.name": id}) node = [x for x in query][0] node.reload() node.delete() except Exception: log.error("(M) => Removing k8s node failed.") return
def query_number_of_worker_nodes(config, worker_name): """ Return the number of instances of a worker node, pulled from tfstate """ instances = 1 if pk_config.dryrun_get(dryrun_id): log.info("(C) DRYRUN enabled. Skipping...") return instances try: resources = _get_resources_from_state(config, worker_name) instances = len(resources[0]["instances"]) except Exception: log.error("Failed to get no. of instances for {}".format(worker_name)) log.debug("-->instances: {0}".format(instances)) return instances
def scale_k8s_deploy(endpoint, service_name, replicas): service_name = '-'.join(service_name.split('_')[1:]) log = logging.getLogger('pk_k8s') log.info('(S) => m_container_count: {0}'.format(replicas)) if pk_config.dryrun_get(dryrun_id): log.info('(S) DRYRUN enabled. Skipping...') return kubernetes.config.load_kube_config() client = kubernetes.client.ExtensionsV1beta1Api() try: dep = client.read_namespaced_deployment(service_name, "default") dep.spec.replicas = replicas client.patch_namespaced_deployment_scale(service_name, "default", dep) except Exception as e: log.warning('(S) Scaling of k8s service "{0}" failed: {1}'.format( service_name, str(e))) return
def deploy_alerts_under_prometheus(rules_directory, alerts, stack): log = logging.getLogger('pk_prometheus') if pk_config.dryrun_get(dryrun_id): log.info('(C) DRYRUN enabled. Skipping...') return if not alerts: return try: content = {'groups': [{'name': 'micado', 'rules': []}]} for alert in alerts: content['groups'][0]['rules'].append(dict(alert)) rule_file = os.path.join(rules_directory, stack + '.rules') with open(rule_file, 'w') as outfile: yaml.round_trip_dump(content, outfile, default_flow_style=False) except Exception: log.exception('Deploying alerts under Prometheus failed:') return
def scale_k8s_deploy(endpoint, service_name, replicas): service_name = "-".join(service_name.split("_")[1:]) log = logging.getLogger("pk_k8s") log.info("(S) => m_container_count: {0}".format(replicas)) if pk_config.dryrun_get(dryrun_id): log.info("(S) DRYRUN enabled. Skipping...") return try: query = pykube.Deployment.objects(kube).filter( field_selector={"metadata.name": service_name}) deployment = [x for x in query][0] deployment.reload() deployment.scale(replicas) except Exception as e: log.warning('(S) Scaling of k8s service "{0}" failed: {1}'.format( service_name, str(e))) return
def calling_rest_api_init(): global m_opt_accessible log=logging.getLogger('pk_optimizer') config = pk_config.config() if pk_config.dryrun_get(dryrun_id): log.info('(O) DRYRUN enabled. Skipping...') return url = config.get('optimizer_endpoint')+'/init' log.debug('(O) Calling optimizer REST API init() method: '+url) try: response = requests.post(url, data=yaml.dump(m_opt_init_params)) m_opt_accessible = True except Exception as e: m_opt_accessible = False log.exception('(O) Calling optimizer REST API init() method raised exception: ') log.info('(O) WARNING: Optimizer is disabled for the current policy.') return log.debug('(O) Response: '+str(response)) return
def query_k8s_replicas(endpoint, service_name): service_name = '-'.join(service_name.split('_')[1:]) log = logging.getLogger('pk_k8s') instance = 1 if pk_config.dryrun_get(dryrun_id): log.info('(I) DRYRUN enabled. Skipping...') return instance kubernetes.config.load_kube_config() client = kubernetes.client.ExtensionsV1beta1Api() try: dep = client.read_namespaced_deployment(service_name, "default") replicas = dep.spec.replicas log.debug('(I) => m_container_count for {0}: {1}'.format( service_name, replicas)) except Exception as e: log.warning( '(Q) Querying k8s service "{0}" replicas failed: {1}'.format( service_name, str(e))) return instance
def query_k8s_replicas(endpoint, service_name): service_name = "-".join(service_name.split("_")[1:]) log = logging.getLogger("pk_k8s") instance = 1 if pk_config.dryrun_get(dryrun_id): log.info("(I) DRYRUN enabled. Skipping...") return instance try: query = pykube.Deployment.objects(kube).filter( field_selector={"metadata.name": service_name}) deployment = [x for x in query][0] deployment.reload() instance = deployment.replicas log.debug("(I) => m_container_count for {0}: {1}".format( service_name, instance)) except Exception as e: log.warning( '(Q) Querying k8s service "{0}" replicas failed: {1}'.format( service_name, str(e))) return instance
def query_list_of_nodes(endpoint, worker_name='micado-worker', status='ready'): log = logging.getLogger('pk_k8s') list_of_nodes = [] if pk_config.dryrun_get(dryrun_id): log.info('(I) DRYRUN enabled. Skipping...') a = {} a['ID'] = 'dummyID' a['Addr'] = '127.0.0.1' list_of_nodes.append(a.copy()) return list_of_nodes kubernetes.config.load_kube_config() client = kubernetes.client.CoreV1Api() try: nodes = [ x for x in client.list_node().items if MASTER not in x.metadata.labels ] if status == 'ready': nodes = [ x for x in nodes if NOTREADY not in [y.key for y in x.spec.taints or []] ] nodes = [ x for x in nodes if x.metadata.labels.get('micado.eu/node_type') == worker_name ] elif status == 'down': nodes = [ x for x in nodes if NOTREADY in [y.key for y in x.spec.taints or []] ] for n in nodes: a = {} a['ID'] = n.metadata.name a['Addr'] = n.status.addresses[0].address list_of_nodes.append(a.copy()) return list_of_nodes except Exception as e: log.exception('(Q) Query of k8s nodes failed.') return dict()
def collect_init_params_and_variables(policy): log = logging.getLogger('pk_optimizer') config = pk_config.config() if pk_config.dryrun_get(dryrun_id): log.info('(O) DRYRUN enabled. Skipping...') return reset_variables() m_opt_init_params['constants'] = dict() for varname, value in policy.get('data', dict()).get('constants', dict()).items(): retvarname = varname_if_init(varname) if retvarname: log.info('(O) => INIT: {0}:{1}'.format(retvarname, value)) m_opt_init_params['constants'][retvarname] = value m_opt_init_params['constants']['input_metrics'] = list() for varname, query in policy.get('data', dict()).get('queries', dict()).items(): retvarname = varname_if_input(varname) if retvarname: log.info('(O) => INPUT: {0}:{1}'.format(retvarname, query)) m_opt_init_params['constants']['input_metrics'].append( dict(name=retvarname)) m_opt_variables.append( dict(lname=varname, sname=retvarname, query=query)) m_opt_init_params['constants']['target_metrics'] = list() for varname, query in policy.get('data', dict()).get('queries', dict()).items(): if check_if_target(varname): insert_target_structure(m_opt_init_params, varname, query) for onenode in policy.get('scaling', dict()).get('nodes', []): if 'm_opt_advice' in onenode.get('scaling_rule', ''): _, omin, omax = limit_instances(None, onenode.get('min_instances'), onenode.get('max_instances')) m_opt_init_params['constants']['min_vm_number'] = omin m_opt_init_params['constants']['max_vm_number'] = omax log.debug('(O) m_opt_init_params (yaml) => {0}'.format( yaml.dump(m_opt_init_params))) log.debug('(O) m_opt_variables (yaml) => {0}'.format( yaml.dump(m_opt_variables))) return
def query_list_of_nodes(endpoint, worker_name="micado-worker", status="ready"): log = logging.getLogger("pk_k8s") list_of_nodes = [] if pk_config.dryrun_get(dryrun_id): log.info("(I) DRYRUN enabled. Skipping...") a = {} a["ID"] = "dummyID" a["Addr"] = "127.0.0.1" list_of_nodes.append(a.copy()) return list_of_nodes try: if status == "ready": query = pykube.Node.objects(kube).filter( selector={"micado.eu/node_type__in": {worker_name}}) nodes = [x for x in query if "taints" not in x.obj["spec"]] elif status == "down": nodes = [] worker_nodes = [ x for x in pykube.Node.objects(kube) if MASTER not in x.labels ] for node in worker_nodes: ready_condition = [ x.items() for x in node.obj["status"]["conditions"] if x.get("type") == "Ready" ][0] if ("status", "Unknown") in ready_condition: nodes.append(node) for n in nodes: a = {} n.reload() a["ID"] = n.metadata["name"] a["Addr"] = n.obj["status"]["addresses"][0]["address"] list_of_nodes.append(a.copy()) return list_of_nodes except Exception: log.exception("(Q) Query of k8s nodes failed.") return dict()
def remove_exporters_from_prometheus_config(template_file, config_file): log = logging.getLogger('pk_prometheus') if pk_config.dryrun_get(dryrun_id): log.info('(C) DRYRUN enabled. Skipping...') return shutil.copyfile(template_file, config_file)
def evaluate_data_queries_and_alerts_for_nodes(endpoint, policy, node): log = logging.getLogger('pk_prometheus') if pk_config.dryrun_get(dryrun_id): log.info( '(Q) DRYRUN enabled. Assigning queries as values to metrics...') queries, alerts = dict(), dict() if 'data' not in policy: policy['data'] = {} if 'query_results' not in policy['data']: policy['data']['query_results'] = dict() scaling_rule_str = node.get('scaling_rule', '') for param, query in policy.get('data', dict()).get('queries', dict()).iteritems(): try: if param.find('m_opt') != -1 or \ (scaling_rule_str is not None and \ scaling_rule_str.find(param) != -1): if pk_config.dryrun_get(dryrun_id) or \ param.startswith("m_opt_target_minth_") or \ param.startswith("m_opt_target_maxth_"): #TODO: handle dummy value more appropriately policy['data']['query_results'][param] = query queries[param] = query else: if isinstance(query, list): response = requests.get(endpoint + "/api/v1/query?query=" + query[0]).json() log.debug('Prometheus response query "{0}":{1}'.format( query[0], response)) val = extract_value_from_prometheus_response( query, response, dict()) policy['data']['query_results'][param] = val queries[param] = val else: response = requests.get(endpoint + "/api/v1/query?query=" + query).json() log.debug('Prometheus response query "{0}":{1}'.format( query, response)) val = extract_value_from_prometheus_response( query, response, dict()) policy['data']['query_results'][param] = float(val) queries[param] = float(val) except Exception as e: policy['data']['query_results'][param] = None queries[param] = None log.warning( 'Evaluating expression for query "{0}" failed: {1}'.format( param, e.message)) policy['data']['alert_results'] = {} for item in policy.get('data', dict()).get('alerts', dict()): attrname = item['alert'] if scaling_rule_str is not None and scaling_rule_str.find( attrname) != -1: if alerts_query(attrname) is not None: policy['data']['alert_results'][attrname] = True alerts[attrname] = True else: policy['data']['alert_results'][attrname] = False alerts[attrname] = False return queries, alerts
def add_exporters_to_prometheus_config(policy, template_file, config_file): log = logging.getLogger('pk_prometheus') try: config_content = dict() if pk_config.dryrun_get(dryrun_id): log.info('(C) DRYRUN enabled. Skipping...') return shutil.copy(config_file, template_file) with open(template_file, 'r') as f: config_content = yaml.round_trip_load(f) if 'scrape_configs' not in config_content: config_content['scrape_configs'] = [] #Find proper scrape_config or create scrape_config = [ x for x in config_content['scrape_configs'] if x.get('job_name', '') == 'micado' and 'static_configs' in x ] if not scrape_config: config_content['scrape_configs'].append({ 'job_name': 'micado', 'static_configs': [] }) scrape_config = [ x for x in config_content['scrape_configs'] if x.get('job_name', '') == 'micado' and 'static_configs' in x ][0] else: scrape_config = scrape_config[0] #Find proper static_config or create static_config = [ x for x in scrape_config['static_configs'] if 'targets' in x.keys() ] if not static_config: scrape_config['static_configs'].append({'targets': []}) static_config = [ x for x in scrape_config['static_configs'] if 'targets' in x.keys() ][0] else: static_config = static_config[0] config_changed = False for exporter_endpoint in policy.get('data', dict()).get('sources', dict()): if exporter_endpoint not in static_config['targets']: exp = exporter_endpoint.split(':') if len(exp) == 1: continue elif '.' not in exp[0]: kube_job = [ x for x in config_content['scrape_configs'] if x.get('job_name') == 'kube-services' ] if not kube_job: continue relabel = kube_job[0].get('relabel_configs', []) old_label = [ x for x in relabel if x.get('action') == 'keep' ] if old_label: old_label = old_label[0] old_regex = old_label.get('regex') new_regex = '{}|{}:{}'.format(old_regex, exp[0], exp[1]) old_label['regex'] = new_regex else: label = { 'source_labels': ['endpoint'], 'action': 'keep', 'regex': '(^a)|{}:{}'.format(exp[0], exp[1]) } relabel.append(label) else: static_config['targets'].append(exporter_endpoint) config_changed = True log.info('(C) => exporter "{0}" added to config'.format( exporter_endpoint)) else: log.info( '(C) => exporter "{0}" skipped, already part of config'. format(exporter_endpoint)) if config_changed: with open(config_file, 'w') as outfile: yaml.round_trip_dump(config_content, outfile, default_flow_style=False) except Exception as e: log.exception('Adding exporters to prometheus config failed:') return