def get_deployment_names_from_list(service_instance_list): app_names = [] for service_instance in service_instance_list: try: service, instance, _, __ = decompose_job_id(service_instance) app_name = get_kubernetes_app_name(service, instance) app_names.append(app_name) except InvalidJobNameError: log.error( f"Invalid service instance specified. Format is service{SPACER}instance." ) sys.exit(1) return app_names
def main(): args = parse_args() soa_dir = args.soa_dir cluster = args.cluster instances = get_services_for_cluster(cluster=cluster, instance_type="kubernetes", soa_dir=soa_dir) service_instances = [] for name, instance in instances: if args.sanitise: app_name = kubernetes_tools.get_kubernetes_app_name(name, instance) else: app_name = compose_job_id(name, instance) service_instances.append(app_name) print("\n".join(service_instances)) sys.exit(0)
def create_instance_cpu_scaling_rule( service: str, instance: str, autoscaling_config: AutoscalingParamsDict, paasta_cluster: str, ) -> PrometheusAdapterRule: """ Creates a Prometheus adapter rule config for a given service instance. """ deployment_name = get_kubernetes_app_name(service=service, instance=instance) sanitized_instance_name = sanitise_kubernetes_name(instance) metric_name = f"{deployment_name}-cpu-prom" moving_average_window = autoscaling_config.get( "moving_average_window_seconds", DEFAULT_CPU_AUTOSCALING_MOVING_AVERAGE_WINDOW) # this series query is a bit of a hack: we don't use the Prometheus adapter as expected (i.e., very generic rules) # but we still need to give it a query that returns something even though we're not going to use the series/label # templates that are auto-extracted for us. That said: we still need this query to return labels that can be tied # back to k8s objects WITHOUT using label_replace series_query = f""" kube_deployment_labels{{ deployment='{deployment_name}', paasta_cluster='{paasta_cluster}', namespace='paasta' }} """ cpu_usage = f""" avg( irate( container_cpu_usage_seconds_total{{ namespace='paasta', container='{sanitized_instance_name}', paasta_cluster='{paasta_cluster}' }}[1m] ) ) by (pod, container) """ cpus_available = f""" sum( container_spec_cpu_quota{{ namespace='paasta', container='{sanitized_instance_name}', paasta_cluster='{paasta_cluster}' }} / container_spec_cpu_period{{ namespace='paasta', paasta_cluster='{paasta_cluster}' }} ) by (pod, container) """ # NOTE: we only have Pod names in our container_cpu* metrics, but we can't get a # Deployment from those consistenly due to k8s limitations on certain field lengths # - thus we need to extract this information from the ReplicaSet name (which is made # possible by the fact that our ReplicaSets are named # {{deployment}}-{{10 character hex string}}) so that our query only considers the # service that we want to autoscale - without this we're only filtering by instance # name and these are very much not unique # k8s:pod:info is an internal recording rule that joins kube_pod_info with # kube_pod_status_phase pod_info_join = f""" on (pod) group_left(kube_deployment) label_replace( k8s:pod:info{{ created_by_name=~'{deployment_name}.*', created_by_kind='ReplicaSet', namespace='paasta', paasta_cluster='{paasta_cluster}', phase='Running' }}, 'kube_deployment', '$1', 'created_by_name', '(.+)-[a-f0-9]{{10}}' ) """ # get the total usage of all of our Pods divided by the number of CPUs available to # those Pods (i.e., the k8s CPU limit) in order to get the % of CPU used and then add # some labels to this vector load = f""" sum( (({cpu_usage}) / ({cpus_available})) * {pod_info_join} ) by (kube_deployment) """ current_replicas = f""" ( scalar( kube_deployment_spec_replicas{{paasta_cluster='{paasta_cluster}',deployment='{deployment_name}'}} >= 0 or max_over_time( kube_deployment_spec_replicas{{paasta_cluster='{paasta_cluster}',deployment='{deployment_name}'}}[{DEFAULT_EXTRAPOLATION_TIME}s] ) ) ) """ # we want to calculate: # * the desired replicas based on instantaneous load, # * smooth that over time, # * and then divide by the non-smoothed current number of replicas. # otherwise, if we do the naive thing and take the average of the load inside avg_over_time, # then we'll see the oscillations that we fixed in PR #2862 moving_average_load = f""" avg_over_time(({load})[{moving_average_window}s:]) / {current_replicas} """ # for some reason, during bounces we lose the labels from the previous timeseries (and thus end up with two # timeseries), so we avg these to merge them together # NOTE: we multiply by 100 to return a number between [0, 100] to the HPA moving_average_load_percent = f"avg({moving_average_load}) * 100" # we need to do some somwhat hacky label_replaces to inject labels that will then be used for association # without these, the adapter doesn't know what deployment to associate the query result with # NOTE: these labels MUST match the equivalent ones in the seriesQuery metrics_query = f""" label_replace( label_replace( {moving_average_load_percent}, 'deployment', '{deployment_name}', '', '' ), 'namespace', 'paasta', '', '' ) """ return { "name": { "as": metric_name }, "seriesQuery": _minify_promql(series_query), "metricsQuery": _minify_promql(metrics_query), "resources": { "overrides": { "namespace": { "resource": "namespace" }, "deployment": { "group": "apps", "resource": "deployments" }, }, }, }
def create_instance_uwsgi_scaling_rule( service: str, instance: str, autoscaling_config: AutoscalingParamsDict, paasta_cluster: str, ) -> PrometheusAdapterRule: """ Creates a Prometheus adapter rule config for a given service instance. """ setpoint = autoscaling_config["setpoint"] moving_average_window = autoscaling_config.get( "moving_average_window_seconds", DEFAULT_UWSGI_AUTOSCALING_MOVING_AVERAGE_WINDOW) # this should always be set, but we default to 0 for safety as the worst thing that would happen # is that we take a couple more iterations than required to hit the desired setpoint offset = autoscaling_config.get("offset", 0) deployment_name = get_kubernetes_app_name(service=service, instance=instance) worker_filter_terms = f"paasta_cluster='{paasta_cluster}',paasta_service='{service}',paasta_instance='{instance}'" replica_filter_terms = ( f"paasta_cluster='{paasta_cluster}',deployment='{deployment_name}'") current_replicas = f""" sum( label_join( ( kube_deployment_spec_replicas{{{replica_filter_terms}}} >= 0 or max_over_time( kube_deployment_spec_replicas{{{replica_filter_terms}}}[{DEFAULT_EXTRAPOLATION_TIME}s] ) ), "kube_deployment", "", "deployment" ) ) by (kube_deployment) """ # k8s:deployment:pods_status_ready is a metric created by summing kube_pod_status_ready # over paasta service/instance/cluster. it counts the number of ready pods in a paasta # deployment. ready_pods = f""" (sum( k8s:deployment:pods_status_ready{{{worker_filter_terms}}} >= 0 or max_over_time( k8s:deployment:pods_status_ready{{{worker_filter_terms}}}[{DEFAULT_EXTRAPOLATION_TIME}s] ) ) by (kube_deployment)) """ load_per_instance = f""" avg( uwsgi_worker_busy{{{worker_filter_terms}}} ) by (kube_pod, kube_deployment) """ missing_instances = f""" clamp_min( {ready_pods} - count({load_per_instance}) by (kube_deployment), 0 ) """ total_load = f""" ( sum( {load_per_instance} ) by (kube_deployment) + {missing_instances} ) """ desired_instances_at_each_point_in_time = f""" {total_load} / {setpoint - offset} """ desired_instances = f""" avg_over_time( ( {desired_instances_at_each_point_in_time} )[{moving_average_window}s:] ) """ metrics_query = f""" {desired_instances} / {current_replicas} """ metric_name = f"{deployment_name}-uwsgi-prom" return { "name": { "as": metric_name }, "seriesQuery": f"uwsgi_worker_busy{{{worker_filter_terms}}}", "resources": { "template": "kube_<<.Resource>>" }, "metricsQuery": _minify_promql(metrics_query), }
def create_instance_arbitrary_promql_scaling_rule( service: str, instance: str, autoscaling_config: AutoscalingParamsDict, paasta_cluster: str, ) -> PrometheusAdapterRule: prometheus_adapter_config = autoscaling_config["prometheus_adapter_config"] deployment_name = get_kubernetes_app_name(service=service, instance=instance) if "seriesQuery" in prometheus_adapter_config: # If the user specifies seriesQuery, don't wrap their metricsQuery, under the assumption that they may not want # us to mess with their labels. series_query = prometheus_adapter_config["seriesQuery"] metrics_query = prometheus_adapter_config["metricsQuery"] else: # If the user doesn't specify seriesQuery, assume they want to just write some promql that returns a number. # Set up series_query to match the default `resources` series_query = f""" kube_deployment_labels{{ deployment='{deployment_name}', paasta_cluster='{paasta_cluster}', namespace='paasta' }} """ # Wrap their promql with label_replace() calls that add `deployment` / `namespace` labels which match the default `resources`. metrics_query = f""" label_replace( label_replace( {prometheus_adapter_config["metricsQuery"]}, 'deployment', '{deployment_name}', '', '' ), 'namespace', 'paasta', '', '' ) """ return { "name": { "as": f"{deployment_name}-arbitrary-promql", }, "seriesQuery": _minify_promql(series_query), "metricsQuery": _minify_promql(metrics_query), "resources": prometheus_adapter_config.get( "resources", { "overrides": { "namespace": { "resource": "namespace" }, "deployment": { "group": "apps", "resource": "deployments" }, }, }, ), }
def create_instance_piscina_scaling_rule( service: str, instance: str, autoscaling_config: AutoscalingParamsDict, paasta_cluster: str, ) -> PrometheusAdapterRule: """ Creates a Prometheus adapter rule config for a given service instance. """ setpoint = autoscaling_config["setpoint"] moving_average_window = autoscaling_config.get( "moving_average_window_seconds", DEFAULT_PISCINA_AUTOSCALING_MOVING_AVERAGE_WINDOW, ) deployment_name = get_kubernetes_app_name(service=service, instance=instance) worker_filter_terms = f"paasta_cluster='{paasta_cluster}',paasta_service='{service}',paasta_instance='{instance}'" replica_filter_terms = ( f"paasta_cluster='{paasta_cluster}',deployment='{deployment_name}'") current_replicas = f""" sum( label_join( ( kube_deployment_spec_replicas{{{replica_filter_terms}}} >= 0 or max_over_time( kube_deployment_spec_replicas{{{replica_filter_terms}}}[{DEFAULT_EXTRAPOLATION_TIME}s] ) ), "kube_deployment", "", "deployment" ) ) by (kube_deployment) """ # k8s:deployment:pods_status_ready is a metric created by summing kube_pod_status_ready # over paasta service/instance/cluster. it counts the number of ready pods in a paasta # deployment. ready_pods = f""" (sum( k8s:deployment:pods_status_ready{{{worker_filter_terms}}} >= 0 or max_over_time( k8s:deployment:pods_status_ready{{{worker_filter_terms}}}[{DEFAULT_EXTRAPOLATION_TIME}s] ) ) by (kube_deployment)) """ load_per_instance = f""" (piscina_pool_utilization{{{worker_filter_terms}}}) """ missing_instances = f""" clamp_min( {ready_pods} - count({load_per_instance}) by (kube_deployment), 0 ) """ total_load = f""" ( sum( {load_per_instance} ) by (kube_deployment) + {missing_instances} ) """ desired_instances_at_each_point_in_time = f""" {total_load} / {setpoint} """ desired_instances = f""" avg_over_time( ( {desired_instances_at_each_point_in_time} )[{moving_average_window}s:] ) """ metrics_query = f""" {desired_instances} / {current_replicas} """ return { "name": { "as": f"{deployment_name}-piscina-prom" }, "seriesQuery": f"piscina_pool_utilization{{{worker_filter_terms}}}", "resources": { "template": "kube_<<.Resource>>" }, "metricsQuery": _minify_promql(metrics_query), }
def sync_boto_secrets( kube_client: KubeClient, cluster: str, service: str, secret_provider_name: str, vault_cluster_config: Mapping[str, str], soa_dir: str, namespace: str, ) -> bool: # Update boto key secrets config_loader = PaastaServiceConfigLoader(service=service, soa_dir=soa_dir) for instance_config in config_loader.instance_configs( cluster=cluster, instance_type_class=KubernetesDeploymentConfig): instance = instance_config.instance boto_keys = instance_config.config_dict.get("boto_keys", []) if not boto_keys: continue boto_keys.sort() secret_data = {} for key in boto_keys: for filetype in ["sh", "yaml", "json", "cfg"]: this_key = key + "." + filetype sanitised_key = this_key.replace(".", "-").replace("_", "--") try: with open(f"/etc/boto_cfg_private/{this_key}") as f: secret_data[sanitised_key] = base64.b64encode( f.read().encode("utf-8")).decode("utf-8") except IOError: log.warning( f"Boto key {this_key} required for {service} could not be found." ) if not secret_data: continue # In order to prevent slamming the k8s API, add some artificial delay here time.sleep(0.3) app_name = get_kubernetes_app_name(service, instance) secret = limit_size_with_hash(f"paasta-boto-key-{app_name}") hashable_data = "".join([secret_data[key] for key in secret_data]) signature = hashlib.sha1(hashable_data.encode("utf-8")).hexdigest() kubernetes_signature = get_kubernetes_secret_signature( kube_client=kube_client, secret=secret, service=service, namespace=namespace, ) if not kubernetes_signature: log.info( f"{secret} for {service} in {namespace} not found, creating") try: create_plaintext_dict_secret( kube_client=kube_client, secret_name=secret, secret_data=secret_data, service=service, namespace=namespace, ) except ApiException as e: if e.status == 409: log.warning( f"Secret {secret} for {service} already exists in {namespace} but no signature found. Updating secret and signature." ) update_plaintext_dict_secret( kube_client=kube_client, secret_name=secret, secret_data=secret_data, service=service, namespace=namespace, ) else: raise create_kubernetes_secret_signature( kube_client=kube_client, secret=secret, service=service, secret_signature=signature, namespace=namespace, ) elif signature != kubernetes_signature: log.info( f"{secret} for {service} in {namespace} needs updating as signature changed" ) update_plaintext_dict_secret( kube_client=kube_client, secret_name=secret, secret_data=secret_data, service=service, namespace=namespace, ) update_kubernetes_secret_signature( kube_client=kube_client, secret=secret, service=service, secret_signature=signature, namespace=namespace, ) else: log.info(f"{secret} for {service} in {namespace} up to date") return True