def manifest(self, namespace, name, app_type, target, **kwargs): min_replicas = kwargs.get('min') max_replicas = kwargs.get('max') cpu_percent = kwargs.get('cpu_percent') if min_replicas < 1: raise KubeException('min replicas needs to be 1 or higher') if max_replicas < min_replicas: raise KubeException( 'max replicas can not be smaller than min replicas') labels = { 'app': namespace, 'type': app_type, 'heritage': 'drycc', } manifest = { 'kind': 'HorizontalPodAutoscaler', 'apiVersion': self.api_version, 'metadata': { 'name': name, 'namespace': namespace, 'labels': labels, }, 'spec': { 'minReplicas': min_replicas, 'maxReplicas': max_replicas, } } if self.version() >= parse("1.3.0"): manifest['spec']['targetCPUUtilizationPercentage'] = cpu_percent manifest['spec']['scaleTargetRef'] = { 'apiVersion': target['apiVersion'], # only works with Deployments, RS and RC 'kind': target['kind'], 'name': target['metadata']['name'], } elif self.version() <= parse("1.2.0"): # api changed between version manifest['spec']['cpuUtilization'] = { 'targetPercentage': cpu_percent } manifest['spec']['scaleRef'] = { # only works with Deployments, RS and RC 'kind': target['kind'], 'name': target['metadata']['name'], # the resource of the above which does the scale action 'subresource': 'scale', } return manifest
def _handle_not_ready_pods(self, namespace, labels): """ Detects if any pod is in the Running phase but not Ready and handles any potential issues around that mainly failed healthcheks """ pods = self.get(namespace, labels=labels).json()['items'] if not pods: pods = [] for pod in pods: # only care about pods that are in running phase if pod['status']['phase'] != 'Running': continue name = '{}-{}'.format(pod['metadata']['labels']['app'], pod['metadata']['labels']['type']) # noqa # find the right container in case there are many on the pod container = self.find_container(name, pod['status']['containerStatuses']) if container is None or container['ready'] == 'true': continue for event in self.events(pod): if event['reason'] == 'Unhealthy': # strip out whitespaces on either side message = "\n".join( [x.strip() for x in event['message'].split("\n")]) raise KubeException(message)
def _handle_pod_errors(self, pod, reason, message): """ Handle potential pod errors based on the Pending reason passed into the function Images, FailedScheduling and others are needed """ # image error reported on the container level container_errors = [ 'Pending', # often an indication of deeper inspection is needed 'ErrImagePull', 'ImagePullBackOff', 'RegistryUnavailable', 'ErrImageInspect', ] # Image event reason mapping event_errors = { "Failed": "FailedToPullImage", "InspectFailed": "FailedToInspectImage", "ErrImageNeverPull": "ErrImageNeverPullPolicy", # Not including this one for now as the message is not useful # "BackOff": "BackOffPullImage", } # We want to be able to ignore pod scheduling errors as they might be temporary if not os.environ.get("DEIS_IGNORE_SCHEDULING_FAILURE", False): # FailedScheduling relates limits event_errors["FailedScheduling"] = "FailedScheduling" # Nicer error than from the event # Often this gets to ImageBullBackOff before we can introspect tho if reason == 'ErrImagePull': raise KubeException(message) # collect all error messages of worth messages = [] if reason in container_errors: for event in self.events(pod): if event['reason'] in event_errors.keys(): # only show a given error once event_errors.pop(event['reason']) # strip out whitespaces on either side message = "\n".join( [x.strip() for x in event['message'].split("\n")]) messages.append(message) if messages: raise KubeException("\n".join(messages))
def deploy(self, namespace, name, image, entrypoint, command, **kwargs): # noqa """Deploy Deployment depending on what's requested""" app_type = kwargs.get('app_type') version = kwargs.get('version') spec_annotations = {} # If an RC already exists then stop processing of the deploy try: # construct old school RC name rc_name = '{}-{}-{}'.format(namespace, version, app_type) self.rc.get(namespace, rc_name) self.log(namespace, 'RC {} already exists. Stopping deploy'.format(rc_name)) return except KubeHTTPException: # if RC doesn't exist then let the app continue pass # create a deployment if missing, otherwise update to trigger a release try: # labels that represent the pod(s) labels = { 'app': namespace, 'version': version, 'type': app_type, 'heritage': 'deis', } # this depends on the deployment object having the latest information deployment = self.deployment.get(namespace, name).json() # a hack to persist the spec annotations on the deployment object to next release # instantiate spec_annotations and set to blank to avoid errors if 'annotations' in deployment['spec']['template'][ 'metadata'].keys(): old_spec_annotations = deployment['spec']['template'][ 'metadata']['annotations'] spec_annotations = old_spec_annotations if deployment['spec']['template']['metadata']['labels'] == labels: self.log( namespace, 'Deployment {} with release {} already exists. Stopping deploy' .format(name, version)) # noqa return except KubeException: # create the initial deployment object (and the first revision) self.deployment.create(namespace, name, image, entrypoint, command, spec_annotations, **kwargs) else: try: # kick off a new revision of the deployment self.deployment.update(namespace, name, image, entrypoint, command, spec_annotations, **kwargs) except KubeException as e: raise KubeException( 'There was a problem while deploying {} of {}-{}. ' "Additional information:\n{}".format( version, namespace, app_type, str(e))) from e
def run(self, namespace, name, image, entrypoint, command, **kwargs): """Run a one-off command.""" self.log( namespace, 'run {}, img {}, entrypoint {}, cmd "{}"'.format( name, image, entrypoint, command)) # run pods never restart kwargs['restartPolicy'] = 'Never' kwargs['command'] = entrypoint kwargs['args'] = command self.pod.create(namespace, name, image, **kwargs) try: # give pod 20 minutes to execute (after it got into ready state) # this is a fairly arbitrary limit but the gunicorn worker / LBs # will make this timeout around 20 anyway. # TODO: Revisit in the future so it can run longer state = 'up' # pod is still running waited = 0 timeout = 1200 # 20 minutes while (state == 'up' and waited < timeout): pod = self.pod.get(namespace, name).json() state = str(self.pod.state(pod)) # default data exit_code = 0 waited += 1 time.sleep(1) if state == 'down': # run finished successfully exit_code = 0 # successful run elif state == 'crashed': # run failed pod_state = pod['status']['containerStatuses'][0]['state'] exit_code = pod_state['terminated']['exitCode'] # timed out! if waited == timeout: raise KubeException('Timed out (20 mins) while running') # check if it is possible to get logs state = self.pod.state(self.pod.get(namespace, name).json()) # States below up do not have logs if not isinstance(state, PodState) or state < PodState.up: return exit_code, 'Could not get logs. Pod is in state {}'.format( str(state)) # grab log information log = self.pod.logs(namespace, name) log.encoding = 'utf-8' # defaults to "ISO-8859-1" otherwise... return exit_code, log.text finally: # cleanup self.pod.delete(namespace, name)
def http_delete(self, path, **kwargs): """ Make a DELETE request to the k8s server. """ try: url = urljoin(self.url, path) response = self.session.delete(url, **kwargs) except requests.exceptions.ConnectionError as err: # reraise as KubeException, but log stacktrace. message = "There was a problem deleting data from " \ "the Kubernetes API server. URL: {}".format(url) logger.error(message) raise KubeException(message) from err return response
def http_get(self, path, params=None, **kwargs): """ Make a GET request to the k8s server. """ try: url = urljoin(self.url, path) response = self.session.get(url, params=params, **kwargs) except requests.exceptions.ConnectionError as err: # reraise as KubeException, but log stacktrace. message = "There was a problem retrieving data from " \ "the Kubernetes API server. URL: {}, params: {}".format(url, params) logger.error(message) raise KubeException(message) from err return response
def http_put(self, path, data=None, **kwargs): """ Make a PUT request to the k8s server. """ try: url = urljoin(self.url, path) response = self.session.put(url, data=data, **kwargs) except requests.exceptions.ConnectionError as err: # reraise as KubeException, but log stacktrace. message = "There was a problem putting data to " \ "the Kubernetes API server. URL: {}, " \ "data: {}".format(url, data) logger.error(message) raise KubeException(message) from err return response
def _check_for_failed_events(self, namespace, labels): """ Request for new ReplicaSet of Deployment and search for failed events involved by that RS Raises: KubeException when RS have events with FailedCreate reason """ response = self.rs.get(namespace, labels=labels) data = response.json() fields = { 'involvedObject.kind': 'ReplicaSet', 'involvedObject.name': data['items'][0]['metadata']['name'], 'involvedObject.namespace': namespace, 'involvedObject.uid': data['items'][0]['metadata']['uid'], } events_list = self.ns.events(namespace, fields=fields).json() events = events_list.get('items', []) if events is not None and len(events) != 0: for event in events: if event['reason'] == 'FailedCreate': log = self._get_formatted_messages(events) self.log(namespace, log) raise KubeException(log)
def http_patch(self, path, data=None, **kwargs): """ Make a PATCH request to the k8s server. """ try: url = urljoin(self.url, path) # accepted media types include: # application/json-patch+json, # application/merge-patch+json, # application/apply-patch+yaml # self.session.headers["Content-Type"] = "application/json-patch+json" response = self.session.patch(url, data=data, **kwargs) except requests.exceptions.ConnectionError as err: # reraise as KubeException, but log stacktrace. message = "There was a problem patching data to " \ "the Kubernetes API server. URL: {}, " \ "data: {}".format(url, data) logger.error(message) raise KubeException(message) from err return response
def _check_for_failed_events(self, namespace, labels): """ Request for new ReplicaSet of Deployment and search for failed events involved by that RS Raises: KubeException when RS have events with FailedCreate reason """ max_retries = 3 retry_sleep_sec = 3.0 for try_ in range(max_retries): response = self.rs.get(namespace, labels=labels) data = response.json() try: fields = { 'involvedObject.kind': 'ReplicaSet', 'involvedObject.name': data['items'][0]['metadata']['name'], 'involvedObject.namespace': namespace, 'involvedObject.uid': data['items'][0]['metadata']['uid'], } except Exception as e: if try_ + 1 < max_retries: self.log( namespace, "Got an empty ReplicaSet list. Trying one more time. {}" .format(json.dumps(labels))) time.sleep(retry_sleep_sec) continue self.log( namespace, "Did not find the ReplicaSet for {}".format( json.dumps(labels)), "WARN") raise e events_list = self.ns.events(namespace, fields=fields).json() events = events_list.get('items', []) if events is not None and len(events) != 0: for event in events: if event['reason'] == 'FailedCreate': log = self._get_formatted_messages(events) self.log(namespace, log) raise KubeException(log)
def manifest(self, namespace, name, data, secret_type='Opaque', labels={}): secret_types = ['Opaque', 'kubernetes.io/dockerconfigjson'] if secret_type not in secret_types: raise KubeException( '{} is not a supported secret type. Use one of the following: ' .format(secret_type, ', '.join(secret_types))) # noqa manifest = { 'kind': 'Secret', 'apiVersion': 'v1', 'metadata': { 'name': name, 'namespace': namespace, 'labels': { 'app': namespace, 'heritage': 'drycc' } }, 'type': secret_type, 'data': {} } # add in any additional label info manifest['metadata']['labels'].update(labels) for key, value in data.items(): if value is None: manifest['data'].update({key: ''}) continue value = value if isinstance(value, bytes) else bytes( str(value), 'UTF-8') item = base64.b64encode(value).decode(encoding='UTF-8') manifest['data'].update({key: item}) return manifest
def run(self, namespace, name, image, entrypoint, command, **kwargs): """Run a one-off command.""" self.log(namespace, 'run {}, img {}, entrypoint {}, cmd "{}"'.format( name, image, entrypoint, command) ) # force the app_type kwargs['app_type'] = 'run' # run pods never restart kwargs['restartPolicy'] = 'Never' kwargs['command'] = entrypoint kwargs['args'] = command # create application config and build the pod manifest self.set_application_config(namespace, kwargs.get('envs', {}), kwargs.get('version')) manifest = self.pod.manifest(namespace, name, image, **kwargs) url = self.pods.api("/namespaces/{}/pods", namespace) response = self.http_post(url, json=manifest) if self.unhealthy(response.status_code): raise KubeHTTPException(response, 'create Pod in Namespace "{}"', namespace) # wait for run pod to start - use the same function as scale labels = manifest['metadata']['labels'] containers = manifest['spec']['containers'] self.pods.wait_until_ready( namespace, containers, labels, desired=1, timeout=kwargs.get('deploy_timeout') ) try: # give pod 20 minutes to execute (after it got into ready state) # this is a fairly arbitrary limit but the gunicorn worker / LBs # will make this timeout around 20 anyway. # TODO: Revisit in the future so it can run longer state = 'up' # pod is still running waited = 0 timeout = 1200 # 20 minutes while (state == 'up' and waited < timeout): response = self.pod.get(namespace, name) pod = response.json() state = str(self.pod.state(pod)) # default data exit_code = 0 waited += 1 time.sleep(1) if state == 'down': # run finished successfully exit_code = 0 # successful run elif state == 'crashed': # run failed pod_state = pod['status']['containerStatuses'][0]['state'] exit_code = pod_state['terminated']['exitCode'] # timed out! if waited == timeout: raise KubeException('Timed out (20 mins) while running') # check if it is possible to get logs state = self.pod.state(self.pod.get(namespace, name).json()) # States below up do not have logs if not isinstance(state, PodState) or state < PodState.up: return exit_code, 'Could not get logs. Pod is in state {}'.format(str(state)) # grab log information log = self.pod.logs(namespace, name) log.encoding = 'utf-8' # defaults to "ISO-8859-1" otherwise... return exit_code, log.text finally: # cleanup self.pod.delete(namespace, name)