def start(self): try: self._scheduler.start(self.job_id) except Exception as e: err = '{} (start): {}'.format(self.job_id, e) log_event(self.app, err, logging.WARNING) raise
def _scale_containers(self, scale_types, to_remove): release = self.release_set.latest() build_type = app_build_type(release) for scale_type in scale_types: image = release.image version = "v{}".format(release.version) kwargs = { "memory": release.config.memory, "cpu": release.config.cpu, "tags": release.config.tags, "envs": release.config.values, "version": version, "num": scale_types[scale_type], "app_type": scale_type, "build_type": build_type, "healthcheck": release.config.healthcheck(), } job_id = self._get_job_id(scale_type) command = self._get_command(scale_type) try: self._scheduler.scale(namespace=self.id, name=job_id, image=image, command=command, **kwargs) except Exception as e: err = "{} (scale): {}".format(job_id, e) log_event(self, err, logging.ERROR) raise [c.delete() for c in to_remove]
def create(self, request, **kwargs): app = self.get_object() user = get_object_or_404(User, username=request.data['username']) assign_perm(self.perm, user, app) models.log_event(app, "User {} was granted access to {}".format(user, app)) return Response(status=status.HTTP_201_CREATED)
def stop(self): try: self._scheduler.stop(self.job_id) except Exception as e: err = '{} (stop): {}'.format(self.job_id, e) log_event(self.app, err, logging.ERROR) raise
def destroy(self): try: self._scheduler.destroy(self.job_id) except Exception as e: err = '{} (destroy): {}'.format(self.job_id, e) log_event(self.app, err, logging.ERROR) raise
def _deploy_app(self, scale_types, release, existing): build_type = app_build_type(release) for scale_type in scale_types: image = release.image version = "v{}".format(release.version) kwargs = { "memory": release.config.memory, "cpu": release.config.cpu, "tags": release.config.tags, "envs": release.config.values, "num": 0, # Scaling up happens in a separate operation "version": version, "app_type": scale_type, "build_type": build_type, "healthcheck": release.config.healthcheck(), } job_id = self._get_job_id(scale_type) command = self._get_command(scale_type) try: self._scheduler.deploy(namespace=self.id, name=job_id, image=image, command=command, **kwargs) except Exception as e: err = "{} (deploy): {}".format(job_id, e) log_event(self, err, logging.ERROR) raise [c.delete() for c in existing]
def run(self, command): """Run a one-off command""" if self.release.build is None: raise EnvironmentError('No build associated with this release ' 'to run this command') image = self.release.image kwargs = {'memory': self.release.config.memory, 'cpu': self.release.config.cpu, 'tags': self.release.config.tags, 'envs': self.release.config.values} entrypoint = '/bin/bash' # if this is a procfile-based app, switch the entrypoint to slugrunner's default # FIXME: remove slugrunner's hardcoded entrypoint if self.release.build.procfile and \ self.release.build.sha and not \ self.release.build.dockerfile: entrypoint = '/runner/init' command = "'{}'".format(command) else: command = "-c '{}'".format(command) try: rc, output = self._scheduler.run(self.job_id, image, entrypoint, command, **kwargs) return rc, output except Exception as e: err = '{} (run): {}'.format(self.job_id, e) log_event(self.app, err, logging.ERROR) raise
def destroy(self, request, **kwargs): app = self.get_object() user = get_object_or_404(User, username=kwargs['username']) if not user.has_perm(self.perm, app): raise PermissionDenied() remove_perm(self.perm, user, app) models.log_event(app, "User {} was revoked access to {}".format(user, app)) return Response(status=status.HTTP_204_NO_CONTENT)
def create(self, request, **kwargs): app = get_object_or_404(self.model, id=kwargs['id']) if request.user != app.owner and not request.user.is_superuser: return Response(status=status.HTTP_403_FORBIDDEN) user = get_object_or_404(User, username=request.DATA['username']) assign_perm(self.perm, user, app) models.log_event(app, "User {} was granted access to {}".format(user, app)) return Response(status=status.HTTP_201_CREATED)
def create(self, request, **kwargs): app = self.get_object() if not permissions.IsOwnerOrAdmin.has_object_permission(permissions.IsOwnerOrAdmin(), request, self, app): raise PermissionDenied() user = get_object_or_404(User, username=request.data["username"]) assign_perm(self.perm, user, app) models.log_event(app, "User {} was granted access to {}".format(user, app)) return Response(status=status.HTTP_201_CREATED)
def create(self, request, **kwargs): app = self.get_object() if not permissions.IsOwnerOrAdmin.has_object_permission(permissions.IsOwnerOrAdmin(), request, self, app): raise PermissionDenied() user = get_object_or_404(User, username=request.data['username']) assign_perm(self.perm, user, app) models.log_event(app, "User {} was granted access to {}".format(user, app)) return Response(status=status.HTTP_201_CREATED)
def _clean_app_logs(self): """Delete application logs stored by the logger component""" try: url = "http://{}:{}/logs/{}".format(settings.LOGGER_HOST, settings.LOGGER_PORT, self.id) requests.delete(url) except Exception as e: # Ignore errors deleting application logs. An error here should not interfere with # the overall success of deleting an application, but we should log it. err = "Error deleting existing application logs: {}".format(e) log_event(self, err, logging.WARNING)
def create(self, request, **kwargs): app = get_object_or_404(self.model, id=kwargs["id"]) if request.user != app.owner: return Response(status=status.HTTP_403_FORBIDDEN) user = get_object_or_404(User, username=request.DATA["username"]) assign_perm(self.perm, user, app) app.publish() tasks.converge_controller.apply_async().wait() models.log_event(app, "User {} was granted access to {}".format(user, app)) return Response(status=status.HTTP_201_CREATED)
def destroy(self, request, **kwargs): app = get_object_or_404(self.model, id=kwargs['id']) if request.user != app.owner and not request.user.is_superuser: return Response(status=status.HTTP_403_FORBIDDEN) user = get_object_or_404(User, username=kwargs['username']) if user.has_perm(self.perm, app): remove_perm(self.perm, user, app) models.log_event(app, "User {} was revoked access to {}".format(user, app)) return Response(status=status.HTTP_204_NO_CONTENT) else: return Response(status=status.HTTP_404_NOT_FOUND)
def list_pods(self, *args, **kwargs): """Used to list basic information about pods running for a given application""" try: labels = {"app": str(self)} # always supply a version, either latest or a specific one if "release" not in kwargs or kwargs["release"] is None: release = self.release_set.latest() else: release = self.release_set.get(version=kwargs["release"]) version = "v{}".format(release.version) labels.update({"version": version}) if "type" in kwargs: labels.update({"type": kwargs["type"]}) # in case a singular pod is requested if "name" in kwargs: pods = [self._scheduler._get_pod(self.id, kwargs["name"]).json()] else: pods = self._scheduler._get_pods(self.id, labels=labels).json()["items"] data = [] for p in pods: # specifically ignore run pods if p["metadata"]["labels"]["type"] == "run": continue item = Pod() item["name"] = p["metadata"]["name"] item["state"] = self._scheduler.resolve_state(p).name item["release"] = p["metadata"]["labels"]["version"] item["type"] = p["metadata"]["labels"]["type"] if "startTime" in p["status"]: started = p["status"]["startTime"] else: started = str(datetime.utcnow().strftime(settings.DEIS_DATETIME_FORMAT)) item["started"] = started data.append(item) # sorting so latest start date is first data.sort(key=lambda x: x["started"], reverse=True) return data except KubeHTTPException as e: pass except Exception as e: err = "(list pods): {}".format(e) log_event(self, err, logging.ERROR) raise
def destroy(self, request, **kwargs): app = get_object_or_404(self.model, id=kwargs["id"]) if request.user != app.owner: return Response(status=status.HTTP_403_FORBIDDEN) user = get_object_or_404(User, username=kwargs["username"]) if user.has_perm(self.perm, app): remove_perm(self.perm, user, app) app.publish() tasks.converge_controller.apply_async().wait() models.log_event(app, "User {} was revoked access to {}".format(user, app)) return Response(status=status.HTTP_204_NO_CONTENT) else: return Response(status=status.HTTP_404_NOT_FOUND)
def destroy(self, request, **kwargs): app = get_object_or_404(models.App, id=self.kwargs["id"]) user = get_object_or_404(User, username=kwargs["username"]) perm_name = "api.{}".format(self.perm) if not user.has_perm(perm_name, app): raise PermissionDenied() if user != request.user and not permissions.IsOwnerOrAdmin.has_object_permission( permissions.IsOwnerOrAdmin(), request, self, app ): raise PermissionDenied() remove_perm(self.perm, user, app) models.log_event(app, "User {} was revoked access to {}".format(user, app)) return Response(status=status.HTTP_204_NO_CONTENT)
def destroy(self, request, **kwargs): app = get_object_or_404(models.App, id=self.kwargs['id']) user = get_object_or_404(User, username=kwargs['username']) perm_name = "api.{}".format(self.perm) if not user.has_perm(perm_name, app): raise PermissionDenied() if (user != request.user and not permissions.IsOwnerOrAdmin.has_object_permission(permissions.IsOwnerOrAdmin(), request, self, app)): raise PermissionDenied() remove_perm(self.perm, user, app) models.log_event(app, "User {} was revoked access to {}".format(user, app)) return Response(status=status.HTTP_204_NO_CONTENT)
def _restart_containers(self, to_restart): """Restarts containers via the scheduler""" if not to_restart: return stop_threads = [Thread(target=c.stop) for c in to_restart] start_threads = [Thread(target=c.start) for c in to_restart] [t.start() for t in stop_threads] [t.join() for t in stop_threads] if any(c.state != "created" for c in to_restart): err = "warning, some containers failed to stop" log_event(self, err, logging.WARNING) [t.start() for t in start_threads] [t.join() for t in start_threads] if any(c.state != "up" for c in to_restart): err = "warning, some containers failed to start" log_event(self, err, logging.WARNING)
def run(self, user, command): """Run a one-off command in an ephemeral app container.""" if self.release_set.latest().build is None: raise EnvironmentError("No build associated with this release to run this command") # TODO: add support for interactive shell msg = "{} runs '{}'".format(user.username, command) log_event(self, msg) c_num = max([c.num for c in self.container_set.filter(type="run")] or [0]) + 1 # create database record for run process c = Container.objects.create( owner=self.owner, app=self, release=self.release_set.latest(), type="run", num=c_num ) # SECURITY: shell-escape user input escaped_command = command.replace("'", "'\\''") return c.run(escaped_command)
def _destroy_containers(self, to_destroy): """Destroys containers via the scheduler""" if not to_destroy: return # for mock scheduler if "scale" not in dir(self._scheduler): destroy_threads = [Thread(target=c.destroy) for c in to_destroy] [t.start() for t in destroy_threads] [t.join() for t in destroy_threads] [c.delete() for c in to_destroy if c.state == "destroyed"] if any(c.state != "destroyed" for c in to_destroy): err = "aborting, failed to destroy some containers" log_event(self, err, logging.ERROR) raise RuntimeError(err) else: [c.delete() for c in to_destroy]
def create(self): image = self.release.image kwargs = {'memory': self.release.config.memory, 'cpu': self.release.config.cpu, 'tags': self.release.config.tags, 'envs': self.release.config.values} try: self._scheduler.create( name=self.job_id, image=image, command=self._command, **kwargs ) except Exception as e: err = '{} (create): {}'.format(self.job_id, e) log_event(self.app, err, logging.ERROR) raise
def _start_containers(self, to_add): """Creates and starts containers via the scheduler""" if not to_add: return create_threads = [Thread(target=c.create) for c in to_add] start_threads = [Thread(target=c.start) for c in to_add] [t.start() for t in create_threads] [t.join() for t in create_threads] if any(c.state != "created" for c in to_add): err = "aborting, failed to create some containers" log_event(self, err, logging.ERROR) self._destroy_containers(to_add) raise RuntimeError(err) [t.start() for t in start_threads] [t.join() for t in start_threads] if set([c.state for c in to_add]) != set(["up"]): err = "warning, some containers failed to start" log_event(self, err, logging.WARNING)
def new(self, user, config, build, summary=None, source_version='latest'): """ Create a new application release using the provided Build and Config on behalf of a user. Releases start at v1 and auto-increment. """ # construct fully-qualified target image new_version = self.version + 1 # create new release and auto-increment version release = Release.objects.create( owner=user, app=self.app, config=config, build=build, version=new_version, summary=summary) try: release.publish() except EnvironmentError as e: # If we cannot publish this app, just log and carry on log_event(self.app, e) pass return release
def scale(self, user, structure): # noqa """Scale containers up or down to match requested structure.""" # use create to make sure minimum resources are created self.create() if self.release_set.latest().build is None: raise EnvironmentError("No build associated with this release") requested_structure = structure.copy() release = self.release_set.latest() # test for available process types available_process_types = release.build.procfile or {} for container_type in requested_structure: if container_type == "cmd": continue # allow docker cmd types in case we don't have the image source if container_type not in available_process_types: raise EnvironmentError("Container type {} does not exist in application".format(container_type)) msg = "{} scaled containers ".format(user.username) + " ".join( "{}={}".format(k, v) for k, v in list(requested_structure.items()) ) log_event(self, msg) # iterate and scale by container type (web, worker, etc) changed = False to_add, to_remove = [], [] scale_types = {} # iterate on a copy of the container_type keys for container_type in list(requested_structure.keys()): containers = list(self.container_set.filter(type=container_type).order_by("created")) # increment new container nums off the most recent container results = self.container_set.filter(type=container_type).aggregate(Max("num")) container_num = (results.get("num__max") or 0) + 1 requested = requested_structure.pop(container_type) diff = requested - len(containers) if diff == 0: continue changed = True scale_types[container_type] = requested while diff < 0: c = containers.pop() to_remove.append(c) diff += 1 while diff > 0: # create a database record c = Container.objects.create( owner=self.owner, app=self, release=release, type=container_type, num=container_num ) to_add.append(c) container_num += 1 diff -= 1 if changed: if "scale" in dir(self._scheduler): self._scale_containers(scale_types, to_remove) else: if to_add: self._start_containers(to_add) if to_remove: self._destroy_containers(to_remove) # save new structure to the database vals = self.container_set.exclude(type="run").values("type").annotate(Count("pk")).order_by() new_structure = structure.copy() new_structure.update({v["type"]: v["pk__count"] for v in vals}) self.structure = new_structure self.save() return changed
def restart(self, **kwargs): # noqa """ Restart found pods by deleting them (RC will recreate). Wait until they are all drained away and RC has gotten to a good state """ try: # Resolve single pod name if short form (worker-asdfg) is passed if "name" in kwargs and kwargs["name"].count("-") == 1: if "release" not in kwargs or kwargs["release"] is None: release = self.release_set.latest() else: release = self.release_set.get(version=kwargs["release"]) version = "v{}".format(release.version) kwargs["name"] = "{}-{}-{}".format(kwargs["id"], version, kwargs["name"]) # Fetch the initial set of pods to work from pods = self.list_pods(**kwargs) desired = len(pods) except KubeException: # Nothing was found return [] try: for pod in pods: # This function verifies the delete. Gives pod 30 seconds self._scheduler._delete_pod(self.id, pod["name"]) except Exception as e: err = "warning, some pods failed to stop:\n{}".format(str(e)) log_event(self, err, logging.WARNING) # Wait for pods to start try: timeout = 300 # 5 minutes elapsed = 0 while True: # timed out if elapsed >= timeout: raise RuntimeError("timeout - 5 minutes have passed and pods are not up") # restarting a single pod behaves differently, fetch the *newest* pod # and hope it is the right one. Comes back sorted if "name" in kwargs: del kwargs["name"] pods = self.list_pods(**kwargs) # Add in the latest name kwargs["name"] = pods[0]["name"] pods = pods[0] actual = 0 for pod in self.list_pods(**kwargs): if pod["state"] == "up": actual += 1 if desired == actual: break elapsed += 5 time.sleep(5) except Exception as e: err = "warning, some pods failed to start:\n{}".format(str(e)) log_event(self, err, logging.WARNING) # Return the new pods pods = self.list_pods(**kwargs) return pods