Example #1
0
    def add_route(self, routespec, target, data):
        # Create a route with the name being escaped routespec
        # Use full routespec in label
        # 'data' is JSON encoded and put in an annotation - we don't need to query for it
        safe_name = self.safe_name_for_routespec(routespec).lower()
        endpoint, service, ingress = make_ingress(safe_name, routespec, target,
                                                  data)

        @gen.coroutine
        def ensure_object(create_func, patch_func, body, kind):
            try:
                resp = yield self.asynchronize(create_func,
                                               namespace=self.namespace,
                                               body=body)
                self.log.info('Created %s/%s', kind, safe_name)
            except client.rest.ApiException as e:
                if e.status == 409:
                    # This object already exists, we should patch it to make it be what we want
                    self.log.warn("Trying to patch %s/%s, it already exists",
                                  kind, safe_name)
                    resp = yield self.asynchronize(patch_func,
                                                   namespace=self.namespace,
                                                   body=body,
                                                   name=body.metadata.name)
                else:
                    raise

        yield ensure_object(self.core_api.create_namespaced_endpoints,
                            self.core_api.patch_namespaced_endpoints,
                            body=endpoint,
                            kind='endpoints')

        yield exponential_backoff(
            lambda: safe_name in self.endpoint_reflector.endpoints,
            'Could not find endpoints/%s after creating it' % safe_name)

        yield ensure_object(self.core_api.create_namespaced_service,
                            self.core_api.patch_namespaced_service,
                            body=service,
                            kind='service')

        yield exponential_backoff(
            lambda: safe_name in self.service_reflector.services,
            'Could not find service/%s after creating it' % safe_name)

        yield ensure_object(self.extension_api.create_namespaced_ingress,
                            self.extension_api.patch_namespaced_ingress,
                            body=ingress,
                            kind='ingress')

        yield exponential_backoff(
            lambda: safe_name in self.ingress_reflector.ingresses,
            'Could not find ingress/%s after creating it' % safe_name)
Example #2
0
    def _start(self):
        '''Start the user's pod.
        '''
        retry_times = 4  # Ad-hoc
        pod = yield self.get_pod_manifest()
        if self.modify_pod_hook:
            pod = yield gen.maybe_future(self.modify_pod_hook(self, pod))
        for i in range(retry_times):
            try:
                yield self.asynchronize(
                    self.api.create_namespaced_pod,
                    self.namespace,
                    pod,
                )
                break
            except ApiException as e:
                if e.status != 409:
                    # We only want to handle 409 conflict errors
                    self.log.exception("Failed for %s", pod.to_str())
                    raise
                self.log.info('Found existing pod %s, attempting to kill',
                              self.pod_name)
                # TODO: this should show up in events
                yield self.stop(now=True)

                self.log.info('Killed pod %s, will try starting ' %
                              self.pod_name + 'singleuser pod again')
        else:
            raise Exception('Can not create user pod %s :' % self.pod_name +
                            'already exists and could not be deleted')

        # we need a timeout here even though start itself has a timeout
        # in order for this coroutine to finish at some point.
        # using the same start_timeout here
        # essentially ensures that this timeout should never propagate up
        # because the handler will have stopped waiting after
        # start_timeout, starting from a slightly earlier point.
        try:
            yield exponential_backoff(
                lambda: self.is_pod_running(
                    self.pod_reflector.pods.get(
                        (self.namespace, self.pod_name), None)),
                'pod/%s did not start in %s seconds!' %
                (self.pod_name, self.start_timeout),
                timeout=self.start_timeout,
            )
        except TimeoutError:
            if self.pod_name not in self.pod_reflector.pods:
                # if pod never showed up at all,
                # restart the pod reflector which may have
                # become disconnected.
                self.log.error(
                    "Pod {} never showed up in reflector; restarting " +
                    "pod reflector.".format(self.pod_name))
                self._start_watching_pods(replace=True)
                raise

        pod = self.pod_reflector.pods[(self.namespace, self.pod_name)]
        self.pod_id = pod.metadata.uid
        return (pod.status.pod_ip, self.port)
Example #3
0
    def start(self):
        if self.user_storage_pvc_ensure:
            pvc = self.get_pvc_manifest()
            try:
                yield self.asynchronize(
                    self.api.create_namespaced_persistent_volume_claim,
                    namespace=self.namespace,
                    body=pvc)
            except ApiException as e:
                if e.status == 409:
                    self.log.info(
                        "PVC " + self.pvc_name +
                        " already exists, so did not create new pvc.")
                else:
                    raise

        # If we run into a 409 Conflict error, it means a pod with the
        # same name already exists. We stop it, wait for it to stop, and
        # try again. We try 4 times, and if it still fails we give up.
        # FIXME: Have better / cleaner retry logic!
        retry_times = 4
        pod = yield self.get_pod_manifest()
        if self.modify_pod_hook:
            pod = yield gen.maybe_future(self.modify_pod_hook(self, pod))
        for i in range(retry_times):
            try:
                yield self.asynchronize(self.api.create_namespaced_pod,
                                        self.namespace, pod)
                break
            except ApiException as e:
                if e.status != 409:
                    # We only want to handle 409 conflict errors
                    self.log.exception("Failed for %s", pod.to_str())
                    raise
                self.log.info('Found existing pod %s, attempting to kill',
                              self.pod_name)
                yield self.stop(True)

                self.log.info(
                    'Killed pod %s, will try starting singleuser pod again',
                    self.pod_name)
        else:
            raise Exception(
                'Can not create user pod %s already exists & could not be deleted'
                % self.pod_name)

        # Note: The self.start_timeout here is kinda superfluous, since
        # there is already a timeout on how long start can run for in
        # jupyterhub itself.
        yield exponential_backoff(lambda: self.is_pod_running(
            self.pod_reflector.pods.get(self.pod_name, None)),
                                  'pod/%s did not start in %s seconds!' %
                                  (self.pod_name, self.start_timeout),
                                  timeout=self.start_timeout)

        pod = self.pod_reflector.pods[self.pod_name]
        return (pod.status.pod_ip, self.port)
Example #4
0
    def stop(self, now=False):
        delete_options = client.V1DeleteOptions()

        if now:
            grace_seconds = 0
        else:
            # Give it some time, but not the default (which is 30s!)
            # FIXME: Move this into pod creation maybe?
            grace_seconds = 1

        delete_options.grace_period_seconds = grace_seconds
        self.log.info("Deleting pod %s", self.pod_name)
        try:
            yield self.asynchronize(
                self.api.delete_namespaced_pod,
                name=self.pod_name,
                namespace=self.namespace,
                body=delete_options,
                grace_period_seconds=grace_seconds,
            )
        except ApiException as e:
            if e.status == 404:
                self.log.warning(
                    "No pod %s to delete. Assuming already deleted.",
                    self.pod_name,
                )
            else:
                raise
        try:
            yield exponential_backoff(
                lambda: self.pod_reflector.pods.get(
                    (self.namespace, self.pod_name), None) is None,
                'pod/%s did not disappear in %s seconds!' %
                (self.pod_name, self.start_timeout),
                timeout=self.start_timeout,
            )
        except TimeoutError:
            self.log.error("Pod %s did not disappear, " % self.pod_name +
                           "restarting pod reflector")
            self._start_watching_pods(replace=True)
            raise
        if self.delete_namespace_on_stop:
            self.asynchronize(self._maybe_delete_namespace)
            if self.delete_namespaced_pvs_on_stop:
                self.asynchronize(self._destroy_namespaced_pvs)
Example #5
0
    def start(self):
        if self.user_storage_pvc_ensure:
            pvc = self.get_pvc_manifest()
            try:
                yield self.asynchronize(
                    self.api.create_namespaced_persistent_volume_claim,
                    namespace=self.namespace,
                    body=pvc)
            except ApiException as e:
                if e.status == 409:
                    self.log.info(
                        "PVC " + self.pvc_name +
                        " already exists, so did not create new pvc.")
                else:
                    raise

        main_loop = IOLoop.current()

        def on_reflector_failure():
            self.log.critical("Events reflector failed, halting Hub.")
            main_loop.stop()

        # events are selected based on pod name, which will include previous launch/stop
        self.events = EventReflector(parent=self,
                                     namespace=self.namespace,
                                     fields={
                                         'involvedObject.kind': 'Pod',
                                         'involvedObject.name': self.pod_name
                                     },
                                     on_failure=on_reflector_failure)
        # If we run into a 409 Conflict error, it means a pod with the
        # same name already exists. We stop it, wait for it to stop, and
        # try again. We try 4 times, and if it still fails we give up.
        # FIXME: Have better / cleaner retry logic!
        retry_times = 4
        pod = yield self.get_pod_manifest()
        if self.modify_pod_hook:
            pod = yield gen.maybe_future(self.modify_pod_hook(self, pod))
        for i in range(retry_times):
            try:
                yield self.asynchronize(self.api.create_namespaced_pod,
                                        self.namespace, pod)
                break
            except ApiException as e:
                if e.status != 409:
                    # We only want to handle 409 conflict errors
                    self.log.exception("Failed for %s", pod.to_str())
                    raise
                self.log.info('Found existing pod %s, attempting to kill',
                              self.pod_name)
                yield self.stop(True)

                self.log.info(
                    'Killed pod %s, will try starting singleuser pod again',
                    self.pod_name)
        else:
            raise Exception(
                'Can not create user pod %s already exists & could not be deleted'
                % self.pod_name)

        # Note: The self.start_timeout here is kinda superfluous, since
        # there is already a timeout on how long start can run for in
        # jupyterhub itself.
        yield exponential_backoff(lambda: self.is_pod_running(
            self.pod_reflector.pods.get(self.pod_name, None)),
                                  'pod/%s did not start in %s seconds!' %
                                  (self.pod_name, self.start_timeout),
                                  timeout=self.start_timeout)

        pod = self.pod_reflector.pods[self.pod_name]
        self.log.debug('pod %s events before launch: %s', self.pod_name,
                       self.events.events)
        # Note: we stop the event watcher once launch is successful, but the reflector
        # will only stop when the next event comes in, likely when it is stopped.
        self.events.stop()
        return (pod.status.pod_ip, self.port)
    def add_route(self, routespec, target, data):
        # Create a route with the name being escaped routespec
        # Use full routespec in label
        # 'data' is JSON encoded and put in an annotation - we don't need to query for it
        safe_name = self.safe_name_for_routespec(routespec).lower()
        endpoint, service, ingress = make_ingress(
            safe_name,
            routespec,
            target,
            data
        )

        @gen.coroutine
        def ensure_object(create_func, patch_func, body, kind):
            try:
                resp = yield self.asynchronize(
                    create_func,
                    namespace=self.namespace,
                    body=body
                )
                self.log.info('Created %s/%s', kind, safe_name)
            except client.rest.ApiException as e:
                if e.status == 409:
                    # This object already exists, we should patch it to make it be what we want
                    self.log.warn("Trying to patch %s/%s, it already exists", kind, safe_name)
                    resp = yield self.asynchronize(
                        patch_func,
                        namespace=self.namespace,
                        body=body,
                        name=body.metadata.name
                    )
                else:
                    raise

        if endpoint is not None:
            yield ensure_object(
                self.core_api.create_namespaced_endpoints,
                self.core_api.patch_namespaced_endpoints,
                body=endpoint,
                kind='endpoints'
            )

            yield exponential_backoff(
                lambda: safe_name in self.endpoint_reflector.endpoints,
                'Could not find endpoints/%s after creating it' % safe_name
            )
        else:
            delete_endpoint = self.asynchronize(
                self.core_api.delete_namespaced_endpoints,
                name=safe_name,
                namespace=self.namespace,
                body=client.V1DeleteOptions(grace_period_seconds=0),
            )
            yield self.delete_if_exists('endpoint', safe_name, delete_endpoint)

        yield ensure_object(
            self.core_api.create_namespaced_service,
            self.core_api.patch_namespaced_service,
            body=service,
            kind='service'
        )

        yield exponential_backoff(
            lambda: safe_name in self.service_reflector.services,
            'Could not find service/%s after creating it' % safe_name
        )

        yield ensure_object(
            self.extension_api.create_namespaced_ingress,
            self.extension_api.patch_namespaced_ingress,
            body=ingress,
            kind='ingress'
        )

        yield exponential_backoff(
            lambda: safe_name in self.ingress_reflector.ingresses,
            'Could not find ingress/%s after creating it' % safe_name
        )
Example #7
0
    def _start(self):
        """Start the user's pod"""
        # Ensure namespace and necessary resources exist
        self._ensure_namespace()
        # record latest event so we don't include old
        # events from previous pods in self.events
        # track by order and name instead of uid
        # so we get events like deletion of a previously stale
        # pod if it's part of this spawn process
        events = self.events
        if events:
            self._last_event = events[-1].metadata.uid

        if self.storage_pvc_ensure:
            # Try and create the pvc. If it succeeds we are good. If
            # returns a 409 indicating it already exists we are good. If
            # it returns a 403, indicating potential quota issue we need
            # to see if pvc already exists before we decide to raise the
            # error for quota being exceeded. This is because quota is
            # checked before determining if the PVC needed to be
            # created.

            pvc = self.get_pvc_manifest()

            try:
                yield self.asynchronize(
                    self.api.create_namespaced_persistent_volume_claim,
                    namespace=self.namespace,
                    body=pvc)
            except ApiException as e:
                if e.status == 409:
                    self.log.info("PVC " + self.pvc_name +
                                  " already exists, so did not create" +
                                  " new pvc.")

                elif e.status == 403:
                    t, v, tb = sys.exc_info()

                    try:
                        yield self.asynchronize(
                            self.api.read_namespaced_persistent_volume_claim,
                            name=self.pvc_name,
                            namespace=self.namespace)
                    except ApiException:
                        raise v.with_traceback(tb)

                    self.log.info("PVC " + self.pvc_name + " already exists," +
                                  " possibly have reached quota though.")

                else:
                    raise
        # If we run into a 409 Conflict error, it means a pod with the
        # same name already exists. We stop it, wait for it to stop, and
        # try again. We try 4 times, and if it still fails we give up.
        # FIXME: Have better / cleaner retry logic!
        retry_times = 4
        pod = yield self.get_pod_manifest()
        if self.modify_pod_hook:
            pod = yield gen.maybe_future(self.modify_pod_hook(self, pod))
        for i in range(retry_times):
            try:
                yield self.asynchronize(
                    self.api.create_namespaced_pod,
                    self.namespace,
                    pod,
                )
                break
            except ApiException as e:
                if e.status != 409:
                    # We only want to handle 409 conflict errors
                    self.log.exception("Failed for %s", pod.to_str())
                    raise
                self.log.info('Found existing pod %s, attempting to kill',
                              self.pod_name)
                # TODO: this should show up in events
                yield self.stop(True)

                self.log.info('Killed pod %s, will try starting ' %
                              self.pod_name + 'singleuser pod again')
        else:
            raise Exception('Can not create user pod %s :' % self.pod_name +
                            'already exists and could not be deleted')

        # we need a timeout here even though start itself has a timeout
        # in order for this coroutine to finish at some point.
        # using the same start_timeout here
        # essentially ensures that this timeout should never propagate up
        # because the handler will have stopped waiting after
        # start_timeout, starting from a slightly earlier point.
        try:
            yield exponential_backoff(
                lambda: self.is_pod_running(
                    self.pod_reflector.pods.get(
                        (self.namespace, self.pod_name), None)),
                'pod/%s did not start in %s seconds!' %
                (self.pod_name, self.start_timeout),
                timeout=self.start_timeout,
            )
        except TimeoutError:
            if self.pod_name not in self.pod_reflector.pods:
                # if pod never showed up at all,
                # restart the pod reflector which may have become disconnected.
                self.log.error("Pod %s never showed up in reflector;" %
                               self.pod_name + " restarting pod reflector.")
                self._start_watching_pods(replace=True)
            raise

        pod = self.pod_reflector.pods[(self.namespace, self.pod_name)]
        self.pod_id = pod.metadata.uid
        if self.event_reflector:
            self.log.debug(
                'pod %s events before launch: %s',
                self.pod_name,
                "\n".join([
                    "%s [%s] %s" %
                    (event.last_timestamp, event.type, event.message)
                    for event in self.events
                ]),
            )
        return (pod.status.pod_ip, self.port)