def add_route(self, routespec, target, data): # Create a route with the name being escaped routespec # Use full routespec in label # 'data' is JSON encoded and put in an annotation - we don't need to query for it safe_name = self.safe_name_for_routespec(routespec).lower() endpoint, service, ingress = make_ingress(safe_name, routespec, target, data) @gen.coroutine def ensure_object(create_func, patch_func, body, kind): try: resp = yield self.asynchronize(create_func, namespace=self.namespace, body=body) self.log.info('Created %s/%s', kind, safe_name) except client.rest.ApiException as e: if e.status == 409: # This object already exists, we should patch it to make it be what we want self.log.warn("Trying to patch %s/%s, it already exists", kind, safe_name) resp = yield self.asynchronize(patch_func, namespace=self.namespace, body=body, name=body.metadata.name) else: raise yield ensure_object(self.core_api.create_namespaced_endpoints, self.core_api.patch_namespaced_endpoints, body=endpoint, kind='endpoints') yield exponential_backoff( lambda: safe_name in self.endpoint_reflector.endpoints, 'Could not find endpoints/%s after creating it' % safe_name) yield ensure_object(self.core_api.create_namespaced_service, self.core_api.patch_namespaced_service, body=service, kind='service') yield exponential_backoff( lambda: safe_name in self.service_reflector.services, 'Could not find service/%s after creating it' % safe_name) yield ensure_object(self.extension_api.create_namespaced_ingress, self.extension_api.patch_namespaced_ingress, body=ingress, kind='ingress') yield exponential_backoff( lambda: safe_name in self.ingress_reflector.ingresses, 'Could not find ingress/%s after creating it' % safe_name)
def _start(self): '''Start the user's pod. ''' retry_times = 4 # Ad-hoc pod = yield self.get_pod_manifest() if self.modify_pod_hook: pod = yield gen.maybe_future(self.modify_pod_hook(self, pod)) for i in range(retry_times): try: yield self.asynchronize( self.api.create_namespaced_pod, self.namespace, pod, ) break except ApiException as e: if e.status != 409: # We only want to handle 409 conflict errors self.log.exception("Failed for %s", pod.to_str()) raise self.log.info('Found existing pod %s, attempting to kill', self.pod_name) # TODO: this should show up in events yield self.stop(now=True) self.log.info('Killed pod %s, will try starting ' % self.pod_name + 'singleuser pod again') else: raise Exception('Can not create user pod %s :' % self.pod_name + 'already exists and could not be deleted') # we need a timeout here even though start itself has a timeout # in order for this coroutine to finish at some point. # using the same start_timeout here # essentially ensures that this timeout should never propagate up # because the handler will have stopped waiting after # start_timeout, starting from a slightly earlier point. try: yield exponential_backoff( lambda: self.is_pod_running( self.pod_reflector.pods.get( (self.namespace, self.pod_name), None)), 'pod/%s did not start in %s seconds!' % (self.pod_name, self.start_timeout), timeout=self.start_timeout, ) except TimeoutError: if self.pod_name not in self.pod_reflector.pods: # if pod never showed up at all, # restart the pod reflector which may have # become disconnected. self.log.error( "Pod {} never showed up in reflector; restarting " + "pod reflector.".format(self.pod_name)) self._start_watching_pods(replace=True) raise pod = self.pod_reflector.pods[(self.namespace, self.pod_name)] self.pod_id = pod.metadata.uid return (pod.status.pod_ip, self.port)
def start(self): if self.user_storage_pvc_ensure: pvc = self.get_pvc_manifest() try: yield self.asynchronize( self.api.create_namespaced_persistent_volume_claim, namespace=self.namespace, body=pvc) except ApiException as e: if e.status == 409: self.log.info( "PVC " + self.pvc_name + " already exists, so did not create new pvc.") else: raise # If we run into a 409 Conflict error, it means a pod with the # same name already exists. We stop it, wait for it to stop, and # try again. We try 4 times, and if it still fails we give up. # FIXME: Have better / cleaner retry logic! retry_times = 4 pod = yield self.get_pod_manifest() if self.modify_pod_hook: pod = yield gen.maybe_future(self.modify_pod_hook(self, pod)) for i in range(retry_times): try: yield self.asynchronize(self.api.create_namespaced_pod, self.namespace, pod) break except ApiException as e: if e.status != 409: # We only want to handle 409 conflict errors self.log.exception("Failed for %s", pod.to_str()) raise self.log.info('Found existing pod %s, attempting to kill', self.pod_name) yield self.stop(True) self.log.info( 'Killed pod %s, will try starting singleuser pod again', self.pod_name) else: raise Exception( 'Can not create user pod %s already exists & could not be deleted' % self.pod_name) # Note: The self.start_timeout here is kinda superfluous, since # there is already a timeout on how long start can run for in # jupyterhub itself. yield exponential_backoff(lambda: self.is_pod_running( self.pod_reflector.pods.get(self.pod_name, None)), 'pod/%s did not start in %s seconds!' % (self.pod_name, self.start_timeout), timeout=self.start_timeout) pod = self.pod_reflector.pods[self.pod_name] return (pod.status.pod_ip, self.port)
def stop(self, now=False): delete_options = client.V1DeleteOptions() if now: grace_seconds = 0 else: # Give it some time, but not the default (which is 30s!) # FIXME: Move this into pod creation maybe? grace_seconds = 1 delete_options.grace_period_seconds = grace_seconds self.log.info("Deleting pod %s", self.pod_name) try: yield self.asynchronize( self.api.delete_namespaced_pod, name=self.pod_name, namespace=self.namespace, body=delete_options, grace_period_seconds=grace_seconds, ) except ApiException as e: if e.status == 404: self.log.warning( "No pod %s to delete. Assuming already deleted.", self.pod_name, ) else: raise try: yield exponential_backoff( lambda: self.pod_reflector.pods.get( (self.namespace, self.pod_name), None) is None, 'pod/%s did not disappear in %s seconds!' % (self.pod_name, self.start_timeout), timeout=self.start_timeout, ) except TimeoutError: self.log.error("Pod %s did not disappear, " % self.pod_name + "restarting pod reflector") self._start_watching_pods(replace=True) raise if self.delete_namespace_on_stop: self.asynchronize(self._maybe_delete_namespace) if self.delete_namespaced_pvs_on_stop: self.asynchronize(self._destroy_namespaced_pvs)
def start(self): if self.user_storage_pvc_ensure: pvc = self.get_pvc_manifest() try: yield self.asynchronize( self.api.create_namespaced_persistent_volume_claim, namespace=self.namespace, body=pvc) except ApiException as e: if e.status == 409: self.log.info( "PVC " + self.pvc_name + " already exists, so did not create new pvc.") else: raise main_loop = IOLoop.current() def on_reflector_failure(): self.log.critical("Events reflector failed, halting Hub.") main_loop.stop() # events are selected based on pod name, which will include previous launch/stop self.events = EventReflector(parent=self, namespace=self.namespace, fields={ 'involvedObject.kind': 'Pod', 'involvedObject.name': self.pod_name }, on_failure=on_reflector_failure) # If we run into a 409 Conflict error, it means a pod with the # same name already exists. We stop it, wait for it to stop, and # try again. We try 4 times, and if it still fails we give up. # FIXME: Have better / cleaner retry logic! retry_times = 4 pod = yield self.get_pod_manifest() if self.modify_pod_hook: pod = yield gen.maybe_future(self.modify_pod_hook(self, pod)) for i in range(retry_times): try: yield self.asynchronize(self.api.create_namespaced_pod, self.namespace, pod) break except ApiException as e: if e.status != 409: # We only want to handle 409 conflict errors self.log.exception("Failed for %s", pod.to_str()) raise self.log.info('Found existing pod %s, attempting to kill', self.pod_name) yield self.stop(True) self.log.info( 'Killed pod %s, will try starting singleuser pod again', self.pod_name) else: raise Exception( 'Can not create user pod %s already exists & could not be deleted' % self.pod_name) # Note: The self.start_timeout here is kinda superfluous, since # there is already a timeout on how long start can run for in # jupyterhub itself. yield exponential_backoff(lambda: self.is_pod_running( self.pod_reflector.pods.get(self.pod_name, None)), 'pod/%s did not start in %s seconds!' % (self.pod_name, self.start_timeout), timeout=self.start_timeout) pod = self.pod_reflector.pods[self.pod_name] self.log.debug('pod %s events before launch: %s', self.pod_name, self.events.events) # Note: we stop the event watcher once launch is successful, but the reflector # will only stop when the next event comes in, likely when it is stopped. self.events.stop() return (pod.status.pod_ip, self.port)
def add_route(self, routespec, target, data): # Create a route with the name being escaped routespec # Use full routespec in label # 'data' is JSON encoded and put in an annotation - we don't need to query for it safe_name = self.safe_name_for_routespec(routespec).lower() endpoint, service, ingress = make_ingress( safe_name, routespec, target, data ) @gen.coroutine def ensure_object(create_func, patch_func, body, kind): try: resp = yield self.asynchronize( create_func, namespace=self.namespace, body=body ) self.log.info('Created %s/%s', kind, safe_name) except client.rest.ApiException as e: if e.status == 409: # This object already exists, we should patch it to make it be what we want self.log.warn("Trying to patch %s/%s, it already exists", kind, safe_name) resp = yield self.asynchronize( patch_func, namespace=self.namespace, body=body, name=body.metadata.name ) else: raise if endpoint is not None: yield ensure_object( self.core_api.create_namespaced_endpoints, self.core_api.patch_namespaced_endpoints, body=endpoint, kind='endpoints' ) yield exponential_backoff( lambda: safe_name in self.endpoint_reflector.endpoints, 'Could not find endpoints/%s after creating it' % safe_name ) else: delete_endpoint = self.asynchronize( self.core_api.delete_namespaced_endpoints, name=safe_name, namespace=self.namespace, body=client.V1DeleteOptions(grace_period_seconds=0), ) yield self.delete_if_exists('endpoint', safe_name, delete_endpoint) yield ensure_object( self.core_api.create_namespaced_service, self.core_api.patch_namespaced_service, body=service, kind='service' ) yield exponential_backoff( lambda: safe_name in self.service_reflector.services, 'Could not find service/%s after creating it' % safe_name ) yield ensure_object( self.extension_api.create_namespaced_ingress, self.extension_api.patch_namespaced_ingress, body=ingress, kind='ingress' ) yield exponential_backoff( lambda: safe_name in self.ingress_reflector.ingresses, 'Could not find ingress/%s after creating it' % safe_name )
def _start(self): """Start the user's pod""" # Ensure namespace and necessary resources exist self._ensure_namespace() # record latest event so we don't include old # events from previous pods in self.events # track by order and name instead of uid # so we get events like deletion of a previously stale # pod if it's part of this spawn process events = self.events if events: self._last_event = events[-1].metadata.uid if self.storage_pvc_ensure: # Try and create the pvc. If it succeeds we are good. If # returns a 409 indicating it already exists we are good. If # it returns a 403, indicating potential quota issue we need # to see if pvc already exists before we decide to raise the # error for quota being exceeded. This is because quota is # checked before determining if the PVC needed to be # created. pvc = self.get_pvc_manifest() try: yield self.asynchronize( self.api.create_namespaced_persistent_volume_claim, namespace=self.namespace, body=pvc) except ApiException as e: if e.status == 409: self.log.info("PVC " + self.pvc_name + " already exists, so did not create" + " new pvc.") elif e.status == 403: t, v, tb = sys.exc_info() try: yield self.asynchronize( self.api.read_namespaced_persistent_volume_claim, name=self.pvc_name, namespace=self.namespace) except ApiException: raise v.with_traceback(tb) self.log.info("PVC " + self.pvc_name + " already exists," + " possibly have reached quota though.") else: raise # If we run into a 409 Conflict error, it means a pod with the # same name already exists. We stop it, wait for it to stop, and # try again. We try 4 times, and if it still fails we give up. # FIXME: Have better / cleaner retry logic! retry_times = 4 pod = yield self.get_pod_manifest() if self.modify_pod_hook: pod = yield gen.maybe_future(self.modify_pod_hook(self, pod)) for i in range(retry_times): try: yield self.asynchronize( self.api.create_namespaced_pod, self.namespace, pod, ) break except ApiException as e: if e.status != 409: # We only want to handle 409 conflict errors self.log.exception("Failed for %s", pod.to_str()) raise self.log.info('Found existing pod %s, attempting to kill', self.pod_name) # TODO: this should show up in events yield self.stop(True) self.log.info('Killed pod %s, will try starting ' % self.pod_name + 'singleuser pod again') else: raise Exception('Can not create user pod %s :' % self.pod_name + 'already exists and could not be deleted') # we need a timeout here even though start itself has a timeout # in order for this coroutine to finish at some point. # using the same start_timeout here # essentially ensures that this timeout should never propagate up # because the handler will have stopped waiting after # start_timeout, starting from a slightly earlier point. try: yield exponential_backoff( lambda: self.is_pod_running( self.pod_reflector.pods.get( (self.namespace, self.pod_name), None)), 'pod/%s did not start in %s seconds!' % (self.pod_name, self.start_timeout), timeout=self.start_timeout, ) except TimeoutError: if self.pod_name not in self.pod_reflector.pods: # if pod never showed up at all, # restart the pod reflector which may have become disconnected. self.log.error("Pod %s never showed up in reflector;" % self.pod_name + " restarting pod reflector.") self._start_watching_pods(replace=True) raise pod = self.pod_reflector.pods[(self.namespace, self.pod_name)] self.pod_id = pod.metadata.uid if self.event_reflector: self.log.debug( 'pod %s events before launch: %s', self.pod_name, "\n".join([ "%s [%s] %s" % (event.last_timestamp, event.type, event.message) for event in self.events ]), ) return (pod.status.pod_ip, self.port)