def _update_mon_count(current, new): # type: (ccl.CephCluster, ccl.CephCluster) -> ccl.CephCluster if newcount is None: raise orchestrator.OrchestratorError( 'unable to set mon count to None') if not new.spec.mon: raise orchestrator.OrchestratorError( "mon attribute not specified in new spec") new.spec.mon.count = newcount return new
def get_ceph_image(self) -> str: try: api_response = self.coreV1_api.list_namespaced_pod(self.rook_env.namespace, label_selector="app=rook-ceph-mon", timeout_seconds=10) if api_response.items: return api_response.items[-1].spec.containers[0].image else: raise orchestrator.OrchestratorError( "Error getting ceph image. Cluster without monitors") except ApiException as e: raise orchestrator.OrchestratorError("Error getting ceph image: {}".format(e))
def add_stateless_service(self, service_type, spec): """ Add a stateless service in the cluster : service_type: Kind of service (nfs, rgw, mds) : spec : an Orchestrator.StatelessServiceSpec object : returns : Completion object """ # Check service_type is supported if service_type not in ["rgw"]: raise orchestrator.OrchestratorError( "{} service not supported".format(service_type)) # Add the hosts to the inventory in the right group hosts = spec.service_spec.hosts if not hosts: raise orchestrator.OrchestratorError("No hosts provided."\ "At least one destination host is needed to install the RGW "\ "service") InventoryGroup("{}s".format(service_type), self.ar_client).update(hosts) # Limit playbook execution to certain hosts limited = ",".join(hosts) # Add the settings for this service extravars = vars(spec.service_spec) # Group hosts by resource (used in rm ops) if service_type == "rgw": resource_group = "rgw_zone_{}".format(spec.service_spec.rgw_zone) InventoryGroup(resource_group, self.ar_client).update(hosts) # Execute the playbook to create the service playbook_operation = PlaybookOperation( client=self.ar_client, playbook=SITE_PLAYBOOK, logger=self.log, result_pattern="", params=extravars, querystr_dict={"limit": limited}) # Filter to get the result playbook_operation.output_wizard = ProcessPlaybookResult( self.ar_client, self.log) playbook_operation.event_filter_list = ["playbook_on_stats"] # Execute the playbook self._launch_operation(playbook_operation) return playbook_operation
def _remove_osds_bg(self) -> None: """ Performs actions in the _serve() loop to remove an OSD when criteria is met. """ logger.debug( f"{len(self.to_remove_osds)} OSDs are scheduled for removal: {list(self.to_remove_osds)}" ) self._update_osd_removal_status() remove_osds: set = self.to_remove_osds.copy() for osd in remove_osds: if not osd.force: self.drain_osd(osd.osd_id) # skip criteria if not self.is_empty(osd.osd_id): logger.info( f"OSD <{osd.osd_id}> is not empty yet. Waiting a bit more" ) continue if not self.ok_to_destroy([osd.osd_id]): logger.info( f"OSD <{osd.osd_id}> is not safe-to-destroy yet. Waiting a bit more" ) continue # abort criteria if not self.down_osd([osd.osd_id]): # also remove it from the remove_osd list and set a health_check warning? raise orchestrator.OrchestratorError( f"Could not set OSD <{osd.osd_id}> to 'down'") if osd.replace: if not self.destroy_osd(osd.osd_id): # also remove it from the remove_osd list and set a health_check warning? raise orchestrator.OrchestratorError( f"Could not destroy OSD <{osd.osd_id}>") else: if not self.purge_osd(osd.osd_id): # also remove it from the remove_osd list and set a health_check warning? raise orchestrator.OrchestratorError( f"Could not purge OSD <{osd.osd_id}>") self.mgr._remove_daemon(osd.fullname, osd.nodename) logger.info( f"Successfully removed OSD <{osd.osd_id}> on {osd.nodename}") logger.debug(f"Removing {osd.osd_id} from the queue.") self.to_remove_osds.remove(osd)
def remove_service(self, service_name: str, force: bool = False) -> str: if service_name == 'rbd-mirror': return self.rook_cluster.rm_service('cephrbdmirrors', 'default-rbd-mirror') service_type, service_id = service_name.split('.', 1) if service_type == 'mds': return self.rook_cluster.rm_service('cephfilesystems', service_id) elif service_type == 'rgw': return self.rook_cluster.rm_service('cephobjectstores', service_id) elif service_type == 'nfs': ret, out, err = self.mon_command({'prefix': 'auth ls'}) matches = re.findall(rf'client\.nfs-ganesha\.{service_id}\..*', out) for match in matches: self.check_mon_command({'prefix': 'auth rm', 'entity': match}) return self.rook_cluster.rm_service('cephnfses', service_id) elif service_type == 'rbd-mirror': return self.rook_cluster.rm_service('cephrbdmirrors', service_id) elif service_type == 'osd': if service_id in self._drive_group_map: del self._drive_group_map[service_id] self._save_drive_groups() return f'Removed {service_name}' elif service_type == 'ingress': self.log.info("{0} service '{1}' does not exist".format( 'ingress', service_id)) return 'The Rook orchestrator does not currently support ingress' else: raise orchestrator.OrchestratorError( f'Service type {service_type} not supported')
def remote_from_orchestrator_cli_self_test(self, what: str) -> Any: import orchestrator if what == 'OrchestratorError': return orchestrator.OrchResult(result=None, exception=orchestrator.OrchestratorError('hello, world')) elif what == "ZeroDivisionError": return orchestrator.OrchResult(result=None, exception=ZeroDivisionError('hello, world')) assert False, repr(what)
def get_daemon(self, daemon_name: str) -> orchestrator.DaemonDescription: assert not daemon_name.startswith('ha-rgw.') for _, dm in self.daemons.items(): for _, dd in dm.items(): if dd.name() == daemon_name: return dd raise orchestrator.OrchestratorError(f'Unable to find {daemon_name} daemon(s)')
def _update_mon_count(current, new): # type: (ccl.CephCluster, ccl.CephCluster) -> ccl.CephCluster if newcount is None: raise orchestrator.OrchestratorError( 'unable to set mon count to None') new.spec.mon.count = newcount return new
def _daemon_action(self, action, name): if '.' not in name: raise orchestrator.OrchestratorError('%s is not a valid daemon name' % name) (daemon_type, daemon_id) = name.split('.', 1) completion = self.daemon_action(action, daemon_type, daemon_id) self._orchestrator_wait([completion]) orchestrator.raise_if_exception(completion) return HandleCommandResult(stdout=completion.result_str())
def get_daemon(self, daemon_name: str, host: Optional[str] = None) -> orchestrator.DaemonDescription: assert not daemon_name.startswith('ha-rgw.') dds = self.get_daemons_by_host(host) if host else self._get_daemons() for dd in dds: if dd.name() == daemon_name: return dd raise orchestrator.OrchestratorError(f'Unable to find {daemon_name} daemon(s)')
def _daemon_rm(self, names): for name in names: if '.' not in name: raise orchestrator.OrchestratorError('%s is not a valid daemon name' % name) completion = self.remove_daemons(names) self._orchestrator_wait([completion]) orchestrator.raise_if_exception(completion) return HandleCommandResult(stdout=completion.result_str())
def remove_stateless_service(self, service_type, id_resource): """ Remove a stateles services providing <sv_id> resources :svc_type : Kind of service (nfs, rgw, mds) :id_resource : Id of the resource provided <zone name> if service is RGW ... : returns : Completion object """ # Check service_type is supported if service_type not in ["rgw"]: raise orchestrator.OrchestratorError( "{} service not supported".format(service_type)) # Ansible Inventory group for the kind of service group = "{}s".format(service_type) # get the list of hosts where to remove the service # (hosts in resource group) if service_type == "rgw": group_prefix = "rgw_zone_{}" resource_group = group_prefix.format(id_resource) hosts_list = list(InventoryGroup(resource_group, self.ar_client)) limited = ",".join(hosts_list) # Avoid manual confirmation extravars = {"ireallymeanit": "yes"} # Execute the playbook to remove the service playbook_operation = PlaybookOperation( client=self.ar_client, playbook=PURGE_PLAYBOOK, logger=self.log, result_pattern="", params=extravars, querystr_dict={"limit": limited}) # Filter to get the result playbook_operation.output_wizard = ProcessPlaybookResult( self.ar_client, self.log) playbook_operation.event_filter_list = ["playbook_on_stats"] # Cleaning of inventory after a sucessful operation clean_inventory = {} clean_inventory[resource_group] = hosts_list clean_inventory[group] = hosts_list playbook_operation.clean_hosts_on_success = clean_inventory # Execute the playbook self.log.info("Removing service %s for resource %s", service_type, id_resource) self._launch_operation(playbook_operation) return playbook_operation
def remove_service(self, service_name: str) -> str: service_type, service_name = service_name.split('.', 1) if service_type == 'mds': return self.rook_cluster.rm_service('cephfilesystems', service_name) elif service_type == 'rgw': return self.rook_cluster.rm_service('cephobjectstores', service_name) elif service_type == 'nfs': return self.rook_cluster.rm_service('cephnfses', service_name) else: raise orchestrator.OrchestratorError(f'Service type {service_type} not supported')
def remote_from_orchestrator_cli_self_test(self, what): import orchestrator if what == 'OrchestratorError': c = orchestrator.TrivialReadCompletion(result=None) c.fail(orchestrator.OrchestratorError('hello, world')) return c elif what == "ZeroDivisionError": c = orchestrator.TrivialReadCompletion(result=None) c.fail(ZeroDivisionError('hello, world')) return c assert False, repr(what)
def _service_rm(self, name): if '.' in name: (service_type, service_name) = name.split('.') else: service_type = name service_name = None if name in ['mon', 'mgr']: raise orchestrator.OrchestratorError( 'The mon and mgr services cannot be removed') completion = self.remove_service(service_type, service_name) self._orchestrator_wait([completion]) orchestrator.raise_if_exception(completion) return HandleCommandResult(stdout=completion.result_str())
def _proc_daemons(daemons): args = [] for d in daemons: args.append((d.service_type, d.service_instance, d.nodename, action)) if not args: if service_name: n = service_name + '-*' else: n = service_id raise orchestrator.OrchestratorError( 'Unable to find %s.%s daemon(s)' % ( service_type, n)) return self._service_action(args)
def add_host(self, spec): # type: (orchestrator.HostSpec) -> None host = spec.hostname if host == 'raise_validation_error': raise orchestrator.OrchestratorValidationError("MON count must be either 1, 3 or 5") if host == 'raise_error': raise orchestrator.OrchestratorError("host address is empty") if host == 'raise_bug': raise ZeroDivisionError() if host == 'raise_not_implemented': raise NotImplementedError() if host == 'raise_no_orchestrator': raise orchestrator.NoOrchestrator() if host == 'raise_import_error': raise ImportError("test_orchestrator not enabled") assert isinstance(host, str)
def remove_service(self, service_name: str) -> RookCompletion[str]: service_type, service_name = service_name.split('.', 1) if service_type == 'mds': return self._service_rm_decorate( 'MDS', service_name, lambda: self.rook_cluster.rm_service( 'cephfilesystems', service_name)) elif service_type == 'rgw': return self._service_rm_decorate( 'RGW', service_name, lambda: self.rook_cluster.rm_service( 'cephobjectstores', service_name)) elif service_type == 'nfs': return self._service_rm_decorate( 'NFS', service_name, lambda: self.rook_cluster.rm_service( 'cephnfses', service_name)) else: raise orchestrator.OrchestratorError( f'Service type {service_type} not supported')
def service_action(self, action, service_type, service_name=None, service_id=None): self.log.debug('service_action action %s type %s name %s id %s' % ( action, service_type, service_name, service_id)) if action == 'reload': return trivial_result(["Reload is a no-op"]) daemons = self._get_services( service_type, service_name=service_name, service_id=service_id) args = [] for d in daemons: args.append((d.service_type, d.service_instance, d.nodename, action)) if not args: if service_name: n = service_name + '-*' else: n = service_id raise orchestrator.OrchestratorError( 'Unable to find %s.%s daemon(s)' % ( service_type, n)) return self._service_action(args)
def process_removal_queue(self) -> None: """ Performs actions in the _serve() loop to remove an OSD when criteria is met. we can't hold self.lock, as we're calling _remove_daemon in the loop """ # make sure that we don't run on OSDs that are not in the cluster anymore. self.cleanup() # find osds that are ok-to-stop and not yet draining ready_to_drain_osds = self._ready_to_drain_osds() if ready_to_drain_osds: # start draining those _ = [osd.start_draining() for osd in ready_to_drain_osds] all_osds = self.all_osds() logger.debug(f"{self.queue_size()} OSDs are scheduled " f"for removal: {all_osds}") # Check all osds for their state and take action (remove, purge etc) new_queue: Set[OSD] = set() for osd in all_osds: # type: OSD if not osd.force: # skip criteria if not osd.is_empty: logger.debug(f"{osd} is not empty yet. Waiting a bit more") new_queue.add(osd) continue if not osd.safe_to_destroy(): logger.debug( f"{osd} is not safe-to-destroy yet. Waiting a bit more") new_queue.add(osd) continue # abort criteria if not osd.down(): # also remove it from the remove_osd list and set a health_check warning? raise orchestrator.OrchestratorError( f"Could not mark {osd} down") # stop and remove daemon assert osd.hostname is not None if self.mgr.cache.has_daemon(f'osd.{osd.osd_id}'): CephadmServe(self.mgr)._remove_daemon(f'osd.{osd.osd_id}', osd.hostname) logger.info(f"Successfully removed {osd} on {osd.hostname}") else: logger.info( f"Daemon {osd} on {osd.hostname} was already removed") if osd.replace: # mark destroyed in osdmap if not osd.destroy(): raise orchestrator.OrchestratorError( f"Could not destroy {osd}") logger.info( f"Successfully destroyed old {osd} on {osd.hostname}; ready for replacement" ) else: # purge from osdmap if not osd.purge(): raise orchestrator.OrchestratorError( f"Could not purge {osd}") logger.info(f"Successfully purged {osd} on {osd.hostname}") if osd.zap: # throws an exception if the zap fails logger.info(f"Zapping devices for {osd} on {osd.hostname}") osd.do_zap() logger.info( f"Successfully zapped devices for {osd} on {osd.hostname}") logger.debug(f"Removing {osd} from the queue.") # self could change while this is processing (osds get added from the CLI) # The new set is: 'an intersection of all osds that are still not empty/removed (new_queue) and # osds that were added while this method was executed' with self.lock: self.osds.intersection_update(new_queue) self._save_to_store()
def process_removal_queue(self) -> None: """ Performs actions in the _serve() loop to remove an OSD when criteria is met. """ # make sure that we don't run on OSDs that are not in the cluster anymore. self.cleanup() logger.debug( f"{self.mgr.to_remove_osds.queue_size()} OSDs are scheduled " f"for removal: {self.mgr.to_remove_osds.all_osds()}") # find osds that are ok-to-stop and not yet draining ok_to_stop_osds = self.find_osd_stop_threshold( self.mgr.to_remove_osds.idling_osds()) if ok_to_stop_osds: # start draining those _ = [osd.start_draining() for osd in ok_to_stop_osds] # Check all osds for their state and take action (remove, purge etc) to_remove_osds = self.mgr.to_remove_osds.all_osds() new_queue = set() for osd in to_remove_osds: if not osd.force: # skip criteria if not osd.is_empty: logger.info( f"OSD <{osd.osd_id}> is not empty yet. Waiting a bit more" ) new_queue.add(osd) continue if not osd.safe_to_destroy(): logger.info( f"OSD <{osd.osd_id}> is not safe-to-destroy yet. Waiting a bit more" ) new_queue.add(osd) continue # abort criteria if not osd.down(): # also remove it from the remove_osd list and set a health_check warning? raise orchestrator.OrchestratorError( f"Could not set OSD <{osd.osd_id}> to 'down'") if osd.replace: if not osd.destroy(): raise orchestrator.OrchestratorError( f"Could not destroy OSD <{osd.osd_id}>") else: if not osd.purge(): raise orchestrator.OrchestratorError( f"Could not purge OSD <{osd.osd_id}>") if not osd.exists: continue self.mgr._remove_daemon(osd.fullname, osd.hostname) logger.info( f"Successfully removed OSD <{osd.osd_id}> on {osd.hostname}") logger.debug(f"Removing {osd.osd_id} from the queue.") # self.mgr.to_remove_osds could change while this is processing (osds get added from the CLI) # The new set is: 'an intersection of all osds that are still not empty/removed (new_queue) and # osds that were added while this method was executed' self.mgr.to_remove_osds.intersection_update(new_queue) self.save_to_store()
def _execute_blight_job(self, ident_fault: str, on: bool, loc: orchestrator.DeviceLightLoc) -> str: operation_id = str(hash(loc)) message = "" # job definition job_metadata = client.V1ObjectMeta(name=operation_id, namespace=self.rook_env.namespace, labels={"ident": operation_id}) pod_metadata = client.V1ObjectMeta(labels={"ident": operation_id}) pod_container = client.V1Container( name="ceph-lsmcli-command", security_context=client.V1SecurityContext(privileged=True), image=self.get_ceph_image(), command=[ "lsmcli", ], args=[ 'local-disk-%s-led-%s' % (ident_fault, 'on' if on else 'off'), '--path', loc.path or loc.dev, ], volume_mounts=[ client.V1VolumeMount(name="devices", mount_path="/dev"), client.V1VolumeMount(name="run-udev", mount_path="/run/udev") ]) pod_spec = client.V1PodSpec( containers=[pod_container], active_deadline_seconds=30, # Max time to terminate pod restart_policy="Never", node_selector={"kubernetes.io/hostname": loc.host}, volumes=[ client.V1Volume( name="devices", host_path=client.V1HostPathVolumeSource(path="/dev")), client.V1Volume( name="run-udev", host_path=client.V1HostPathVolumeSource(path="/run/udev")) ]) pod_template = client.V1PodTemplateSpec(metadata=pod_metadata, spec=pod_spec) job_spec = client.V1JobSpec( active_deadline_seconds=60, # Max time to terminate job ttl_seconds_after_finished= 10, # Alfa. Lifetime after finishing (either Complete or Failed) backoff_limit=0, template=pod_template) job = client.V1Job(api_version="batch/v1", kind="Job", metadata=job_metadata, spec=job_spec) # delete previous job if it exists try: try: api_response = self.batchV1_api.delete_namespaced_job( operation_id, self.rook_env.namespace, propagation_policy="Background") except ApiException as e: if e.status != 404: # No problem if the job does not exist raise # wait until the job is not present deleted = False retries = 0 while not deleted and retries < 10: api_response = self.batchV1_api.list_namespaced_job( self.rook_env.namespace, label_selector="ident=%s" % operation_id, timeout_seconds=10) deleted = not api_response.items if retries > 5: sleep(0.1) retries += 1 if retries == 10 and not deleted: raise orchestrator.OrchestratorError( "Light <{}> in <{}:{}> cannot be executed. Cannot delete previous job <{}>" .format(on, loc.host, loc.path or loc.dev, operation_id)) # create the job api_response = self.batchV1_api.create_namespaced_job( self.rook_env.namespace, job) # get the result finished = False while not finished: api_response = self.batchV1_api.read_namespaced_job( operation_id, self.rook_env.namespace) finished = api_response.status.succeeded or api_response.status.failed if finished: message = api_response.status.conditions[-1].message # get the result of the lsmcli command api_response = self.coreV1_api.list_namespaced_pod( self.rook_env.namespace, label_selector="ident=%s" % operation_id, timeout_seconds=10) if api_response.items: pod_name = api_response.items[-1].metadata.name message = self.coreV1_api.read_namespaced_pod_log( pod_name, self.rook_env.namespace) except ApiException as e: log.exception('K8s API failed. {}'.format(e)) raise # Finally, delete the job. # The job uses <ttl_seconds_after_finished>. This makes that the TTL controller delete automatically the job. # This feature is in Alpha state, so extra explicit delete operations trying to delete the Job has been used strategically try: api_response = self.batchV1_api.delete_namespaced_job( operation_id, self.rook_env.namespace, propagation_policy="Background") except ApiException as e: if e.status != 404: # No problem if the job does not exist raise return message
def add_rgw(self, spec): # type: (orchestrator.RGWSpec) -> orchestrator.Completion """ Add a RGW service in the cluster : spec : an Orchestrator.RGWSpec object : returns : Completion object """ # Add the hosts to the inventory in the right group hosts = spec.placement.hosts if not hosts: raise orchestrator.OrchestratorError( "No hosts provided. " "At least one destination host is needed to install the RGW " "service") def set_rgwspec_defaults(spec): spec.rgw_multisite = spec.rgw_multisite if spec.rgw_multisite is not None else True spec.rgw_zonemaster = spec.rgw_zonemaster if spec.rgw_zonemaster is not None else True spec.rgw_zonesecondary = spec.rgw_zonesecondary \ if spec.rgw_zonesecondary is not None else False spec.rgw_multisite_proto = spec.rgw_multisite_proto \ if spec.rgw_multisite_proto is not None else "http" spec.rgw_frontend_port = spec.rgw_frontend_port \ if spec.rgw_frontend_port is not None else 8080 spec.rgw_zonegroup = spec.rgw_zonegroup if spec.rgw_zonegroup is not None else "default" spec.rgw_zone_user = spec.rgw_zone_user if spec.rgw_zone_user is not None else "zone.user" spec.rgw_realm = spec.rgw_realm if spec.rgw_realm is not None else "default" spec.system_access_key = spec.system_access_key \ if spec.system_access_key is not None else spec.genkey(20) spec.system_secret_key = spec.system_secret_key \ if spec.system_secret_key is not None else spec.genkey(40) set_rgwspec_defaults(spec) InventoryGroup("rgws", self.ar_client).update(hosts) # Limit playbook execution to certain hosts limited = ",".join(str(host) for host in hosts) # Add the settings for this service extravars = { k: v for (k, v) in spec.__dict__.items() if k.startswith('rgw_') } extravars['rgw_zone'] = spec.name extravars[ 'rgw_multisite_endpoint_addr'] = spec.rgw_multisite_endpoint_addr extravars[ 'rgw_multisite_endpoints_list'] = spec.rgw_multisite_endpoints_list extravars['rgw_frontend_port'] = str(spec.rgw_frontend_port) # Group hosts by resource (used in rm ops) resource_group = "rgw_zone_{}".format(spec.name) InventoryGroup(resource_group, self.ar_client).update(hosts) # Execute the playbook to create the service op = playbook_operation(client=self.ar_client, playbook=SITE_PLAYBOOK, result_pattern="", params=extravars, querystr_dict={"limit": limited}, output_wizard=ProcessPlaybookResult( self.ar_client), event_filter_list=["playbook_on_stats"]) # Execute the playbook self._launch_operation(op) return op