def __init__(self, *args): """Init _stored attributes and interfaces, observe events.""" super().__init__(*args) self._stored.set_default( user_node_state=str(), partition_name=str(), config_available=False, ) self._nrpe = Nrpe(self, "nrpe-external-master") self._slurm_manager = SlurmManager(self, "slurmd") self._slurmd = Slurmd(self, "slurmd") self._slurmd_peer = SlurmdPeer(self, "slurmd-peer") event_handler_bindings = { self.on.install: self._on_install, self.on.upgrade_charm: self._on_upgrade, self.on.config_changed: self._on_send_slurmd_info, self._slurmd_peer.on.slurmd_peer_available: self._on_send_slurmd_info, self._slurmd.on.slurm_config_available: self._on_check_status_and_write_config, self.on.set_node_state_action: self._on_set_node_state_action, } for event, handler in event_handler_bindings.items(): self.framework.observe(event, handler)
def __init__(self, *args): """Init _stored attributes and interfaces, observe events.""" super().__init__(*args) self._stored.set_default(nhc_conf=str(), slurm_installed=False, slurmctld_available=False, slurmctld_started=False, cluster_name=str()) self._slurm_manager = SlurmManager(self, "slurmd") self._fluentbit = FluentbitClient(self, "fluentbit") # interface to slurmctld, should only have one slurmctld per slurmd app self._slurmd = Slurmd(self, "slurmd") self._slurmd_peer = SlurmdPeer(self, "slurmd-peer") event_handler_bindings = { self.on.install: self._on_install, self.on.upgrade_charm: self._on_upgrade, self.on.update_status: self._on_update_status, self.on.config_changed: self._on_config_changed, self.on.slurmctld_started: self._on_slurmctld_started, self.on.slurmd_start: self._on_slurmd_start, self.on.check_etcd: self._on_check_etcd, self._slurmd.on.slurmctld_available: self._on_slurmctld_available, self._slurmd.on.slurmctld_unavailable: self._on_slurmctld_unavailable, # fluentbit self.on["fluentbit"].relation_created: self._on_configure_fluentbit, # actions self.on.version_action: self._on_version_action, self.on.node_configured_action: self._on_node_configured_action, self.on.get_node_inventory_action: self._on_get_node_inventory_action, self.on.show_nhc_config_action: self._on_show_nhc_config, # infiniband actions self.on.get_infiniband_repo_action: self.get_infiniband_repo, self.on.set_infiniband_repo_action: self.set_infiniband_repo, self.on.install_infiniband_action: self.install_infiniband, self.on.uninstall_infiniband_action: self.uninstall_infiniband, self.on.start_infiniband_action: self.start_infiniband, self.on.enable_infiniband_action: self.enable_infiniband, self.on.stop_infiniband_action: self.stop_infiniband, self.on.is_active_infiniband_action: self.is_active_infiniband, # nvdia actions self.on.nvidia_repo_action: self.nvidia_repo, self.on.nvidia_package_action: self.nvidia_package, self.on.nvidia_install_action: self.nvidia_install, } for event, handler in event_handler_bindings.items(): self.framework.observe(event, handler)
class SlurmdCharm(CharmBase): """Slurmd lifecycle events.""" _stored = StoredState() def __init__(self, *args): """Init _stored attributes and interfaces, observe events.""" super().__init__(*args) self._stored.set_default( user_node_state=str(), partition_name=str(), config_available=False, ) self._nrpe = Nrpe(self, "nrpe-external-master") self._slurm_manager = SlurmManager(self, "slurmd") self._slurmd = Slurmd(self, "slurmd") self._slurmd_peer = SlurmdPeer(self, "slurmd-peer") event_handler_bindings = { self.on.install: self._on_install, self.on.upgrade_charm: self._on_upgrade, self.on.config_changed: self._on_send_slurmd_info, self._slurmd_peer.on.slurmd_peer_available: self._on_send_slurmd_info, self._slurmd.on.slurm_config_available: self._on_check_status_and_write_config, self.on.set_node_state_action: self._on_set_node_state_action, } for event, handler in event_handler_bindings.items(): self.framework.observe(event, handler) def _on_config_changed(self, event): self.get_set_return_partition_name() self._on_send_slurmd_info(event) def _on_install(self, event): self._slurm_manager.install() self._stored.slurm_installed = True self.unit.status = ActiveStatus("Slurm Installed") def _on_upgrade(self, event): self._slurm_manager.upgrade() def _on_set_node_state_action(self, event): """Set the node state.""" self._stored.user_node_state = event.params["node-state"] self._on_send_slurm_info(event) def _on_send_slurmd_info(self, event): if self.framework.model.unit.is_leader(): if self._slurmd.is_joined: partition = self._assemble_partition() if partition: self._slurmd.set_slurmd_info_on_app_relation_data( partition) return event.defer() return def _on_check_status_and_write_config(self, event): if not self._check_status(): event.defer() return slurm_config = dict(self._slurmd.get_slurm_config()) self._slurm_manager.render_config_and_restart(slurm_config) self.unit.status = ActiveStatus("Slurmd Available") def _check_status(self): slurm_installed = self._stored.slurm_installed config_available = self._stored.config_available if not (slurm_installed and config_available): self.unit.status = BlockedStatus( "NEED RELATION TO SLURM CONFIGURATOR") return False else: return True def _assemble_partition(self): """Assemble the partition info.""" partition_name = self._stored.partition_name partition_config = self.model.config.get('partition-config') partition_state = self.model.config.get('partition-state') slurmd_info = self._assemble_slurmd_info() return { 'inventory': slurmd_info, 'partition_name': partition_name, 'partition_state': partition_state, 'partition_config': partition_config, } def _assemble_slurmd_info(self): """Apply mutations to nodes in the partition, return slurmd nodes.""" slurmd_info = self._slurmd_peer.get_slurmd_info() if not slurmd_info: return None # If the user has set custom state for nodes # ensure we update the state for the targeted nodes. user_node_state = self._stored.user_node_state if user_node_state: node_states = { item.split("=")[0]: item.split("=")[1] for item in user_node_state.split(",") } # Copy the slurmd_info returned from the the slurmd-peer relation # to a temporary variable to which we will make modifications. slurmd_info_tmp = copy.deepcopy(slurmd_info) # Iterate over the slurmd nodes in the partition and check # for nodes that need their state modified. for partition in slurmd_info: partition_tmp = copy.deepcopy(partition) for slurmd_node in partition['inventory']: if slurmd_node['hostname'] in node_states.keys(): slurmd_node_tmp = copy.deepcopy(slurmd_node) slurmd_node_tmp['state'] = \ node_states[slurmd_node['hostname']] partition_tmp['inventory'].remove(slurmd_node) partition_tmp['inventory'].append(slurmd_node_tmp) slurmd_info_tmp.remove(partition) slurmd_info_tmp.append(partition_tmp) else: slurmd_info_tmp = slurmd_info return slurmd_info_tmp def get_set_return_partition_name(self): """Set the partition name.""" # Determine if a partition-name config exists, if so # ensure the partition_name known by the charm is consistent. # If no partition name has been specified then generate one. partition_name = self.model.config.get('partition-name') if partition_name: if partition_name != self._stored.partition_name: self._stored.partition_name = partition_name elif not self._stored.partition_name: self._stored.partition_name = f"juju-compute-{random_string()}" return self._stored.partition_name def get_slurm_component(self): """Return the slurm component.""" return self._slurm_manager.slurm_component def get_hostname(self): """Return the hostname.""" return self._slurm_manager.hostname def get_port(self): """Return the port.""" return self._slurm_manager.port
class SlurmdCharm(CharmBase): """Slurmd lifecycle events.""" _stored = StoredState() def __init__(self, *args): """Init _stored attributes and interfaces, observe events.""" super().__init__(*args) self._stored.set_default( munge_key_available=False, slurmd_restarted=False, user_node_state=str(), partition_name=str(), ) self._nrpe = Nrpe(self, "nrpe-external-master") self._slurm_manager = SlurmManager(self, "slurmd") self._slurmd = Slurmd(self, "slurmd") self._slurmd_peer = SlurmdPeer(self, "slurmd-peer") event_handler_bindings = { self.on.install: self._on_install, self.on.upgrade_charm: self._on_upgrade, self.on.start: self._on_check_status_and_write_config, self.on.config_changed: self._on_config_changed, self._slurmd_peer.on.slurmd_peer_available: self._on_set_partition_info_on_app_relation_data, self._slurmd_peer.on.slurmd_peer_departed: self._on_set_partition_info_on_app_relation_data, self._slurmd.on.slurm_config_available: self._on_check_status_and_write_config, self._slurmd.on.slurm_config_unavailable: self._on_check_status_and_write_config, self._slurmd.on.restart_slurmd: self._on_restart_slurmd, self._slurmd.on.munge_key_available: self._on_write_munge_key, self.on.set_node_state_action: self._on_set_node_state_action, } for event, handler in event_handler_bindings.items(): self.framework.observe(event, handler) def _on_install(self, event): self._slurm_manager.install(self.config["snapstore-channel"]) if self.model.unit.is_leader(): self._get_set_partition_name() logger.debug(f"PARTITION_NAME: {self._stored.partition_name}") self._stored.slurm_installed = True self.unit.status = ActiveStatus("Slurm installed") def _on_upgrade(self, event): slurm_config = self._check_status() if not slurm_config: event.defer() return self._slurm_manager.upgrade( slurm_config, self.config["snapstore-channel"] ) def _on_config_changed(self, event): if self.model.unit.is_leader(): self._get_set_partition_name() if self._check_status(): self._on_set_partition_info_on_app_relation_data( event ) def _on_write_munge_key(self, event): if not self._stored.slurm_installed: event.defer() return munge_key = self._slurmd.get_stored_munge_key() self._slurm_manager.configure_munge_key(munge_key) self._slurm_manager.restart_munged() self._stored.munge_key_available = True def _on_check_status_and_write_config(self, event): slurm_config = self._check_status() if not slurm_config: event.defer() return # if slurm_config['configless']: # slurmctld_hostname = slurm_config['active_controller_hostname'] # self._slurm_manager.configure_slurmctld_hostname( # slurmctld_hostname # ) # self._slurm_manager.restart_slurm_component() # else: # Ensure we aren't dealing with a StoredDict before trying # to render the slurm.conf. slurm_config = dict(slurm_config) self._slurm_manager.render_slurm_configs(slurm_config) # Only restart slurmd the first time the node is brought up. if not self._stored.slurmd_restarted: self._slurm_manager.restart_slurm_component() self._stored.slurmd_restarted = True self.unit.status = ActiveStatus("slurmd available") def _on_restart_slurmd(self, event): self._slurm_manager.restart_slurm_component() def _check_status(self): munge_key_available = self._stored.munge_key_available slurm_installed = self._stored.slurm_installed slurm_config = self._slurmd.get_stored_slurm_config() slurmd_joined = self._slurmd.is_joined if not slurmd_joined: self.unit.status = BlockedStatus( "Needed relations: slurm-configurator" ) return None elif not (munge_key_available and slurm_config and slurm_installed): self.unit.status = WaitingStatus( "Waiting on: configuration" ) return None return dict(slurm_config) def _on_set_node_state_action(self, event): """Set the node state.""" self._stored.user_node_state = event.params["node-state"] self._on_set_partition_info_on_app_relation_data(event) def _on_set_partition_info_on_app_relation_data(self, event): """Set the slurm partition info on the application relation data.""" # Only the leader can set data on the relation. if self.framework.model.unit.is_leader(): # If the relation with slurm-configurator exists then set our # partition info on the application relation data. # This handler shouldn't fire if the relation isn't made, # but add this extra check here just incase. if self._slurmd.is_joined: partition = self._assemble_partition() if partition: self._slurmd.set_partition_info_on_app_relation_data( partition ) return event.defer() return def _assemble_partition(self): """Assemble the partition info.""" partition_name = self._stored.partition_name partition_config = self.config.get("partition-config") partition_state = self.config.get("partition-state") slurmd_inventory = self._assemble_slurmd_inventory() return { "inventory": slurmd_inventory, "partition_name": partition_name, "partition_state": partition_state, "partition_config": partition_config, } def _assemble_slurmd_inventory(self): """Apply mutations to nodes in the partition, return slurmd nodes.""" slurmd_inventory = self._slurmd_peer.get_slurmd_inventory() if not slurmd_inventory: return None # If the user has set custom state for nodes # ensure we update the state for the targeted nodes. user_node_state = self._stored.user_node_state if user_node_state: node_states = { item.split("=")[0]: item.split("=")[1] for item in user_node_state.split(",") } # Copy the slurmd_inventory returned from the the slurmd-peer # relation to a temporary variable that we will use to # iterate over while we conditionally make modifications to the # original inventory. slurmd_inventory_tmp = copy.deepcopy(slurmd_inventory) # Iterate over the slurmd nodes in the partition and check # for nodes that need their state modified. for partition in slurmd_inventory_tmp: partition_tmp = copy.deepcopy(partition) for slurmd_node in partition["inventory"]: if slurmd_node["hostname"] in node_states.keys(): slurmd_node_tmp = copy.deepcopy(slurmd_node) slurmd_node_tmp["state"] = \ node_states[slurmd_node["hostname"]] partition_tmp["inventory"].remove(slurmd_node) partition_tmp["inventory"].append(slurmd_node_tmp) slurmd_inventory.remove(partition) slurmd_inventory.append(partition_tmp) return slurmd_inventory def _get_set_partition_name(self): """Set the partition name.""" # Determine if a partition-name config exists, if so # ensure the self._stored.partition_name is consistent with the # supplied config. # If no partition name has been specified then generate one. partition_name = self.config.get("partition-name") if partition_name: if partition_name != self._stored.partition_name: self._stored.partition_name = partition_name elif not self._stored.partition_name: self._stored.partition_name = f"juju-compute-{random_string()}" return def get_partition_name(self): """Return the partition_name.""" return self._stored.partition_name def get_slurm_component(self): """Return the slurm component.""" return self._slurm_manager.slurm_component def get_hostname(self): """Return the hostname.""" return self._slurm_manager.hostname def get_port(self): """Return the port.""" return self._slurm_manager.port