Example #1
0
    def __init__(self, *args):
        """Init _stored attributes and interfaces, observe events."""
        super().__init__(*args)

        self._stored.set_default(
            user_node_state=str(),
            partition_name=str(),
            config_available=False,
        )

        self._nrpe = Nrpe(self, "nrpe-external-master")

        self._slurm_manager = SlurmManager(self, "slurmd")

        self._slurmd = Slurmd(self, "slurmd")
        self._slurmd_peer = SlurmdPeer(self, "slurmd-peer")

        event_handler_bindings = {
            self.on.install: self._on_install,
            self.on.upgrade_charm: self._on_upgrade,
            self.on.config_changed: self._on_send_slurmd_info,
            self._slurmd_peer.on.slurmd_peer_available:
            self._on_send_slurmd_info,
            self._slurmd.on.slurm_config_available:
            self._on_check_status_and_write_config,
            self.on.set_node_state_action: self._on_set_node_state_action,
        }
        for event, handler in event_handler_bindings.items():
            self.framework.observe(event, handler)
    def __init__(self, *args):
        """Init _stored attributes and interfaces, observe events."""
        super().__init__(*args)

        self._stored.set_default(nhc_conf=str(),
                                 slurm_installed=False,
                                 slurmctld_available=False,
                                 slurmctld_started=False,
                                 cluster_name=str())

        self._slurm_manager = SlurmManager(self, "slurmd")
        self._fluentbit = FluentbitClient(self, "fluentbit")

        # interface to slurmctld, should only have one slurmctld per slurmd app
        self._slurmd = Slurmd(self, "slurmd")
        self._slurmd_peer = SlurmdPeer(self, "slurmd-peer")

        event_handler_bindings = {
            self.on.install: self._on_install,
            self.on.upgrade_charm: self._on_upgrade,
            self.on.update_status: self._on_update_status,
            self.on.config_changed: self._on_config_changed,
            self.on.slurmctld_started: self._on_slurmctld_started,
            self.on.slurmd_start: self._on_slurmd_start,
            self.on.check_etcd: self._on_check_etcd,
            self._slurmd.on.slurmctld_available: self._on_slurmctld_available,
            self._slurmd.on.slurmctld_unavailable:
            self._on_slurmctld_unavailable,
            # fluentbit
            self.on["fluentbit"].relation_created:
            self._on_configure_fluentbit,
            # actions
            self.on.version_action: self._on_version_action,
            self.on.node_configured_action: self._on_node_configured_action,
            self.on.get_node_inventory_action:
            self._on_get_node_inventory_action,
            self.on.show_nhc_config_action: self._on_show_nhc_config,
            # infiniband actions
            self.on.get_infiniband_repo_action: self.get_infiniband_repo,
            self.on.set_infiniband_repo_action: self.set_infiniband_repo,
            self.on.install_infiniband_action: self.install_infiniband,
            self.on.uninstall_infiniband_action: self.uninstall_infiniband,
            self.on.start_infiniband_action: self.start_infiniband,
            self.on.enable_infiniband_action: self.enable_infiniband,
            self.on.stop_infiniband_action: self.stop_infiniband,
            self.on.is_active_infiniband_action: self.is_active_infiniband,
            # nvdia actions
            self.on.nvidia_repo_action: self.nvidia_repo,
            self.on.nvidia_package_action: self.nvidia_package,
            self.on.nvidia_install_action: self.nvidia_install,
        }
        for event, handler in event_handler_bindings.items():
            self.framework.observe(event, handler)
Example #3
0
class SlurmdCharm(CharmBase):
    """Slurmd lifecycle events."""

    _stored = StoredState()

    def __init__(self, *args):
        """Init _stored attributes and interfaces, observe events."""
        super().__init__(*args)

        self._stored.set_default(
            user_node_state=str(),
            partition_name=str(),
            config_available=False,
        )

        self._nrpe = Nrpe(self, "nrpe-external-master")

        self._slurm_manager = SlurmManager(self, "slurmd")

        self._slurmd = Slurmd(self, "slurmd")
        self._slurmd_peer = SlurmdPeer(self, "slurmd-peer")

        event_handler_bindings = {
            self.on.install: self._on_install,
            self.on.upgrade_charm: self._on_upgrade,
            self.on.config_changed: self._on_send_slurmd_info,
            self._slurmd_peer.on.slurmd_peer_available:
            self._on_send_slurmd_info,
            self._slurmd.on.slurm_config_available:
            self._on_check_status_and_write_config,
            self.on.set_node_state_action: self._on_set_node_state_action,
        }
        for event, handler in event_handler_bindings.items():
            self.framework.observe(event, handler)

    def _on_config_changed(self, event):
        self.get_set_return_partition_name()
        self._on_send_slurmd_info(event)

    def _on_install(self, event):
        self._slurm_manager.install()
        self._stored.slurm_installed = True
        self.unit.status = ActiveStatus("Slurm Installed")

    def _on_upgrade(self, event):
        self._slurm_manager.upgrade()

    def _on_set_node_state_action(self, event):
        """Set the node state."""
        self._stored.user_node_state = event.params["node-state"]
        self._on_send_slurm_info(event)

    def _on_send_slurmd_info(self, event):
        if self.framework.model.unit.is_leader():
            if self._slurmd.is_joined:
                partition = self._assemble_partition()
                if partition:
                    self._slurmd.set_slurmd_info_on_app_relation_data(
                        partition)
                    return
            event.defer()
            return

    def _on_check_status_and_write_config(self, event):
        if not self._check_status():
            event.defer()
            return
        slurm_config = dict(self._slurmd.get_slurm_config())
        self._slurm_manager.render_config_and_restart(slurm_config)
        self.unit.status = ActiveStatus("Slurmd Available")

    def _check_status(self):
        slurm_installed = self._stored.slurm_installed
        config_available = self._stored.config_available

        if not (slurm_installed and config_available):
            self.unit.status = BlockedStatus(
                "NEED RELATION TO SLURM CONFIGURATOR")
            return False
        else:
            return True

    def _assemble_partition(self):
        """Assemble the partition info."""
        partition_name = self._stored.partition_name
        partition_config = self.model.config.get('partition-config')
        partition_state = self.model.config.get('partition-state')

        slurmd_info = self._assemble_slurmd_info()

        return {
            'inventory': slurmd_info,
            'partition_name': partition_name,
            'partition_state': partition_state,
            'partition_config': partition_config,
        }

    def _assemble_slurmd_info(self):
        """Apply mutations to nodes in the partition, return slurmd nodes."""
        slurmd_info = self._slurmd_peer.get_slurmd_info()
        if not slurmd_info:
            return None

        # If the user has set custom state for nodes
        # ensure we update the state for the targeted nodes.
        user_node_state = self._stored.user_node_state
        if user_node_state:
            node_states = {
                item.split("=")[0]: item.split("=")[1]
                for item in user_node_state.split(",")
            }

            # Copy the slurmd_info returned from the the slurmd-peer relation
            # to a temporary variable to which we will make modifications.
            slurmd_info_tmp = copy.deepcopy(slurmd_info)

            # Iterate over the slurmd nodes in the partition and check
            # for nodes that need their state modified.
            for partition in slurmd_info:
                partition_tmp = copy.deepcopy(partition)
                for slurmd_node in partition['inventory']:
                    if slurmd_node['hostname'] in node_states.keys():
                        slurmd_node_tmp = copy.deepcopy(slurmd_node)
                        slurmd_node_tmp['state'] = \
                            node_states[slurmd_node['hostname']]
                        partition_tmp['inventory'].remove(slurmd_node)
                        partition_tmp['inventory'].append(slurmd_node_tmp)
                slurmd_info_tmp.remove(partition)
                slurmd_info_tmp.append(partition_tmp)
        else:
            slurmd_info_tmp = slurmd_info

        return slurmd_info_tmp

    def get_set_return_partition_name(self):
        """Set the partition name."""
        # Determine if a partition-name config exists, if so
        # ensure the partition_name known by the charm is consistent.
        # If no partition name has been specified then generate one.
        partition_name = self.model.config.get('partition-name')
        if partition_name:
            if partition_name != self._stored.partition_name:
                self._stored.partition_name = partition_name
        elif not self._stored.partition_name:
            self._stored.partition_name = f"juju-compute-{random_string()}"
        return self._stored.partition_name

    def get_slurm_component(self):
        """Return the slurm component."""
        return self._slurm_manager.slurm_component

    def get_hostname(self):
        """Return the hostname."""
        return self._slurm_manager.hostname

    def get_port(self):
        """Return the port."""
        return self._slurm_manager.port
Example #4
0
class SlurmdCharm(CharmBase):
    """Slurmd lifecycle events."""

    _stored = StoredState()

    def __init__(self, *args):
        """Init _stored attributes and interfaces, observe events."""
        super().__init__(*args)

        self._stored.set_default(
            munge_key_available=False,
            slurmd_restarted=False,
            user_node_state=str(),
            partition_name=str(),
        )

        self._nrpe = Nrpe(self, "nrpe-external-master")

        self._slurm_manager = SlurmManager(self, "slurmd")

        self._slurmd = Slurmd(self, "slurmd")
        self._slurmd_peer = SlurmdPeer(self, "slurmd-peer")

        event_handler_bindings = {
            self.on.install: self._on_install,
            self.on.upgrade_charm: self._on_upgrade,
            self.on.start: self._on_check_status_and_write_config,
            self.on.config_changed: self._on_config_changed,
            self._slurmd_peer.on.slurmd_peer_available:
            self._on_set_partition_info_on_app_relation_data,
            self._slurmd_peer.on.slurmd_peer_departed:
            self._on_set_partition_info_on_app_relation_data,
            self._slurmd.on.slurm_config_available:
            self._on_check_status_and_write_config,
            self._slurmd.on.slurm_config_unavailable:
            self._on_check_status_and_write_config,
            self._slurmd.on.restart_slurmd:
            self._on_restart_slurmd,
            self._slurmd.on.munge_key_available: self._on_write_munge_key,
            self.on.set_node_state_action: self._on_set_node_state_action,
        }
        for event, handler in event_handler_bindings.items():
            self.framework.observe(event, handler)

    def _on_install(self, event):
        self._slurm_manager.install(self.config["snapstore-channel"])

        if self.model.unit.is_leader():
            self._get_set_partition_name()
            logger.debug(f"PARTITION_NAME: {self._stored.partition_name}")
        self._stored.slurm_installed = True
        self.unit.status = ActiveStatus("Slurm installed")

    def _on_upgrade(self, event):
        slurm_config = self._check_status()
        if not slurm_config:
            event.defer()
            return

        self._slurm_manager.upgrade(
            slurm_config,
            self.config["snapstore-channel"]
        )

    def _on_config_changed(self, event):
        if self.model.unit.is_leader():
            self._get_set_partition_name()
            if self._check_status():
                self._on_set_partition_info_on_app_relation_data(
                    event
                )

    def _on_write_munge_key(self, event):
        if not self._stored.slurm_installed:
            event.defer()
            return
        munge_key = self._slurmd.get_stored_munge_key()
        self._slurm_manager.configure_munge_key(munge_key)
        self._slurm_manager.restart_munged()
        self._stored.munge_key_available = True

    def _on_check_status_and_write_config(self, event):
        slurm_config = self._check_status()
        if not slurm_config:
            event.defer()
            return

        # if slurm_config['configless']:
        #    slurmctld_hostname = slurm_config['active_controller_hostname']
        #    self._slurm_manager.configure_slurmctld_hostname(
        #        slurmctld_hostname
        #    )
        #    self._slurm_manager.restart_slurm_component()
        # else:

        # Ensure we aren't dealing with a StoredDict before trying
        # to render the slurm.conf.
        slurm_config = dict(slurm_config)
        self._slurm_manager.render_slurm_configs(slurm_config)

        # Only restart slurmd the first time the node is brought up.
        if not self._stored.slurmd_restarted:
            self._slurm_manager.restart_slurm_component()
            self._stored.slurmd_restarted = True

        self.unit.status = ActiveStatus("slurmd available")

    def _on_restart_slurmd(self, event):
        self._slurm_manager.restart_slurm_component()

    def _check_status(self):
        munge_key_available = self._stored.munge_key_available
        slurm_installed = self._stored.slurm_installed
        slurm_config = self._slurmd.get_stored_slurm_config()

        slurmd_joined = self._slurmd.is_joined

        if not slurmd_joined:
            self.unit.status = BlockedStatus(
                "Needed relations: slurm-configurator"
            )
            return None

        elif not (munge_key_available and slurm_config and slurm_installed):
            self.unit.status = WaitingStatus(
                "Waiting on: configuration"
            )
            return None

        return dict(slurm_config)

    def _on_set_node_state_action(self, event):
        """Set the node state."""
        self._stored.user_node_state = event.params["node-state"]
        self._on_set_partition_info_on_app_relation_data(event)

    def _on_set_partition_info_on_app_relation_data(self, event):
        """Set the slurm partition info on the application relation data."""
        # Only the leader can set data on the relation.
        if self.framework.model.unit.is_leader():
            # If the relation with slurm-configurator exists then set our
            # partition info on the application relation data.
            # This handler shouldn't fire if the relation isn't made,
            # but add this extra check here just incase.
            if self._slurmd.is_joined:
                partition = self._assemble_partition()
                if partition:
                    self._slurmd.set_partition_info_on_app_relation_data(
                        partition
                    )
                    return
            event.defer()
            return

    def _assemble_partition(self):
        """Assemble the partition info."""
        partition_name = self._stored.partition_name
        partition_config = self.config.get("partition-config")
        partition_state = self.config.get("partition-state")

        slurmd_inventory = self._assemble_slurmd_inventory()

        return {
            "inventory": slurmd_inventory,
            "partition_name": partition_name,
            "partition_state": partition_state,
            "partition_config": partition_config,
        }

    def _assemble_slurmd_inventory(self):
        """Apply mutations to nodes in the partition, return slurmd nodes."""
        slurmd_inventory = self._slurmd_peer.get_slurmd_inventory()
        if not slurmd_inventory:
            return None

        # If the user has set custom state for nodes
        # ensure we update the state for the targeted nodes.
        user_node_state = self._stored.user_node_state
        if user_node_state:
            node_states = {
                item.split("=")[0]: item.split("=")[1]
                for item in user_node_state.split(",")
            }

            # Copy the slurmd_inventory returned from the the slurmd-peer
            # relation to a temporary variable that we will use to
            # iterate over while we conditionally make modifications to the
            # original inventory.
            slurmd_inventory_tmp = copy.deepcopy(slurmd_inventory)

            # Iterate over the slurmd nodes in the partition and check
            # for nodes that need their state modified.
            for partition in slurmd_inventory_tmp:
                partition_tmp = copy.deepcopy(partition)
                for slurmd_node in partition["inventory"]:
                    if slurmd_node["hostname"] in node_states.keys():
                        slurmd_node_tmp = copy.deepcopy(slurmd_node)
                        slurmd_node_tmp["state"] = \
                            node_states[slurmd_node["hostname"]]
                        partition_tmp["inventory"].remove(slurmd_node)
                        partition_tmp["inventory"].append(slurmd_node_tmp)
                slurmd_inventory.remove(partition)
                slurmd_inventory.append(partition_tmp)

        return slurmd_inventory

    def _get_set_partition_name(self):
        """Set the partition name."""
        # Determine if a partition-name config exists, if so
        # ensure the self._stored.partition_name is consistent with the
        # supplied config.
        # If no partition name has been specified then generate one.
        partition_name = self.config.get("partition-name")
        if partition_name:
            if partition_name != self._stored.partition_name:
                self._stored.partition_name = partition_name
        elif not self._stored.partition_name:
            self._stored.partition_name = f"juju-compute-{random_string()}"
        return

    def get_partition_name(self):
        """Return the partition_name."""
        return self._stored.partition_name

    def get_slurm_component(self):
        """Return the slurm component."""
        return self._slurm_manager.slurm_component

    def get_hostname(self):
        """Return the hostname."""
        return self._slurm_manager.hostname

    def get_port(self):
        """Return the port."""
        return self._slurm_manager.port