Esempio n. 1
0
    def __init__(self, *args):
        """Init _stored attributes and interfaces, observe events."""
        super().__init__(*args)

        self._stored.set_default(
            user_node_state=str(),
            partition_name=str(),
            config_available=False,
        )

        self._nrpe = Nrpe(self, "nrpe-external-master")

        self._slurm_manager = SlurmManager(self, "slurmd")

        self._slurmd = Slurmd(self, "slurmd")
        self._slurmd_peer = SlurmdPeer(self, "slurmd-peer")

        event_handler_bindings = {
            self.on.install: self._on_install,
            self.on.upgrade_charm: self._on_upgrade,
            self.on.config_changed: self._on_send_slurmd_info,
            self._slurmd_peer.on.slurmd_peer_available:
            self._on_send_slurmd_info,
            self._slurmd.on.slurm_config_available:
            self._on_check_status_and_write_config,
            self.on.set_node_state_action: self._on_set_node_state_action,
        }
        for event, handler in event_handler_bindings.items():
            self.framework.observe(event, handler)
Esempio n. 2
0
    def __init__(self, *args):
        """Init _stored attributes and interfaces, observe events."""
        super().__init__(*args)

        self._stored.set_default(nhc_conf=str(),
                                 slurm_installed=False,
                                 slurmctld_available=False,
                                 slurmctld_started=False,
                                 cluster_name=str())

        self._slurm_manager = SlurmManager(self, "slurmd")
        self._fluentbit = FluentbitClient(self, "fluentbit")

        # interface to slurmctld, should only have one slurmctld per slurmd app
        self._slurmd = Slurmd(self, "slurmd")
        self._slurmd_peer = SlurmdPeer(self, "slurmd-peer")

        event_handler_bindings = {
            self.on.install: self._on_install,
            self.on.upgrade_charm: self._on_upgrade,
            self.on.update_status: self._on_update_status,
            self.on.config_changed: self._on_config_changed,
            self.on.slurmctld_started: self._on_slurmctld_started,
            self.on.slurmd_start: self._on_slurmd_start,
            self.on.check_etcd: self._on_check_etcd,
            self._slurmd.on.slurmctld_available: self._on_slurmctld_available,
            self._slurmd.on.slurmctld_unavailable:
            self._on_slurmctld_unavailable,
            # fluentbit
            self.on["fluentbit"].relation_created:
            self._on_configure_fluentbit,
            # actions
            self.on.version_action: self._on_version_action,
            self.on.node_configured_action: self._on_node_configured_action,
            self.on.get_node_inventory_action:
            self._on_get_node_inventory_action,
            self.on.show_nhc_config_action: self._on_show_nhc_config,
            # infiniband actions
            self.on.get_infiniband_repo_action: self.get_infiniband_repo,
            self.on.set_infiniband_repo_action: self.set_infiniband_repo,
            self.on.install_infiniband_action: self.install_infiniband,
            self.on.uninstall_infiniband_action: self.uninstall_infiniband,
            self.on.start_infiniband_action: self.start_infiniband,
            self.on.enable_infiniband_action: self.enable_infiniband,
            self.on.stop_infiniband_action: self.stop_infiniband,
            self.on.is_active_infiniband_action: self.is_active_infiniband,
            # nvdia actions
            self.on.nvidia_repo_action: self.nvidia_repo,
            self.on.nvidia_package_action: self.nvidia_package,
            self.on.nvidia_install_action: self.nvidia_install,
        }
        for event, handler in event_handler_bindings.items():
            self.framework.observe(event, handler)
Esempio n. 3
0
    def __init__(self, *args):
        """Init charm, _stored defaults, interfaces and observe events."""
        super().__init__(*args)

        self._stored.set_default(
            default_partition=str(),
            munge_key=str(),
            slurm_installed=False,
            slurmctld_available=False,
            slurmdbd_available=False,
            slurmd_available=False,
            slurmrestd_available=False,
        )

        self._elasticsearch = Elasticsearch(self, "elasticsearch")
        self._grafana = GrafanaSource(self, "grafana-source")
        self._influxdb = InfluxDB(self, "influxdb-api")
        self._nhc = Nhc(self, "nhc")
        self._slurmrestd = Slurmrestd(self, "slurmrestd")
        self._slurm_manager = SlurmManager(self, "slurmd")
        self._slurmctld = Slurmctld(self, "slurmctld")
        self._slurmdbd = Slurmdbd(self, "slurmdbd")
        self._slurmd = Slurmd(self, "slurmd")
        self._prolog_epilog = PrologEpilog(self, "prolog-epilog")

        # #### Charm lifecycle events #### #
        event_handler_bindings = {
            # #### Juju lifecycle events #### #
            self.on.install:
            self._on_install,

            # self.on.start:
            # self._on_check_status_and_write_config,
            self.on.config_changed:
            self._on_check_status_and_write_config,
            self.on.upgrade_charm:
            self._on_upgrade,

            # ######## Addons lifecycle events ######## #
            self._elasticsearch.on.elasticsearch_available:
            self._on_check_status_and_write_config,
            self._elasticsearch.on.elasticsearch_unavailable:
            self._on_check_status_and_write_config,
            self._grafana.on.grafana_available:
            self._on_grafana_available,
            self._influxdb.on.influxdb_available:
            self._on_influxdb_available,
            self._influxdb.on.influxdb_unavailable:
            self._on_check_status_and_write_config,
            self._nhc.on.nhc_bin_available:
            self._on_check_status_and_write_config,

            # ######## Slurm component lifecycle events ######## #
            self._slurmctld.on.slurmctld_available:
            self._on_check_status_and_write_config,
            self._slurmctld.on.slurmctld_unavailable:
            self._on_check_status_and_write_config,
            self._slurmdbd.on.slurmdbd_available:
            self._on_check_status_and_write_config,
            self._slurmdbd.on.slurmdbd_unavailable:
            self._on_check_status_and_write_config,
            self._slurmd.on.slurmd_available:
            self._on_check_status_and_write_config,
            self._slurmd.on.slurmd_unavailable:
            self._on_check_status_and_write_config,
            self._slurmrestd.on.slurmrestd_available:
            self._on_check_status_and_write_config,
            self._slurmrestd.on.slurmrestd_unavailable:
            self._on_check_status_and_write_config,
            self._prolog_epilog.on.prolog_epilog_available:
            self._on_check_status_and_write_config,
            self._prolog_epilog.on.prolog_epilog_unavailable:
            self._on_check_status_and_write_config,
        }
        for event, handler in event_handler_bindings.items():
            self.framework.observe(event, handler)
Esempio n. 4
0
class SlurmConfiguratorCharm(CharmBase):
    """Facilitate slurm configuration operations."""

    _stored = StoredState()

    def __init__(self, *args):
        """Init charm, _stored defaults, interfaces and observe events."""
        super().__init__(*args)

        self._stored.set_default(
            default_partition=str(),
            munge_key=str(),
            slurm_installed=False,
            slurmctld_available=False,
            slurmdbd_available=False,
            slurmd_available=False,
            slurmrestd_available=False,
        )

        self._elasticsearch = Elasticsearch(self, "elasticsearch")
        self._grafana = GrafanaSource(self, "grafana-source")
        self._influxdb = InfluxDB(self, "influxdb-api")
        self._nhc = Nhc(self, "nhc")
        self._slurmrestd = Slurmrestd(self, "slurmrestd")
        self._slurm_manager = SlurmManager(self, "slurmd")
        self._slurmctld = Slurmctld(self, "slurmctld")
        self._slurmdbd = Slurmdbd(self, "slurmdbd")
        self._slurmd = Slurmd(self, "slurmd")
        self._prolog_epilog = PrologEpilog(self, "prolog-epilog")

        # #### Charm lifecycle events #### #
        event_handler_bindings = {
            # #### Juju lifecycle events #### #
            self.on.install:
            self._on_install,

            # self.on.start:
            # self._on_check_status_and_write_config,
            self.on.config_changed:
            self._on_check_status_and_write_config,
            self.on.upgrade_charm:
            self._on_upgrade,

            # ######## Addons lifecycle events ######## #
            self._elasticsearch.on.elasticsearch_available:
            self._on_check_status_and_write_config,
            self._elasticsearch.on.elasticsearch_unavailable:
            self._on_check_status_and_write_config,
            self._grafana.on.grafana_available:
            self._on_grafana_available,
            self._influxdb.on.influxdb_available:
            self._on_influxdb_available,
            self._influxdb.on.influxdb_unavailable:
            self._on_check_status_and_write_config,
            self._nhc.on.nhc_bin_available:
            self._on_check_status_and_write_config,

            # ######## Slurm component lifecycle events ######## #
            self._slurmctld.on.slurmctld_available:
            self._on_check_status_and_write_config,
            self._slurmctld.on.slurmctld_unavailable:
            self._on_check_status_and_write_config,
            self._slurmdbd.on.slurmdbd_available:
            self._on_check_status_and_write_config,
            self._slurmdbd.on.slurmdbd_unavailable:
            self._on_check_status_and_write_config,
            self._slurmd.on.slurmd_available:
            self._on_check_status_and_write_config,
            self._slurmd.on.slurmd_unavailable:
            self._on_check_status_and_write_config,
            self._slurmrestd.on.slurmrestd_available:
            self._on_check_status_and_write_config,
            self._slurmrestd.on.slurmrestd_unavailable:
            self._on_check_status_and_write_config,
            self._prolog_epilog.on.prolog_epilog_available:
            self._on_check_status_and_write_config,
            self._prolog_epilog.on.prolog_epilog_unavailable:
            self._on_check_status_and_write_config,
        }
        for event, handler in event_handler_bindings.items():
            self.framework.observe(event, handler)

    def _on_install(self, event):
        """Install the slurm snap and set the munge key."""
        self._slurm_manager.install()
        self._stored.munge_key = self._slurm_manager.get_munge_key()
        self._stored.slurm_installed = True
        self.unit.status = ActiveStatus("Slurm Installed")

    def _on_upgrade(self, event):
        """Upgrade the charm."""
        slurm_config = self._assemble_slurm_config()

        if not slurm_config:
            self.unit.status = BlockedStatus(
                "Cannot generate slurm_config, defering upgrade.")
            event.defer()
            return

        self._slurm_manager.upgrade(slurm_config)

    def _on_grafana_available(self, event):
        """Create the grafana-source if we are the leader and have influxdb."""
        leader = self._is_leader()
        influxdb_info = self._get_influxdb_info()
        grafana = self._grafana

        if leader and influxdb_info:
            grafana.set_grafana_source_info(influxdb_info)

    def _on_influxdb_available(self, event):
        """Create the grafana-source if we have all the things."""
        grafana = self._grafana
        influxdb_info = self._get_influxdb_info()
        leader = self._is_leader()

        if leader and grafana.is_joined and influxdb_info:
            grafana.set_grafana_source_info(influxdb_info)

        self._on_check_status_and_write_config(event)

    def _on_check_status_and_write_config(self, event):
        """Check that we have what we need before we proceed."""
        if not self._check_status():
            event.defer()
            return

        # Generate the slurm_config
        slurm_config = self._assemble_slurm_config()

        if not slurm_config:
            self.unit.status = BlockedStatus(
                "Cannot generate slurm_config - defering event.")
            event.defer()
            return

        self._slurmctld.set_slurm_config_on_app_relation_data(slurm_config, )
        self._slurmd.set_slurm_config_on_app_relation_data(slurm_config, )
        if self._stored.slurmrestd_available:
            self._slurmrestd.set_slurm_config_on_app_relation_data(
                slurm_config, )
        self._slurm_manager.render_config_and_restart({
            **slurm_config, 'munge_key':
            self.get_munge_key()
        })

    def _assemble_slurm_config(self):
        """Assemble and return the slurm config."""
        slurmctld_info = self._slurmctld.get_slurmctld_info()
        slurmdbd_info = self._slurmdbd.get_slurmdbd_info()
        slurmd_info = self._slurmd.get_slurmd_info()

        if not (slurmd_info and slurmctld_info and slurmdbd_info):
            return {}

        addons_info = self._assemble_addons()
        partitions_info = self._assemble_partitions(slurmd_info)

        logger.debug(addons_info)
        logger.debug(partitions_info)
        logger.debug(slurmctld_info)
        logger.debug(slurmdbd_info)

        return {
            'munge_key': self._stored.munge_key,
            'partitions': partitions_info,
            **slurmctld_info,
            **slurmdbd_info,
            **addons_info,
            **self.model.config,
        }

    def _assemble_partitions(self, slurmd_info):
        """Make any needed modifications to partition data."""
        slurmd_info_tmp = copy.deepcopy(slurmd_info)

        for partition in slurmd_info:

            # Deep copy the partition to a tmp var so we can modify it as
            # needed whilst not modifying the object we are iterating over.
            partition_tmp = copy.deepcopy(partition)
            # Extract the partition_name from the partition and from the charm
            # config.
            partition_name = partition['partition_name']
            default_partition_from_config = self.model.config.get(
                'default_partition')

            # Check that the default_partition isn't defined in the charm
            # config.
            # If the user hasn't provided a default partition, then we infer
            # the partition_default by defaulting to the first related slurmd
            # application.
            if not default_partition_from_config:
                if partition['partition_name'] ==\
                   self._stored.default_partition:
                    partition_tmp['partition_default'] = 'YES'
            else:
                if default_partition_from_config == partition_name:
                    partition_tmp['partition_default'] = 'YES'

            slurmd_info_tmp.remove(partition)
            slurmd_info_tmp.append(partition_tmp)

        return slurmd_info_tmp

    def _assemble_addons(self):
        """Assemble any addon components."""
        acct_gather = self._get_influxdb_info()
        elasticsearch_ingress = \
            self._elasticsearch.get_elasticsearch_ingress()
        nhc_info = self._nhc.get_nhc_info()
        prolog_epilog = self._prolog_epilog.get_prolog_epilog()

        ctxt = dict()

        if prolog_epilog:
            ctxt['prolog_epilog'] = prolog_epilog

        if acct_gather:
            ctxt['acct_gather'] = acct_gather
            acct_gather_custom = self.model.config.get('acct_gather_custom')
            if acct_gather_custom:
                ctxt['acct_gather']['custom'] = acct_gather_custom

        if nhc_info:
            ctxt['nhc'] = {
                'nhc_bin': nhc_info['nhc_bin'],
                'health_check_interval': nhc_info['health_check_interval'],
                'health_check_node_state': nhc_info['health_check_node_state'],
            }

        if elasticsearch_ingress:
            ctxt['elasticsearch_address'] = elasticsearch_ingress

        return ctxt

    def _check_status(self):
        """Check that the core components we need exist."""
        slurmctld_available = self._stored.slurmctld_available
        slurmdbd_available = self._stored.slurmdbd_available
        slurmd_available = self._stored.slurmd_available
        slurm_installed = self._stored.slurm_installed
        default_partition = self._stored.default_partition

        deps = [
            default_partition,
            slurmctld_available,
            slurmdbd_available,
            slurmd_available,
            slurm_installed,
        ]

        if not all(deps):
            if not slurmctld_available:
                self.unit.status = BlockedStatus("NEED RELATION TO SLURMCTLD")
            elif not slurmdbd_available:
                self.unit.status = BlockedStatus("NEED RELATION TO SLURMDBD")
            elif not slurmd_available:
                self.unit.status = BlockedStatus("NEED RELATION TO SLURMD")
            elif not slurm_installed:
                self.unit.status = BlockedStatus("SLURM NOT INSTALLED")
            else:
                self.unit.status = BlockedStatus("PARTITION NAME UNAVAILABLE")
            return False
        else:
            self.unit.status = ActiveStatus("")
            return True

    def _get_influxdb_info(self):
        """Return influxdb info."""
        return self._influxdb.get_influxdb_info()

    def _is_leader(self):
        return self.model.unit.is_leader()

    def get_munge_key(self):
        """Return the slurmdbd_info from stored state."""
        return self._stored.munge_key

    def get_default_partition(self):
        """Return self._stored.default_partition."""
        return self._stored.default_partition

    def is_slurm_installed(self):
        """Return true/false based on whether or not slurm is installed."""
        return self._stored.slurm_installed

    def set_slurmctld_available(self, slurmctld_available):
        """Set slurmctld_available."""
        self._stored.slurmctld_available = slurmctld_available

    def set_slurmdbd_available(self, slurmdbd_available):
        """Set slurmdbd_available."""
        self._stored.slurmdbd_available = slurmdbd_available

    def set_default_partition(self, partition_name):
        """Set self._stored.default_partition."""
        self._stored.default_partition = partition_name

    def set_slurmd_available(self, slurmd_available):
        """Set slurmd_available."""
        self._stored.slurmd_available = slurmd_available

    def set_slurmrestd_available(self, slurmrestd_available):
        """Set slurmrestd_available."""
        self._stored.slurmrestd_available = slurmrestd_available
Esempio n. 5
0
    def __init__(self, *args):
        """Init _stored attributes and interfaces, observe events."""
        super().__init__(*args)

        self._stored.set_default(
            jwt_key=str(),
            munge_key=str(),
            slurm_installed=False,
            slurmd_available=False,
            slurmrestd_available=False,
            slurmdbd_available=False,
            down_nodes=list(),
        )

        self._slurm_manager = SlurmManager(self, "slurmctld")

        self._slurmd = Slurmd(self, "slurmd")
        self._slurmdbd = Slurmdbd(self, "slurmdbd")
        self._slurmrestd = Slurmrestd(self, "slurmrestd")
        self._slurmctld_peer = SlurmctldPeer(self, "slurmctld-peer")
        self._prolog_epilog = PrologEpilog(self, "prolog-epilog")

        self._grafana = GrafanaSource(self, "grafana-source")
        self._influxdb = InfluxDB(self, "influxdb-api")
        self._elasticsearch = Elasticsearch(self, "elasticsearch")
        self._fluentbit = FluentbitClient(self, "fluentbit")

        self._user_group = UserGroupProvides(self, "user-group")
        self._etcd = EtcdOps()

        event_handler_bindings = {
            self.on.install:
            self._on_install,
            self.on.upgrade_charm:
            self._on_upgrade,
            self.on.update_status:
            self._on_update_status,
            self.on.config_changed:
            self._on_write_slurm_config,
            self.on.leader_elected:
            self._on_leader_elected,
            # slurm component lifecycle events
            self._slurmdbd.on.slurmdbd_available:
            self._on_slurmdbd_available,
            self._slurmdbd.on.slurmdbd_unavailable:
            self._on_slurmdbd_unavailable,
            self._slurmd.on.slurmd_available:
            self._on_write_slurm_config,
            self._slurmd.on.slurmd_unavailable:
            self._on_write_slurm_config,
            self._slurmd.on.slurmd_departed:
            self._on_write_slurm_config,
            self._slurmrestd.on.slurmrestd_available:
            self._on_slurmrestd_available,
            self._slurmrestd.on.slurmrestd_unavailable:
            self._on_write_slurm_config,
            self._slurmctld_peer.on.slurmctld_peer_available:
            self.
            _on_write_slurm_config,  # NOTE: a second slurmctld should get the jwt/munge keys and configure them
            # fluentbit
            self.on["fluentbit"].relation_created:
            self._on_fluentbit_relation_created,
            # Addons lifecycle events
            self._prolog_epilog.on.prolog_epilog_available:
            self._on_write_slurm_config,
            self._prolog_epilog.on.prolog_epilog_unavailable:
            self._on_write_slurm_config,
            self._grafana.on.grafana_available:
            self._on_grafana_available,
            self._influxdb.on.influxdb_available:
            self._on_influxdb_available,
            self._influxdb.on.influxdb_unavailable:
            self._on_write_slurm_config,
            self._elasticsearch.on.elasticsearch_available:
            self._on_elasticsearch_available,
            self._elasticsearch.on.elasticsearch_unavailable:
            self._on_write_slurm_config,
            self._user_group.on.create_user_group:
            self._on_create_user_group,
            self._user_group.on.remove_user_group:
            self._on_remove_user_group,
            # actions
            self.on.show_current_config_action:
            self._on_show_current_config,
            self.on.drain_action:
            self._drain_nodes_action,
            self.on.resume_action:
            self._resume_nodes_action,
            self.on.influxdb_info_action:
            self._infludb_info_action,
        }
        for event, handler in event_handler_bindings.items():
            self.framework.observe(event, handler)
Esempio n. 6
0
class SlurmctldCharm(CharmBase):
    """Slurmctld lifecycle events."""

    _stored = StoredState()

    def __init__(self, *args):
        """Init _stored attributes and interfaces, observe events."""
        super().__init__(*args)

        self._stored.set_default(
            jwt_key=str(),
            munge_key=str(),
            slurm_installed=False,
            slurmd_available=False,
            slurmrestd_available=False,
            slurmdbd_available=False,
            down_nodes=list(),
        )

        self._slurm_manager = SlurmManager(self, "slurmctld")

        self._slurmd = Slurmd(self, "slurmd")
        self._slurmdbd = Slurmdbd(self, "slurmdbd")
        self._slurmrestd = Slurmrestd(self, "slurmrestd")
        self._slurmctld_peer = SlurmctldPeer(self, "slurmctld-peer")
        self._prolog_epilog = PrologEpilog(self, "prolog-epilog")

        self._grafana = GrafanaSource(self, "grafana-source")
        self._influxdb = InfluxDB(self, "influxdb-api")
        self._elasticsearch = Elasticsearch(self, "elasticsearch")
        self._fluentbit = FluentbitClient(self, "fluentbit")

        self._user_group = UserGroupProvides(self, "user-group")
        self._etcd = EtcdOps()

        event_handler_bindings = {
            self.on.install:
            self._on_install,
            self.on.upgrade_charm:
            self._on_upgrade,
            self.on.update_status:
            self._on_update_status,
            self.on.config_changed:
            self._on_write_slurm_config,
            self.on.leader_elected:
            self._on_leader_elected,
            # slurm component lifecycle events
            self._slurmdbd.on.slurmdbd_available:
            self._on_slurmdbd_available,
            self._slurmdbd.on.slurmdbd_unavailable:
            self._on_slurmdbd_unavailable,
            self._slurmd.on.slurmd_available:
            self._on_write_slurm_config,
            self._slurmd.on.slurmd_unavailable:
            self._on_write_slurm_config,
            self._slurmd.on.slurmd_departed:
            self._on_write_slurm_config,
            self._slurmrestd.on.slurmrestd_available:
            self._on_slurmrestd_available,
            self._slurmrestd.on.slurmrestd_unavailable:
            self._on_write_slurm_config,
            self._slurmctld_peer.on.slurmctld_peer_available:
            self.
            _on_write_slurm_config,  # NOTE: a second slurmctld should get the jwt/munge keys and configure them
            # fluentbit
            self.on["fluentbit"].relation_created:
            self._on_fluentbit_relation_created,
            # Addons lifecycle events
            self._prolog_epilog.on.prolog_epilog_available:
            self._on_write_slurm_config,
            self._prolog_epilog.on.prolog_epilog_unavailable:
            self._on_write_slurm_config,
            self._grafana.on.grafana_available:
            self._on_grafana_available,
            self._influxdb.on.influxdb_available:
            self._on_influxdb_available,
            self._influxdb.on.influxdb_unavailable:
            self._on_write_slurm_config,
            self._elasticsearch.on.elasticsearch_available:
            self._on_elasticsearch_available,
            self._elasticsearch.on.elasticsearch_unavailable:
            self._on_write_slurm_config,
            self._user_group.on.create_user_group:
            self._on_create_user_group,
            self._user_group.on.remove_user_group:
            self._on_remove_user_group,
            # actions
            self.on.show_current_config_action:
            self._on_show_current_config,
            self.on.drain_action:
            self._drain_nodes_action,
            self.on.resume_action:
            self._resume_nodes_action,
            self.on.influxdb_info_action:
            self._infludb_info_action,
        }
        for event, handler in event_handler_bindings.items():
            self.framework.observe(event, handler)

    @property
    def hostname(self):
        """Return the hostname."""
        return self._slurm_manager.hostname

    @property
    def port(self):
        """Return the port."""
        return self._slurm_manager.port

    @property
    def cluster_name(self) -> str:
        """Return the cluster name."""
        return self.config.get("cluster-name")

    @property
    def _slurmctld_info(self):
        return self._slurmctld_peer.get_slurmctld_info()

    @property
    def slurmdbd_info(self):
        """Return slurmdbd_info from relation."""
        return self._slurmdbd.get_slurmdbd_info()

    @property
    def _slurmd_info(self) -> list:
        return self._slurmd.get_slurmd_info()

    @property
    def _cluster_info(self):
        """Assemble information about the cluster."""
        cluster_info = {}
        cluster_info['cluster_name'] = self.config.get('cluster-name')
        cluster_info['custom_config'] = self.config.get('custom-config')
        cluster_info['proctrack_type'] = self.config.get('proctrack-type')
        cluster_info['cgroup_config'] = self.config.get('cgroup-config')

        interval = self.config.get('health-check-interval')
        state = self.config.get('health-check-state')
        nhc = self._slurm_manager.slurm_config_nhc_values(interval, state)
        cluster_info.update(nhc)

        return cluster_info

    @property
    def _addons_info(self):
        """Assemble addons for slurm.conf."""
        return {
            **self._assemble_prolog_epilog(),
            **self._assemble_acct_gather_addon(),
            **self._assemble_elastic_search_addon()
        }

    def _assemble_prolog_epilog(self) -> dict:
        """Generate the prolog_epilog section of the addons."""
        logger.debug("## Generating prolog epilog configuration")

        prolog_epilog = self._prolog_epilog.get_prolog_epilog()

        if prolog_epilog:
            return {"prolog_epilog": prolog_epilog}
        else:
            return {}

    def _assemble_acct_gather_addon(self):
        """Generate the acct gather section of the addons."""
        logger.debug("## Generating acct gather configuration")

        addons = dict()

        influxdb_info = self._get_influxdb_info()
        if influxdb_info:
            addons["acct_gather"] = influxdb_info
            addons["acct_gather"]["default"] = "all"
            addons["acct_gather_profile"] = "acct_gather_profile/influxdb"

        # it is possible to setup influxdb or hdf5 profiles without the
        # relation, using the custom-config section of slurm.conf. We need to
        # support setting up the acct_gather configuration for this scenario
        acct_gather_custom = self.config.get("acct-gather-custom")
        if acct_gather_custom:
            if not addons.get("acct_gather"):
                addons["acct_gather"] = dict()

            addons["acct_gather"]["custom"] = acct_gather_custom

        addons["acct_gather_frequency"] = self.config.get(
            "acct-gather-frequency")

        return addons

    def _assemble_elastic_search_addon(self):
        """Generate the acct gather section of the addons."""
        logger.debug("## Generating elastic search addon configuration")
        addon = dict()

        elasticsearch_ingress = self._elasticsearch.elasticsearch_ingress
        if elasticsearch_ingress:
            suffix = f"/{self.cluster_name}/jobcomp"
            addon = {
                "elasticsearch_address": f"{elasticsearch_ingress}{suffix}"
            }

        return addon

    def set_slurmd_available(self, flag: bool):
        """Set stored value of slurmd available."""
        self._stored.slurmd_available = flag

    def _set_slurmdbd_available(self, flag: bool):
        """Set stored value of slurmdbd available."""
        self._stored.slurmdbd_available = flag

    def set_slurmrestd_available(self, flag: bool):
        """Set stored value of slurmdrest available."""
        self._stored.slurmrestd_available = flag

    def _is_leader(self):
        return self.model.unit.is_leader()

    def is_slurm_installed(self):
        """Return true/false based on whether or not slurm is installed."""
        return self._stored.slurm_installed

    def _on_show_current_config(self, event):
        """Show current slurm.conf."""
        slurm_conf = self._slurm_manager.get_slurm_conf()
        event.set_results({"slurm.conf": slurm_conf})

    def _on_install(self, event):
        """Perform installation operations for slurmctld."""
        self.unit.set_workload_version(Path("version").read_text().strip())

        self.unit.status = WaitingStatus("Installing slurmctld")

        custom_repo = self.config.get("custom-slurm-repo")
        successful_installation = self._slurm_manager.install(custom_repo)

        if successful_installation:
            self._stored.slurm_installed = True

            # Store the munge_key and jwt_rsa key in the stored state.
            # NOTE: Use leadership settings instead of stored state when
            # leadership settings support becomes available in the framework.
            if self._is_leader():
                # NOTE the backup controller should also have the jwt and munge
                #      keys configured. We should move these information to the
                #      peer relation.
                self._stored.jwt_rsa = self._slurm_manager.generate_jwt_rsa()
                self._stored.munge_key = self._slurm_manager.get_munge_key()
                self._slurm_manager.configure_jwt_rsa(self.get_jwt_rsa())
            else:
                # NOTE: the secondary slurmctld should get the jwt and munge
                #       keys from the peer relation here
                logger.debug("secondary slurmctld")

            # all slurmctld should restart munged here, as it would assure
            # munge is working
            self._slurm_manager.restart_munged()
        else:
            self.unit.status = BlockedStatus("Error installing slurmctld")
            event.defer()

        logger.debug("## Retrieving etcd resource to install it")
        try:
            etcd_path = self.model.resources.fetch("etcd")
            logger.debug(f"## Found etcd resource: {etcd_path}")
        except ModelError:
            logger.error("## Missing etcd resource")
            self.unit.status = BlockedStatus("Missing etcd resource")
            event.defer()
            return
        self._etcd.install(etcd_path)

        self._check_status()

    def _on_fluentbit_relation_created(self, event):
        """Set up Fluentbit log forwarding."""
        logger.debug("## Configuring fluentbit")
        cfg = list()
        cfg.extend(self._slurm_manager.fluentbit_config_nhc)
        cfg.extend(self._slurm_manager.fluentbit_config_slurm)
        self._fluentbit.configure(cfg)

    def _on_upgrade(self, event):
        """Perform upgrade operations."""
        self.unit.set_workload_version(Path("version").read_text().strip())

    def _on_update_status(self, event):
        """Handle update status."""
        self._check_status()

    def _on_leader_elected(self, event: LeaderElectedEvent) -> None:
        logger.debug("## slurmctld - leader elected")
        self._etcd.start()

        # populate etcd with the nodelist
        slurm_config = self._assemble_slurm_config()
        accounted_nodes = self._assemble_all_nodes(
            slurm_config.get("partitions", []))
        logger.debug(
            f"## Sending to etcd list of accounted nodes: {accounted_nodes}")
        self._etcd.set_list_of_accounted_nodes(accounted_nodes)

    def _check_status(self):
        """Check for all relations and set appropriate status.

        This charm needs these conditions to be satified in order to be ready:
        - Slurm components installed.
        - Munge running.
        - slurmdbd node running.
        - slurmd inventory.
        """
        # NOTE: slurmd and slurmrestd are not needed for slurmctld to work,
        #       only for the cluster to operate. But we need slurmd inventory
        #       to assemble slurm.conf

        if self._slurm_manager.needs_reboot:
            self.unit.status = BlockedStatus("Machine needs reboot")
            return False

        if not self._stored.slurm_installed:
            self.unit.status = BlockedStatus("Error installing slurmctld")
            return False

        if (self._is_leader() and not self._etcd.is_active()):
            self.unit.status = WaitingStatus("Initializing charm")
            return False

        if not self._slurm_manager.check_munged():
            self.unit.status = BlockedStatus("Error configuring munge key")
            return False

        # statuses of mandatory components:
        # - joined: someone executed juju relate slurmctld foo
        # - available: the units exchanged data through the relation
        # NOTE: slurmrestd is not mandatory for the cluster to work, that's why
        #       it is not acounted for in here
        statuses = {
            "slurmd": {
                "available": self._stored.slurmd_available,
                "joined": self._slurmd.is_joined
            },
            "slurmdbd": {
                "available": self._stored.slurmdbd_available,
                "joined": self._slurmdbd.is_joined
            }
        }

        relations_needed = list()
        waiting_on = list()
        for component in statuses.keys():
            if not statuses[component]["joined"]:
                relations_needed.append(component)
            if not statuses[component]["available"]:
                waiting_on.append(component)

        if len(relations_needed):
            msg = f"Need relations: {','.join(relations_needed)}"
            self.unit.status = BlockedStatus(msg)
            return False

        if len(waiting_on):
            msg = f"Wating on: {','.join(waiting_on)}"
            self.unit.status = WaitingStatus(msg)
            return False

        self.unit.status = ActiveStatus("slurmctld available")
        return True

    def get_munge_key(self):
        """Get the stored munge key."""
        return self._stored.munge_key

    def get_jwt_rsa(self):
        """Get the stored jwt_rsa key."""
        return self._stored.jwt_rsa

    def _assemble_partitions(self, slurmd_info):
        """Make any needed modifications to partition data."""
        slurmd_info_tmp = copy.deepcopy(slurmd_info)
        default_partition_from_config = self.config.get("default-partition")

        for partition in slurmd_info:
            # Deep copy the partition to a tmp var so we can modify it as
            # needed whilst not modifying the object we are iterating over.
            partition_tmp = copy.deepcopy(partition)
            # Extract the partition_name from the partition.
            partition_name = partition["partition_name"]

            # Check that the default_partition isn't defined in the charm
            # config.
            # If the user hasn't provided a default partition, then we infer
            # the partition_default by defaulting to the "configurator"
            # partition.
            if default_partition_from_config:
                if default_partition_from_config == partition_name:
                    partition_tmp["partition_default"] = "YES"

            slurmd_info_tmp.remove(partition)
            slurmd_info_tmp.append(partition_tmp)

        return slurmd_info_tmp

    def _assemble_slurm_config(self):
        """Assemble and return the slurm config."""
        logger.debug('## Assembling new slurm.conf')

        slurmctld_info = self._slurmctld_info
        slurmdbd_info = self.slurmdbd_info
        slurmd_info = self._slurmd_info
        cluster_info = self._cluster_info

        logger.debug("######## INFO")
        logger.debug(f'## slurmd: {slurmd_info}')
        logger.debug(f'## slurmctld_info: {slurmctld_info}')
        logger.debug(f'## slurmdbd_info: {slurmdbd_info}')
        logger.debug(f'## cluster_info: {cluster_info}')
        logger.debug("######## INFO - end")

        if not (slurmctld_info and slurmd_info and slurmdbd_info):
            return {}

        addons_info = self._addons_info
        partitions_info = self._assemble_partitions(slurmd_info)
        down_nodes = self._assemble_down_nodes(slurmd_info)

        logger.debug(f'#### addons: {addons_info}')
        logger.debug(f'#### partitions_info: {partitions_info}')
        logger.debug(f"#### Down nodes: {down_nodes}")

        return {
            "partitions": partitions_info,
            "down_nodes": down_nodes,
            **slurmctld_info,
            **slurmdbd_info,
            **addons_info,
            **cluster_info,
        }

    def _on_slurmrestd_available(self, event):
        """Set slurm_config on the relation when slurmrestd available."""
        if not self._check_status():
            event.defer()
            return

        slurm_config = self._assemble_slurm_config()

        if not slurm_config:
            self.unit.status = BlockedStatus(
                "Cannot generate slurm_config - defering event.")
            event.defer()
            return

        if self._stored.slurmrestd_available:
            self._slurmrestd.set_slurm_config_on_app_relation_data(
                slurm_config, )
            self._slurmrestd.restart_slurmrestd()

    def _on_slurmdbd_available(self, event):
        self._set_slurmdbd_available(True)
        self._on_write_slurm_config(event)

    def _on_slurmdbd_unavailable(self, event):
        self._set_slurmdbd_available(False)
        self._check_status()

    def _on_write_slurm_config(self, event):
        """Check that we have what we need before we proceed."""
        logger.debug("### Slurmctld - _on_write_slurm_config()")

        # only the leader should write the config, restart, and scontrol reconf
        if not self._is_leader():
            return

        if not self._check_status():
            event.defer()
            return

        slurm_config = self._assemble_slurm_config()
        if slurm_config:
            self._slurm_manager.render_slurm_configs(slurm_config)

            # restart is needed if nodes are added/removed from the cluster
            self._slurm_manager.slurm_systemctl('restart')
            self._slurm_manager.slurm_cmd('scontrol', 'reconfigure')

            # send the list of hostnames to slurmd via etcd
            accounted_nodes = self._assemble_all_nodes(
                slurm_config["partitions"])
            self._etcd.set_list_of_accounted_nodes(accounted_nodes)

            # send the custom NHC parameters to all slurmd
            self._slurmd.set_nhc_params(self.config.get('health-check-params'))

            # check for "not new anymore" nodes, i.e., nodes that runned the
            # node-configured action. Those nodes are not anymore in the
            # DownNodes section in the slurm.conf, but we need to resume them
            # manually and update the internal cache
            down_nodes = slurm_config['down_nodes']
            configured_nodes = self._assemble_configured_nodes(down_nodes)
            logger.debug(f"### configured nodes: {configured_nodes}")
            self._resume_nodes(configured_nodes)
            self._stored.down_nodes = down_nodes.copy()

            # slurmrestd needs the slurm.conf file, so send it every time it changes
            if self._stored.slurmrestd_available:
                self._slurmrestd.set_slurm_config_on_app_relation_data(
                    slurm_config)
                # NOTE: scontrol reconfigure does not restart slurmrestd
                self._slurmrestd.restart_slurmrestd()
        else:
            logger.debug("## Should rewrite slurm.conf, but we don't have it. "
                         "Deferring.")
            event.defer()

    @staticmethod
    def _assemble_all_nodes(slurmd_info: list) -> List[str]:
        """Parse slurmd_info and return a list with all hostnames."""
        nodes = list()
        for partition in slurmd_info:
            for node in partition["inventory"]:
                nodes.append(node["node_name"])
        return nodes

    @staticmethod
    def _assemble_down_nodes(slurmd_info):
        """Parse partitions' nodes and assemble a list of DownNodes."""
        down_nodes = []
        for partition in slurmd_info:
            for node in partition["inventory"]:
                if node["new_node"]:
                    down_nodes.append(node["node_name"])

        return down_nodes

    def _assemble_configured_nodes(self, down_nodes):
        """Assemble list of nodes that are not new anymore.

        new_node status is removed with an action, this method returns a list
        of nodes that were previously new but are not anymore.
        """
        configured_nodes = []
        for node in self._stored.down_nodes:
            if node not in down_nodes:
                configured_nodes.append(node)

        return configured_nodes

    def _resume_nodes(self, nodelist):
        """Run scontrol to resume the speficied node list."""
        nodes = ",".join(nodelist)
        update_cmd = f"update nodename={nodes} state=resume"
        self._slurm_manager.slurm_cmd('scontrol', update_cmd)

    def _on_grafana_available(self, event):
        """Create the grafana-source if we are the leader and have influxdb."""
        if not self._is_leader():
            return

        influxdb_info = self._get_influxdb_info()

        if influxdb_info:
            self._grafana.set_grafana_source_info(influxdb_info)
        else:
            logger.error(
                "## Can not set Grafana source: missing influxdb relation")

    def _on_influxdb_available(self, event):
        """Assemble addons to forward slurm data to influxdb."""
        self._on_write_slurm_config(event)

    def _on_elasticsearch_available(self, event):
        """Assemble addons to forward Slurm data to elasticsearch."""
        self._on_write_slurm_config(event)

    def _get_influxdb_info(self) -> dict:
        """Return influxdb info."""
        return self._influxdb.get_influxdb_info()

    def _drain_nodes_action(self, event):
        """Drain specified nodes."""
        nodes = event.params['nodename']
        reason = event.params['reason']

        logger.debug(f'#### Draining {nodes} because {reason}.')
        event.log(f'Draining {nodes} because {reason}.')

        try:
            cmd = f'scontrol update nodename={nodes} state=drain reason="{reason}"'
            subprocess.check_output(shlex.split(cmd))
            event.set_results({'status': 'draining', 'nodes': nodes})
        except subprocess.CalledProcessError as e:
            event.fail(message=f'Error draining {nodes}: {e.output}')

    def _resume_nodes_action(self, event):
        """Resume specified nodes."""
        nodes = event.params['nodename']

        logger.debug(f'#### Resuming {nodes}.')
        event.log(f'Resuming {nodes}.')

        try:
            cmd = f'scontrol update nodename={nodes} state=resume'
            subprocess.check_output(shlex.split(cmd))
            event.set_results({'status': 'resuming', 'nodes': nodes})
        except subprocess.CalledProcessError as e:
            event.fail(message=f'Error resuming {nodes}: {e.output}')

    def _infludb_info_action(self, event):
        influxdb_info = self._get_influxdb_info()

        if not influxdb_info:
            influxdb_info = "not related"
        logger.debug(f"## InfluxDB-info action: {influxdb_info}")
        event.set_results({"influxdb": influxdb_info})

    def _on_create_user_group(self, event):
        """Create the user and group provided."""
        user = self._user_group.user_name
        user_uid = self._user_group.user_uid
        group = self._user_group.group_name

        # Create the group.
        try:
            subprocess.check_output(["groupadd", "--gid", user_uid,
                                     group])  # use the UID as the GID
        except subprocess.CalledProcessError as e:
            if e.returncode == 9:
                logger.warning("## Group already exists.")
            if e.returncode == 4:
                logger.warning("## GID already exists.")
                self._user_group._relation.data[self._user_group.model.app][
                    "status"] = "failure: GID already exists"
                return
            else:
                logger.error(f"## Error creating group: {e}")

        # Create the user.
        try:
            subprocess.check_output([
                "useradd",
                "--system",
                "--no-create-home",
                "--gid",
                group,
                "--shell",
                "/usr/sbin/nologin",
                "-u",
                user_uid,
                user,
            ])
        except subprocess.CalledProcessError as e:
            if e.returncode == 9:
                logger.warning("## User already exists.")
            if e.returncode == 4:
                logger.warning("## UID already exists.")
                self._user_group._relation.data[self._user_group.model.app][
                    "status"] = "failure: UID already exists"
                return
            else:
                logger.error(f"## Error creating user: {e}")

        self._user_group._relation.data[
            self._user_group.model.app]["status"] = "success: User created"

    def _on_remove_user_group(self, event):
        """Remove the user and group provided."""
        user = self._user_group.user_name
        group = self._user_group.group_name

        # Remove the user.
        try:
            subprocess.check_output(["userdel", user])
        except subprocess.CalledProcessError as e:
            logger.error(f"## Error deleting user: {e}")

        # Remove the group.
        try:
            subprocess.check_output(["groupdel", group])
        except subprocess.CalledProcessError as e:
            logger.error(f"## Error deleting group: {e}")
Esempio n. 7
0
class SlurmdCharm(CharmBase):
    """Slurmd lifecycle events."""

    _stored = StoredState()

    def __init__(self, *args):
        """Init _stored attributes and interfaces, observe events."""
        super().__init__(*args)

        self._stored.set_default(
            user_node_state=str(),
            partition_name=str(),
            config_available=False,
        )

        self._nrpe = Nrpe(self, "nrpe-external-master")

        self._slurm_manager = SlurmManager(self, "slurmd")

        self._slurmd = Slurmd(self, "slurmd")
        self._slurmd_peer = SlurmdPeer(self, "slurmd-peer")

        event_handler_bindings = {
            self.on.install: self._on_install,
            self.on.upgrade_charm: self._on_upgrade,
            self.on.config_changed: self._on_send_slurmd_info,
            self._slurmd_peer.on.slurmd_peer_available:
            self._on_send_slurmd_info,
            self._slurmd.on.slurm_config_available:
            self._on_check_status_and_write_config,
            self.on.set_node_state_action: self._on_set_node_state_action,
        }
        for event, handler in event_handler_bindings.items():
            self.framework.observe(event, handler)

    def _on_config_changed(self, event):
        self.get_set_return_partition_name()
        self._on_send_slurmd_info(event)

    def _on_install(self, event):
        self._slurm_manager.install()
        self._stored.slurm_installed = True
        self.unit.status = ActiveStatus("Slurm Installed")

    def _on_upgrade(self, event):
        self._slurm_manager.upgrade()

    def _on_set_node_state_action(self, event):
        """Set the node state."""
        self._stored.user_node_state = event.params["node-state"]
        self._on_send_slurm_info(event)

    def _on_send_slurmd_info(self, event):
        if self.framework.model.unit.is_leader():
            if self._slurmd.is_joined:
                partition = self._assemble_partition()
                if partition:
                    self._slurmd.set_slurmd_info_on_app_relation_data(
                        partition)
                    return
            event.defer()
            return

    def _on_check_status_and_write_config(self, event):
        if not self._check_status():
            event.defer()
            return
        slurm_config = dict(self._slurmd.get_slurm_config())
        self._slurm_manager.render_config_and_restart(slurm_config)
        self.unit.status = ActiveStatus("Slurmd Available")

    def _check_status(self):
        slurm_installed = self._stored.slurm_installed
        config_available = self._stored.config_available

        if not (slurm_installed and config_available):
            self.unit.status = BlockedStatus(
                "NEED RELATION TO SLURM CONFIGURATOR")
            return False
        else:
            return True

    def _assemble_partition(self):
        """Assemble the partition info."""
        partition_name = self._stored.partition_name
        partition_config = self.model.config.get('partition-config')
        partition_state = self.model.config.get('partition-state')

        slurmd_info = self._assemble_slurmd_info()

        return {
            'inventory': slurmd_info,
            'partition_name': partition_name,
            'partition_state': partition_state,
            'partition_config': partition_config,
        }

    def _assemble_slurmd_info(self):
        """Apply mutations to nodes in the partition, return slurmd nodes."""
        slurmd_info = self._slurmd_peer.get_slurmd_info()
        if not slurmd_info:
            return None

        # If the user has set custom state for nodes
        # ensure we update the state for the targeted nodes.
        user_node_state = self._stored.user_node_state
        if user_node_state:
            node_states = {
                item.split("=")[0]: item.split("=")[1]
                for item in user_node_state.split(",")
            }

            # Copy the slurmd_info returned from the the slurmd-peer relation
            # to a temporary variable to which we will make modifications.
            slurmd_info_tmp = copy.deepcopy(slurmd_info)

            # Iterate over the slurmd nodes in the partition and check
            # for nodes that need their state modified.
            for partition in slurmd_info:
                partition_tmp = copy.deepcopy(partition)
                for slurmd_node in partition['inventory']:
                    if slurmd_node['hostname'] in node_states.keys():
                        slurmd_node_tmp = copy.deepcopy(slurmd_node)
                        slurmd_node_tmp['state'] = \
                            node_states[slurmd_node['hostname']]
                        partition_tmp['inventory'].remove(slurmd_node)
                        partition_tmp['inventory'].append(slurmd_node_tmp)
                slurmd_info_tmp.remove(partition)
                slurmd_info_tmp.append(partition_tmp)
        else:
            slurmd_info_tmp = slurmd_info

        return slurmd_info_tmp

    def get_set_return_partition_name(self):
        """Set the partition name."""
        # Determine if a partition-name config exists, if so
        # ensure the partition_name known by the charm is consistent.
        # If no partition name has been specified then generate one.
        partition_name = self.model.config.get('partition-name')
        if partition_name:
            if partition_name != self._stored.partition_name:
                self._stored.partition_name = partition_name
        elif not self._stored.partition_name:
            self._stored.partition_name = f"juju-compute-{random_string()}"
        return self._stored.partition_name

    def get_slurm_component(self):
        """Return the slurm component."""
        return self._slurm_manager.slurm_component

    def get_hostname(self):
        """Return the hostname."""
        return self._slurm_manager.hostname

    def get_port(self):
        """Return the port."""
        return self._slurm_manager.port
Esempio n. 8
0
class SlurmdCharm(CharmBase):
    """Slurmd lifecycle events."""

    _stored = StoredState()

    def __init__(self, *args):
        """Init _stored attributes and interfaces, observe events."""
        super().__init__(*args)

        self._stored.set_default(
            munge_key_available=False,
            slurmd_restarted=False,
            user_node_state=str(),
            partition_name=str(),
        )

        self._nrpe = Nrpe(self, "nrpe-external-master")

        self._slurm_manager = SlurmManager(self, "slurmd")

        self._slurmd = Slurmd(self, "slurmd")
        self._slurmd_peer = SlurmdPeer(self, "slurmd-peer")

        event_handler_bindings = {
            self.on.install: self._on_install,
            self.on.upgrade_charm: self._on_upgrade,
            self.on.start: self._on_check_status_and_write_config,
            self.on.config_changed: self._on_config_changed,
            self._slurmd_peer.on.slurmd_peer_available:
            self._on_set_partition_info_on_app_relation_data,
            self._slurmd_peer.on.slurmd_peer_departed:
            self._on_set_partition_info_on_app_relation_data,
            self._slurmd.on.slurm_config_available:
            self._on_check_status_and_write_config,
            self._slurmd.on.slurm_config_unavailable:
            self._on_check_status_and_write_config,
            self._slurmd.on.restart_slurmd:
            self._on_restart_slurmd,
            self._slurmd.on.munge_key_available: self._on_write_munge_key,
            self.on.set_node_state_action: self._on_set_node_state_action,
        }
        for event, handler in event_handler_bindings.items():
            self.framework.observe(event, handler)

    def _on_install(self, event):
        self._slurm_manager.install(self.config["snapstore-channel"])

        if self.model.unit.is_leader():
            self._get_set_partition_name()
            logger.debug(f"PARTITION_NAME: {self._stored.partition_name}")
        self._stored.slurm_installed = True
        self.unit.status = ActiveStatus("Slurm installed")

    def _on_upgrade(self, event):
        slurm_config = self._check_status()
        if not slurm_config:
            event.defer()
            return

        self._slurm_manager.upgrade(
            slurm_config,
            self.config["snapstore-channel"]
        )

    def _on_config_changed(self, event):
        if self.model.unit.is_leader():
            self._get_set_partition_name()
            if self._check_status():
                self._on_set_partition_info_on_app_relation_data(
                    event
                )

    def _on_write_munge_key(self, event):
        if not self._stored.slurm_installed:
            event.defer()
            return
        munge_key = self._slurmd.get_stored_munge_key()
        self._slurm_manager.configure_munge_key(munge_key)
        self._slurm_manager.restart_munged()
        self._stored.munge_key_available = True

    def _on_check_status_and_write_config(self, event):
        slurm_config = self._check_status()
        if not slurm_config:
            event.defer()
            return

        # if slurm_config['configless']:
        #    slurmctld_hostname = slurm_config['active_controller_hostname']
        #    self._slurm_manager.configure_slurmctld_hostname(
        #        slurmctld_hostname
        #    )
        #    self._slurm_manager.restart_slurm_component()
        # else:

        # Ensure we aren't dealing with a StoredDict before trying
        # to render the slurm.conf.
        slurm_config = dict(slurm_config)
        self._slurm_manager.render_slurm_configs(slurm_config)

        # Only restart slurmd the first time the node is brought up.
        if not self._stored.slurmd_restarted:
            self._slurm_manager.restart_slurm_component()
            self._stored.slurmd_restarted = True

        self.unit.status = ActiveStatus("slurmd available")

    def _on_restart_slurmd(self, event):
        self._slurm_manager.restart_slurm_component()

    def _check_status(self):
        munge_key_available = self._stored.munge_key_available
        slurm_installed = self._stored.slurm_installed
        slurm_config = self._slurmd.get_stored_slurm_config()

        slurmd_joined = self._slurmd.is_joined

        if not slurmd_joined:
            self.unit.status = BlockedStatus(
                "Needed relations: slurm-configurator"
            )
            return None

        elif not (munge_key_available and slurm_config and slurm_installed):
            self.unit.status = WaitingStatus(
                "Waiting on: configuration"
            )
            return None

        return dict(slurm_config)

    def _on_set_node_state_action(self, event):
        """Set the node state."""
        self._stored.user_node_state = event.params["node-state"]
        self._on_set_partition_info_on_app_relation_data(event)

    def _on_set_partition_info_on_app_relation_data(self, event):
        """Set the slurm partition info on the application relation data."""
        # Only the leader can set data on the relation.
        if self.framework.model.unit.is_leader():
            # If the relation with slurm-configurator exists then set our
            # partition info on the application relation data.
            # This handler shouldn't fire if the relation isn't made,
            # but add this extra check here just incase.
            if self._slurmd.is_joined:
                partition = self._assemble_partition()
                if partition:
                    self._slurmd.set_partition_info_on_app_relation_data(
                        partition
                    )
                    return
            event.defer()
            return

    def _assemble_partition(self):
        """Assemble the partition info."""
        partition_name = self._stored.partition_name
        partition_config = self.config.get("partition-config")
        partition_state = self.config.get("partition-state")

        slurmd_inventory = self._assemble_slurmd_inventory()

        return {
            "inventory": slurmd_inventory,
            "partition_name": partition_name,
            "partition_state": partition_state,
            "partition_config": partition_config,
        }

    def _assemble_slurmd_inventory(self):
        """Apply mutations to nodes in the partition, return slurmd nodes."""
        slurmd_inventory = self._slurmd_peer.get_slurmd_inventory()
        if not slurmd_inventory:
            return None

        # If the user has set custom state for nodes
        # ensure we update the state for the targeted nodes.
        user_node_state = self._stored.user_node_state
        if user_node_state:
            node_states = {
                item.split("=")[0]: item.split("=")[1]
                for item in user_node_state.split(",")
            }

            # Copy the slurmd_inventory returned from the the slurmd-peer
            # relation to a temporary variable that we will use to
            # iterate over while we conditionally make modifications to the
            # original inventory.
            slurmd_inventory_tmp = copy.deepcopy(slurmd_inventory)

            # Iterate over the slurmd nodes in the partition and check
            # for nodes that need their state modified.
            for partition in slurmd_inventory_tmp:
                partition_tmp = copy.deepcopy(partition)
                for slurmd_node in partition["inventory"]:
                    if slurmd_node["hostname"] in node_states.keys():
                        slurmd_node_tmp = copy.deepcopy(slurmd_node)
                        slurmd_node_tmp["state"] = \
                            node_states[slurmd_node["hostname"]]
                        partition_tmp["inventory"].remove(slurmd_node)
                        partition_tmp["inventory"].append(slurmd_node_tmp)
                slurmd_inventory.remove(partition)
                slurmd_inventory.append(partition_tmp)

        return slurmd_inventory

    def _get_set_partition_name(self):
        """Set the partition name."""
        # Determine if a partition-name config exists, if so
        # ensure the self._stored.partition_name is consistent with the
        # supplied config.
        # If no partition name has been specified then generate one.
        partition_name = self.config.get("partition-name")
        if partition_name:
            if partition_name != self._stored.partition_name:
                self._stored.partition_name = partition_name
        elif not self._stored.partition_name:
            self._stored.partition_name = f"juju-compute-{random_string()}"
        return

    def get_partition_name(self):
        """Return the partition_name."""
        return self._stored.partition_name

    def get_slurm_component(self):
        """Return the slurm component."""
        return self._slurm_manager.slurm_component

    def get_hostname(self):
        """Return the hostname."""
        return self._slurm_manager.hostname

    def get_port(self):
        """Return the port."""
        return self._slurm_manager.port
Esempio n. 9
0
class SlurmdCharm(CharmBase):
    """Slurmd lifecycle events."""

    _stored = StoredState()
    on = SlurmdCharmEvents()

    def __init__(self, *args):
        """Init _stored attributes and interfaces, observe events."""
        super().__init__(*args)

        self._stored.set_default(nhc_conf=str(),
                                 slurm_installed=False,
                                 slurmctld_available=False,
                                 slurmctld_started=False,
                                 cluster_name=str())

        self._slurm_manager = SlurmManager(self, "slurmd")
        self._fluentbit = FluentbitClient(self, "fluentbit")

        # interface to slurmctld, should only have one slurmctld per slurmd app
        self._slurmd = Slurmd(self, "slurmd")
        self._slurmd_peer = SlurmdPeer(self, "slurmd-peer")

        event_handler_bindings = {
            self.on.install: self._on_install,
            self.on.upgrade_charm: self._on_upgrade,
            self.on.update_status: self._on_update_status,
            self.on.config_changed: self._on_config_changed,
            self.on.slurmctld_started: self._on_slurmctld_started,
            self.on.slurmd_start: self._on_slurmd_start,
            self.on.check_etcd: self._on_check_etcd,
            self._slurmd.on.slurmctld_available: self._on_slurmctld_available,
            self._slurmd.on.slurmctld_unavailable:
            self._on_slurmctld_unavailable,
            # fluentbit
            self.on["fluentbit"].relation_created:
            self._on_configure_fluentbit,
            # actions
            self.on.version_action: self._on_version_action,
            self.on.node_configured_action: self._on_node_configured_action,
            self.on.get_node_inventory_action:
            self._on_get_node_inventory_action,
            self.on.show_nhc_config_action: self._on_show_nhc_config,
            # infiniband actions
            self.on.get_infiniband_repo_action: self.get_infiniband_repo,
            self.on.set_infiniband_repo_action: self.set_infiniband_repo,
            self.on.install_infiniband_action: self.install_infiniband,
            self.on.uninstall_infiniband_action: self.uninstall_infiniband,
            self.on.start_infiniband_action: self.start_infiniband,
            self.on.enable_infiniband_action: self.enable_infiniband,
            self.on.stop_infiniband_action: self.stop_infiniband,
            self.on.is_active_infiniband_action: self.is_active_infiniband,
            # nvdia actions
            self.on.nvidia_repo_action: self.nvidia_repo,
            self.on.nvidia_package_action: self.nvidia_package,
            self.on.nvidia_install_action: self.nvidia_install,
        }
        for event, handler in event_handler_bindings.items():
            self.framework.observe(event, handler)

    def _on_install(self, event):
        """Perform installation operations for slurmd."""
        self.unit.set_workload_version(Path("version").read_text().strip())
        self.unit.status = WaitingStatus("Installing slurmd")

        custom_repo = self.config.get("custom-slurm-repo")
        successful_installation = self._slurm_manager.install(custom_repo)
        logger.debug(f"### slurmd installed: {successful_installation}")

        if successful_installation:
            self._stored.slurm_installed = True
        else:
            self.unit.status = BlockedStatus("Error installing slurmd")
            event.defer()

        self._check_status()

    def _on_configure_fluentbit(self, event):
        """Set up Fluentbit log forwarding."""
        self._configure_fluentbit()

    def _configure_fluentbit(self):
        logger.debug("## Configuring fluentbit")
        cfg = list()
        cfg.extend(self._slurm_manager.fluentbit_config_nhc)
        cfg.extend(self._slurm_manager.fluentbit_config_slurm)
        self._fluentbit.configure(cfg)

    def _on_upgrade(self, event):
        """Perform upgrade operations."""
        self.unit.set_workload_version(Path("version").read_text().strip())

    def _on_update_status(self, event):
        """Handle update status."""
        self._check_status()

    def _check_status(self) -> bool:
        """Check if we heve all needed components.

        - partition name
        - slurm installed
        - slurmctld available and working
        - munge key configured and working
        """
        if self._slurm_manager.needs_reboot:
            self.unit.status = BlockedStatus("Machine needs reboot")
            return False

        if not self.get_partition_name():
            self.unit.status = WaitingStatus("Waiting on charm configuration")
            return False

        if not self._stored.slurm_installed:
            self.unit.status = BlockedStatus("Error installing slurmd")
            return False

        if not self._slurmd.is_joined:
            self.unit.status = BlockedStatus("Need relations: slurmctld")
            return False

        if not self._stored.slurmctld_available:
            self.unit.status = WaitingStatus("Waiting on: slurmctld")
            return False

        if not self._slurm_manager.check_munged():
            self.unit.status = BlockedStatus("Error configuring munge key")
            return False

        if not self._stored.slurmctld_started:
            self.unit.status = WaitingStatus("Waiting slurmctld to start")
            return False

        self.unit.status = ActiveStatus("slurmd available")
        return True

    def ensure_slurmd_starts(self, max_attemps=10) -> bool:
        """Ensure slurmd is up and running."""
        logger.debug("## Stoping slurmd")
        self._slurm_manager.slurm_systemctl('stop')

        for i in range(max_attemps):
            if self._slurm_manager.slurm_is_active():
                logger.debug("## Slurmd running")
                break
            else:
                logger.warning("## Slurmd not running, trying to start it")
                self.unit.status = WaitingStatus("Starting slurmd")
                self._slurm_manager.restart_slurm_component()
                sleep(2 + i)

        if self._slurm_manager.slurm_is_active():
            return True
        else:
            self.unit.status = BlockedStatus("Cannot start slurmd")
            return False

    def _set_slurmctld_available(self, flag: bool):
        """Change stored value for slurmctld availability."""
        self._stored.slurmctld_available = flag

    def _set_slurmctld_started(self, flag: bool):
        """Change stored value for slurmctld started."""
        self._stored.slurmctld_started = flag

    def _on_slurmctld_available(self, event):
        """Get data from slurmctld and send inventory."""
        if not self._stored.slurm_installed:
            event.defer()
            return

        logger.debug(
            '#### Slurmctld available - setting overrides for configless')
        # get slurmctld host:port from relation and override systemd services
        host = self._slurmd.slurmctld_hostname
        port = self._slurmd.slurmctld_port
        self._slurm_manager.create_configless_systemd_override(host, port)
        self._slurm_manager.daemon_reload()

        self._write_munge_key_and_restart_munge()

        self._set_slurmctld_available(True)
        self._on_set_partition_info_on_app_relation_data(event)
        self._check_status()

        # check etcd for hostnames
        self.on.check_etcd.emit()

    def _on_check_etcd(self, event):
        """Check if node is accounted for.

        Check if slurmctld accounted for this node's inventory for the first
        time, if so, emit slurmctld_started event, so the node can start the
        daemon.
        """

        host = self._slurmd.slurmctld_address
        port = self._slurmd.etcd_port
        logger.debug(f"## Connecting to etcd3 in {host}:{port}")
        client = Etcd3Client(host=host, port=port, api_path="/v3/")

        logger.debug("## Querying etcd3 for node list")
        try:
            v = client.get(key="all_nodes")
            logger.debug(f"## Got: {v}")
        except Exception as e:
            logger.error(
                f"## Unable to connect to {host} to get list of nodes: {e}")
            event.defer()
            return

        node_accounted = False
        if v:
            hostnames = json.loads(v[0])
            logger.debug(f"### etcd3 node list: {hostnames}")
            if self.hostname in hostnames:
                self.on.slurmctld_started.emit()
                node_accounted = True

        if not node_accounted:
            logger.debug("## Node not accounted for. Deferring.")
            event.defer()

    def _on_slurmctld_unavailable(self, event):
        logger.debug("## Slurmctld unavailable")
        self._set_slurmctld_available(False)
        self._set_slurmctld_started(False)
        self._slurm_manager.slurm_systemctl('stop')
        self._check_status()

    def _on_slurmctld_started(self, event):
        """Set flag to True and emit slurmd_start event."""
        self._set_slurmctld_started(True)
        self.on.slurmd_start.emit()

    def _on_slurmd_start(self, event):
        if not self._check_status():
            event.defer()
            return

        # only set up fluentbit if we have a relation to it
        if self._fluentbit._relation is not None:
            self._configure_fluentbit()

        # at this point, we have slurm installed, munge configured, and we know
        # slurmctld accounted for this node. It should be safe to start slurmd
        if self.ensure_slurmd_starts():
            logger.debug("## slurmctld started and slurmd is running")
        else:
            event.defer()
        self._check_status()

    def _on_config_changed(self, event):
        """Handle charm configuration changes."""
        if self.model.unit.is_leader():
            logger.debug("## slurmd config changed - leader")
            self._on_set_partition_info_on_app_relation_data(event)

        nhc_conf = self.model.config.get('nhc-conf')
        if nhc_conf:
            if nhc_conf != self._stored.nhc_conf:
                self._stored.nhc_conf = nhc_conf
                self._slurm_manager.render_nhc_config(nhc_conf)

    def get_partition_name(self) -> str:
        """Return the partition_name in the slurmd relation."""
        # Determine if a user-supplied partition-name config exists, if so
        # ensure the partition_name is consistent with the supplied config.
        # If no partition name has been specified then generate one.
        partition_name = self._slurmd_peer.partition_name
        partition_name_from_config = self.config.get("partition-name")
        if partition_name:
            if partition_name_from_config:
                partition_name_from_config = partition_name_from_config.replace(
                    ' ', '-')
                if partition_name != partition_name_from_config:
                    self._set_partition_name(partition_name_from_config)
                    partition_name = partition_name_from_config
                else:
                    logger.debug("Partition name unchanged.")
            else:
                logger.debug("Partition name unchanged.")
        else:
            partition_name = f"osd-{self.app.name}"
            logger.debug(f"Partition name: {partition_name}")
            self._set_partition_name(partition_name)

        return partition_name

    def _set_partition_name(self, name: str):
        """Set the partition_name in the slurmd relation."""
        if self.model.unit.is_leader():
            self._slurmd_peer.partition_name = name

    def _write_munge_key_and_restart_munge(self):
        logger.debug('#### slurmd charm - writting munge key')

        self._slurm_manager.configure_munge_key(
            self._slurmd.get_stored_munge_key())

        if self._slurm_manager.restart_munged():
            logger.debug("## Munge restarted succesfully")
        else:
            logger.error("## Unable to restart munge")

    def _on_version_action(self, event):
        """Return version of installed components.

        - Slurm
        - munge
        - NHC
        - infiniband
        """
        version = {}
        version['slurm'] = self._slurm_manager.slurm_version()
        version['munge'] = self._slurm_manager.munge_version()
        version['nhc'] = self._slurm_manager.nhc_version()
        version['infiniband'] = self._slurm_manager.infiniband_version()

        event.set_results(version)

    def _on_node_configured_action(self, event):
        """Remove node from DownNodes."""
        # trigger reconfig
        self._slurmd.configure_new_node()
        logger.debug('### This node is not new anymore')

    def _on_get_node_inventory_action(self, event):
        """Return node inventory."""
        inventory = self._slurmd.node_inventory
        event.set_results({'inventory': inventory})

    def get_infiniband_repo(self, event):
        """Return the currently used infiniband repository."""
        repo = self._slurm_manager.infiniband.repository
        event.set_results({'infiniband-repo': repo})

    def set_infiniband_repo(self, event):
        """Set the infiniband repository."""
        repo = event.params["repo"]
        logger.debug(f"#### setting custom infiniband repo: {repo}")
        repo = base64.b64decode(repo).decode()
        self._slurm_manager.infiniband.repository = repo

    def install_infiniband(self, event):
        """Install infiniband."""
        logger.debug("#### Installing Infiniband")
        self._slurm_manager.infiniband.install()
        event.set_results({'installation': 'Successfull. Please reboot node.'})
        self.unit.status = BlockedStatus("Need reboot for Infiniband")

    def uninstall_infiniband(self, event):
        """Install infiniband."""
        logger.debug("#### Uninstalling Infiniband")
        self._slurm_manager.infiniband.uninstall()

    def start_infiniband(self, event):
        """Start Infiniband systemd service."""
        logger.debug("#### Starting Infiniband service")
        self._slurm_manager.infiniband.start()

    def enable_infiniband(self, event):
        """Enable Infiniband systemd service."""
        logger.debug("#### Enabling Infiniband service")
        self._slurm_manager.infiniband.enable()

    def stop_infiniband(self, event):
        """Stop Infiniband systemd service."""
        logger.debug("#### Stoping Infiniband service")
        self._slurm_manager.infiniband.stop()

    def is_active_infiniband(self, event):
        """Check if Infiniband systemd service is arctive."""
        status = self._slurm_manager.infiniband.is_active()
        logger.debug(f"#### Infiniband service is-active: {status}")
        event.set_results({'infiniband-is-active': status})

    def nvidia_repo(self, event):
        """Set or get the used nvidia repository."""
        repo = event.params.get("repo", None)
        if repo:
            self._slurm_manager.nvidia.repository = base64.b64decode(
                repo).decode()

        event.set_results(
            {'nvidia-repo': self._slurm_manager.nvidia.repository})

    def nvidia_package(self, event):
        """Set or get the used nvidia package."""
        package = event.params.get("package", None)
        if package or package == "":
            # user supplied a package name -> store it
            self._slurm_manager.nvidia.package = package

        event.set_results(
            {'nvidia-package': self._slurm_manager.nvidia.package})

    def nvidia_install(self, event):
        """Install nvidia drivers."""
        logger.debug("#### Installing nvidia drivers: %s",
                     self._slurm_manager.nvidia.package)
        self._slurm_manager.nvidia.install()
        event.set_results({'installation': 'Successfull. Please reboot node.'})
        self.unit.status = BlockedStatus("Need reboot for nvidia")

    def _on_show_nhc_config(self, event):
        """Show current nhc.conf."""
        nhc_conf = self._slurm_manager.get_nhc_config()
        event.set_results({"nhc.conf": nhc_conf})

    def _on_set_partition_info_on_app_relation_data(self, event):
        """Set the slurm partition info on the application relation data."""
        # Only the leader can set data on the relation.
        if self.model.unit.is_leader():
            # If the relation with slurmctld exists then set our
            # partition info on the application relation data.
            # This handler shouldn't fire if the relation isn't made,
            # but add this extra check here just incase.
            if self._slurmd.is_joined:
                partition = self._assemble_partition()
                if partition:
                    self._slurmd.set_partition_info_on_app_relation_data(
                        partition)
                else:
                    event.defer()
            else:
                event.defer()

    def _assemble_partition(self):
        """Assemble the partition info."""
        partition_name = self.get_partition_name()
        partition_config = self.config.get("partition-config")
        partition_state = self.config.get("partition-state")
        logger.debug(f"## partition_name: {partition_name}")

        return {
            "partition_name": partition_name,
            "partition_state": partition_state,
            "partition_config": partition_config,
        }

    @property
    def hostname(self) -> str:
        """Return the hostname."""
        return self._slurm_manager.hostname

    @property
    def cluster_name(self) -> str:
        """Return the cluster-name."""
        return self._stored.cluster_name

    @cluster_name.setter
    def cluster_name(self, name: str):
        """Set the cluster-name."""
        self._stored.cluster_name = name
Esempio n. 10
0
class SlurmConfiguratorCharm(CharmBase):
    """Facilitate slurm configuration operations."""

    _stored = StoredState()

    def __init__(self, *args):
        """Init charm, _stored defaults, interfaces and observe events."""
        super().__init__(*args)

        self._stored.set_default(
            munge_key=str(),
            override_slurm_conf=None,
            slurm_installed=False,
            slurmd_restarted=False,
            slurmctld_available=False,
            slurmdbd_available=False,
            slurmd_available=False,
            slurmrestd_available=False,
        )

        self._elasticsearch = Elasticsearch(self, "elasticsearch")
        self._grafana = GrafanaSource(self, "grafana-source")
        self._influxdb = InfluxDB(self, "influxdb-api")
        self._nhc = Nhc(self, "nhc")
        self._slurmrestd = Slurmrestd(self, "slurmrestd")
        self._slurm_manager = SlurmManager(self, "slurmd")
        self._slurmctld = Slurmctld(self, "slurmctld")
        self._slurmdbd = Slurmdbd(self, "slurmdbd")
        self._slurmd = Slurmd(self, "slurmd")
        self._prolog_epilog = PrologEpilog(self, "prolog-epilog")

        # #### Charm lifecycle events #### #
        event_handler_bindings = {
            # #### Juju lifecycle events #### #
            self.on.install:
            self._on_install,
            self.on.config_changed:
            self._on_check_status_and_write_config,
            self.on.upgrade_charm:
            self._on_upgrade,
            # ######## Addons lifecycle events ######## #
            self._elasticsearch.on.elasticsearch_available:
            self._on_check_status_and_write_config,
            self._elasticsearch.on.elasticsearch_unavailable:
            self._on_check_status_and_write_config,
            self._grafana.on.grafana_available:
            self._on_grafana_available,
            self._influxdb.on.influxdb_available:
            self._on_influxdb_available,
            self._influxdb.on.influxdb_unavailable:
            self._on_check_status_and_write_config,
            self._nhc.on.nhc_bin_available:
            self._on_check_status_and_write_config,
            # ######## Slurm component lifecycle events ######## #
            self._slurmctld.on.slurmctld_available:
            self._on_check_status_and_write_config,
            self._slurmctld.on.slurmctld_unavailable:
            self._on_check_status_and_write_config,
            self._slurmdbd.on.slurmdbd_available:
            self._on_check_status_and_write_config,
            self._slurmdbd.on.slurmdbd_unavailable:
            self._on_check_status_and_write_config,
            self._slurmd.on.slurmd_available:
            self._on_check_status_and_write_config,
            self._slurmd.on.slurmd_unavailable:
            self._on_check_status_and_write_config,
            self._slurmrestd.on.slurmrestd_available:
            self._on_slurmrestd_available,
            self._slurmrestd.on.slurmrestd_unavailable:
            self._on_check_status_and_write_config,
            self._prolog_epilog.on.prolog_epilog_available:
            self._on_check_status_and_write_config,
            self._prolog_epilog.on.prolog_epilog_unavailable:
            self._on_check_status_and_write_config,
            # Actions
            self.on.scontrol_reconfigure_action:
            self._on_scontrol_reconfigure,
            self.on.get_slurm_conf_action:
            self._on_get_slurm_conf,
            self.on.set_slurm_conf_action:
            self._on_set_slurm_conf,
        }
        for event, handler in event_handler_bindings.items():
            self.framework.observe(event, handler)

    def _on_scontrol_reconfigure(self, event):
        """Run 'scontrol reconfigure' on slurmctld."""
        self._slurmctld.scontrol_reconfigure()

    def _on_get_slurm_conf(self, event):
        """Return the slurm.conf."""
        # Determine if we have an override config.
        override_slurm_conf = self._stored.override_slurm_conf
        if override_slurm_conf:
            slurm_conf = override_slurm_conf
        else:
            slurm_conf = self._slurm_manager.get_slurm_conf()

        # Return the slurm.conf as the result of the action.
        event.set_results({"slurm.conf": slurm_conf})

    def _on_set_slurm_conf(self, event):
        """Set the override slurm.conf."""
        self._stored.override_slurm_conf = event.params["slurm-conf"]

    def _on_install(self, event):
        """Install the slurm snap and capture the munge key."""
        self._slurm_manager.install(self.config["snapstore-channel"])
        self._stored.munge_key = self._slurm_manager.get_munge_key()
        self._stored.slurm_installed = True
        self.unit.status = ActiveStatus("slurm installed")

    def _on_upgrade(self, event):
        """Upgrade the charm."""
        slurm_config = \
            self._stored.override_slurm_conf or self._assemble_slurm_config()

        if not slurm_config:
            self.unit.status = BlockedStatus(
                "Cannot generate slurm_config, defering upgrade.")
            event.defer()
            return

        self._slurm_manager.upgrade(slurm_config,
                                    self.config["snapstore-channel"])

    def _on_grafana_available(self, event):
        """Create the grafana-source if we are the leader and have influxdb."""
        leader = self._is_leader()
        influxdb_info = self._get_influxdb_info()
        grafana = self._grafana

        if leader and influxdb_info:
            grafana.set_grafana_source_info(influxdb_info)

    def _on_influxdb_available(self, event):
        """Create the grafana-source if we have all the things."""
        grafana = self._grafana
        influxdb_info = self._get_influxdb_info()
        leader = self._is_leader()

        if leader and grafana.is_joined and influxdb_info:
            grafana.set_grafana_source_info(influxdb_info)

        self._on_check_status_and_write_config(event)

    def _on_slurmrestd_available(self, event):
        """Set slurm_config on the relation when slurmrestd available."""
        if not self._check_status():
            event.defer()
            return

        # Generate the slurm_config
        slurm_config = self._assemble_slurm_config()

        if not slurm_config:
            self.unit.status = BlockedStatus(
                "Cannot generate slurm_config - defering event.")
            event.defer()
            return

        if self._stored.slurmrestd_available:
            self._slurmrestd.set_slurm_config_on_app_relation_data(
                slurm_config, )
            self._slurmrestd.restart_slurmrestd()

    def _on_check_status_and_write_config(self, event):
        """Check that we have what we need before we proceed."""
        if not self._check_status():
            event.defer()
            return

        # Generate the slurm_config
        slurm_config = self._assemble_slurm_config()

        if not slurm_config:
            self.unit.status = BlockedStatus(
                "Cannot generate slurm_config - defering event.")
            event.defer()
            return

        self._slurmctld.set_slurm_config_on_app_relation_data(slurm_config, )
        self._slurmctld.restart_slurmctld()

        self._slurmd.set_slurm_config_on_app_relation_data(slurm_config, )

        if self._stored.slurmrestd_available:
            self._slurmrestd.set_slurm_config_on_app_relation_data(
                slurm_config, )
            self._slurmrestd.restart_slurmrestd()

        self._slurm_manager.render_slurm_configs(slurm_config)

        if not self._stored.slurmd_restarted:
            self._slurm_manager.restart_slurm_component()
            self._stored.slurmd_restarted = True

        self._slurmctld.scontrol_reconfigure()

    def _assemble_slurm_config(self):
        """Assemble and return the slurm config."""
        slurmctld_info = self._slurmctld.get_slurmctld_info()
        slurmdbd_info = self._slurmdbd.get_slurmdbd_info()
        slurmd_info = self._slurmd.get_slurmd_info()

        if not (slurmd_info and slurmctld_info and slurmdbd_info):
            return {}

        addons_info = self._assemble_addons()
        partitions_info = self._assemble_partitions(slurmd_info)

        logger.debug(addons_info)
        logger.debug(partitions_info)
        logger.debug(slurmctld_info)
        logger.debug(slurmdbd_info)

        return {
            "partitions": partitions_info,
            **slurmctld_info,
            **slurmdbd_info,
            **addons_info,
            **self.config,
        }

    def _assemble_partitions(self, slurmd_info):
        """Make any needed modifications to partition data."""
        slurmd_info_tmp = copy.deepcopy(slurmd_info)
        default_partition_from_config = self.config.get("default_partition")

        for partition in slurmd_info:
            # Deep copy the partition to a tmp var so we can modify it as
            # needed whilst not modifying the object we are iterating over.
            partition_tmp = copy.deepcopy(partition)
            # Extract the partition_name from the partition.
            partition_name = partition["partition_name"]

            # Check that the default_partition isn't defined in the charm
            # config.
            # If the user hasn't provided a default partition, then we infer
            # the partition_default by defaulting to the "configurator"
            # partition.
            if not default_partition_from_config:
                if partition["partition_name"] == "configurator":
                    partition_tmp["partition_default"] = "YES"
            else:
                if default_partition_from_config == partition_name:
                    partition_tmp["partition_default"] = "YES"

            slurmd_info_tmp.remove(partition)
            slurmd_info_tmp.append(partition_tmp)

        return slurmd_info_tmp

    def _assemble_addons(self):
        """Assemble any addon components."""
        acct_gather = self._get_influxdb_info()
        elasticsearch_ingress = self._elasticsearch.get_elasticsearch_ingress()
        nhc_info = self._nhc.get_nhc_info()
        prolog_epilog = self._prolog_epilog.get_prolog_epilog()

        ctxt = dict()

        if prolog_epilog:
            ctxt["prolog_epilog"] = prolog_epilog

        if acct_gather:
            ctxt["acct_gather"] = acct_gather
            acct_gather_custom = self.config.get("acct_gather_custom")
            if acct_gather_custom:
                ctxt["acct_gather"]["custom"] = acct_gather_custom

        if nhc_info:
            ctxt["nhc"] = {
                "nhc_bin": nhc_info["nhc_bin"],
                "health_check_interval": nhc_info["health_check_interval"],
                "health_check_node_state": nhc_info["health_check_node_state"],
            }

        if elasticsearch_ingress:
            ctxt["elasticsearch_address"] = elasticsearch_ingress

        return ctxt

    def _check_status(self):
        """Check that the core components we need exist."""
        slurm_component_statuses = {
            "slurmctld": {
                "available": self._stored.slurmctld_available,
                "joined": self._slurmctld.is_joined,
            },
            "slurmd": {
                "available": self._stored.slurmd_available,
                "joined": self._slurmd.is_joined,
            },
            "slurmdbd": {
                "available": self._stored.slurmdbd_available,
                "joined": self._slurmdbd.is_joined,
            },
        }

        relations_needed = []
        waiting_on = []

        msg = str()

        for slurm_component in slurm_component_statuses.keys():
            if not slurm_component_statuses[slurm_component]["joined"]:
                relations_needed.append(slurm_component)
            elif not slurm_component_statuses[slurm_component]["available"]:
                waiting_on.append(slurm_component)

        relations_needed_len = len(relations_needed)
        waiting_on_len = len(waiting_on)

        if relations_needed_len > 0:
            msg += f"Needed relations: {','.join(relations_needed)} "

        if waiting_on_len > 0:
            msg += f"Waiting on: {','.join(waiting_on)}"

        # Using what we have gathered about the status of each slurm component,
        # determine the application status.
        if relations_needed_len > 0:
            self.unit.status = BlockedStatus(msg)
        elif waiting_on_len > 0:
            self.unit.status = WaitingStatus(msg)
        else:
            self.unit.status = ActiveStatus("slurm-configurator available")
            return True
        return False

    def _get_influxdb_info(self):
        """Return influxdb info."""
        return self._influxdb.get_influxdb_info()

    def _is_leader(self):
        return self.model.unit.is_leader()

    def get_munge_key(self):
        """Return the slurmdbd_info from stored state."""
        return self._stored.munge_key

    def is_slurm_installed(self):
        """Return true/false based on whether or not slurm is installed."""
        return self._stored.slurm_installed

    def set_slurmctld_available(self, slurmctld_available):
        """Set slurmctld_available."""
        self._stored.slurmctld_available = slurmctld_available

    def set_slurmdbd_available(self, slurmdbd_available):
        """Set slurmdbd_available."""
        self._stored.slurmdbd_available = slurmdbd_available

    def set_slurmd_available(self, slurmd_available):
        """Set slurmd_available."""
        self._stored.slurmd_available = slurmd_available

    def set_slurmrestd_available(self, slurmrestd_available):
        """Set slurmrestd_available."""
        self._stored.slurmrestd_available = slurmrestd_available