def configure_node(cluster_changed, cluster_joined): status_set('maintenance', 'Configuring slurm-node') controller_data = cluster_changed.active_data create_spool_dir(context=controller_data) render_munge_key(context=controller_data) # If the munge.key has been changed on the controller and munge is # running, the service must be restarted to use the new key if flags.is_flag_set('endpoint.slurm-cluster.changed.munge_key' ) and service_running(MUNGE_SERVICE): log('Restarting munge due to key change on slurm-controller') service_restart(MUNGE_SERVICE) render_slurm_config(context=controller_data) # Make sure munge is running if not service_running(MUNGE_SERVICE): service_start(MUNGE_SERVICE) # Make sure slurmd is running if not service_running(SLURMD_SERVICE): service_start(SLURMD_SERVICE) flags.set_flag('slurm-node.configured') log('Set {} flag'.format('slurm-node.configured')) flags.clear_flag('endpoint.slurm-cluster.active.changed') log('Cleared {} flag'.format('endpoint.slurm-cluster.active.changed')) # Clear this flag to be able to signal munge_key changed if it occurs from # a controller. flags.clear_flag('endpoint.slurm-cluster.changed.munge_key') log('Cleared {} flag'.format('endpoint.slurm-cluster.changed.munge_key'))
def configure_node(cluster_changed, cluster_joined): status_set('maintenance', 'Configuring slurm-node') controller_data = cluster_changed.active_data create_spool_dir(context=controller_data) render_munge_key(context=controller_data) render_slurm_config(context=controller_data) # Make sure slurmd is running if not service_running(SLURMD_SERVICE): service_start(SLURMD_SERVICE) flags.set_flag('slurm-node.configured') log('Set {} flag'.format('slurm-node.configured')) flags.clear_flag('endpoint.slurm-cluster.active.changed') log('Cleared {} flag'.format('endpoint.slurm-cluster.active.changed'))
def configure_node(cluster_changed, cluster_joined): status_set('maintenance', 'Configuring slurm-node') controller_data = cluster_changed.active_data gres_context = get_inventory() gres_context.update({key: controller_data[key] for key in ['slurm_user']}) create_spool_dir(context=controller_data) render_munge_key(context=controller_data) render_slurm_config(context=controller_data) render_gres_config(context=gres_context) # Make sure slurmd is running if not service_running(SLURMD_SERVICE): service_start(SLURMD_SERVICE) flags.set_flag('slurm-node.configured') flags.clear_flag('endpoint.slurm-cluster.active.changed')
def configure_controller(cluster): status_set('maintenance', 'Configuring slurm-controller') # Get node configs nodes = cluster.get_nodes() partitions = cluster.get_partitions() config().update({ 'nodes': nodes, 'partitions': partitions, 'control_machine': gethostname(), 'control_addr': unit_private_ip(), }) # Setup slurm dirs and config create_state_save_location(config=config()) render_slurm_config(config=config()) # Make sure slurmctld is running if not service_running(SLURMCTLD_SERVICE): service_start(SLURMCTLD_SERVICE) # Send config to nodes cluster.send_controller_config(config=config()) # Update states remove_state('slurm-controller.changed') set_state('slurm-controller.configured')
def configure_controller(*args): ''' A controller is only configured after leader election is performed. Cluster endpoint must be present for a controller to proceed with initial configuration''' hookenv.status_set('maintenance', 'Configuring slurm-controller') # need to have a role determined here so that a controller context can # be uniformly prepared for consumption on the worker side as controller # and node layers share a common layer with a slurm.conf template # mostly identical on all nodes is_active = controller.is_active_controller() role = controller.ROLES[is_active] peer_role = controller.ROLES[not is_active] # the endpoint is present as joined is required for this handler cluster_endpoint = relations.endpoint_from_flag( 'endpoint.slurm-cluster.joined') # Get node configs nodes = cluster_endpoint.get_node_data() partitions = controller.get_partitions(nodes) # relation-changed does not necessarily mean that data will be provided if not partitions: flags.clear_flag('endpoint.slurm-cluster.changed') return # the whole charm config will be sent to related nodes # with some additional options added via dict update controller_conf = copy.deepcopy(hookenv.config()) controller_conf.update({ 'nodes': nodes, 'partitions': partitions, # for worker nodes 'munge_key': hookenv.leader_get('munge_key'), }) net_details = controller.add_key_prefix( cluster_endpoint.network_details(), role) # update the config dict used as a context in rendering to have prefixed # keys for network details based on a current unit role (active or backup) controller_conf.update(net_details) ha_endpoint = relations.endpoint_from_flag( 'endpoint.slurm-controller-ha.joined') if ha_endpoint: # add prefixed peer data peer_data = controller.add_key_prefix( ha_endpoint.peer_data, peer_role) controller_conf.update(peer_data) else: peer_data = None # a controller service is configurable if it is an active controller # or a backup controller that knows about an active controller is_configurable = is_active or (not is_active and peer_data) if is_configurable: hookenv.log('The controller is configurable ({})'.format(role)) # Setup slurm dirs and config helpers.create_state_save_location(context=controller_conf) helpers.render_slurm_config(context=controller_conf) flags.set_flag('slurm-controller.configured') # restart controller process on any changes # TODO: this could be optimized via goal-state hook by # observing "joining" node units host.service_restart(helpers.SLURMCTLD_SERVICE) else: hookenv.log('The controller is NOT configurable ({})'.format(role)) if not is_active: hookenv.status_set('maintenance', 'Backup controller is waiting for peer data') # Send config to nodes if is_active: # TODO: wait until a peer acknowledges that it has cleared # its side of a node-facing relation - this needs to be done # in case an active controller is changed to a different one # to avoid split-brain conditions on node units cluster_endpoint.send_controller_config(controller_conf) else: # otherwise make sure that all keys are cleared # this is relevant for a former active controller cluster_endpoint.send_controller_config({ k: None for k in controller_conf.keys() }) # clear the changed flag as it is not cleared automatically flags.clear_flag('endpoint.slurm-cluster.changed')
def configure_controller(*args): ''' A controller is only configured after leader election is performed. Cluster endpoint must be present for a controller to proceed with initial configuration''' hookenv.status_set('maintenance', 'Configuring slurm-controller') flags.clear_flag('slurm-controller.configured') # need to have a role determined here so that a controller context can # be uniformly prepared for consumption on the worker side as controller # and node layers share a common layer with a slurm.conf template # mostly identical on all nodes is_active = controller.is_active_controller() role = controller.ROLES[is_active] peer_role = controller.ROLES[not is_active] # the endpoint is present as joined is required for this handler cluster_endpoint = relations.endpoint_from_flag( 'endpoint.slurm-cluster.joined') # Get node configs nodes = cluster_endpoint.get_node_data() partitions = controller.get_partitions(nodes) # Implementation of automatic node weights node_weight_criteria = hookenv.config().get('node_weight_criteria') if node_weight_criteria != 'none': weightres = controller.set_node_weight_criteria(node_weight_criteria, nodes) # If the weight configuration is incorrect, abort reconfiguration. Status # will be set to blocked with an informative message. The controller charm # will keep running. if not weightres: return # relation-changed does not necessarily mean that data will be provided if not partitions: flags.clear_flag('endpoint.slurm-cluster.changed') return # the whole charm config will be sent to related nodes # with some additional options added via dict update controller_conf = copy.deepcopy(hookenv.config()) # if controller cluster config include file exists, add contents to controller_conf dict slurmconf_include = '%s/slurm-%s.conf' % (helpers.SLURM_CONFIG_DIR, hookenv.config().get('clustername')) if os.path.exists(slurmconf_include): f = open(slurmconf_include, "r") controller_conf.update({'include': f.read()}) f.close() controller_conf.update({ 'nodes': nodes, 'partitions': partitions, # for worker nodes 'munge_key': hookenv.leader_get('munge_key'), }) net_details = controller.add_key_prefix( cluster_endpoint.network_details(), role) # update the config dict used as a context in rendering to have prefixed # keys for network details based on a current unit role (active or backup) controller_conf.update(net_details) ha_endpoint = relations.endpoint_from_flag( 'endpoint.slurm-controller-ha.joined') if ha_endpoint: # add prefixed peer data peer_data = controller.add_key_prefix( ha_endpoint.peer_data, peer_role) controller_conf.update(peer_data) else: peer_data = None # If we have a DBD relation, extract endpoint data and configure DBD setup # directly, regardless if the clustername gets accepted in the DBD or not if flags.is_flag_set('endpoint.slurm-dbd-consumer.joined') and leadership.leader_get('dbd_host'): dbd_host = leadership.leader_get('dbd_host') controller_conf.update({ 'dbd_host': leadership.leader_get('dbd_host'), 'dbd_port': leadership.leader_get('dbd_port'), 'dbd_ipaddr': leadership.leader_get('dbd_ipaddr') }) es_endpoint = relations.endpoint_from_flag( 'elasticsearch.available') if es_endpoint: for unit in es_endpoint.list_unit_data(): elastic_host = unit['host'] elastic_port = unit['port'] controller_conf.update({ 'elastic_host': elastic_host, 'elastic_port': elastic_port, }) hookenv.log(("elasticsearch available, using %s:%s from endpoint relation.") % (elastic_host,elastic_port)) else: hookenv.log('No endpoint for elasticsearch available') # In case we are here due to DBD join or charm config change, announce this to the nodes # by changing the value of slurm_config_updated if flags.is_flag_set('slurm.dbd_host_updated') or flags.is_flag_set('config.changed'): ts = time.time() hookenv.log('Slurm configuration on controller was updated on %s, annoucing to nodes' % ts) controller_conf.update({ 'slurm_config_updated': ts }) flags.clear_flag('slurm.dbd_host_updated') # a controller service is configurable if it is an active controller # or a backup controller that knows about an active controller is_configurable = is_active or (not is_active and peer_data) if is_configurable: hookenv.log('The controller is configurable ({})'.format(role)) # Setup slurm dirs and config helpers.create_state_save_location(context=controller_conf) helpers.render_slurm_config(context=controller_conf, active_controller=is_active) flags.set_flag('slurm-controller.configured') flags.clear_flag('slurm-controller.reconfigure') flags.clear_flag('slurm-controller.munge_updated') # restart controller process on any changes # TODO: this could be optimized via goal-state hook by # observing "joining" node units host.service_restart(helpers.SLURMCTLD_SERVICE) else: hookenv.log('The controller is NOT configurable ({})'.format(role)) if not is_active: hookenv.status_set('maintenance', 'Backup controller is waiting for peer data') # Send config to nodes if is_active: # TODO: wait until a peer acknowledges that it has cleared # its side of a node-facing relation - this needs to be done # in case an active controller is changed to a different one # to avoid split-brain conditions on node units cluster_endpoint.send_controller_config(controller_conf) else: # otherwise make sure that all keys are cleared # this is relevant for a former active controller cluster_endpoint.send_controller_config({ k: None for k in controller_conf.keys() }) # clear the changed flag as it is not cleared automatically flags.clear_flag('endpoint.slurm-cluster.changed')