def configure_maas_stonith_resource(stonith_hostnames): """Create maas stonith resource for the given hostname. :param stonith_hostnames: The hostnames that the stonith management system refers to the remote node as. :type stonith_hostname: List """ ctxt = { 'stonith_plugin': 'stonith:external/maas', 'stonith_hostnames': stonith_hostnames, 'stonith_resource_name': 'st-maas', 'url': config('maas_url'), 'apikey': config('maas_credentials'), 'resource_params': ("params url='{url}' apikey='{apikey}' hostnames='{hostnames}' " "op monitor interval=25 start-delay=25 " "timeout=25") } _configure_stonith_resource(ctxt) pcmk.commit("crm configure property stonith-enabled=true", failure_is_fatal=True) return {ctxt['stonith_resource_name']: ctxt['stonith_plugin']}
def configure_stonith(): if configure_pacemaker_remote_stonith_resource(): configure_peer_stonith_resource() enable_stonith() set_stonith_configured(True) else: log('Disabling STONITH', level=INFO) cmd = "crm configure property stonith-enabled=false" pcmk.commit(cmd)
def configure_stonith(): if configure_pacemaker_remote_stonith_resource(): configure_peer_stonith_resource() log('Not disabling STONITH as pacemaker remotes are present', level=INFO) else: log('Disabling STONITH', level=INFO) cmd = "crm configure property stonith-enabled=false" pcmk.commit(cmd)
def configure_monitor_host(): """Configure extra monitor host for better network failure detection""" log('Checking monitor host configuration', level=DEBUG) monitor_host = config('monitor_host') if monitor_host: if not pcmk.crm_opt_exists('ping'): log('Implementing monitor host configuration (host: %s)' % monitor_host, level=DEBUG) monitor_interval = config('monitor_interval') cmd = ('crm -w -F configure primitive ping ' 'ocf:pacemaker:ping params host_list="%s" ' 'multiplier="100" op monitor interval="%s" ' % (monitor_host, monitor_interval)) pcmk.commit(cmd) cmd = ('crm -w -F configure clone cl_ping ping ' 'meta interleave="true"') pcmk.commit(cmd) else: log('Reconfiguring monitor host configuration (host: %s)' % monitor_host, level=DEBUG) cmd = ('crm -w -F resource param ping set host_list="%s"' % monitor_host) else: if pcmk.crm_opt_exists('ping'): log('Disabling monitor host configuration', level=DEBUG) pcmk.commit('crm -w -F resource stop ping') pcmk.commit('crm -w -F configure delete ping')
def configure_cluster_global(): """Configure global cluster options""" log('Applying global cluster configuration', level=DEBUG) # NOTE(lathiat) quorum in a two-node scenario is handled by # corosync two_node=1. In this case quorum is required for # initial cluster startup but not if a node was previously in # contact with the full cluster. log('Configuring no-quorum-policy to stop', level=DEBUG) cmd = "crm configure property no-quorum-policy=stop" pcmk.commit(cmd) cmd = ('crm configure rsc_defaults $id="rsc-options" ' 'resource-stickiness="100"') pcmk.commit(cmd)
def add_location_rules_for_local_nodes(res_name): """Add location rules for running resource on local nodes. Add location rules allowing the given resource to run on local nodes (eg not remote nodes). :param res_name: Resource name to create location rules for. :type res_name: str """ for node in pcmk.list_nodes(): loc_constraint_name = 'loc-{}-{}'.format(res_name, node) if not pcmk.crm_opt_exists(loc_constraint_name): cmd = 'crm -w -F configure location {} {} 0: {}'.format( loc_constraint_name, res_name, node) pcmk.commit(cmd, failure_is_fatal=True) log('%s' % cmd, level=DEBUG)
def _trigger_corosync_update(): # Trigger emit_corosync_conf() and corosync-cfgtool -R # for all the hanode peer units to run relid = relation_ids('hanode') if len(relid) < 1: action_fail('no peer ha nodes') return corosync_update_uuid = uuid.uuid1().hex reldata = {'trigger-corosync-update': corosync_update_uuid} relation_set(relation_id=relid[0], relation_settings=reldata) # Trigger the same logic in the leader (no hanode-relation-changed # hook will be received by self) if (is_update_ring_requested(corosync_update_uuid) and emit_corosync_conf()): cmd = 'corosync-cfgtool -R' pcmk.commit(cmd)
def configure_pacemaker_remote(remote_hostname, remote_ip): """Create a resource corresponding to the pacemaker remote node. :param remote_hostname: Remote hostname used for registering remote node. :type remote_hostname: str :param remote_ip: Remote IP used for registering remote node. :type remote_ip: str :returns: Name of resource for pacemaker remote node. :rtype: str """ resource_name = remote_hostname if not pcmk.is_resource_present(resource_name): cmd = ("crm configure primitive {} ocf:pacemaker:remote " "params server={} reconnect_interval=60 " "op monitor interval=30s").format(resource_name, remote_ip) pcmk.commit(cmd, failure_is_fatal=True) return resource_name
def configure_resources_on_remotes(resources=None, clones=None, groups=None): """Add location rules as needed for resources, clones and groups If remote nodes should not run resources then add location rules then add location rules to enable them on local nodes. :param resources: Resource definitions :type resources: dict :param clones: Clone definitions :type clones: dict :param groups: Group definitions :type groups: dict """ clones = clones or {} groups = groups or {} try: resources_on_remote = need_resources_on_remotes() except ValueError: msg = 'Unable to calculate whether resources should run on remotes' log(msg, level=WARNING) return if resources_on_remote: msg = ('Resources are permitted to run on remotes, no need to create ' 'location constraints') log(msg, level=WARNING) return pacemaker_remotes = [] for res_name, res_type in resources.items(): if res_name not in list(clones.values()) + list(groups.values()): if res_type == 'ocf:pacemaker:remote': pacemaker_remotes.append(res_name) else: add_location_rules_for_local_nodes(res_name) add_location_rules_for_pacemaker_remotes(pacemaker_remotes) for cl_name in clones: add_location_rules_for_local_nodes(cl_name) # Limit clone resources to only running on X number of nodes where X # is the number of local nodes. Otherwise they will show as offline # on the remote nodes. node_count = len(pcmk.list_nodes()) cmd = ('crm_resource --resource {} --set-parameter clone-max ' '--meta --parameter-value {}').format(cl_name, node_count) pcmk.commit(cmd, failure_is_fatal=True) log('%s' % cmd, level=DEBUG) for grp_name in groups: add_location_rules_for_local_nodes(grp_name)
def configure_cluster_global(): """Configure global cluster options""" log('Applying global cluster configuration', level=DEBUG) if int(config('cluster_count')) >= 3: # NOTE(jamespage) if 3 or more nodes, then quorum can be # managed effectively, so stop if quorum lost log('Configuring no-quorum-policy to stop', level=DEBUG) cmd = "crm configure property no-quorum-policy=stop" else: # NOTE(jamespage) if less that 3 nodes, quorum not possible # so ignore log('Configuring no-quorum-policy to ignore', level=DEBUG) cmd = "crm configure property no-quorum-policy=ignore" pcmk.commit(cmd) cmd = ('crm configure rsc_defaults $id="rsc-options" ' 'resource-stickiness="100"') pcmk.commit(cmd)
def configure_maas_stonith_resource(stonith_hostnames): """Create stonith resource for the given hostname. :param stonith_hostnames: The hostnames that the stonith management system refers to the remote node as. :type stonith_hostname: List """ hostnames = [] for host in stonith_hostnames: hostnames.append(host) if '.' in host: hostnames.append(host.split('.')[0]) hostnames = list(set(hostnames)) ctxt = { 'url': config('maas_url'), 'apikey': config('maas_credentials'), 'hostnames': ' '.join(sorted(hostnames))} if all(ctxt.values()): maas_login_params = "url='{url}' apikey='{apikey}'".format(**ctxt) maas_rsc_hash = pcmk.resource_checksum( 'st', 'stonith:external/maas', res_params=maas_login_params)[:7] ctxt['stonith_resource_name'] = 'st-maas-{}'.format(maas_rsc_hash) ctxt['resource_params'] = ( "params url='{url}' apikey='{apikey}' hostnames='{hostnames}' " "op monitor interval=25 start-delay=25 " "timeout=25").format(**ctxt) if pcmk.is_resource_present(ctxt['stonith_resource_name']): pcmk.crm_update_resource( ctxt['stonith_resource_name'], 'stonith:external/maas', ctxt['resource_params']) else: cmd = ( "crm configure primitive {stonith_resource_name} " "stonith:external/maas {resource_params}").format(**ctxt) pcmk.commit(cmd, failure_is_fatal=True) pcmk.commit( "crm configure property stonith-enabled=true", failure_is_fatal=True) else: raise ValueError("Missing configuration: {}".format(ctxt)) return {ctxt['stonith_resource_name']: 'stonith:external/maas'}
def set_cluster_symmetry(): """Set the cluster symmetry. By default the cluster is an Opt-out cluster (equivalent to symmetric-cluster=true) this means that any resource can run anywhere unless a node explicitly Opts-out. When using pacemaker-remotes there may be hundreds of nodes and if they are not prepared to run resources the cluster should be switched to an Opt-in cluster. """ try: symmetric = need_resources_on_remotes() except ValueError: msg = 'Unable to calculated desired symmetric-cluster setting' log(msg, level=WARNING) return log('Configuring symmetric-cluster: {}'.format(symmetric), level=DEBUG) cmd = "crm configure property symmetric-cluster={}".format( str(symmetric).lower()) pcmk.commit(cmd, failure_is_fatal=True)
def _configure_stonith_resource(ctxt): hostnames = [] for host in ctxt['stonith_hostnames']: hostnames.append(host) if '.' in host: hostnames.append(host.split('.')[0]) ctxt['hostnames'] = ' '.join(sorted(list(set(hostnames)))) if all(ctxt.values()): ctxt['resource_params'] = ctxt['resource_params'].format(**ctxt) if pcmk.is_resource_present(ctxt['stonith_resource_name']): pcmk.crm_update_resource(ctxt['stonith_resource_name'], ctxt['stonith_plugin'], ctxt['resource_params']) else: cmd = ("crm configure primitive {stonith_resource_name} " "{stonith_plugin} {resource_params}").format(**ctxt) pcmk.commit(cmd, failure_is_fatal=True) else: raise ValueError("Missing configuration: {}".format(ctxt))
def configure_pacemaker_remote(remote_hostname, remote_ip): """Create a resource corresponding to the pacemaker remote node. :param remote_hostname: Remote hostname used for registering remote node. :type remote_hostname: str :param remote_ip: Remote IP used for registering remote node. :type remote_ip: str :returns: Name of resource for pacemaker remote node. :rtype: str """ resource_name = remote_hostname.split('.')[0] if not pcmk.is_resource_present(resource_name): cmd = ( "crm configure primitive {} ocf:pacemaker:remote " "params server={} reconnect_interval=60 " "op monitor interval=30s").format(resource_name, remote_ip) pcmk.commit(cmd, failure_is_fatal=True) return resource_name
def configure_stonith(): if config('stonith_enabled') not in ['true', 'True', True]: if configure_pacemaker_remote_stonith_resource(): log('Not disabling STONITH as pacemaker remotes are present', level=INFO) else: log('Disabling STONITH', level=INFO) cmd = "crm configure property stonith-enabled=false" pcmk.commit(cmd) else: log('Enabling STONITH for all nodes in cluster.', level=INFO) # configure stontih resources for all nodes in cluster. # note: this is totally provider dependent and requires # access to the MAAS API endpoint, using endpoint and credentials # set in config. url = config('maas_url') creds = config('maas_credentials') if None in [url, creds]: msg = 'maas_url and maas_credentials must be set ' \ 'in config to enable STONITH.' status_set('blocked', msg) raise Exception(msg) nodes = maas.MAASHelper(url, creds).list_nodes() if not nodes: msg = 'Could not obtain node inventory from ' \ 'MAAS @ %s.' % url status_set('blocked', msg) raise Exception(msg) cluster_nodes = pcmk.list_nodes() for node in cluster_nodes: rsc, constraint = pcmk.maas_stonith_primitive(nodes, node) if not rsc: msg = 'Failed to determine STONITH primitive for ' \ 'node %s' % node status_set('blocked', msg) raise Exception(msg) rsc_name = str(rsc).split(' ')[1] if not pcmk.is_resource_present(rsc_name): log('Creating new STONITH primitive %s.' % rsc_name, level=DEBUG) cmd = 'crm -F configure %s' % rsc pcmk.commit(cmd) if constraint: cmd = 'crm -F configure %s' % constraint pcmk.commit(cmd) else: log('STONITH primitive already exists for node.', level=DEBUG) pcmk.commit("crm configure property stonith-enabled=true")
def configure_legacy_stonith(): if config('stonith_enabled') not in ['true', 'True', True]: if configure_pacemaker_remote_stonith_resource(): log('Not disabling STONITH as pacemaker remotes are present', level=INFO) else: log('Disabling STONITH', level=INFO) cmd = "crm configure property stonith-enabled=false" pcmk.commit(cmd) else: log('Enabling STONITH for all nodes in cluster.', level=INFO) # configure stontih resources for all nodes in cluster. # note: this is totally provider dependent and requires # access to the MAAS API endpoint, using endpoint and credentials # set in config. url = config('maas_url') creds = config('maas_credentials') if None in [url, creds]: msg = 'maas_url and maas_credentials must be set ' \ 'in config to enable STONITH.' status_set('blocked', msg) raise Exception(msg) nodes = maas.MAASHelper(url, creds).list_nodes() if not nodes: msg = 'Could not obtain node inventory from ' \ 'MAAS @ %s.' % url status_set('blocked', msg) raise Exception(msg) cluster_nodes = pcmk.list_nodes() for node in cluster_nodes: rsc, constraint = pcmk.maas_stonith_primitive(nodes, node) if not rsc: msg = 'Failed to determine STONITH primitive for ' \ 'node %s' % node status_set('blocked', msg) raise Exception(msg) rsc_name = str(rsc).split(' ')[1] if not pcmk.is_resource_present(rsc_name): log('Creating new STONITH primitive %s.' % rsc_name, level=DEBUG) cmd = 'crm -F configure %s' % rsc pcmk.commit(cmd) if constraint: cmd = 'crm -F configure %s' % constraint pcmk.commit(cmd) else: log('STONITH primitive already exists for node.', level=DEBUG) pcmk.commit("crm configure property stonith-enabled=true")
def configure_resources_on_remotes(resources=None, clones=None, groups=None): """Add location rules as needed for resources, clones and groups If remote nodes should not run resources then add location rules then add location rules to enable them on local nodes. :param resources: Resource definitions :type resources: dict :param clones: Clone definitions :type clones: dict :param groups: Group definitions :type groups: dict """ clones = clones or {} groups = groups or {} try: resources_on_remote = need_resources_on_remotes() except ValueError: msg = 'Unable to calculate whether resources should run on remotes' log(msg, level=WARNING) return if resources_on_remote: msg = ('Resources are permitted to run on remotes, no need to create ' 'location constraints') log(msg, level=WARNING) return for res_name, res_type in resources.items(): if res_name not in list(clones.values()) + list(groups.values()): add_location_rules_for_local_nodes(res_name) for cl_name in clones: add_location_rules_for_local_nodes(cl_name) # Limit clone resources to only running on X number of nodes where X # is the number of local nodes. Otherwise they will show as offline # on the remote nodes. node_count = len(pcmk.list_nodes()) cmd = ('crm_resource --resource {} --set-parameter clone-max ' '--meta --parameter-value {}').format(cl_name, node_count) pcmk.commit(cmd, failure_is_fatal=True) log('%s' % cmd, level=DEBUG) for grp_name in groups: add_location_rules_for_local_nodes(grp_name)
def cleanup_remote_nodes(remote_nodes): """Cleanup pacemaker remote resources Remove all status records of the resource and probe the node afterwards. :param remote_nodes: List of resource names associated with remote nodes :type remote_nodes: list """ for res_name in remote_nodes: cmd = 'crm resource cleanup {}'.format(res_name) # Resource cleanups seem to fail occasionally even on healthy nodes # Bug #1822962. Given this cleanup task is just housekeeping log # the message if a failure occurs and move on. if pcmk.commit(cmd, failure_is_fatal=False) == 0: log('Cleanup of resource {} succeeded'.format(res_name), level=DEBUG) else: log('Cleanup of resource {} failed'.format(res_name), level=WARNING)
def cleanup_remote_nodes(remote_nodes): """Cleanup pacemaker remote resources Remove all status records of the resource and probe the node afterwards. :param remote_nodes: List of resource names associated with remote nodes :type remote_nodes: list """ for res_name in remote_nodes: cmd = 'crm resource cleanup {}'.format(res_name) # Resource cleanups seem to fail occasionally even on healthy nodes # Bug #1822962. Given this cleanup task is just housekeeping log # the message if a failure occurs and move on. if pcmk.commit(cmd, failure_is_fatal=False) == 0: log( 'Cleanup of resource {} succeeded'.format(res_name), level=DEBUG) else: log( 'Cleanup of resource {} failed'.format(res_name), level=WARNING)
def configure_cluster_global(): """Configure global cluster options""" log('Applying global cluster configuration', level=DEBUG) # NOTE(lathiat) quorum in a two-node scenario is handled by # corosync two_node=1. In this case quorum is required for # initial cluster startup but not if a node was previously in # contact with the full cluster. log('Configuring no-quorum-policy to stop', level=DEBUG) cmd = "crm configure property no-quorum-policy=stop" pcmk.commit(cmd) cmd = ('crm configure rsc_defaults $id="rsc-options" ' 'resource-stickiness="100"') pcmk.commit(cmd) log('Configuring cluster-recheck-interval to 60 seconds', level=DEBUG) cmd = "crm configure property cluster-recheck-interval=60" pcmk.commit(cmd)
def configure_cluster_global(failure_timeout): """Configure global cluster options :param failure_timeout: Duration in seconds (measured from the most recent failure) to wait before resetting failcount to 0. :type failure_timeout: int """ log('Applying global cluster configuration', level=DEBUG) # NOTE(lathiat) quorum in a two-node scenario is handled by # corosync two_node=1. In this case quorum is required for # initial cluster startup but not if a node was previously in # contact with the full cluster. log('Configuring no-quorum-policy to stop', level=DEBUG) cmd = "crm configure property no-quorum-policy=stop" pcmk.commit(cmd) cmd = ('crm configure rsc_defaults $id="rsc-options" ' 'resource-stickiness="100" ' 'failure-timeout={}'.format(failure_timeout)) pcmk.commit(cmd) log('Configuring cluster-recheck-interval to 60 seconds', level=DEBUG) cmd = "crm configure property cluster-recheck-interval=60" pcmk.commit(cmd)
def disable_stonith(): """Disable stonith via the global property stonith-enabled.""" pcmk.commit("crm configure property stonith-enabled=false", failure_is_fatal=True)
def remove_legacy_maas_stonith_resources(): """Remove maas stoniths resources using the old name.""" stonith_resources = pcmk.crm_maas_stonith_resource_list() for resource_name in stonith_resources: pcmk.commit('crm -w -F resource stop {}'.format(resource_name)) pcmk.commit('crm -w -F configure delete {}'.format(resource_name))
def stop(): cmd = 'crm -w -F node delete %s' % socket.gethostname() pcmk.commit(cmd) apt_purge(['corosync', 'pacemaker'], fatal=True)
def ha_relation_changed(): # Check that we are related to a principle and that # it has already provided the required corosync configuration if not get_corosync_conf(): log('Unable to configure corosync right now, deferring configuration', level=INFO) return if relation_ids('hanode'): log('Ready to form cluster - informing peers', level=DEBUG) relation_set(relation_id=relation_ids('hanode')[0], ready=True) else: log('Ready to form cluster, but not related to peers just yet', level=INFO) return # Check that there's enough nodes in order to perform the # configuration of the HA cluster if len(get_cluster_nodes()) < int(config('cluster_count')): log('Not enough nodes in cluster, deferring configuration', level=INFO) return relids = relation_ids('ha') if len(relids) == 1: # Should only ever be one of these # Obtain relation information relid = relids[0] units = related_units(relid) if len(units) < 1: log('No principle unit found, deferring configuration', level=INFO) return unit = units[0] log('Parsing cluster configuration using rid: %s, unit: %s' % (relid, unit), level=DEBUG) resources = parse_data(relid, unit, 'resources') delete_resources = parse_data(relid, unit, 'delete_resources') resource_params = parse_data(relid, unit, 'resource_params') groups = parse_data(relid, unit, 'groups') ms = parse_data(relid, unit, 'ms') orders = parse_data(relid, unit, 'orders') colocations = parse_data(relid, unit, 'colocations') clones = parse_data(relid, unit, 'clones') locations = parse_data(relid, unit, 'locations') init_services = parse_data(relid, unit, 'init_services') else: log('Related to %s ha services' % (len(relids)), level=DEBUG) return if True in [ ra.startswith('ocf:openstack') for ra in resources.itervalues() ]: apt_install('openstack-resource-agents') if True in [ra.startswith('ocf:ceph') for ra in resources.itervalues()]: apt_install('ceph-resource-agents') if True in [ra.startswith('ocf:maas') for ra in resources.values()]: if validate_dns_ha(): log('Setting up access to MAAS API', level=INFO) setup_maas_api() # Update resource_parms for DNS resources to include MAAS URL and # credentials for resource in resource_params.keys(): if resource.endswith("_hostname"): resource_params[resource] += ( ' maas_url="{}" maas_credentials="{}"' ''.format(config('maas_url'), config('maas_credentials'))) else: msg = ("DNS HA is requested but maas_url " "or maas_credentials are not set") status_set('blocked', msg) raise ValueError(msg) # NOTE: this should be removed in 15.04 cycle as corosync # configuration should be set directly on subordinate configure_corosync() pcmk.wait_for_pcmk() configure_cluster_global() configure_monitor_host() configure_stonith() # Only configure the cluster resources # from the oldest peer unit. if oldest_peer(peer_units()): log('Deleting Resources' % (delete_resources), level=DEBUG) for res_name in delete_resources: if pcmk.crm_opt_exists(res_name): if ocf_file_exists(res_name, resources): log('Stopping and deleting resource %s' % res_name, level=DEBUG) if pcmk.crm_res_running(res_name): pcmk.commit('crm -w -F resource stop %s' % res_name) else: log('Cleanuping and deleting resource %s' % res_name, level=DEBUG) pcmk.commit('crm resource cleanup %s' % res_name) # Daemon process may still be running after the upgrade. kill_legacy_ocf_daemon_process(res_name) pcmk.commit('crm -w -F configure delete %s' % res_name) log('Configuring Resources: %s' % (resources), level=DEBUG) for res_name, res_type in resources.iteritems(): # disable the service we are going to put in HA if res_type.split(':')[0] == "lsb": disable_lsb_services(res_type.split(':')[1]) if service_running(res_type.split(':')[1]): service_stop(res_type.split(':')[1]) elif (len(init_services) != 0 and res_name in init_services and init_services[res_name]): disable_upstart_services(init_services[res_name]) if service_running(init_services[res_name]): service_stop(init_services[res_name]) # Put the services in HA, if not already done so # if not pcmk.is_resource_present(res_name): if not pcmk.crm_opt_exists(res_name): if res_name not in resource_params: cmd = 'crm -w -F configure primitive %s %s' % (res_name, res_type) else: cmd = ('crm -w -F configure primitive %s %s %s' % (res_name, res_type, resource_params[res_name])) pcmk.commit(cmd) log('%s' % cmd, level=DEBUG) if config('monitor_host'): cmd = ('crm -F configure location Ping-%s %s rule ' '-inf: pingd lte 0' % (res_name, res_name)) pcmk.commit(cmd) log('Configuring Groups: %s' % (groups), level=DEBUG) for grp_name, grp_params in groups.iteritems(): if not pcmk.crm_opt_exists(grp_name): cmd = ('crm -w -F configure group %s %s' % (grp_name, grp_params)) pcmk.commit(cmd) log('%s' % cmd, level=DEBUG) log('Configuring Master/Slave (ms): %s' % (ms), level=DEBUG) for ms_name, ms_params in ms.iteritems(): if not pcmk.crm_opt_exists(ms_name): cmd = 'crm -w -F configure ms %s %s' % (ms_name, ms_params) pcmk.commit(cmd) log('%s' % cmd, level=DEBUG) log('Configuring Orders: %s' % (orders), level=DEBUG) for ord_name, ord_params in orders.iteritems(): if not pcmk.crm_opt_exists(ord_name): cmd = 'crm -w -F configure order %s %s' % (ord_name, ord_params) pcmk.commit(cmd) log('%s' % cmd, level=DEBUG) log('Configuring Colocations: %s' % colocations, level=DEBUG) for col_name, col_params in colocations.iteritems(): if not pcmk.crm_opt_exists(col_name): cmd = 'crm -w -F configure colocation %s %s' % (col_name, col_params) pcmk.commit(cmd) log('%s' % cmd, level=DEBUG) log('Configuring Clones: %s' % clones, level=DEBUG) for cln_name, cln_params in clones.iteritems(): if not pcmk.crm_opt_exists(cln_name): cmd = 'crm -w -F configure clone %s %s' % (cln_name, cln_params) pcmk.commit(cmd) log('%s' % cmd, level=DEBUG) log('Configuring Locations: %s' % locations, level=DEBUG) for loc_name, loc_params in locations.iteritems(): if not pcmk.crm_opt_exists(loc_name): cmd = 'crm -w -F configure location %s %s' % (loc_name, loc_params) pcmk.commit(cmd) log('%s' % cmd, level=DEBUG) for res_name, res_type in resources.iteritems(): if len(init_services) != 0 and res_name in init_services: # Checks that the resources are running and started. # Ensure that clones are excluded as the resource is # not directly controllable (dealt with below) # Ensure that groups are cleaned up as a whole rather # than as individual resources. if (res_name not in clones.values() and res_name not in groups.values() and not pcmk.crm_res_running(res_name)): # Just in case, cleanup the resources to ensure they get # started in case they failed for some unrelated reason. cmd = 'crm resource cleanup %s' % res_name pcmk.commit(cmd) for cl_name in clones: # Always cleanup clones cmd = 'crm resource cleanup %s' % cl_name pcmk.commit(cmd) for grp_name in groups: # Always cleanup groups cmd = 'crm resource cleanup %s' % grp_name pcmk.commit(cmd) for rel_id in relation_ids('ha'): relation_set(relation_id=rel_id, clustered="yes")
def ha_relation_changed(): # Check that we are related to a principle and that # it has already provided the required corosync configuration if not get_corosync_conf(): log('Unable to configure corosync right now, deferring configuration', level=INFO) return if relation_ids('hanode'): log('Ready to form cluster - informing peers', level=DEBUG) relation_set(relation_id=relation_ids('hanode')[0], ready=True) else: log('Ready to form cluster, but not related to peers just yet', level=INFO) return # Check that there's enough nodes in order to perform the # configuration of the HA cluster if len(get_cluster_nodes()) < int(config('cluster_count')): log('Not enough nodes in cluster, deferring configuration', level=INFO) return relids = relation_ids('ha') if len(relids) == 1: # Should only ever be one of these # Obtain relation information relid = relids[0] units = related_units(relid) if len(units) < 1: log('No principle unit found, deferring configuration', level=INFO) return unit = units[0] log('Parsing cluster configuration using rid: %s, unit: %s' % (relid, unit), level=DEBUG) resources = parse_data(relid, unit, 'resources') delete_resources = parse_data(relid, unit, 'delete_resources') resource_params = parse_data(relid, unit, 'resource_params') groups = parse_data(relid, unit, 'groups') ms = parse_data(relid, unit, 'ms') orders = parse_data(relid, unit, 'orders') colocations = parse_data(relid, unit, 'colocations') clones = parse_data(relid, unit, 'clones') locations = parse_data(relid, unit, 'locations') init_services = parse_data(relid, unit, 'init_services') else: log('Related to %s ha services' % (len(relids)), level=DEBUG) return if True in [ra.startswith('ocf:openstack') for ra in resources.values()]: apt_install('openstack-resource-agents') if True in [ra.startswith('ocf:ceph') for ra in resources.values()]: apt_install('ceph-resource-agents') if True in [ra.startswith('ocf:maas') for ra in resources.values()]: try: validate_dns_ha() except MAASConfigIncomplete as ex: log(ex.args[0], level=ERROR) status_set('blocked', ex.args[0]) # if an exception is raised the hook will end up in error state # which will obfuscate the workload status and message. return log('Setting up access to MAAS API', level=INFO) setup_maas_api() # Update resource_parms for DNS resources to include MAAS URL and # credentials for resource in resource_params.keys(): if resource.endswith("_hostname"): res_ipaddr = get_ip_addr_from_resource_params( resource_params[resource]) resource_params[resource] += ( ' maas_url="{}" maas_credentials="{}"' ''.format(config('maas_url'), config('maas_credentials'))) write_maas_dns_address(resource, res_ipaddr) # NOTE: this should be removed in 15.04 cycle as corosync # configuration should be set directly on subordinate configure_corosync() try_pcmk_wait() configure_cluster_global() configure_monitor_host() configure_stonith() # Only configure the cluster resources # from the oldest peer unit. if is_leader(): log('Deleting Resources' % (delete_resources), level=DEBUG) for res_name in delete_resources: if pcmk.crm_opt_exists(res_name): if ocf_file_exists(res_name, resources): log('Stopping and deleting resource %s' % res_name, level=DEBUG) if pcmk.crm_res_running(res_name): pcmk.commit('crm -w -F resource stop %s' % res_name) else: log('Cleanuping and deleting resource %s' % res_name, level=DEBUG) pcmk.commit('crm resource cleanup %s' % res_name) # Daemon process may still be running after the upgrade. kill_legacy_ocf_daemon_process(res_name) pcmk.commit('crm -w -F configure delete %s' % res_name) log('Configuring Resources: %s' % (resources), level=DEBUG) for res_name, res_type in resources.items(): # disable the service we are going to put in HA if res_type.split(':')[0] == "lsb": disable_lsb_services(res_type.split(':')[1]) if service_running(res_type.split(':')[1]): service_stop(res_type.split(':')[1]) elif (len(init_services) != 0 and res_name in init_services and init_services[res_name]): disable_upstart_services(init_services[res_name]) if service_running(init_services[res_name]): service_stop(init_services[res_name]) # Put the services in HA, if not already done so # if not pcmk.is_resource_present(res_name): if not pcmk.crm_opt_exists(res_name): if res_name not in resource_params: cmd = 'crm -w -F configure primitive %s %s' % (res_name, res_type) else: cmd = ('crm -w -F configure primitive %s %s %s' % (res_name, res_type, resource_params[res_name])) pcmk.commit(cmd) log('%s' % cmd, level=DEBUG) if config('monitor_host'): cmd = ('crm -F configure location Ping-%s %s rule ' '-inf: pingd lte 0' % (res_name, res_name)) pcmk.commit(cmd) else: # the resource already exists so it will be updated. code = pcmk.crm_update_resource(res_name, res_type, resource_params.get(res_name)) if code != 0: msg = "Cannot update pcmkr resource: {}".format(res_name) status_set('blocked', msg) raise Exception(msg) log('Configuring Groups: %s' % (groups), level=DEBUG) for grp_name, grp_params in groups.items(): if not pcmk.crm_opt_exists(grp_name): cmd = ('crm -w -F configure group %s %s' % (grp_name, grp_params)) pcmk.commit(cmd) log('%s' % cmd, level=DEBUG) log('Configuring Master/Slave (ms): %s' % (ms), level=DEBUG) for ms_name, ms_params in ms.items(): if not pcmk.crm_opt_exists(ms_name): cmd = 'crm -w -F configure ms %s %s' % (ms_name, ms_params) pcmk.commit(cmd) log('%s' % cmd, level=DEBUG) log('Configuring Orders: %s' % (orders), level=DEBUG) for ord_name, ord_params in orders.items(): if not pcmk.crm_opt_exists(ord_name): cmd = 'crm -w -F configure order %s %s' % (ord_name, ord_params) pcmk.commit(cmd) log('%s' % cmd, level=DEBUG) log('Configuring Clones: %s' % clones, level=DEBUG) for cln_name, cln_params in clones.items(): if not pcmk.crm_opt_exists(cln_name): cmd = 'crm -w -F configure clone %s %s' % (cln_name, cln_params) pcmk.commit(cmd) log('%s' % cmd, level=DEBUG) # Ordering is important here, colocation and location constraints # reference resources. All resources referenced by the constraints # need to exist otherwise constraint creation will fail. log('Configuring Colocations: %s' % colocations, level=DEBUG) for col_name, col_params in colocations.items(): if not pcmk.crm_opt_exists(col_name): cmd = 'crm -w -F configure colocation %s %s' % (col_name, col_params) pcmk.commit(cmd) log('%s' % cmd, level=DEBUG) log('Configuring Locations: %s' % locations, level=DEBUG) for loc_name, loc_params in locations.items(): if not pcmk.crm_opt_exists(loc_name): cmd = 'crm -w -F configure location %s %s' % (loc_name, loc_params) pcmk.commit(cmd) log('%s' % cmd, level=DEBUG) for res_name, res_type in resources.items(): if len(init_services) != 0 and res_name in init_services: # Checks that the resources are running and started. # Ensure that clones are excluded as the resource is # not directly controllable (dealt with below) # Ensure that groups are cleaned up as a whole rather # than as individual resources. if (res_name not in clones.values() and res_name not in groups.values() and not pcmk.crm_res_running(res_name)): # Just in case, cleanup the resources to ensure they get # started in case they failed for some unrelated reason. cmd = 'crm resource cleanup %s' % res_name pcmk.commit(cmd) for cl_name in clones: # Always cleanup clones cmd = 'crm resource cleanup %s' % cl_name pcmk.commit(cmd) for grp_name in groups: # Always cleanup groups cmd = 'crm resource cleanup %s' % grp_name pcmk.commit(cmd) for rel_id in relation_ids('ha'): relation_set(relation_id=rel_id, clustered="yes")
def ha_relation_changed(): # Check that we are related to a principle and that # it has already provided the required corosync configuration if not get_corosync_conf(): log('Unable to configure corosync right now, deferring configuration', level=INFO) return if relation_ids('hanode'): log('Ready to form cluster - informing peers', level=DEBUG) relation_set(relation_id=relation_ids('hanode')[0], ready=True) else: log('Ready to form cluster, but not related to peers just yet', level=INFO) return # Check that there's enough nodes in order to perform the # configuration of the HA cluster if len(get_cluster_nodes()) < int(config('cluster_count')): log('Not enough nodes in cluster, deferring configuration', level=INFO) return relids = relation_ids('ha') or relation_ids('juju-info') if len(relids) == 1: # Should only ever be one of these # Obtain relation information relid = relids[0] units = related_units(relid) if len(units) < 1: log('No principle unit found, deferring configuration', level=INFO) return unit = units[0] log('Parsing cluster configuration using rid: %s, unit: %s' % (relid, unit), level=DEBUG) resources = parse_data(relid, unit, 'resources') delete_resources = parse_data(relid, unit, 'delete_resources') resource_params = parse_data(relid, unit, 'resource_params') groups = parse_data(relid, unit, 'groups') ms = parse_data(relid, unit, 'ms') orders = parse_data(relid, unit, 'orders') colocations = parse_data(relid, unit, 'colocations') clones = parse_data(relid, unit, 'clones') locations = parse_data(relid, unit, 'locations') init_services = parse_data(relid, unit, 'init_services') else: log('Related to %s ha services' % (len(relids)), level=DEBUG) return if True in [ra.startswith('ocf:openstack') for ra in resources.values()]: apt_install('openstack-resource-agents') if True in [ra.startswith('ocf:ceph') for ra in resources.values()]: apt_install('ceph-resource-agents') if True in [ra.startswith('ocf:maas') for ra in resources.values()]: try: validate_dns_ha() except MAASConfigIncomplete as ex: log(ex.args[0], level=ERROR) status_set('blocked', ex.args[0]) # if an exception is raised the hook will end up in error state # which will obfuscate the workload status and message. return log('Setting up access to MAAS API', level=INFO) setup_maas_api() # Update resource_parms for DNS resources to include MAAS URL and # credentials for resource in resource_params.keys(): if resource.endswith("_hostname"): res_ipaddr = get_ip_addr_from_resource_params( resource_params[resource]) resource_params[resource] += ( ' maas_url="{}" maas_credentials="{}"' ''.format(config('maas_url'), config('maas_credentials'))) write_maas_dns_address(resource, res_ipaddr) # NOTE: this should be removed in 15.04 cycle as corosync # configuration should be set directly on subordinate configure_corosync() try_pcmk_wait() failure_timeout = config('failure_timeout') configure_cluster_global(failure_timeout) configure_monitor_host() configure_stonith() # Only configure the cluster resources # from the oldest peer unit. if is_leader(): log('Setting cluster symmetry', level=INFO) set_cluster_symmetry() log('Deleting Resources' % (delete_resources), level=DEBUG) for res_name in delete_resources: if pcmk.crm_opt_exists(res_name): if ocf_file_exists(res_name, resources): log('Stopping and deleting resource %s' % res_name, level=DEBUG) if pcmk.crm_res_running(res_name): pcmk.commit('crm -w -F resource stop %s' % res_name) else: log('Cleanuping and deleting resource %s' % res_name, level=DEBUG) pcmk.commit('crm resource cleanup %s' % res_name) # Daemon process may still be running after the upgrade. kill_legacy_ocf_daemon_process(res_name) pcmk.commit('crm -w -F configure delete %s' % res_name) log('Configuring Resources: %s' % (resources), level=DEBUG) for res_name, res_type in resources.items(): # disable the service we are going to put in HA if res_type.split(':')[0] == "lsb": disable_lsb_services(res_type.split(':')[1]) if service_running(res_type.split(':')[1]): service_stop(res_type.split(':')[1]) elif (len(init_services) != 0 and res_name in init_services and init_services[res_name]): disable_upstart_services(init_services[res_name]) if service_running(init_services[res_name]): service_stop(init_services[res_name]) # Put the services in HA, if not already done so # if not pcmk.is_resource_present(res_name): if not pcmk.crm_opt_exists(res_name): if res_name not in resource_params: cmd = 'crm -w -F configure primitive %s %s' % (res_name, res_type) else: cmd = ('crm -w -F configure primitive %s %s %s' % (res_name, res_type, resource_params[res_name])) pcmk.commit(cmd) log('%s' % cmd, level=DEBUG) if config('monitor_host'): cmd = ('crm -F configure location Ping-%s %s rule ' '-inf: pingd lte 0' % (res_name, res_name)) pcmk.commit(cmd) else: # the resource already exists so it will be updated. code = pcmk.crm_update_resource(res_name, res_type, resource_params.get(res_name)) if code != 0: msg = "Cannot update pcmkr resource: {}".format(res_name) status_set('blocked', msg) raise Exception(msg) log('Configuring Groups: %s' % (groups), level=DEBUG) for grp_name, grp_params in groups.items(): if not pcmk.crm_opt_exists(grp_name): cmd = ('crm -w -F configure group %s %s' % (grp_name, grp_params)) pcmk.commit(cmd) log('%s' % cmd, level=DEBUG) log('Configuring Master/Slave (ms): %s' % (ms), level=DEBUG) for ms_name, ms_params in ms.items(): if not pcmk.crm_opt_exists(ms_name): cmd = 'crm -w -F configure ms %s %s' % (ms_name, ms_params) pcmk.commit(cmd) log('%s' % cmd, level=DEBUG) log('Configuring Orders: %s' % (orders), level=DEBUG) for ord_name, ord_params in orders.items(): if not pcmk.crm_opt_exists(ord_name): cmd = 'crm -w -F configure order %s %s' % (ord_name, ord_params) pcmk.commit(cmd) log('%s' % cmd, level=DEBUG) log('Configuring Clones: %s' % clones, level=DEBUG) for cln_name, cln_params in clones.items(): if not pcmk.crm_opt_exists(cln_name): cmd = 'crm -w -F configure clone %s %s' % (cln_name, cln_params) pcmk.commit(cmd) log('%s' % cmd, level=DEBUG) # Ordering is important here, colocation and location constraints # reference resources. All resources referenced by the constraints # need to exist otherwise constraint creation will fail. log('Configuring Colocations: %s' % colocations, level=DEBUG) for col_name, col_params in colocations.items(): if not pcmk.crm_opt_exists(col_name): cmd = 'crm -w -F configure colocation %s %s' % (col_name, col_params) pcmk.commit(cmd) log('%s' % cmd, level=DEBUG) log('Configuring Locations: %s' % locations, level=DEBUG) for loc_name, loc_params in locations.items(): if not pcmk.crm_opt_exists(loc_name): cmd = 'crm -w -F configure location %s %s' % (loc_name, loc_params) pcmk.commit(cmd) log('%s' % cmd, level=DEBUG) for res_name, res_type in resources.items(): if len(init_services) != 0 and res_name in init_services: # Checks that the resources are running and started. # Ensure that clones are excluded as the resource is # not directly controllable (dealt with below) # Ensure that groups are cleaned up as a whole rather # than as individual resources. if (res_name not in clones.values() and res_name not in groups.values() and not pcmk.crm_res_running(res_name)): # Just in case, cleanup the resources to ensure they get # started in case they failed for some unrelated reason. cmd = 'crm resource cleanup %s' % res_name pcmk.commit(cmd) for cl_name in clones: # Always cleanup clones cmd = 'crm resource cleanup %s' % cl_name pcmk.commit(cmd) for grp_name in groups: # Always cleanup groups cmd = 'crm resource cleanup %s' % grp_name pcmk.commit(cmd) # All members of the cluster need to be registered before resources # that reference them can be created. if len(get_member_ready_nodes()) >= int(config('cluster_count')): log('Configuring any remote nodes', level=INFO) remote_resources = configure_pacemaker_remote_resources() stonith_resource = configure_pacemaker_remote_stonith_resource() resources.update(remote_resources) resources.update(stonith_resource) configure_resources_on_remotes(resources=resources, clones=clones, groups=groups) else: log('Deferring configuration of any remote nodes', level=INFO) for rel_id in relation_ids('ha'): relation_set(relation_id=rel_id, clustered="yes") # Inform peers that local configuration is complete and this member # is ready for rel_id in relation_ids('hanode'): relation_set(relation_id=rel_id, member_ready=True)