def get_member_id(nodename=None): ''' Return the member ID (different from the node ID) for a etcd member of the cluster. Arguments: * `nodename`: (optional) the nodename for the member we want the ID for. if no name is provided (or empty), the local node will be used. ''' command = ["etcdctl"] + get_etcdctl_args() + ["member", "list"] target_nodename = nodename or __salt__['caasp_net.get_nodename']() debug("getting etcd member ID with: %s", command) try: target_url = 'https://{}:{}'.format(target_nodename, ETCD_CLIENT_PORT) members_output = subprocess.check_output(command) for member_line in members_output.splitlines(): if target_url in member_line: return member_line.split(':')[0] except Exception as e: error('cannot get member ID for "%s": %s', e, target_nodename) error('output: %s', members_output) return ''
def get_cluster_size(**kwargs): ''' Determines the optimal/desired (but possible) etcd cluster size Determines the desired number of cluster members, defaulting to the value supplied in the etcd:masters pillar, falling back to match the number nodes with the kube-master role, and if this is less than 3, it will bump it to 3 (or the number of nodes available if the number of nodes is less than 3). Optional arguments: * `masters`: list of current kubernetes masters * `minions`: list of current kubernetes minions ''' member_count = __salt__['pillar.get']('etcd:masters', None) masters = __salt__['caasp_nodes.get_from_args_or_with_expr']( 'masters', kwargs, 'G@roles:kube-master') minions = __salt__['caasp_nodes.get_from_args_or_with_expr']( 'minions', kwargs, 'G@roles:kube-minion') if not member_count: # A value has not been set in the pillar, calculate a "good" number # for the user. num_masters = len(masters) member_count = _optimal_etcd_number(num_masters) if member_count < MIN_RECOMMENDED_MEMBER_COUNT: # Attempt to increase the number of etcd master to 3, # however, if we don't have 3 nodes in total, # then match the number of nodes we have. increased_member_count = len(masters) + len(minions) increased_member_count = min(MIN_RECOMMENDED_MEMBER_COUNT, increased_member_count) # ... but make sure we are using an odd number # (otherwise we could have some leader election problems) member_count = _optimal_etcd_number(increased_member_count) warn("etcd member count too low (%d), increasing to %d", num_masters, increased_member_count) # TODO: go deeper and look for candidates in nodes with # no role (as get_replacement_for_member() does) else: # A value has been set in the pillar, respect the users choice # even it's not a "good" number. member_count = int(member_count) if member_count < MIN_RECOMMENDED_MEMBER_COUNT: warn( "etcd member count too low (%d), consider increasing " "to %d", member_count, MIN_RECOMMENDED_MEMBER_COUNT) member_count = max(1, member_count) debug("using member count = %d", member_count) return member_count
def get_additional_etcd_members(num_wanted=None, **kwargs): ''' Taking into account 1) the current number of etcd members, and 2) the number of etcd nodes we should be running in the cluster (obtained with `get_cluster_size()`) get a list of additional nodes (IDs) that should run `etcd` too. Optional arguments: * `etcd_members`: list of current etcd members * `excluded`: list of nodes to exclude ''' excluded = kwargs.get('excluded', []) current_etcd_members = __salt__['caasp_nodes.get_from_args_or_with_expr']( 'etcd_members', kwargs, 'G@roles:etcd') num_current_etcd_members = len(current_etcd_members) # the number of etcd masters that should be in the cluster num_wanted_etcd_members = num_wanted or get_cluster_size(**kwargs) # ... and the number we are missing num_additional_etcd_members = num_wanted_etcd_members - num_current_etcd_members if num_additional_etcd_members <= 0: debug('get_additional_etcd_members: we dont need more etcd members') return [] debug('get_additional_etcd_members: curr:%d wanted:%d -> %d missing', num_current_etcd_members, num_wanted_etcd_members, num_additional_etcd_members) # Get a list of `num_additional_etcd_members` nodes that could be used # for running etcd. A valid node is a node that: # # 1) is not the `admin` or `ca` # 2) has no `etcd` role (bootstrapped or not) # 2) is not being removed/added/updated # 3) (in preference order, first for non bootstrapped nodes) # 1) has no role assigned # 2) is a master # 3) is a minion # new_etcd_members = __salt__['caasp_nodes.get_with_prio_for_role']( num_additional_etcd_members, 'etcd', excluded=current_etcd_members + excluded) if len(new_etcd_members) < num_additional_etcd_members: error( 'get_additional_etcd_members: cannot satisfy the %s members missing', num_additional_etcd_members) return new_etcd_members
def get_from_args_or_with_expr(arg_name, args_dict, *args, **kwargs): """ Utility function for getting a list of nodes from either the kwargs or from an expression. """ if arg_name in args_dict: debug('using argument "%s": %s', arg_name, args_dict[arg_name]) return _sanitize_list(args_dict[arg_name]) else: return get_with_expr(*args, **kwargs)
def _get_hostname_and_port(url, default_port=None): parsed = urlparse(url) if parsed.hostname: hostname = parsed.hostname port = parsed.port else: splitted_url = url.split(':') hostname = splitted_url[0] if len(splitted_url) > 1: port = int(splitted_url[1]) else: port = None res = (hostname, port or default_port) debug("%s parsed as %s", url, res) return res
def get_with_prio(num, description, prio_rules, **kwargs): """ Get a list of `num` nodes that could be used for running some role. A valid node is a node that: 1) is not the `salt` or `ca` 2) dopes not currently have that role 2) is not being removed/added/updated """ new_nodes = [] remaining = num for expr in prio_rules: debug('trying to find candidates for "%s" with "%s"', description, expr) # get all the nodes matching the priority expression, # but filtering out all the nodes we already have candidates = get_with_expr( expr, exclude_admin=True, exclude_in_progress=True, **kwargs ) debug("... %d candidates", len(candidates)) ids = [x for x in candidates if x not in new_nodes] if len(ids) > 0: debug("... new candidates: %s (we need %d)", candidates, remaining) new_ids = ids[:remaining] new_nodes = new_nodes + new_ids remaining -= len(new_ids) debug( "... %d new candidates (%s) for %s: %d remaining", len(ids), str(ids), description, remaining, ) else: debug('... no new candidates found with "%s"', expr) if remaining <= 0: break info( "we were looking for %d candidates for %s and %d found", num, description, len(new_nodes), ) return new_nodes[:num]
def get_with_expr(expr, **kwargs): """ Get all the nodes that match some expression `expr` Optional arguments: * `booted`: exclude non-bootstrapped nodes * `exclude_admin`: exclude the Admin and CA nodes * `exclude_in_progress`: exclude any node with *_in_progress grains * `excluded`: list of nodes to exclude * `excluded_roles`: list of roles to exclude * `excluded_grains`: list of grains to exclude * `grain`: return a map of <id>:<grain> items instead of a list of <id>s """ expr_items = [expr] grain = kwargs.get("grain", DEFAULT_GRAIN) excluded = _sanitize_list(kwargs.get("excluded", [])) excluded_grains = _sanitize_list(kwargs.get("excluded_grains", [])) excluded_roles = _sanitize_list(kwargs.get("excluded_roles", [])) if kwargs.get("booted", False): expr_items.append("G@bootstrap_complete:true") if kwargs.get("exclude_admin", False): excluded_roles += ["salt", "ca"] if kwargs.get("exclude_in_progress", False): excluded_grains += IN_PROGRESS_GRAINS if excluded: expr_items.append("not L@" + "|".join(excluded)) excluded_roles = _sanitize_list(excluded_roles) if excluded_roles: expr_items.append("not P@roles:(" + "|".join(excluded_roles) + ")") excluded_grains = _sanitize_list(excluded_grains) if excluded_grains: expr_items += ["not G@{}:true".format(g) for g in excluded_grains] res = __salt__["caasp_grains.get"](" and ".join(expr_items), grain=grain) res = res if ("grain" in kwargs) else res.keys() debug("%s: %s", expr, res) return res
def get_with_expr(expr, **kwargs): ''' Get all the nodes that match some expression `expr` Optional arguments: * `booted`: exclude non-bootstrapped nodes * `exclude_admin`: exclude the Admin and CA nodes * `exclude_in_progress`: exclude any node with *_in_progress grains * `excluded`: list of nodes to exclude * `excluded_roles`: list of roles to exclude * `excluded_grains`: list of grains to exclude * `grain`: return a map of <id>:<grain> items instead of a list of <id>s ''' expr_items = [expr] grain = kwargs.get('grain', DEFAULT_GRAIN) excluded = _sanitize_list(kwargs.get('excluded', [])) excluded_grains = _sanitize_list(kwargs.get('excluded_grains', [])) excluded_roles = _sanitize_list(kwargs.get('excluded_roles', [])) if kwargs.get('booted', False): expr_items.append('G@bootstrap_complete:true') if kwargs.get('exclude_admin', False): excluded_roles += ['admin', 'ca'] if kwargs.get('exclude_in_progress', False): excluded_grains += IN_PROGRESS_GRAINS if excluded: expr_items.append('not L@' + '|'.join(excluded)) excluded_roles = _sanitize_list(excluded_roles) if excluded_roles: expr_items.append('not P@roles:(' + '|'.join(excluded_roles) + ')') excluded_grains = _sanitize_list(excluded_grains) if excluded_grains: expr_items += ['not G@{}:true'.format(g) for g in excluded_grains] res = __salt__['caasp_grains.get'](' and '.join(expr_items), grain=grain) res = res if ('grain' in kwargs) else res.keys() debug('%s: %s', expr, res) return res
def stop_container_and_wait(name, namespace, timeout=60, **kwargs): ''' Stop the running container named ``name`` running inside of the specified ``namespace``. Then waits for kubelet to bring up a new instance of the same container. name Name of the container. This is checked against the ``metadata.name`` field of a kubernetes pod. namespace Name of the namespace to search the container inside. timeout If the container has not been restarted after timeout seconds, return with a failure. By default a 60 seconds timeout is applied. .. code-block:: yaml kube_system_haproxy: caasp_cri.stop_container_and_wait: name: haproxy namespace: kube-system timeout: 120 ''' stopped = __salt__['caasp_cri.stop_container'](name, namespace, **kwargs) if not stopped: debug( 'CaaS: {namespace}.{name} container was not found running'.format( namespace=namespace, name=name)) return wait_for_container(name, namespace, timeout, **kwargs)
def get_expr_affected_by(target, **kwargs): """ Get an expression for matching nodes that are affected by the addition/removal of `target`. Those affected nodes should be highstated in order to update their configuration. Some notes: * we only consider bootstraped nodes. * we ignore nodes where some oither operation is in progress (ie, an update) Optional arguments: * `exclude_in_progress`: (default=True) exclude any node with *_in_progress * `excluded`: list of nodes to exclude * `excluded_roles`: list of roles to exclude """ affected_items = [] affected_roles = [] etcd_members = get_from_args_or_with_expr("etcd_members", kwargs, "G@roles:etcd") masters = get_from_args_or_with_expr("masters", kwargs, "G@roles:kube-master") minions = get_from_args_or_with_expr("minions", kwargs, "G@roles:kube-minion") if target in etcd_members: # we must highstate: # * etcd members (ie, peers list in /etc/sysconfig/etcd) affected_roles.append("etcd") # * api servers (ie, etcd endpoints in /etc/kubernetes/apiserver affected_roles.append("kube-master") if target in masters: # we must highstate: # * admin (ie, haproxy) affected_roles.append("admin") # * minions (ie, haproxy) affected_roles.append("kube-minion") if target in minions: # ok, ok, /etc/hosts will contain the old node, but who cares! pass if not affected_roles: debug("no roles affected by the removal/addition of %s", target) return "" affected_items.append("G@bootstrap_complete:true") affected_roles.sort() affected_items.append("P@roles:(" + "|".join(affected_roles) + ")") # exclude some roles affected_items.append("not G@roles:ca") if kwargs.get("exclude_in_progress", True): affected_items.append("not G@bootstrap_in_progress:true") affected_items.append("not G@update_in_progress:true") affected_items.append("not G@node_removal_in_progress:true") affected_items.append("not G@node_addition_in_progress:true") excluded_nodes = _sanitize_list([target] + kwargs.get("excluded", [])) if excluded_nodes: affected_items.append("not L@" + ",".join(excluded_nodes)) excluded_roles = _sanitize_list(kwargs.get("excluded_roles", [])) if excluded_roles: affected_items.append("not P@roles:(" + "|".join(excluded_roles) + ")") return " and ".join(affected_items)
def get_replacement_for(target, replacement="", **kwargs): """ When removing a node `target`, try to get a `replacement` (and the new roles that must be assigned) for all the roles that were running there. If the user provides an explicit `replacement`, verify that that replacement is valid. In case the user-provided is not valid, raise an exception (aborting the execution). If no replacement can be found, we are fine as long as we have a minimum number of nodes with that role (ie, for masters, we are fine as long as we have at least one master). """ assert target excluded = kwargs.get("excluded", []) replacement_provided = replacement != "" replacement_roles = [] def warn_or_abort_on_replacement_provided(msg, *args): if replacement_provided: abort("the user provided replacement cannot be used: " + msg, *args) else: warn(msg, *args) # preparations # check: we cannot try to remove some 'virtual' nodes forbidden = get_from_args_or_with_expr("forbidden", kwargs, "P@roles:(admin|ca)") if target in forbidden: abort('%s cannot be removed: it has a "ca" or "admin" role', target) elif replacement_provided and replacement in forbidden: abort( '%s cannot be replaced by %s: the replacement has a "ca" or "admin" role', target, replacement, ) elif replacement_provided and replacement in excluded: abort( "%s cannot be replaced by %s: the replacement is in the list of nodes excluded", target, replacement, ) masters = get_from_args_or_with_expr("masters", kwargs, "G@roles:kube-master") minions = get_from_args_or_with_expr("minions", kwargs, "G@roles:kube-minion") etcd_members = get_from_args_or_with_expr("etcd_members", kwargs, "G@roles:etcd") # # replacement for etcd members # if target in etcd_members: etcd_replacement = replacement if not etcd_replacement: debug("looking for replacement for etcd at %s", target) # we must choose another node and promote it to be an etcd member etcd_replacement = _get_one_for_role("etcd", excluded=excluded) # check if the replacement provided is valid if etcd_replacement: bootstrapped_etcd_members = get_from_args_or_with_expr( "booted_etcd_members", kwargs, "G@roles:kube-master", booted=True ) if etcd_replacement in bootstrapped_etcd_members: warn_or_abort_on_replacement_provided( "the replacement for the etcd server %s cannot be %s: another etcd server is already running there", target, etcd_replacement, ) etcd_replacement = "" # the etcd replacement can be run in bootstrapped masters/minions, # so we are done with the incompatibility checks... if etcd_replacement: debug( "setting %s as the replacement for the etcd member %s", etcd_replacement, target, ) replacement = etcd_replacement replacement_roles.append("etcd") if "etcd" not in replacement_roles: if len(etcd_members) <= _MIN_ETCD_MEMBERS_AFTER_REMOVAL: # we need at least one etcd server abort( "cannot remove etcd member %s: too few etcd members, and no replacement found or provided", target, ) else: warn( "number of etcd members will be reduced to %d, as no replacement for etcd server in %s has been found (or provided)", len(etcd_members), target, ) # # replacement for k8s masters # if target in masters: master_replacement = replacement if not master_replacement: # NOTE: even if no `replacement` was provided in the pillar, # we probably have one at this point: if the master was # running etcd as well, we have already tried to find # a replacement in the previous step... # however, we must verify that the etcd replacement # is a valid k8s master replacement too. # (ideally we should find the union of etcd and # masters candidates) debug("looking for replacement for kubernetes master at %s", target) master_replacement = _get_one_for_role("kube-master", excluded=excluded) # check if the replacement provided/found is valid if master_replacement: bootstrapped_masters = get_from_args_or_with_expr( "booted_masters", kwargs, "G@roles:kube-master", booted=True ) if master_replacement in bootstrapped_masters: warn_or_abort_on_replacement_provided( "will not replace the k8s master %s: the replacement %s is already running a k8s master", target, master_replacement, ) master_replacement = "" elif master_replacement in minions: warn_or_abort_on_replacement_provided( "will not replace the k8s master at %s: the replacement found/provided is the k8s minion %s", target, master_replacement, ) master_replacement = "" if master_replacement: # so far we do not support having two replacements for two roles, # so we check if the new replacement is compatible with any previous # replacement found so far. If it is not, keep the previous one and # warn the user if not replacement: replacement = master_replacement assert len(replacement) > 0 if replacement == master_replacement: debug( "setting %s as replacement for the kubernetes master %s", replacement, target, ) replacement_roles.append("kube-master") else: warn( "the k8s master replacement (%s) is not the same as the current replacement (%s) " + "(it will run %s) so we cannot use it for running the k8s master too", master_replacement, replacement, ",".join(replacement_roles), ) if "kube-master" not in replacement_roles: # stability check: check if it is ok not to run the k8s master in the replacement if len(masters) <= _MIN_MASTERS_AFTER_REMOVAL: # we need at least one master (for runing the k8s API at all times) abort( "cannot remove k8s master %s: too few k8s masters, and no replacement found or provided", target, ) else: warn( "number of k8s masters will be reduced to %d, as no replacement for the k8s master in %s has been found (or provided)", len(masters), target, ) # # replacement for k8s minions # if target in minions: minion_replacement = replacement if not minion_replacement: debug("looking for replacement for kubernetes minion at %s", target) minion_replacement = _get_one_for_role("kube-minion", excluded=excluded) # check if the replacement provided/found is valid # NOTE: maybe the new role has already been assigned in Velum... if minion_replacement: bootstrapped_minions = get_from_args_or_with_expr( "booted_minions", kwargs, "G@roles:kube-minion", booted=True ) if minion_replacement in bootstrapped_minions: warn_or_abort_on_replacement_provided( "will not replace minion %s: the replacement %s is already running a k8s minion", target, minion_replacement, ) minion_replacement = "" elif minion_replacement in masters: warn_or_abort_on_replacement_provided( "will not replace the k8s minion %s: the replacement %s is already a k8s master", target, minion_replacement, ) minion_replacement = "" elif "kube-master" in replacement_roles: warn_or_abort_on_replacement_provided( "will not replace the k8s minion %s: the replacement found/provided, %s, is already scheduled for being a new k8s master", target, minion_replacement, ) minion_replacement = "" if minion_replacement: # once again, check if the new replacement is compatible with any previous one if not replacement: replacement = minion_replacement assert len(replacement) > 0 if replacement == minion_replacement: debug( "setting %s as replacement for the k8s minion %s", replacement, target, ) replacement_roles.append("kube-minion") else: warn( "the k8s minion replacement (%s) is not the same as the current replacement (%s) " + "(it will run %s) so we cannot use it for running the k8s minion too", minion_replacement, replacement, ",".join(replacement_roles), ) if "kube-minion" not in replacement_roles: # stability check: check if it is ok not to run the k8s minion in the replacement if len(minions) <= _MIN_MINIONS_AFTER_REMOVAL: # we need at least one minion (for running dex, kube-dns, etc..) abort( "cannot remove k8s minion %s: too few k8s minions, and no replacement found or provided", target, ) else: warn( "number of k8s minions will be reduced to %d, as no replacement for the k8s minion in %s has been found (or provided)", len(masters), target, ) # other consistency checks... if replacement: # consistency check: if there is a replacement, it must have some (new) role(s) if not replacement_roles: abort("internal error: replacement %s has no roles assigned", replacement) else: # if no valid replacement has been found, clear the roles too replacement_roles = [] return replacement, replacement_roles
def get_expr_affected_by(target, **kwargs): ''' Get an expression for matching nodes that are affected by the addition/removal of `target`. Those affected nodes should be highstated in order to update their configuration. Some notes: * we only consider bootstraped nodes. * we ignore nodes where some oither operation is in progress (ie, an update) Optional arguments: * `exclude_in_progress`: (default=True) exclude any node with *_in_progress * `excluded`: list of nodes to exclude * `excluded_roles`: list of roles to exclude ''' affected_items = [] affected_roles = [] etcd_members = get_from_args_or_with_expr('etcd_members', kwargs, 'G@roles:etcd') masters = get_from_args_or_with_expr('masters', kwargs, 'G@roles:kube-master') minions = get_from_args_or_with_expr('minions', kwargs, 'G@roles:kube-minion') if target in etcd_members: # we must highstate: # * etcd members (ie, peers list in /etc/sysconfig/etcd) affected_roles.append('etcd') # * api servers (ie, etcd endpoints in /etc/kubernetes/apiserver affected_roles.append('kube-master') if target in masters: # we must highstate: # * admin (ie, haproxy) affected_roles.append('admin') # * minions (ie, haproxy) affected_roles.append('kube-minion') if target in minions: # ok, ok, /etc/hosts will contain the old node, but who cares! pass if not affected_roles: debug('no roles affected by the removal/addition of %s', target) return '' affected_items.append('G@bootstrap_complete:true') affected_roles.sort() affected_items.append('P@roles:(' + '|'.join(affected_roles) + ')') # exclude some roles affected_items.append('not G@roles:ca') if kwargs.get('exclude_in_progress', True): affected_items.append('not G@bootstrap_in_progress:true') affected_items.append('not G@update_in_progress:true') affected_items.append('not G@node_removal_in_progress:true') affected_items.append('not G@node_addition_in_progress:true') excluded_nodes = _sanitize_list([target] + kwargs.get('excluded', [])) if excluded_nodes: affected_items.append('not L@' + ','.join(excluded_nodes)) excluded_roles = _sanitize_list(kwargs.get('excluded_roles', [])) if excluded_roles: affected_items.append('not P@roles:(' + '|'.join(excluded_roles) + ')') return ' and '.join(affected_items)
def get_registries_certs(lst, default_port=5000): ''' Given a list of "valid" items, return a dictionay of "<HOST>[:<PORT>]" -> <CERT> "valid" items must be get'able objects, with attributes "url", "cert" and (optional) "mirrors" "url"s can be [<PROTO>://]<HOST>[:<PORT>] ''' certs = {} debug('Finding certificates in: %s', lst) for registry in lst: try: url = registry.get('url') cert = registry.get('cert', '') if cert: # parse the name as an URL or "host:port", and return <HOST>[:<PORT>] hostname, port = _get_hostname_and_port(url) host_port = hostname if port: host_port += ":" + str(port) debug('Adding certificate for: %s', host_port) certs[host_port] = cert if port: if port == default_port: # When using the standar port (5000), if the user introduces # "my-registry:5000" as a trusted registry, he/she will be able # to do "docker pull my-registry:5000/some/image" but not # "docker pull my-registry/some/image". # So we must also create the "ca.crt" for "my-registry" # as he/she could just access "docker pull my-registry/some/image", # and Docker would fail to find "my-registry/ca.crt" name = hostname debug( 'Using default port: adding certificate for "%s" too', name) certs[name] = cert else: # the same happens if the user introduced a certificate for # "my-registry": we must fix the "docker pull my-registry:5000/some/image" case... name = hostname + ':' + str(default_port) debug( 'Adding certificate for default port, "%s", too', name) certs[name] = cert except Exception as e: error('Could not parse certificate: %s', e) try: mirrors = registry.get('mirrors', []) if mirrors: debug('Looking recursively for certificates in mirrors') certs_mirrors = get_registries_certs(mirrors, default_port=default_port) certs.update(certs_mirrors) except Exception as e: error('Could not parse mirrors: %s', e) return certs