def create_security_group(self, knp, project_id): sg_name = ("sg-" + knp['metadata']['namespace'] + "-" + knp['metadata']['name']) desc = ("Kuryr-Kubernetes Network Policy %s SG" % utils.get_res_unique_name(knp)) try: # Create initial security group sg = self.os_net.create_security_group(name=sg_name, project_id=project_id, description=desc) driver_utils.tag_neutron_resources([sg]) # NOTE(dulek): Neutron populates every new SG with two rules # allowing egress on IPv4 and IPv6. This collides with # how network policies are supposed to work, because # initially even egress traffic should be blocked. # To work around this we will delete those two SG # rules just after creation. for sgr in sg.security_group_rules: self.os_net.delete_security_group_rule(sgr['id']) except (os_exc.SDKException, exceptions.ResourceNotReady) as exc: np = utils.get_referenced_object(knp, 'NetworkPolicy') if np: self.kubernetes.add_event(np, 'FailedToAddSecurityGroup', f'Adding new security group or ' f'security group rules for ' f'corresponding network policy has ' f'failed: {exc}', 'Warning') LOG.exception("Error creating security group for network policy " " %s", knp['metadata']['name']) raise return sg.id
def add(self, params): kp_name = self._get_obj_name(params) timeout = CONF.cni_daemon.vif_annotation_timeout # In order to fight race conditions when pods get recreated with the # same name (think StatefulSet), we're trying to get pod UID either # from the request or the API in order to use it as the ID to compare. if 'K8S_POD_UID' not in params.args: # CRI doesn't pass K8S_POD_UID, get it from the API. pod = self._get_pod(params) if not pod: raise exceptions.CNIPodGone(kp_name) params.args.K8S_POD_UID = pod['metadata']['uid'] vifs = self._do_work(params, b_base.connect, timeout) # NOTE(dulek): Saving containerid to be able to distinguish old DEL # requests that we should ignore. We need a lock to # prevent race conditions and replace whole object in the # dict for multiprocessing.Manager to notice that. with lockutils.lock(kp_name, external=True): d = self.registry[kp_name] d['containerid'] = params.CNI_CONTAINERID self.registry[kp_name] = d LOG.debug('Saved containerid = %s for CRD %s', params.CNI_CONTAINERID, kp_name) # Wait for timeout sec, 1 sec between tries, retry when even one # vif is not active. @retrying.retry(stop_max_delay=timeout * 1000, wait_fixed=RETRY_DELAY, retry_on_result=utils.any_vif_inactive) def wait_for_active(kp_name): return self.registry[kp_name]['vifs'] data = { 'metadata': { 'name': params.args.K8S_POD_NAME, 'namespace': params.args.K8S_POD_NAMESPACE } } pod = k_utils.get_referenced_object(data, 'Pod') try: self.k8s.add_event(pod, 'CNIWaitingForVIFs', f'Waiting for Neutron ports of {kp_name} to ' f'become ACTIVE after binding.', component='kuryr-daemon') vifs = wait_for_active(kp_name) except retrying.RetryError: self.k8s.add_event( pod, 'CNITimedOutWaitingForVIFs', f'Timed out waiting for Neutron ports of ' f'{kp_name} to become ACTIVE after binding.', 'Warning', 'kuryr-daemon') raise exceptions.CNINeutronPortActivationTimeout( kp_name, self.registry[kp_name]['vifs']) return vifs[k_const.DEFAULT_IFNAME]
def _get_vifs_from_registry(self, params, timeout): kp_name = self._get_obj_name(params) # In case of KeyError retry for `timeout` s, wait 1 s between tries. @retrying.retry(stop_max_delay=timeout * 1000, wait_fixed=RETRY_DELAY, retry_on_exception=lambda e: isinstance( e, (KeyError, exceptions.CNIPodUidMismatch))) def find(): d = self.registry[kp_name] if d == k_const.CNI_DELETED_POD_SENTINEL: # Pod got deleted meanwhile raise exceptions.CNIPodGone(kp_name) static = d['kp']['spec'].get('podStatic', None) uid = d['kp']['spec']['podUid'] # FIXME(dulek): This is weirdly structured for upgrades support. # If podStatic is not set (KuryrPort created by old # Kuryr version), then on uid mismatch we're fetching # pod from API and check if it's static here. Pods # are quite ephemeral, so will gradually get replaced # after the upgrade and in a while all should have # the field set and the performance penalty should # be resolved. Remove in the future. if 'K8S_POD_UID' in params.args and uid != params.args.K8S_POD_UID: if static is None: pod = self._get_pod(params) static = k_utils.is_pod_static(pod) # Static pods have mirror pod UID in API, so it's always # mismatched. We don't raise in that case. See [1] for more. # [1] https://github.com/k8snetworkplumbingwg/multus-cni/ # issues/773 if not static: raise exceptions.CNIPodUidMismatch(kp_name, params.args.K8S_POD_UID, uid) return d try: d = find() return d['kp'], d['vifs'] except KeyError: data = { 'metadata': { 'name': params.args.K8S_POD_NAME, 'namespace': params.args.K8S_POD_NAMESPACE } } pod = k_utils.get_referenced_object(data, 'Pod') self.k8s.add_event( pod, 'CNITimeoutKuryrPortRegistry', f'Timed out waiting for Neutron ports to be ' f'created for {kp_name}. Check ' f'kuryr-controller logs.', 'Warning', 'kuryr-daemon') raise exceptions.CNIKuryrPortTimeout(kp_name)
def create_security_group_rule(body, knp): os_net = clients.get_network_client() k8s = clients.get_kubernetes_client() try: params = dict(body) if 'ethertype' in params: # NOTE(gryf): in openstacksdk, there is ether_type attribute in # the security_group_rule object, in CRD we have 'ethertype' # instead, just like it was returned by the neutron client. params['ether_type'] = params['ethertype'] del params['ethertype'] sgr = os_net.create_security_group_rule(**params) return sgr.id except os_exc.ConflictException as ex: if 'quota' in ex.details.lower(): np = utils.get_referenced_object(knp, 'NetworkPolicy') k8s.add_event( np, 'FailedToCreateSecurityGroupRule', f'Creating security group rule for corresponding ' f'Network Policy has failed: {ex}', 'Warning') LOG.error("Failed to create security group rule %s: %s", body, ex.details) raise else: LOG.debug( "Failed to create already existing security group " "rule %s", body) # Get existent sg rule id from exception message return str(ex).split()[-1][:-1] except os_exc.SDKException as exc: np = utils.get_referenced_object(knp, 'NetworkPolicy') k8s.add_event( np, 'FailedToCreateSecurityGroupRule', f'Creating security group rule for corresponding ' f'Network Policy has failed: {exc}', 'Warning') LOG.debug("Error creating security group rule") raise
def delete_security_group_rule(security_group_rule_id, knp): os_net = clients.get_network_client() k8s = clients.get_kubernetes_client() try: LOG.debug("Deleting sg rule with ID: %s", security_group_rule_id) os_net.delete_security_group_rule(security_group_rule_id) except os_exc.SDKException as exc: np = utils.get_referenced_object(knp, 'NetworkPolicy') k8s.add_event(np, 'FailedToDeleteSecurityGroupRule', f'Deleting security group rule for corresponding ' f'Network Policy has failed: {exc}', 'Warning') LOG.debug("Error deleting security group rule: %s", security_group_rule_id) raise
def _do_work(self, params, fn, timeout): kp_name = self._get_obj_name(params) # In case of KeyError retry for `timeout` s, wait 1 s between tries. @retrying.retry(stop_max_delay=timeout * 1000, wait_fixed=RETRY_DELAY, retry_on_exception=lambda e: isinstance(e, KeyError)) def find(): return self.registry[kp_name] try: d = find() kp = d['kp'] vifs = d['vifs'] except KeyError: data = { 'metadata': { 'name': params.args.K8S_POD_NAME, 'namespace': params.args.K8S_POD_NAMESPACE } } pod = k_utils.get_referenced_object(data, 'Pod') self.k8s.add_event( pod, 'CNITimeoutKuryrPortRegistry', f'Timed out waiting for Neutron ports to be ' f'created for {kp_name}. Check ' f'kuryr-controller logs.', 'Warning') raise exceptions.CNIKuryrPortTimeout(kp_name) for ifname, vif in vifs.items(): is_default_gateway = (ifname == k_const.DEFAULT_IFNAME) if is_default_gateway: # NOTE(ygupta): if this is the default interface, we should # use the ifname supplied in the CNI ADD request ifname = params.CNI_IFNAME fn(vif, self._get_inst(kp), ifname, params.CNI_NETNS, report_health=self.report_drivers_health, is_default_gateway=is_default_gateway, container_id=params.CNI_CONTAINERID) return vifs
def _patch_kuryrnetworkpolicy_crd(self, knp, field, data, action='replace'): name = knp['metadata']['name'] LOG.debug('Patching KuryrNet CRD %s', name) try: status = self.k8s.patch_crd(field, utils.get_res_link(knp), data, action=action) except exceptions.K8sResourceNotFound: LOG.debug('KuryrNetworkPolicy CRD not found %s', name) return None except exceptions.K8sClientException as exc: np = utils.get_referenced_object(knp, 'NetworkPolicy') self.k8s.add_event(np, 'FailedToPatchKuryrNetworkPolicy', f'Failed to update KuryrNetworkPolicy CRD: ' f'{exc}', 'Warning') LOG.exception('Error updating KuryrNetworkPolicy CRD %s', name) raise knp['status'] = status return knp
def on_finalize(self, knp, *args, **kwargs): LOG.debug("Finalizing KuryrNetworkPolicy %s", knp) project_id = self._drv_project.get_project(knp) pods_to_update = self._drv_policy.affected_pods(knp) crd_sg = knp['status'].get('securityGroupId') try: policy = self._get_networkpolicy(knp['metadata']['annotations'] ['networkPolicyLink']) except exceptions.K8sResourceNotFound: # NP is already gone, let's just try to clean up. policy = None if crd_sg: for pod in pods_to_update: if (utils.is_host_network(pod) or not driver_utils.is_pod_scheduled(pod)): continue pod_sgs = self._drv_pod_sg.get_security_groups(pod, project_id) if crd_sg in pod_sgs: pod_sgs.remove(crd_sg) if not pod_sgs: pod_sgs = CONF.neutron_defaults.pod_security_groups if not pod_sgs: raise cfg.RequiredOptError( 'pod_security_groups', cfg.OptGroup('neutron_defaults')) try: self._drv_vif_pool.update_vif_sgs(pod, pod_sgs) except os_exc.NotFoundException: # Pod got deleted in the meanwhile, safe to ignore. pass # ensure ports at the pool don't have the NP sg associated try: net_id = self._get_policy_net_id(knp) self._drv_vif_pool.remove_sg_from_pools(crd_sg, net_id) except exceptions.K8sResourceNotFound: # Probably the network got removed already, we can ignore it. pass try: self._drv_policy.delete_np_sg(crd_sg) except os_exc.SDKException as exc: np = utils.get_referenced_object(knp, 'NetworkPolicy') if np: self.k8s.add_event(np, 'FailedToRemoveSecurityGroup', f'Deleting security group for ' f'corresponding Network Policy has ' f'failed: {exc}', 'Warning') raise if (CONF.octavia_defaults.enforce_sg_rules and policy and not self._is_egress_only_policy(policy)): services = driver_utils.get_services( knp['metadata']['namespace']) for svc in services.get('items'): if (not svc['spec'].get('selector') or not self._is_service_affected(svc, pods_to_update)): continue sgs = self._drv_svc_sg.get_security_groups(svc, project_id) if crd_sg in sgs: # Remove our crd_sg out of service groups since we # don't have it anymore sgs.remove(crd_sg) try: self._drv_lbaas.update_lbaas_sg(svc, sgs) except exceptions.ResourceNotReady: # We can ignore LB that's being created - its SGs will # get handled when members will be getting created. pass LOG.debug("Removing finalizers from KuryrNetworkPolicy and " "NetworkPolicy.") if policy: self.k8s.remove_finalizer(policy, constants.NETWORKPOLICY_FINALIZER) self.k8s.remove_finalizer(knp, constants.NETWORKPOLICY_FINALIZER)
def add(self, params): kp_name = self._get_obj_name(params) timeout = CONF.cni_daemon.vif_annotation_timeout # Try to confirm if CRD in the registry is not stale cache. If it is, # remove it. with lockutils.lock(kp_name, external=True): if kp_name in self.registry: cached_kp = self.registry[kp_name]['kp'] try: kp = self.k8s.get(k_utils.get_res_link(cached_kp)) except Exception: LOG.exception('Error when getting KuryrPort %s', kp_name) raise exceptions.ResourceNotReady(kp_name) if kp['metadata']['uid'] != cached_kp['metadata']['uid']: LOG.warning( 'Stale KuryrPort %s detected in cache. (API ' 'uid=%s, cached uid=%s). Removing it from ' 'cache.', kp_name, kp['metadata']['uid'], cached_kp['metadata']['uid']) del self.registry[kp_name] vifs = self._do_work(params, b_base.connect, timeout) # NOTE(dulek): Saving containerid to be able to distinguish old DEL # requests that we should ignore. We need a lock to # prevent race conditions and replace whole object in the # dict for multiprocessing.Manager to notice that. with lockutils.lock(kp_name, external=True): d = self.registry[kp_name] d['containerid'] = params.CNI_CONTAINERID self.registry[kp_name] = d LOG.debug('Saved containerid = %s for CRD %s', params.CNI_CONTAINERID, kp_name) # Wait for timeout sec, 1 sec between tries, retry when even one # vif is not active. @retrying.retry(stop_max_delay=timeout * 1000, wait_fixed=RETRY_DELAY, retry_on_result=utils.any_vif_inactive) def wait_for_active(kp_name): return self.registry[kp_name]['vifs'] data = { 'metadata': { 'name': params.args.K8S_POD_NAME, 'namespace': params.args.K8S_POD_NAMESPACE } } pod = k_utils.get_referenced_object(data, 'Pod') try: self.k8s.add_event( pod, 'CNIWaitingForVIFs', f'Waiting for Neutron ports of {kp_name} to ' f'become ACTIVE after binding.') vifs = wait_for_active(kp_name) except retrying.RetryError: self.k8s.add_event( pod, 'CNITimedOutWaitingForVIFs', f'Timed out waiting for Neutron ports of ' f'{kp_name} to become ACTIVE after binding.', 'Warning') raise exceptions.CNINeutronPortActivationTimeout( kp_name, self.registry[kp_name]['vifs']) return vifs[k_const.DEFAULT_IFNAME]