def validate_running_pod(self) -> None: """ validate_running_pod Check whether the Pod we previously started is still running. If not, assume the job was killed without being processed by the operator (or was never started) and clean up. Mark as failed. If Pod is still running, update the status details. Returns: - None if no pod is expected - ProcessingComplete exception if pod is expected but not running - ProcessingComplete exception if pod is expected and is running """ # TODO: what if a pod is running, but the operator doesn't expect one? curpod = self.get_status('pod') curitem = self.get_status('currently_running') if curpod: try: pod = Pod.objects( self.api, namespace=self.namespace).get_by_name(curpod).obj except pykube.exceptions.ObjectDoesNotExist: self.info(f'pod {curpod} missing/deleted, cleaning up') self.set_status('currently_running') self.set_status('pod') self.set_status('state', 'missing') self.items.mark_failed(curitem) self.items.set_item_status(curitem, 'pod_detail') raise ProcessingComplete( info='Cleaned up missing/deleted item') podphase = pod.get('status', {}).get('phase', 'unknown') self.info(f'validated that pod {curpod} is ' f'still running (phase={podphase})') recorded_phase = self.items.status(curitem, 'podphase', 'unknown') # valid phases are Pending, Running, Succeeded, Failed, Unknown # 'started' is the phase the pods start with when created by # operator. if recorded_phase in ('started', 'Pending', 'Running', 'Failed'): self.info(f'item {curitem} status for ' f'{curpod}: {recorded_phase}') raise ProcessingComplete(message=f'item {curitem} %s' % recorded_phase.lower()) if recorded_phase == 'Succeeded': self.info(f'item {curitem} podphase={recorded_phase} but ' f'not yet acknowledged: {curpod}') raise ProcessingComplete(message=f'item {curitem} succeeded, ' 'awaiting acknowledgement') raise ProcessingComplete( error=f'item {curitem} unexpected state: ' f'recorded_phase={recorded_phase}, ' f'status={str(self.status)}', message=f'item {curitem} unexpected state')
def get_kubeobj(self, reason: str = None): """Get the kube object for the overseen object.""" namespace = self.namespace if self.namespace else pykube.all if self.my_pykube_objtype is None: raise ProcessingComplete( message='inheriting class must set self.my_pykube_objtype') try: return (self.my_pykube_objtype.objects( self.api, namespace=namespace).get_by_name(self.name)) except pykube.exceptions.ObjectDoesNotExist as exc: raise ProcessingComplete( error=f'cannot find Object {self.name} ' + f'to {reason}' if reason else '' + f': {exc}', message=f'cannot retrieve "{self.name}" object')
def validate_expected_pod_is_running(self) -> None: """ validate_expected_pod_is_running Validate that the pod which we expect should be running (based on `oaatgroup` status `pod` and `currently_running`) Check whether the Pod we previously started is still running. If not, assume the job was killed without being processed by the operator (or was never started) and clean up. Mark as failed. Returns: - ProcessingComplete exception: - Cleaned up missing/deleted item - Pod exists and is in state: <state> """ curpod = self.get_status('pod') curitem = self.get_status('currently_running') try: pod = Pod.objects(self.api, namespace=self.namespace).get_by_name(curpod).obj except pykube.exceptions.ObjectDoesNotExist: self.info(f'pod {curpod} missing/deleted, cleaning up') self.set_status('currently_running') self.set_status('pod') self.set_status('state', 'missing') self.items.mark_failed(curitem) self.items.set_item_status(curitem, 'pod_detail') raise ProcessingComplete( message=f'item {curitem} failed during validation', info='Cleaned up missing/deleted item') podphase = pod.get('status', {}).get('phase', 'unknown') self.info(f'validated that pod {curpod} exists ' f'(phase={podphase})') recorded_phase = self.items.status(curitem, 'podphase', 'unknown') # if there is a mismatch in phase, then the pod phase handlers # have not yet picked it up and updated the oaatgroup phase. # Note it here, but take no further action if podphase != recorded_phase: self.info(f'mismatch in phase for pod {curpod}: ' f'pod={podphase}, oaatgroup={recorded_phase}') # valid phases are Pending, Running, Succeeded, Failed, Unknown # 'started' is the phase the pods start with when created by # operator. raise ProcessingComplete( message=f'Pod {curpod} exists and is in state {podphase}')
def validate_state(self) -> None: """ validate_state "pod" and "currently_running" should both be None or both be set. If they are out of sync, then our state is inconsistent. This should only happen in unusual situations such as the oaat-operator being killed while starting a pod. TODO: currently just resets both to None, effectively ignoring the result of a running pod. Ideally, we should validate the status of the pod and clean up. """ curpod = self.get_status('pod') curitem = self.get_status('currently_running') if curpod is None and curitem is None: return None if curpod is not None and curitem is not None: return None self.set_status('currently_running') self.set_status('pod') raise ProcessingComplete(state='inconsistent state', message='internal error', error=(f'inconsistent state detected. ' f'pod ({curpod}) is inconsistent ' f'with currently_running ({curitem})'))
def podspec(self) -> dict: """Retrieve Pod specification from this OaatType.""" if not self.valid: raise ProcessingComplete(message='OaatType invalid', error=f'cannot find OaatType {self.name}') msg = 'error in OaatType definition' spec = self.obj.get('spec') if spec is None: raise ProcessingComplete( message=msg, error='missing spec in OaatType definition') if spec.get('type', '') not in ('pod',): raise ProcessingComplete(message=msg, error='spec.type must be "pod"') podspec = spec.get('podspec') if not podspec: raise ProcessingComplete(message=msg, error='spec.podspec is missing') if podspec.get('containers'): raise ProcessingComplete( message=msg, error='currently only support a single container, ' 'please do not use "spec.podspec.containers"') if not podspec.get('container'): raise ProcessingComplete( message=msg, error='spec.podspec.container is missing') if podspec.get('restartPolicy'): raise ProcessingComplete( message=msg, error='for spec.type="pod", you cannot specify ' 'a restartPolicy') return podspec
def delete(self) -> None: myobj = self.get_kubeobj('delete it') try: myobj.delete(propagation_policy='Background') self.debug(f'delete of {self.name} successful') except pykube.exceptions.KubernetesError as exc: raise ProcessingComplete( error=f'cannot delete Object {self.name}: {exc}', message=f'cannot delete "{self.name}" object')
def run_item(self, item_name) -> dict: """ run_item Execute an item job Pod with the spec details from the appropriate OaatType object. """ # TODO: check oaatType spec = self.oaattype.podspec() contspec = spec['container'] del spec['container'] contspec.setdefault('env', []).append({ 'name': 'OAAT_ITEM', 'value': item_name }) for idx in range(len(contspec.get('command', []))): contspec['command'][idx] = (contspec['command'][idx].replace( '%%oaat_item%%', item_name)) for idx in range(len(contspec.get('args', []))): contspec['args'][idx] = (contspec['args'][idx].replace( '%%oaat_item%%', item_name)) for env in contspec['env']: env['value'] = (env.get('value', '').replace('%%oaat_item%%', item_name)) # TODO: currently only supports a single container. Do we want # multi-container? doc = { 'apiVersion': 'v1', 'kind': 'Pod', 'metadata': { 'generateName': self.name + '-' + item_name + '-', 'labels': { 'parent-name': self.name, 'oaat-name': item_name, 'app': 'oaat-operator' } }, 'spec': { 'containers': [contspec], **spec, 'restartPolicy': 'Never' }, } kopf.adopt(doc) pod = Pod(self.api, doc) try: pod.create() except pykube.exceptions.KubernetesError as exc: self.items.mark_failed(item_name) raise ProcessingComplete( error=f'could not create pod {doc}: {exc}', message=f'error creating pod for {item_name}') return pod
def validate_oaat_type(self) -> None: """ validate_oaat_type Ensure the group refers to an appropriate OaatType object. """ if self.oaattype.valid: self.info('found valid oaat type') return None self.set_annotation('operator-status', 'missingOaatType') raise ProcessingComplete( message='error in OaatGroup definition', error=f'unknown oaat type {self.oaattypename}')
def get_oaattype(self) -> KubeOaatType: """Retrieve the OaatType object.""" if self.name is None: return None try: return ( KubeOaatType .objects(self.api, namespace=self.namespace) .get_by_name(self.name) .obj) except pykube.exceptions.ObjectDoesNotExist as exc: raise ProcessingComplete( error=( f'cannot find OaatType {self.namespace}/{self.name}: ' f'{exc}'), message=f'error retrieving "{self.name}" OaatType object')
def validate_no_rogue_pods_are_running(self) -> None: found_rogue = 0 for pod in Pod.objects(self.api, namespace=self.namespace).iterator(): if pod.name == self.get_status('pod'): continue if pod.labels.get('parent-name', '') == self.name: if pod.labels.get('app', '') == 'oaat-operator': podphase = (pod.obj['status'].get('phase', 'unknown')) if podphase in ['Running', 'Pending']: self.warning( f'rogue pod {pod.name} found (phase={podphase})') found_rogue += 1 if found_rogue > 0: raise ProcessingComplete( message='rogue pods running', error=f'found {found_rogue} rogue pods running')
def validate_items(self, status_annotation=None, count_annotation=None) -> None: """ validate_items Ensure there are oaatItems to process. """ if not len(self.items): if status_annotation: self.set_annotation(status_annotation, 'missingItems') raise ProcessingComplete(state='nothing to do', error='error in OaatGroup definition', message=f'no items found. ' f'Please set "oaatItems" in {self.name}') # we have oaatItems, so mark the object as "active" (via annotation) if status_annotation: self.set_annotation(status_annotation, 'active') if count_annotation: self.set_annotation(count_annotation, value=len(self.items))
def create_action(**kwargs): # [1] Overseer should raise ValueError if kwargs are not passed try: Overseer() except ValueError as exc: assert re.search('Overseer must be called with full kopf kwargs', str(exc)), exc kwargs['logger'].debug('[1] successful') pov = Overseer(**kwargs) # [2] error pov.error('[2] error message') # [3] warning pov.warning('[3] warning message') # [4] info pov.info('[4] info message') # [5] debug pov.debug('[5] debug message') # [6] get_status assert pov.get_status('unset_status') is None assert pov.get_status('unset_status', 'empty') == 'empty' # set_status pov.set_status('new_status') pov.set_status('new_status2', 'new_state') # [7] get_label assert pov.get_label('nolabel') is None assert pov.get_label('nolabel', 'empty') == 'empty' assert pov.get_label('testlabel') == 'labelvalue' assert pov.get_label('testlabel', 'empty') == 'labelvalue' # [8] get_kubeobj without my_pykube_objtype try: pov.get_kubeobj() except ProcessingComplete as exc: assert (str(exc) == 'inheriting class must set self.my_pykube_objtype' ), exc kwargs['logger'].debug('[8] successful') # [9] get_kubeobj missing object savename = pov.name pov.name = 'badname' pov.my_pykube_objtype = Pod try: pov.get_kubeobj() except ProcessingComplete as exc: assert str(exc) == 'cannot retrieve "badname" object', exc kwargs['logger'].debug('[9] successful') pov.name = savename # [10] get_kubeobj sunny day kobj = pov.get_kubeobj('examine it') kwargs['logger'].debug(f'kubeobj.metadata: {kobj.metadata}') assert kobj.metadata['name'] == kwargs['name'] kwargs['logger'].debug('[10] successful') # [11] set_annotation pov.set_annotation('testannotation') pov.set_annotation('new_annotation', 'annotation_value') # [12] handle_processing_complete try: raise ProcessingComplete(state='retstate', info='retinfo', error='reterror', warning='retwarning', message='retmessage') except ProcessingComplete as exc: assert (pov.handle_processing_complete(exc).get('message') == 'retmessage'), exc kwargs['logger'].debug('[12] successful') # [13] handle_processing_complete none try: raise ProcessingComplete() except ProcessingComplete as exc: assert pov.handle_processing_complete(exc) is None, exc kwargs['logger'].debug('[13] successful') pov.debug('about to complete') return 'all overseer tests successful'
def find_job_to_run(self) -> str: """ find_job_to_run Find the best item job to run based on last success and failure times. Basic algorithm: - phase one: choose valid item candidates: - start with a list of all possible items to run - remove from the list items which have been successful within the period in the 'frequency' setting - remove from the list items which have failed within the period in the 'failureCoolOff' setting - phase two: choose the item to run from the valid item candidates: - if there is just one item, choose it - find the item with the oldest success (or has never succeeded) - if there is just one item that is 'oldest', choose it - of the items with the oldest success, find the item with the oldest failure - if there is just one item that has both the oldest success and the oldest failure, choose it - choose at random (this is likely to occur if no items have been run - i.e. first iteration) """ now = oaatoperator.utility.now() # Phase One: Choose valid item candidates oaat_items = self.items.list() item_status = {item['name']: 'candidate' for item in oaat_items} if not oaat_items: raise ProcessingComplete( message='error in OaatGroup definition', error='no items found. please set "oaatItems"') self.debug('oaat_items: ' + ', '.join([i['name'] for i in oaat_items])) # Filter out items which have been recently successful self.debug(f'frequency: {self.freq}s') self.debug(f'now: {now}') self.debug(f'cool_off: {self.cool_off}') candidates = [] for item in oaat_items: if now > item['success'] + self.freq: candidates.append(item) item_status[item['name']] = ( f'not successful within last freq ({self.freq})') else: item_status[item['name']] = ( f'successful within last freq ({self.freq})') self.debug('Valid, based on success: ' + ', '.join([i['name'] for i in candidates])) # Filter out items which have failed within the cool off period if self.cool_off is not None: self.debug(f'testing {item["name"]} - ' f'now: {now}, ' f'failure: {item["failure"]}, ' f'cool_off: {self.cool_off}' f'test: {now < item["failure"] + self.cool_off}') if now < item['failure'] + self.cool_off: candidates.remove(item) item_status[item['name']] = ( f'cool_off ({self.cool_off}) not expired since ' f'last failure') self.debug('Valid, based on success and failure cool off: ' + ', '.join([i['name'] for i in candidates])) self.info('item status (* = candidate):\n' + '\n'.join([('* ' if i in candidates else '- ') + f'{i["name"]} ' + f'{item_status[i["name"]]} ' + f'success={i["success"].isoformat()}, ' + f'failure={i["failure"].isoformat()}, ' + f'numfails={i["numfails"]}' for i in oaat_items])) if not candidates: self.set_status('state', 'idle') raise ProcessingComplete(message='not time to run next item') # return single candidate if there is only one left if len(candidates) == 1: return candidates[0]['name'] # Phase 2: Choose the item to run from the valid item candidates # Get all items which are "oldest" oldest_success_time = min([t['success'] for t in candidates]) oldest_success_items = [ item for item in candidates if item['success'] == oldest_success_time ] self.debug('oldest_items {oldest_success_time}: ' + ', '.join([i['name'] for i in oldest_success_items])) if len(oldest_success_items) == 1: return oldest_success_items[0]['name'] # More than one item "equally old" success. Choose based on # last failure (but only if there has been a failure for the item) failure_items = [ item for item in oldest_success_items if item['numfails'] > 0 ] if len(failure_items) == 0: # nothing has failed remaining_items = oldest_success_items else: oldest_failure_time = min( [item['failure'] for item in failure_items]) self.debug(f'oldest_failure_time: {oldest_failure_time}') oldest_failure_items = [ item for item in oldest_success_items if item['failure'] == oldest_failure_time ] self.debug('oldest_failure_items: ' + ', '.join([i['name'] for i in oldest_failure_items])) if len(oldest_failure_items) == 1: return oldest_failure_items[0]['name'] remaining_items = oldest_failure_items # more than one "equally old" failure. Choose at random return remaining_items[randrange( len(remaining_items))]['name'] # nosec