def create_infrastructure_impl(self, infrastructure_id, k8s): try: logger.info('storage=' + str(k8s.get('storage'))) for storage_name, storage in k8s.get('storage', {}).items(): storageSize = storage.get('size', None) storageClassName = storage.get('storageClassName', None) properties = {} if storageClassName == "hostpath": properties['hostpath'] = storage.get('hostpath', None) self.create_storage(storage_name, storageSize, storageClassName, infrastructure_id, properties) for _, network in k8s.get('networks', {}).items(): network_name = network.get('name', None) bridge = network.get('bridge', None) subnet = network.get('subnet', None) range_start = network.get('range_start', None) range_end = network.get('range_end', None) self.create_network(infrastructure_id, network_name, bridge, subnet, range_start, range_end) # TODO mapping storageClassName to pods - just have one storage class? for pod in k8s.get('pods', []): pod_name = pod.get('name', None) image = pod.get('image', None) container_port = pod.get('container_port', None) # storage_name, storageClassName, storageSize storage = pod.get('storage', []) networks = pod.get('network', []) logger.info('pod_name=' + pod_name) self.create_pod(pod_name, image, container_port, infrastructure_id, storage, networks) self.create_config_map_for_outputs(pod_name, infrastructure_id, k8s.get('outputs', {})) except K8sApiException as e: if e.status == 409: logger.error('K8s exception1' + str(e)) self.inf_messaging_service.send_infrastructure_task( InfrastructureTask( infrastructure_id, infrastructure_id, STATUS_FAILED, FailureDetails(FAILURE_CODE_RESOURCE_ALREADY_EXISTS, "Resource already exists"), {})) else: logger.error('K8s exception2' + str(e)) self.inf_messaging_service.send_infrastructure_task( InfrastructureTask( infrastructure_id, infrastructure_id, STATUS_FAILED, FailureDetails(FAILURE_CODE_INTERNAL_ERROR, str(e)), {})) except Exception as e: logger.error('K8s exception2' + str(e)) self.inf_messaging_service.send_infrastructure_task( InfrastructureTask( infrastructure_id, infrastructure_id, STATUS_FAILED, FailureDetails(FAILURE_CODE_INTERNAL_ERROR, str(e)), {}))
def handle_request(self, request): try: if request is not None: if request.get('logging_context', None) is not None: logging_context.set_from_dict(request['logging_context']) if 'request_id' not in request: self.messaging_service.send_lifecycle_execution( LifecycleExecution( None, STATUS_FAILED, FailureDetails(FAILURE_CODE_INTERNAL_ERROR, "Request must have a request_id"), {})) if 'lifecycle_name' not in request: self.messaging_service.send_lifecycle_execution( LifecycleExecution( request['request_id'], STATUS_FAILED, FailureDetails( FAILURE_CODE_INTERNAL_ERROR, "Request must have a lifecycle_name"), {})) if 'driver_files' not in request: self.messaging_service.send_lifecycle_execution( LifecycleExecution( request['request_id'], STATUS_FAILED, FailureDetails(FAILURE_CODE_INTERNAL_ERROR, "Request must have a driver_files"), {})) # run the playbook and send the response to the response queue logger.debug( 'Ansible worker running request {0}'.format(request)) result = self.ansible_client.run_lifecycle_playbook(request) if result is not None: logger.debug( 'Ansible worker finished for request {0}: {1}'.format( request, result)) self.messaging_service.send_lifecycle_execution(result) else: logger.warning( "Empty response from Ansible worker for request {0}". format(request)) else: logger.warning('Null lifecycle request from request queue') except Exception as e: logger.error('Unexpected exception {0}'.format(e)) traceback.print_exc(file=sys.stderr) # don't want the worker to die without knowing the cause, so catch all exceptions if request is not None: self.messaging_service.send_lifecycle_execution( LifecycleExecution( request['request_id'], STATUS_FAILED, FailureDetails(FAILURE_CODE_INTERNAL_ERROR, "Unexpected exception: {0}".format(e)), {})) finally: # clean up zombie processes (Ansible can leave these behind) for p in active_children(): logger.debug("removed zombie process {0}".format(p.name))
def get_infrastructure(self, infrastructure_id, request_type): outputs = {} statuses = [] statuses.append( list( map( lambda pod: self.__build_pod_status( request_type, pod, outputs), self.coreV1Api().list_namespaced_pod( namespace=self.namespace(), label_selector='infrastructure_id={}'.format( infrastructure_id))))) statuses.append( list( map( lambda pvc: self.__build_pvc_status( request_type, pvc, outputs), self.coreV1Api().list_namespaced_persistent_volume_claim( namespace=self.namespace(), label_selector='infrastructure_id={}'.format( infrastructure_id))))) failure_details = None status = STATUS_COMPLETE if request_type == 'CREATE': failed = list( filter(lambda x: x['status'] == STATUS_FAILED, statuses)) if len(failed) > 0: status = STATUS_FAILED failure_details = FailureDetails( FAILURE_CODE_INFRASTRUCTURE_ERROR, failed[0].status_reason) in_progress = list( filter(lambda x: x['status'] == STATUS_IN_PROGRESS, statuses)) if len(in_progress) > 0: status = STATUS_IN_PROGRESS return InfrastructureTask(infrastructure_id, infrastructure_id, status, failure_details, outputs) elif request_type == 'DELETE': failed = list( filter(lambda x: x['status'] == STATUS_FAILED, statuses)) in_progress = list( filter(lambda x: x['status'] == STATUS_IN_PROGRESS, statuses)) if len(failed) > 0: status = STATUS_FAILED failure_details = FailureDetails( FAILURE_CODE_INFRASTRUCTURE_ERROR, failed[0].status_reason) elif len(in_progress) > 0 or len(statuses) > 0: status = STATUS_IN_PROGRESS return InfrastructureTask(infrastructure_id, infrastructure_id, status, failure_details, outputs) else: raise ValueError("Invalud request_type {0}".format(request_type))
def v2_runner_on_failed(self, result, *args, **kwargs): """ Called when a task fails Note: even when a loop is used (so v2_runner_item_on_failed/v2_runner_item_on_ok is called for each item) this function is called at the end, when all items have been attempted but one has failed """ logger.debug( "v2_runner_on_failed: task={0}, result={1}, task_fields={2}". format(result._task, result._result, result._task_fields)) # TODO: handle ignore_errors? self.failed_task = result._task.get_name() if 'msg' in result._result and 'Timeout' in result._result[ 'msg'] and 'waiting for privilege escalation prompt' in result._result[ 'msg']: logger.info('Failure to be treated as unreachable: task ' + str(self.failed_task) + ' failed: ' + str(result._result)) self.__handle_unreachable(result) elif 'module_stderr' in result._result and result._result[ 'module_stderr'].startswith( 'ssh:' ) and 'Host is unreachable' in result._result['module_stderr']: logger.info('Failure to be treated as unreachable: task ' + str(self.failed_task) + ' failed: ' + str(result._result)) self.__handle_unreachable(result) else: self.host_failed = True self.failure_reason = 'task ' + str( self.failed_task) + ' failed: ' + str(result._result) self.host_failed_log.append( dict(task=self.failed_task, result=result._result)) self.failure_details = FailureDetails( FAILURE_CODE_INFRASTRUCTURE_ERROR, self.failure_reason) self.playbook_failed = True self._log_event_for_failed_task(result)
def __build_execution_response(self, stack, request_id): request_type, stack_id, operation_id = self.__split_request_id( request_id) stack_status = stack.get('stack_status', None) failure_details = None if request_type == CREATE_REQUEST_PREFIX: status = self.__determine_create_status(request_id, stack_id, stack_status) else: status = self.__determine_delete_status(request_id, stack_id, stack_status) if status == STATUS_FAILED: description = stack.get('stack_status_reason', None) failure_details = FailureDetails(FAILURE_CODE_INFRASTRUCTURE_ERROR, description) status_reason = stack.get('stack_status_reason', None) outputs = None associated_topology = None if request_type == CREATE_REQUEST_PREFIX: outputs_from_stack = stack.get('outputs', []) outputs = self.__translate_outputs_to_values_dict( outputs_from_stack) return LifecycleExecution(request_id, status, failure_details=failure_details, outputs=outputs)
def v2_runner_on_failed(self, result, *args, **kwargs): """ ansible task failed """ logger.debug("v2_runner_on_failed {0} {1} {2}".format( result._task, result._result, result._task_fields)) self.failed_task = result._task.get_name() if 'msg' in result._result and 'Timeout' in result._result[ 'msg'] and 'waiting for privilege escalation prompt' in result._result[ 'msg']: logger.debug('Failure to be treated as unreachable: task ' + str(self.failed_task) + ' failed: ' + str(result._result)) self.__handle_unreachable(result) elif 'module_stderr' in result._result and result._result[ 'module_stderr'].startswith( 'ssh:' ) and 'Host is unreachable' in result._result['module_stderr']: logger.debug('Failure to be treated as unreachable: task ' + str(self.failed_task) + ' failed: ' + str(result._result)) self.__handle_unreachable(result) else: self.host_failed = True self.failure_reason = 'task ' + str( self.failed_task) + ' failed: ' + str(result._result) self.host_failed_log.append( dict(task=self.failed_task, result=result._result)) self.failure_details = FailureDetails( FAILURE_CODE_INFRASTRUCTURE_ERROR, self.failure_reason) self.playbook_failed = True
def __build_infrastructure_response(self, stack): infrastructure_id = stack.get('id') stack_status = stack.get('stack_status', None) failure_details = None if stack_status in [OS_STACK_STATUS_CREATE_IN_PROGRESS, OS_STACK_STATUS_DELETE_IN_PROGRESS]: logger.debug('Stack %s has stack_status %s, setting status in response to %s', infrastructure_id, stack_status, STATUS_IN_PROGRESS) status = STATUS_IN_PROGRESS elif stack_status in [OS_STACK_STATUS_CREATE_COMPLETE, OS_STACK_STATUS_DELETE_COMPLETE]: logger.debug('Stack %s has stack_status %s, setting status in response to %s', infrastructure_id, stack_status, STATUS_COMPLETE) status = STATUS_COMPLETE elif stack_status in [OS_STACK_STATUS_CREATE_FAILED, OS_STACK_STATUS_DELETE_FAILED]: logger.debug('Stack %s has stack_status %s, setting status in response to %s', infrastructure_id, stack_status, STATUS_FAILED) status = STATUS_FAILED description = stack.get('stack_status_reason', None) failure_details = FailureDetails(FAILURE_CODE_INFRASTRUCTURE_ERROR, description) status_reason = stack.get('stack_status_reason', None) else: logger.debug('Stack %s has stack_status %s, setting status in response to %s', infrastructure_id, stack_status, STATUS_UNKNOWN) status = STATUS_UNKNOWN is_create = True if stack_status in [OS_STACK_STATUS_DELETE_IN_PROGRESS, OS_STACK_STATUS_DELETE_COMPLETE, OS_STACK_STATUS_DELETE_FAILED]: is_create = False outputs = None if is_create: logger.debug('Stack %s last process is a create', infrastructure_id) outputs_from_stack = stack.get('outputs', []) outputs = self.__translate_outputs_to_values_dict(outputs_from_stack) return InfrastructureTask(infrastructure_id, infrastructure_id, status, failure_details, outputs)
def test_run_lifecycle_missing_lifecycle_name(self): # this is needed to ensure logging output appears in test context - see https://stackoverflow.com/questions/7472863/pydev-unittesting-how-to-capture-text-logged-to-a-logging-logger-in-captured-o stream_handler.stream = sys.stdout request_id = uuid.uuid4().hex handler = AnsibleRequestHandler(self.mock_messaging_service, self.mock_ansible_client) handler.handle_request({ 'request_id': request_id, 'driver_files': DirectoryTree(self.tmp_workspace), 'system_properties': PropValueMap({}), 'resource_properties': PropValueMap({}), 'deployment_location': PropValueMap({}) }) self.check_response_only( LifecycleExecution( request_id, STATUS_FAILED, FailureDetails(FAILURE_CODE_INTERNAL_ERROR, "Request must have a lifecycle_name"), {}))
def v2_runner_on_ok(self, result, *args, **kwargs): """ Called when task execution completes (called for each host the task executes against) Note: even when a loop is used (so v2_runner_item_on_ok is called for each successful item) this function is called at the end, when all items have succeeded """ logger.debug('v2_runner_on_ok: {0}'.format(result)) props = [] if 'results' in result._result.keys(): self.facts = result._result['results'] props = [ item['ansible_facts'] for item in self.facts if 'ansible_facts' in item ] else: self.facts = result._result if 'ansible_facts' in self.facts: props = [self.facts['ansible_facts']] for prop in props: for key, value in prop.items(): if key.startswith(self.ansible_properties.output_prop_prefix): output_facts = { key[len(self.ansible_properties.output_prop_prefix):]: value } logger.debug('output props = {0}'.format(output_facts)) self.properties.update(output_facts) elif key == 'associated_topology': try: logger.info('associated_topology = {0}'.format( associated_topology)) self.associated_topology = AssociatedTopology.from_dict( value) except ValueError as ve: self.failure_reason = f'An error has occurred while parsing the ansible fact \'{key}\'. {ve}' self.failure_details = FailureDetails( FAILURE_CODE_INFRASTRUCTURE_ERROR, self.failure_reason) self.playbook_failed = True except Exception as e: self.failure_reason = f'An internal error has occurred. {e}' self.failure_details = FailureDetails( FAILURE_CODE_INFRASTRUCTURE_ERROR, self.failure_reason) self.playbook_failed = True self._log_event_for_ok_task(result)
def job_handler(self, job_definition): if 'request_id' not in job_definition or job_definition[ 'request_id'] is None: logger.warning( 'Job with {0} job type is missing request_id. This job has been discarded' .format(LIFECYCLE_EXECUTION_MONITOR_JOB_TYPE)) return True if 'deployment_location' not in job_definition or job_definition[ 'deployment_location'] is None: logger.warning( 'Job with {0} job type is missing deployment_location. This job has been discarded' .format(LIFECYCLE_EXECUTION_MONITOR_JOB_TYPE)) return True request_id = job_definition['request_id'] deployment_location = job_definition['deployment_location'] try: lifecycle_execution_task = self.handler.get_lifecycle_execution( request_id, deployment_location) except RequestNotFoundError as e: logger.debug( 'Request with ID {0} not found, the request will no longer be monitored' .format(request_id)) return True except TemporaryResourceDriverError as e: logger.exception( 'Temporary error occurred checking status of request with ID {0}. The job will be re-queued: {1}' .format(request_id, str(e))) return False except Exception as e: logger.exception( 'Unexpected error occurred checking status of request with ID {0}. A failure response will be posted and the job will NOT be re-queued: {1}' .format(request_id, str(e))) lifecycle_execution_task = LifecycleExecution( request_id, STATUS_FAILED, FailureDetails(FAILURE_CODE_INTERNAL_ERROR, str(e))) self.lifecycle_messaging_service.send_lifecycle_execution( lifecycle_execution_task) return True status = lifecycle_execution_task.status if status in [STATUS_COMPLETE, STATUS_FAILED]: self.lifecycle_messaging_service.send_lifecycle_execution( lifecycle_execution_task) if hasattr(self.handler, 'post_lifecycle_response'): try: logger.debug( f'Calling post_lifecycle_response for request with ID: {0}' .format(request_id)) self.handler.post_lifecycle_response( request_id, deployment_location) except Exception as e: logger.exception( 'Unexpected error occurred on post_lifecycle_response for request with ID {0}. This error has no impact on the response: {1}' .format(request_id, str(e))) return True return False
def __handle_unreachable(self, result): # TODO do not overwrite if already set self.failed_task = result._task.get_name() self.host_unreachable_log.append( dict(task=self.failed_task, result=result._result)) self.host_unreachable = True self.failure_reason = 'Resource unreachable (task ' + str( self.failed_task) + ' failed: ' + str(result._result) + ')' self.failure_details = FailureDetails(FAILURE_CODE_RESOURCE_NOT_FOUND, self.failure_reason) self.playbook_failed = True
def pod_watcher_worker(self): try: logger.info('Monitoring pods') # TODO loop until close condition is set while True: last_seen_version = 0 # poll forever (timeout == 0) for pod_event in self.watcher.stream( self.coreV1Api().list_pod_for_all_namespaces, resource_version=last_seen_version, timeout_seconds=0): event_type = pod_event['type'] pod = pod_event['object'] metadata = pod.metadata if last_seen_version == 0: # track where we are up to in the pod events stream in case we have to restart last_seen_version = metadata.resource_version pod_name = metadata.name labels = metadata.labels infrastructure_id = labels.get('infrastructure_id', None) if infrastructure_id is not None: logging_context.set_from_dict(labels) try: logger.debug('Got pod event {0}'.format(pod_event)) outputs = {} phase = pod.status.phase podStatus = self.__build_pod_status( event_type, pod, outputs) request_type = 'CREATE' failure_details = None outputs = {"host": pod.metadata.name} if phase is None: status = STATUS_UNKNOWN elif phase in ['Pending']: container_statuses = pod.status.container_statuses if container_statuses is not None and len( container_statuses) > 0: waiting = container_statuses[ 0].state.waiting if (waiting is not None): if (waiting.reason in [ 'ErrImagePull', 'ImagePullBackOff' ]): status = STATUS_FAILED failure_details = FailureDetails( FAILURE_CODE_INFRASTRUCTURE_ERROR, 'ErrImagePull') else: status = STATUS_IN_PROGRESS else: status = STATUS_IN_PROGRESS else: status = STATUS_IN_PROGRESS elif phase in ['Running']: status = STATUS_COMPLETE elif phase in ['Failed']: status = STATUS_FAILED failure_details = FailureDetails( FAILURE_CODE_INFRASTRUCTURE_ERROR, podStatus.status_reason) else: status = STATUS_UNKNOWN if status in [STATUS_COMPLETE, STATUS_FAILED]: if status == STATUS_COMPLETE: try: # try to find the ConfigMap that contains information on output property mappings cm = self.coreV1Api( ).read_namespaced_config_map( infrastructure_id, self.namespace()) logger.info( "Got ConfigMap {0} for infrastructure_id {1}" .format(str(cm), infrastructure_id)) if cm is not None: for output_prop_name, k8s_key in cm.data.items( ): logger.info( "Output: {0}={1}".format( output_prop_name, k8s_key)) if k8s_key.startswith( 'network.'): k8s_prop_name = k8s_key[ len('network.'):] logger.info( "k8s_prop_name: {0}". format(k8s_prop_name)) annotations = pod.metadata.annotations networks_status_str = annotations.get( 'k8s.v1.cni.cncf.io/networks-status', None) logger.info( 'networks_status_str: {0}'. format( str(networks_status_str ))) if networks_status_str is not None: networks_status = json.loads( networks_status_str) for network_status in networks_status: net_name = network_status.get( 'name', None) net_ips = network_status.get( 'ips', {}) logger.info( 'net_name {0}, net_ips {1}' .format( net_name, str(net_ips))) if net_name is not None and len( net_ips) > 0: if net_name == k8s_prop_name: outputs[ output_prop_name] = net_ips[ 0] else: logger.info( 'network status not found for output property {0}' .format( output_prop_name)) except K8sApiException as e: # ok if e.status == 404: logger.info( "Unable to find cm for infrastructure id {0}" .format(infrastructure_id)) inf_task = InfrastructureTask( infrastructure_id, infrastructure_id, status, failure_details, outputs) logger.info( 'Sending infrastructure response {0}'. format(str(inf_task))) self.inf_messaging_service.send_infrastructure_task( inf_task) finally: logging_context.clear() except Exception: logger.exception( "Unexpected exception watching pods, re-initializing") self.pod_watcher_worker()
def run_lifecycle_playbook(self, request): driver_files = request['driver_files'] key_property_processor = None try: request_id = request['request_id'] lifecycle = request['lifecycle_name'] properties = request['resource_properties'] system_properties = request['system_properties'] deployment_location = request['deployment_location'] if not isinstance(deployment_location, dict): return LifecycleExecution( request_id, STATUS_FAILED, FailureDetails(FAILURE_CODE_INTERNAL_ERROR, "Deployment Location must be an object"), {}) dl_properties = PropValueMap( deployment_location.get('properties', {})) config_path = driver_files.get_directory_tree('config') scripts_path = driver_files.get_directory_tree('scripts') key_property_processor = KeyPropertyProcessor( properties, system_properties, dl_properties) playbook_path = get_lifecycle_playbook_path( scripts_path, lifecycle) if playbook_path is not None: if not os.path.exists(playbook_path): return LifecycleExecution( request_id, STATUS_FAILED, FailureDetails(FAILURE_CODE_INTERNAL_ERROR, "Playbook path does not exist"), {}) if deployment_location.get('type') == 'Kubernetes': dl_properties['kubeconfig_path'] = self.create_kube_config( deployment_location) connection_type = "k8s" inventory_path = config_path.get_file_path(INVENTORY_K8S) else: connection_type = "ssh" inventory_path = config_path.get_file_path(INVENTORY) # process key properties by writing them out to a temporary file and adding an # entry to the property dictionary that maps the "[key_name].path" to the key file path key_property_processor.process_key_properties() logger.debug('config_path = ' + config_path.get_path()) logger.debug('driver_files = ' + scripts_path.get_path()) logger.debug("playbook_path=" + playbook_path) logger.debug("inventory_path=" + inventory_path) all_properties = { 'properties': properties, 'system_properties': system_properties, 'dl_properties': dl_properties } process_templates(config_path, all_properties) # always retry on unreachable num_retries = self.ansible_properties.max_unreachable_retries for i in range(0, num_retries): if i > 0: logger.debug( 'Playbook {0}, unreachable retry attempt {1}/{2}'. format(playbook_path, i + 1, num_retries)) start_time = datetime.now() ret = self.run_playbook(request_id, connection_type, inventory_path, playbook_path, lifecycle, all_properties) if not ret.host_unreachable: break end_time = datetime.now() if self.ansible_properties.unreachable_sleep_seconds > 0: # Factor in that the playbook may have taken some time to determine is was unreachable # by using the unreachable_sleep_seconds value as a minimum amount of time for the delay delta = end_time - start_time retry_seconds = max( 0, self.ansible_properties.unreachable_sleep_seconds - int(delta.total_seconds())) time.sleep(retry_seconds) return ret.get_result() else: msg = "No playbook to run at {0} for lifecycle {1} for request {2}".format( playbook_path, lifecycle, request_id) logger.debug(msg) return LifecycleExecution( request_id, STATUS_FAILED, FailureDetails(FAILURE_CODE_INTERNAL_ERROR, msg), {}) except InvalidRequestException as ire: return LifecycleExecution( request_id, STATUS_FAILED, FailureDetails(FAILURE_CODE_INTERNAL_ERROR, ire.msg), {}) except Exception as e: logger.exception("Unexpected exception running playbook") return LifecycleExecution( request_id, STATUS_FAILED, FailureDetails(FAILURE_CODE_INTERNAL_ERROR, "Unexpected exception: {0}".format(e)), {}) finally: if key_property_processor is not None: key_property_processor.clear_key_files() keep_files = request.get('keep_files', False) if not keep_files and driver_files is not None: try: logger.debug( 'Attempting to remove lifecycle scripts at {0}'.format( driver_files.root_path)) driver_files.remove_all() except Exception as e: logger.exception( 'Encountered an error whilst trying to clear out lifecycle scripts directory {0}: {1}' .format(driver_files.root_path, str(e)))
def run_lifecycle_playbook(self, request): driver_files = request['driver_files'] key_property_processor = None location = None try: request_id = request['request_id'] lifecycle = request['lifecycle_name'] resource_properties = request.get('resource_properties', {}) system_properties = request.get('system_properties', {}) request_properties = request.get('request_properties', {}) associated_topology = request.get('associated_topology', None) location = DeploymentLocation.from_request(request) config_path = driver_files.get_directory_tree('config') scripts_path = driver_files.get_directory_tree('scripts') key_property_processor = KeyPropertyProcessor( resource_properties, system_properties, location.properties()) playbook_path = get_lifecycle_playbook_path( scripts_path, lifecycle) if playbook_path is not None: if not os.path.exists(playbook_path): return LifecycleExecution( request_id, STATUS_FAILED, FailureDetails(FAILURE_CODE_INTERNAL_ERROR, "Playbook path does not exist"), {}) inventory = Inventory(driver_files, location.infrastructure_type) # process key properties by writing them out to a temporary file and adding an # entry to the property dictionary that maps the "[key_name].path" to the key file path key_property_processor.process_key_properties() logger.debug( f'Handling request {request_id} with config_path: {config_path.get_path()} driver files path: {scripts_path.get_path()} resource properties: {resource_properties} system properties {system_properties} request properties {request_properties}' ) all_properties = self.render_context_service.build( system_properties, resource_properties, request_properties, location.deployment_location(), associated_topology) process_templates(config_path, self.templating, all_properties) # always retry on unreachable num_retries = self.ansible_properties.max_unreachable_retries for i in range(0, num_retries): if i > 0: logger.debug( 'Playbook {0}, unreachable retry attempt {1}/{2}'. format(playbook_path, i + 1, num_retries)) start_time = datetime.now() ret = self.run_playbook(request_id, location.connection_type, inventory.get_inventory_path(), playbook_path, lifecycle, all_properties) if not ret.host_unreachable: break end_time = datetime.now() if self.ansible_properties.unreachable_sleep_seconds > 0: # Factor in that the playbook may have taken some time to determine is was unreachable # by using the unreachable_sleep_seconds value as a minimum amount of time for the delay delta = end_time - start_time retry_seconds = max( 0, self.ansible_properties.unreachable_sleep_seconds - int(delta.total_seconds())) time.sleep(retry_seconds) return ret.get_result() else: msg = "No playbook to run at {0} for lifecycle {1} for request {2}".format( playbook_path, lifecycle, request_id) logger.debug(msg) return LifecycleExecution( request_id, STATUS_FAILED, FailureDetails(FAILURE_CODE_INTERNAL_ERROR, msg), {}) except InvalidRequestException as ire: return LifecycleExecution( request_id, STATUS_FAILED, FailureDetails(FAILURE_CODE_INTERNAL_ERROR, ire.msg), {}) except Exception as e: logger.exception("Unexpected exception running playbook") return LifecycleExecution( request_id, STATUS_FAILED, FailureDetails(FAILURE_CODE_INTERNAL_ERROR, "Unexpected exception: {0}".format(e)), {}) finally: if location is not None: location.cleanup() if key_property_processor is not None: key_property_processor.clear_key_files() keep_files = request.get('keep_files', False) if not keep_files and driver_files is not None: try: logger.debug( 'Attempting to remove lifecycle scripts at {0}'.format( driver_files.root_path)) driver_files.remove_all() except Exception as e: logger.exception( 'Encountered an error whilst trying to clear out lifecycle scripts directory {0}: {1}' .format(driver_files.root_path, str(e)))
def handle_request(self, request): try: partition = request.partition offset = request.offset request_as_dict = request.as_new_dict() request_id = request_as_dict.get('request_id', None) if 'lifecycle_name' not in request_as_dict or request_as_dict['lifecycle_name'] is None: msg = 'Lifecycle request for partition {0} offset {1} is missing lifecycle_name.'.format(partition, offset) logger.warning(msg) self.messaging_service.send_lifecycle_execution(LifecycleExecution(request_id, STATUS_FAILED, FailureDetails(FAILURE_CODE_INTERNAL_ERROR, msg), {})) return if 'driver_files' not in request_as_dict or request_as_dict['driver_files'] is None: msg = 'Lifecycle request for partition {0} offset {1} is missing driver_files.'.format(partition, offset) logger.warning(msg) self.messaging_service.send_lifecycle_execution(LifecycleExecution(request_id, STATUS_FAILED, FailureDetails(FAILURE_CODE_INTERNAL_ERROR, msg), {})) return if 'system_properties' not in request_as_dict or request_as_dict['system_properties'] is None: msg = 'Lifecycle request for partition {0} offset {1} is missing system_properties.'.format(partition, offset) logger.warning(msg) self.messaging_service.send_lifecycle_execution(LifecycleExecution(request_id, STATUS_FAILED, FailureDetails(FAILURE_CODE_INTERNAL_ERROR, msg), {})) return if 'resource_properties' not in request_as_dict or request_as_dict['resource_properties'] is None: msg = 'Lifecycle request for partition {0} offset {1} is missing resource_properties.'.format(partition, offset) logger.warning(msg) self.messaging_service.send_lifecycle_execution(LifecycleExecution(request_id, STATUS_FAILED, FailureDetails(FAILURE_CODE_INTERNAL_ERROR, msg), {})) return if 'request_properties' not in request_as_dict or request_as_dict['request_properties'] is None: msg = 'Lifecycle request for partition {0} offset {1} is missing request_properties.'.format(partition, offset) logger.warning(msg) self.messaging_service.send_lifecycle_execution(LifecycleExecution(request_id, STATUS_FAILED, FailureDetails(FAILURE_CODE_INTERNAL_ERROR, msg), {})) return if 'associated_topology' not in request_as_dict or request_as_dict['associated_topology'] is None: msg = 'Lifecycle request for partition {0} offset {1} is missing associated_topology.'.format(partition, offset) logger.warning(msg) self.messaging_service.send_lifecycle_execution(LifecycleExecution(request_id, STATUS_FAILED, FailureDetails(FAILURE_CODE_INTERNAL_ERROR, msg), {})) return if 'deployment_location' not in request_as_dict or request_as_dict['deployment_location'] is None: msg = 'Lifecycle request for partition {0} offset {1} is missing deployment_location.'.format(partition, offset) logger.warning(msg) self.messaging_service.send_lifecycle_execution(LifecycleExecution(request_id, STATUS_FAILED, FailureDetails(FAILURE_CODE_INTERNAL_ERROR, msg), {})) return file_name = '{0}'.format(str(uuid.uuid4())) request_as_dict['driver_files'] = self.driver_files_manager.build_tree(file_name, request_as_dict['driver_files']) request_as_dict['resource_properties'] = PropValueMap(request_as_dict['resource_properties']) request_as_dict['system_properties'] = PropValueMap(request_as_dict['system_properties']) request_as_dict['request_properties'] = PropValueMap(request_as_dict['request_properties']) request_as_dict['associated_topology'] = AssociatedTopology.from_dict(request_as_dict['associated_topology']) self.lifecycle_request_handler.handle_request(request_as_dict) except Exception as e: try: self.messaging_service.send_lifecycle_execution(LifecycleExecution(request.request_id, STATUS_FAILED, FailureDetails(FAILURE_CODE_INTERNAL_ERROR, str(e)), {})) except Exception as e: # just log this and carry on logger.exception('Caught exception sending lifecycle response for driver request {0} for topic {1} : {2}'.format(request.request_id, self.request_queue_config.topic.name, str(e)))
def test_run_lifecycle_with_malformed_associated_topology_in_playbook(self): # configure so that we can see logging from the code under test stream_handler = logging.StreamHandler(sys.stdout) logger.addHandler(stream_handler) try: request_id = uuid.uuid4().hex infrastructure_id_1 = uuid.uuid4().hex infrastructure_id_2 = uuid.uuid4().hex infrastructure_osp_type = 'Openstack' infrastructure_k8s_type = 'Kubernetes' properties = PropValueMap({ 'hello_world_private_ip': { 'value': '10.220.217.113', 'type': 'string' }, 'ansible_ssh_user': { 'value': 'accanto', 'type': 'string' }, 'ansible_ssh_pass': { 'value': 'accanto', 'type': 'string' }, 'ansible_become_pass': { 'value': 'accanto', 'type': 'string' } }) system_properties = PropValueMap({ }) associated_topology = AssociatedTopology.from_dict({ 'apache1': { 'id': infrastructure_id_1, 'type': infrastructure_osp_type }, 'apache2': { 'id': infrastructure_id_2, 'type': infrastructure_k8s_type } }) dst = self.__copy_directory_tree(str(pathlib.Path(__file__).parent.absolute()) + '/../../resources/ansible_with_malformed_associated_topology_in_playbook') resp = self.ansible_client.run_lifecycle_playbook({ 'lifecycle_name': 'adopt', 'driver_files': DirectoryTree(dst), 'system_properties': system_properties, 'resource_properties': properties, 'deployment_location': { 'name': 'winterfell', 'type': "Kubernetes", 'properties': PropValueMap({ }) }, 'associated_topology': associated_topology, 'keep_files': True, 'request_id': request_id }) self.assertLifecycleExecutionMatches(resp, LifecycleExecution(request_id, STATUS_FAILED, FailureDetails(FAILURE_CODE_INFRASTRUCTURE_ERROR, "task debug failed: {'msg': \"The task includes an option with an undefined variable. The error was: 'dict object' has no attribute 'wrong'"), {})) self.assertTrue(os.path.exists(dst)) finally: logger.removeHandler(stream_handler)