def _create_nodes_async(self, *args, cluster_name, cluster_vdc_href, cluster_vapp_href, cluster_id, template_name, template_revision, num_workers, network_name, num_cpu, mb_memory, storage_profile_name, ssh_key_filepath, enable_nfs, rollback): org = vcd_utils.get_org(self.tenant_client) vdc = VDC(self.tenant_client, href=cluster_vdc_href) vapp = VApp(self.tenant_client, href=cluster_vapp_href) template = get_template(name=template_name, revision=template_revision) msg = f"Creating {num_workers} node(s) from template " \ f"'{template_name}' (revision {template_revision}) and " \ f"adding to {cluster_name} ({cluster_id})" LOGGER.debug(msg) try: self._update_task(TaskStatus.RUNNING, message=msg) node_type = NodeType.WORKER if enable_nfs: node_type = NodeType.NFS server_config = utils.get_server_runtime_config() catalog_name = server_config['broker']['catalog'] new_nodes = add_nodes(client=self.tenant_client, num_nodes=num_workers, node_type=node_type, org=org, vdc=vdc, vapp=vapp, catalog_name=catalog_name, template=template, network_name=network_name, num_cpu=num_cpu, memory_in_mb=mb_memory, storage_profile=storage_profile_name, ssh_key_filepath=ssh_key_filepath) if node_type == NodeType.NFS: self._update_task( TaskStatus.SUCCESS, message=f"Created {num_workers} node(s) for " f"{cluster_name}({cluster_id})") elif node_type == NodeType.WORKER: self._update_task( TaskStatus.RUNNING, message=f"Adding {num_workers} node(s) to cluster " f"{cluster_name}({cluster_id})") target_nodes = [] for spec in new_nodes['specs']: target_nodes.append(spec['target_vm_name']) vapp.reload() join_cluster(vapp, template[LocalTemplateKey.NAME], template[LocalTemplateKey.REVISION], target_nodes) self._update_task( TaskStatus.SUCCESS, message=f"Added {num_workers} node(s) to cluster " f"{cluster_name}({cluster_id})") except NodeCreationError as e: if rollback: msg = f"Error adding nodes to {cluster_name} {cluster_id}." \ f" Deleting nodes: {e.node_names} (rollback=True)" self._update_task(TaskStatus.RUNNING, message=msg) LOGGER.info(msg) try: self._delete_nodes(cluster_name=cluster_name, cluster_vapp_href=cluster_vapp_href, node_names_list=e.node_names) except Exception: LOGGER.error(f"Failed to delete nodes {e.node_names} " f"from cluster {cluster_name}", exc_info=True) LOGGER.error(f"Error adding nodes to {cluster_name}", exc_info=True) error_obj = error_to_json(e) LOGGER.error(str(e), exc_info=True) stack_trace = ''.join(error_obj[ERROR_MESSAGE_KEY][ERROR_STACKTRACE_KEY]) # noqa: E501 self._update_task( TaskStatus.ERROR, error_message=error_obj[ERROR_MESSAGE_KEY][ERROR_DESCRIPTION_KEY], # noqa: E501 stack_trace=stack_trace) # raising an exception here prints a stacktrace to server console except Exception as e: error_obj = error_to_json(e) LOGGER.error(str(e), exc_info=True) stack_trace = ''.join(error_obj[ERROR_MESSAGE_KEY][ERROR_STACKTRACE_KEY]) # noqa: E501 self._update_task( TaskStatus.ERROR, error_message=error_obj[ERROR_MESSAGE_KEY][ERROR_DESCRIPTION_KEY], # noqa: E501 stack_trace=stack_trace) finally: self.logout_sys_admin_client()
def create_nodes(self, data): """Start the create nodes operation. Validates data for 'node create' operation. Creating nodes is an asynchronous task, so the returned `result['task_href']` can be polled to get updates on task progress. Required data: cluster_name, network_name Optional data and default values: num_nodes=2, num_cpu=None, mb_memory=None, storage_profile_name=None, ssh_key_filepath=None, template_name=default, template_revision=default, enable_nfs=False, rollback=True """ required = [ RequestKey.CLUSTER_NAME, RequestKey.NETWORK_NAME ] utils.ensure_keys_in_dict(required, data, dict_name='data') cluster_name = data[RequestKey.CLUSTER_NAME] # check that requested/default template is valid template = get_template( name=data.get(RequestKey.TEMPLATE_NAME), revision=data.get(RequestKey.TEMPLATE_REVISION)) defaults = { RequestKey.ORG_NAME: None, RequestKey.OVDC_NAME: None, RequestKey.NUM_WORKERS: 1, RequestKey.NUM_CPU: None, RequestKey.MB_MEMORY: None, RequestKey.STORAGE_PROFILE_NAME: None, RequestKey.SSH_KEY_FILEPATH: None, RequestKey.TEMPLATE_NAME: template[LocalTemplateKey.NAME], RequestKey.TEMPLATE_REVISION: template[LocalTemplateKey.REVISION], RequestKey.ENABLE_NFS: False, RequestKey.ROLLBACK: True, } validated_data = {**defaults, **data} # TODO HACK default dictionary combining needs to be fixed validated_data[RequestKey.TEMPLATE_NAME] = validated_data[RequestKey.TEMPLATE_NAME] or template[LocalTemplateKey.NAME] # noqa: E501 validated_data[RequestKey.TEMPLATE_REVISION] = validated_data[RequestKey.TEMPLATE_REVISION] or template[LocalTemplateKey.REVISION] # noqa: E501 template_name = validated_data[RequestKey.TEMPLATE_NAME] template_revision = validated_data[RequestKey.TEMPLATE_REVISION] num_workers = validated_data[RequestKey.NUM_WORKERS] if num_workers < 1: raise CseServerError(f"Worker node count must be > 0 " f"(received {num_workers}).") cluster = get_cluster(self.tenant_client, cluster_name, org_name=validated_data[RequestKey.ORG_NAME], ovdc_name=validated_data[RequestKey.OVDC_NAME]) cluster_id = cluster['cluster_id'] # must _update_task here or else self.task_resource is None # do not logout of sys admin, or else in pyvcloud's session.request() # call, session becomes None self._update_task( TaskStatus.RUNNING, message=f"Creating {num_workers} node(s) from template " f"'{template_name}' (revision {template_revision}) and " f"adding to {cluster_name} ({cluster_id})") self._create_nodes_async( cluster_name=cluster_name, cluster_vdc_href=cluster['vdc_href'], cluster_vapp_href=cluster['vapp_href'], cluster_id=cluster_id, template_name=template_name, template_revision=template_revision, num_workers=validated_data[RequestKey.NUM_WORKERS], network_name=validated_data[RequestKey.NETWORK_NAME], num_cpu=validated_data[RequestKey.NUM_CPU], mb_memory=validated_data[RequestKey.MB_MEMORY], storage_profile_name=validated_data[RequestKey.STORAGE_PROFILE_NAME], # noqa: E501 ssh_key_filepath=validated_data[RequestKey.SSH_KEY_FILEPATH], enable_nfs=validated_data[RequestKey.ENABLE_NFS], rollback=validated_data[RequestKey.ROLLBACK]) return { 'cluster_name': cluster_name, 'task_href': self.task_resource.get('href') }
def _create_cluster_async(self, *args, org_name, ovdc_name, cluster_name, cluster_id, template_name, template_revision, num_workers, network_name, num_cpu, mb_memory, storage_profile_name, ssh_key_filepath, enable_nfs, rollback): org = vcd_utils.get_org(self.tenant_client, org_name=org_name) vdc = vcd_utils.get_vdc( self.tenant_client, vdc_name=ovdc_name, org=org) LOGGER.debug(f"About to create cluster {cluster_name} on {ovdc_name}" f" with {num_workers} worker nodes, " f"storage profile={storage_profile_name}") try: self._update_task( TaskStatus.RUNNING, message=f"Creating cluster vApp {cluster_name}({cluster_id})") try: vapp_resource = \ vdc.create_vapp(cluster_name, description=f"cluster {cluster_name}", network=network_name, fence_mode='bridged') except Exception as e: msg = f"Error while creating vApp: {e}" LOGGER.debug(str(e)) raise ClusterOperationError(msg) self.tenant_client.get_task_monitor().wait_for_status(vapp_resource.Tasks.Task[0]) # noqa: E501 template = get_template(template_name, template_revision) tags = { ClusterMetadataKey.CLUSTER_ID: cluster_id, ClusterMetadataKey.CSE_VERSION: pkg_resources.require('container-service-extension')[0].version, # noqa: E501 ClusterMetadataKey.TEMPLATE_NAME: template[LocalTemplateKey.NAME], # noqa: E501 ClusterMetadataKey.TEMPLATE_REVISION: template[LocalTemplateKey.REVISION] # noqa: E501 } vapp = VApp(self.tenant_client, href=vapp_resource.get('href')) task = vapp.set_multiple_metadata(tags) self.tenant_client.get_task_monitor().wait_for_status(task) self._update_task( TaskStatus.RUNNING, message=f"Creating master node for " f"{cluster_name} ({cluster_id})") vapp.reload() server_config = utils.get_server_runtime_config() catalog_name = server_config['broker']['catalog'] try: add_nodes(client=self.tenant_client, num_nodes=1, node_type=NodeType.MASTER, org=org, vdc=vdc, vapp=vapp, catalog_name=catalog_name, template=template, network_name=network_name, num_cpu=num_cpu, memory_in_mb=mb_memory, storage_profile=storage_profile_name, ssh_key_filepath=ssh_key_filepath) except Exception as e: raise MasterNodeCreationError("Error adding master node:", str(e)) self._update_task( TaskStatus.RUNNING, message=f"Initializing cluster {cluster_name} ({cluster_id})") vapp.reload() init_cluster(vapp, template[LocalTemplateKey.NAME], template[LocalTemplateKey.REVISION]) master_ip = get_master_ip(vapp) task = vapp.set_metadata('GENERAL', 'READWRITE', 'cse.master.ip', master_ip) self.tenant_client.get_task_monitor().wait_for_status(task) self._update_task( TaskStatus.RUNNING, message=f"Creating {num_workers} node(s) for " f"{cluster_name}({cluster_id})") try: add_nodes(client=self.tenant_client, num_nodes=num_workers, node_type=NodeType.WORKER, org=org, vdc=vdc, vapp=vapp, catalog_name=catalog_name, template=template, network_name=network_name, num_cpu=num_cpu, memory_in_mb=mb_memory, storage_profile=storage_profile_name, ssh_key_filepath=ssh_key_filepath) except Exception as e: raise WorkerNodeCreationError("Error creating worker node:", str(e)) self._update_task( TaskStatus.RUNNING, message=f"Adding {num_workers} node(s) to " f"{cluster_name}({cluster_id})") vapp.reload() join_cluster(vapp, template[LocalTemplateKey.NAME], template[LocalTemplateKey.REVISION]) if enable_nfs: self._update_task( TaskStatus.RUNNING, message=f"Creating NFS node for " f"{cluster_name} ({cluster_id})") try: add_nodes(client=self.tenant_client, num_nodes=1, node_type=NodeType.NFS, org=org, vdc=vdc, vapp=vapp, catalog_name=catalog_name, template=template, network_name=network_name, num_cpu=num_cpu, memory_in_mb=mb_memory, storage_profile=storage_profile_name, ssh_key_filepath=ssh_key_filepath) except Exception as e: raise NFSNodeCreationError("Error creating NFS node:", str(e)) self._update_task( TaskStatus.SUCCESS, message=f"Created cluster {cluster_name} ({cluster_id})") except (MasterNodeCreationError, WorkerNodeCreationError, NFSNodeCreationError, ClusterJoiningError, ClusterInitializationError, ClusterOperationError) as e: if rollback: msg = f"Error creating cluster {cluster_name}. " \ f"Deleting cluster (rollback=True)" self._update_task(TaskStatus.RUNNING, message=msg) LOGGER.info(msg) try: cluster = get_cluster(self.tenant_client, cluster_name, cluster_id=cluster_id, org_name=org_name, ovdc_name=ovdc_name) self._delete_cluster(cluster_name=cluster_name, cluster_vdc_href=cluster['vdc_href']) except Exception: LOGGER.error(f"Failed to delete cluster {cluster_name}", exc_info=True) LOGGER.error(f"Error creating cluster {cluster_name}", exc_info=True) error_obj = error_to_json(e) stack_trace = ''.join(error_obj[ERROR_MESSAGE_KEY][ERROR_STACKTRACE_KEY]) # noqa: E501 self._update_task( TaskStatus.ERROR, error_message=error_obj[ERROR_MESSAGE_KEY][ERROR_DESCRIPTION_KEY], # noqa: E501 stack_trace=stack_trace) # raising an exception here prints a stacktrace to server console except Exception as e: LOGGER.error(f"Unknown error creating cluster {cluster_name}", exc_info=True) error_obj = error_to_json(e) stack_trace = ''.join(error_obj[ERROR_MESSAGE_KEY][ERROR_STACKTRACE_KEY]) # noqa: E501 self._update_task( TaskStatus.ERROR, error_message=error_obj[ERROR_MESSAGE_KEY][ERROR_DESCRIPTION_KEY], # noqa: E501 stack_trace=stack_trace) finally: self.logout_sys_admin_client()
def create_cluster(self, data): """Start the cluster creation operation. Common broker function that validates data for the 'create cluster' operation and returns a dictionary with cluster detail and task information. Calls the asyncronous cluster create function that actually performs the work. The returned `result['task_href']` can be polled to get updates on task progress. Required data: cluster_name, org_name, ovdc_name, network_name Optional data and default values: num_nodes=2, num_cpu=None, mb_memory=None, storage_profile_name=None, ssh_key_filepath=None, template_name=default, template_revision=default, enable_nfs=False, rollback=True """ required = [ RequestKey.CLUSTER_NAME, RequestKey.ORG_NAME, RequestKey.OVDC_NAME, RequestKey.NETWORK_NAME ] utils.ensure_keys_in_dict(required, data, dict_name='data') cluster_name = data[RequestKey.CLUSTER_NAME] # check that cluster name is syntactically valid if not is_valid_cluster_name(cluster_name): raise CseServerError(f"Invalid cluster name '{cluster_name}'") # check that cluster name doesn't already exist try: get_cluster(self.tenant_client, cluster_name, org_name=data[RequestKey.ORG_NAME], ovdc_name=data[RequestKey.OVDC_NAME]) raise ClusterAlreadyExistsError(f"Cluster {cluster_name} " f"already exists.") except ClusterNotFoundError: pass # check that requested/default template is valid template = get_template( name=data.get(RequestKey.TEMPLATE_NAME), revision=data.get(RequestKey.TEMPLATE_REVISION)) defaults = { RequestKey.NUM_WORKERS: 2, RequestKey.NUM_CPU: None, RequestKey.MB_MEMORY: None, RequestKey.STORAGE_PROFILE_NAME: None, RequestKey.SSH_KEY_FILEPATH: None, RequestKey.TEMPLATE_NAME: template[LocalTemplateKey.NAME], RequestKey.TEMPLATE_REVISION: template[LocalTemplateKey.REVISION], RequestKey.ENABLE_NFS: False, RequestKey.ROLLBACK: True, } validated_data = {**defaults, **data} # TODO HACK default dictionary combining needs to be fixed validated_data[RequestKey.TEMPLATE_NAME] = validated_data[RequestKey.TEMPLATE_NAME] or template[LocalTemplateKey.NAME] # noqa: E501 validated_data[RequestKey.TEMPLATE_REVISION] = validated_data[RequestKey.TEMPLATE_REVISION] or template[LocalTemplateKey.REVISION] # noqa: E501 template_name = validated_data[RequestKey.TEMPLATE_NAME] template_revision = validated_data[RequestKey.TEMPLATE_REVISION] # check that requested number of worker nodes is at least more than 1 num_workers = validated_data[RequestKey.NUM_WORKERS] if num_workers < 1: raise CseServerError(f"Worker node count must be > 0 " f"(received {num_workers}).") cluster_id = str(uuid.uuid4()) # must _update_task or else self.task_resource is None # do not logout of sys admin, or else in pyvcloud's session.request() # call, session becomes None self._update_task( TaskStatus.RUNNING, message=f"Creating cluster vApp '{cluster_name}' ({cluster_id})" f" from template '{template_name}' " f"(revision {template_revision})") self._create_cluster_async( org_name=validated_data[RequestKey.ORG_NAME], ovdc_name=validated_data[RequestKey.OVDC_NAME], cluster_name=cluster_name, cluster_id=cluster_id, template_name=template_name, template_revision=template_revision, num_workers=validated_data[RequestKey.NUM_WORKERS], network_name=validated_data[RequestKey.NETWORK_NAME], num_cpu=validated_data[RequestKey.NUM_CPU], mb_memory=validated_data[RequestKey.MB_MEMORY], storage_profile_name=validated_data[RequestKey.STORAGE_PROFILE_NAME], # noqa: E501 ssh_key_filepath=validated_data[RequestKey.SSH_KEY_FILEPATH], enable_nfs=validated_data[RequestKey.ENABLE_NFS], rollback=validated_data[RequestKey.ROLLBACK]) return { 'name': cluster_name, 'cluster_id': cluster_id, 'task_href': self.task_resource.get('href') }