def deploy_goldimage_image_service(self, goldimages_directory, goldimage_name): """ Deploy a gold image to the image service. Args: goldimage_name (str): Name of the gold image to deploy. Returns: str: ID of the created disk image. """ arch = self.get_cluster_architecture() # Select a vdisk format to use. Currently PPC64LE goldimages are only built # using qcow2 format and the x86_64 in vmdk. We could have the manager # perform a conversion, but acropolis can already do the image conversion # for us. if arch == GoldImageManager.ARCH_PPC64LE: disk_format = GoldImageManager.FORMAT_QCOW2 else: disk_format = GoldImageManager.FORMAT_VMDK # Use the GoldImage manager to get a path to our appropriate goldimage goldimage_manager = GoldImageManager(goldimages_directory) goldimage_path = goldimage_manager.get_goldimage_path( goldimage_name, format_str=disk_format, arch=arch) log.debug("Deploying %s to cluster", goldimage_path) # Deploy the image to service disk_name = os.path.splitext(os.path.basename(goldimage_path))[0] img_uuid, tid, _ = self._prism_client.images_create( NameUtil.goldimage_vmdisk_name(disk_name, "os"), goldimage_path, self._container_id) TaskPoller.execute_parallel_tasks(tasks=PrismTask.from_task_id( self._prism_client, tid), timeout_secs=3600) # NB: Required due to possible AHV bug. See XRAY-225. num_images_get_retries = 5 for attempt_num in xrange(num_images_get_retries): images_get_data = self._prism_client.images_get(image_id=img_uuid) image_state = images_get_data["image_state"] if image_state.lower() == "active": # Return the disk image return images_get_data["vm_disk_id"] else: log.info( "Waiting for created image to become active " "(imageState: %s, retry %d of %d)", image_state, attempt_num + 1, num_images_get_retries) log.debug(images_get_data) time.sleep(1) else: raise CurieException( CurieError.kInternalError, "Created image failed to become active within " "%d attempts" % num_images_get_retries)
def create_vm(self, goldimages_directory, goldimage_name, vm_name, vcpus=1, ram_mb=1024, node_id=None, datastore_name=None, data_disks=()): """ See 'Cluster.create_vm' for documentation. """ log.info( "Creating VM %s based on %s with %d vCPUs, %d MB RAM and %s " "disks on node %s in datastore %s ", vm_name, goldimage_name, vcpus, ram_mb, str(data_disks), str(node_id), datastore_name) image_uuid = self.deploy_goldimage_image_service( goldimages_directory, goldimage_name) # This namedtuple hackery is to handle the expectations in vm.py which # expects information directly parsed from an OVF file. Units = namedtuple("Units", ["multiplier"]) Disk = namedtuple("Disk", ["capacity", "units"]) attach_disks = [ Disk(gb, Units(1024 * 1024 * 1024)) for gb in data_disks ] vm_desc = VmDescriptor(name=vm_name, memory_mb=ram_mb, num_vcpus=vcpus, vmdisk_uuid_list=[image_uuid], attached_disks=attach_disks, container_uuid=self._container_id) # Create the VM log.info("Creating VM '%s' with %s MB RAM and %s vCPUs", vm_desc.name, vm_desc.memory_mb, vm_desc.num_vcpus) nic_specs = \ [vm_desc.to_ahv_vm_nic_create_spec(self._network_id)["specList"][0]] resp = self._prism_client.vms_create(vm_desc, nic_specs) tid = resp.get("taskUuid") if not tid: raise CurieException(CurieError.kManagementServerApiError, "Failed to deploy VM: %s" % resp) TaskPoller.execute_parallel_tasks(tasks=PrismTask.from_task_id( self._prism_client, tid), timeout_secs=60) task_json = self._prism_client.tasks_get_by_id(tid) vm_uuid = task_json["entityList"][0]["uuid"] # Make a Curie VM descriptor and assign it to the requested node vm = self.__vm_json_to_curie_vm( self._prism_client.vms_get_by_id(vm_uuid)) vm._node_id = node_id return vm
def test_prism_task_from_id(self, mock_tasks_get_by_id): dummy_task = DummyPrismTask() mock_tasks_get_by_id.side_effect = MockTasksGetById() poller = TaskPoller(10, poll_interval_secs=0) prism_task = PrismTask.from_task_id(self.prism, dummy_task.id()) mock_tasks_get_by_id.side_effect.add_task(dummy_task) poller.add_task(prism_task) poller.start() ret = poller.wait_for() self.assertNotEqual(ret, None)
def cleanup_images(self): """ Cleans up image service, removing any images associated with curie. """ images = self._prism_client.images_get().get("entities", {}) to_delete_image_uuids = [] for image in images: if image["name"].startswith(CURIE_GOLDIMAGE_VM_DISK_PREFIX): to_delete_image_uuids.append(image["uuid"]) log.info("Deleting images %s", ", ".join([i for i in to_delete_image_uuids])) task_map = self._prism_client.images_delete(to_delete_image_uuids) image_id_tid_map = {} for image_id, tid in task_map.iteritems(): image_id_tid_map[image_id] = PrismTask.from_task_id( self._prism_client, tid) TaskPoller.execute_parallel_tasks(tasks=image_id_tid_map.values(), timeout_secs=300)
def migrate_vms(self, vms, nodes, max_parallel_tasks=None): """Move 'vms' to 'nodes'. For each VM 'vms[xx]' move to the corresponding Node 'nodes[xx]'. Args: vms (list<Vm>): List of VMs to migrate. nodes (list<Node>): List of nodes to which 'vms' should be migrated. Must be the same length as 'vms'. Each VM in 'vms' wll be moved to the corresponding node in 'nodes'. max_parallel_tasks (int): The number of VMs to migrate in parallel. """ cutoff = self._prism_client.get_cluster_timestamp_usecs() # TODO (jklein): Max parallel tasks won't work unless this is converted # to a descriptor. log.info("Migrating VMS") if len(vms) != len(nodes): raise CurieException( CurieError.kInvalidParameter, "Must provide a destination node for each VM") ret = {} for ii, vm in enumerate(vms): ret[vm.vm_id()] = self._prism_client.vms_migrate( vm.vm_id(), nodes[ii].node_id()) return PrismTaskPoller.execute_parallel_tasks( tasks=[ PrismTask.from_task_id(self._prism_client, tid) for tid in ret.values() ], max_parallel=self._get_max_parallel_tasks(max_parallel_tasks), timeout_secs=len(vms) * 1200, prism_client=self._prism_client, cutoff_usecs=cutoff)
def __set_power_state_for_vms(self, vms, state, wait_for_ip=False, max_parallel_tasks=None, power_on_retries=10, timeout_secs=900): max_parallel_tasks = self._get_max_parallel_tasks(max_parallel_tasks) t0 = time.time() cutoff = self._prism_client.get_cluster_timestamp_usecs() vm_host_map = dict((vm.vm_id(), vm.node_id()) for vm in vms) power_op_vm_ids = vm_host_map.keys() # TODO (jklein): Why are these APIs broken :( for ii in xrange(power_on_retries): vm_id_task_id_map = self._prism_client.vms_set_power_state_for_vms( power_op_vm_ids, state, host_ids=[vm_host_map[vm] for vm in power_op_vm_ids]) task_id_vm_id_map = dict( (v, k) for k, v in vm_id_task_id_map.iteritems()) tasks = [] failed_for_vm_ids = [] # Filter out tasks which immediately failed. for vm_id, tid in vm_id_task_id_map.iteritems(): # TODO (jklein): Don't use literal True to indicate a state. if tid is True: continue if tid is None: failed_for_vm_ids.append(vm_id) continue tasks.append(PrismTask.from_task_id(self._prism_client, tid)) PrismTaskPoller.execute_parallel_tasks( tasks=tasks, max_parallel=max_parallel_tasks, timeout_secs=timeout_secs, prism_client=self._prism_client, cutoff_usecs=cutoff, raise_on_failure=False) failed_for_vm_ids.extend( task_id_vm_id_map[t.id()] for t in tasks if TaskStatus.cannot_succeed(t._state.status)) if not failed_for_vm_ids: break power_op_vm_ids = failed_for_vm_ids log.warning( "Failed to perform power op %s on %d VMs (attempt %d of %d)", state, len(power_op_vm_ids), ii + 1, power_on_retries) else: raise CurieTestException("Failed to power %s VMs %s" % (state, ", ".join(failed_for_vm_ids))) # TODO (jklein): Fix the terrible handling of time here when brain is more # functional. timeout_secs -= time.time() - t0 t0 = time.time() while timeout_secs > 0: vm_id_status_map = dict( (vm["uuid"], vm) for vm in self._prism_client.vms_get()["entities"]) failed_for_vm_ids = [] for vm_id in vm_host_map.iterkeys(): status = vm_id_status_map.get(vm_id) if not status or status.get("powerState") != state: failed_for_vm_ids.append(vm_id) if failed_for_vm_ids: log.info("Waiting for %d of %d VMs to transition to state %s", len(failed_for_vm_ids), len(power_op_vm_ids), state) timeout_secs -= time.time() - t0 t0 = time.time() time.sleep(1) else: break if failed_for_vm_ids: raise CurieTestException("Failed to power %s VMs %s" % (state, ", ".join(failed_for_vm_ids))) if not wait_for_ip: return if state != "on": raise CurieTestException( "Cannot wait for IPs to be assigned to powered off VMs") timeout_secs -= time.time() - t0 t0 = time.time() needs_ip_vm_ids = set(vm.vm_id() for vm in vms) has_ip_vm_ids = set() while timeout_secs > 0: vm_id_status_map = dict( (vm["uuid"], vm) for vm in self._prism_client.vms_get()["entities"]) for vm_id in (needs_ip_vm_ids - has_ip_vm_ids): if vm_id not in vm_id_status_map: # NB: Prism API may temporarily return an incomplete list of VMs. continue ip_addr = vm_id_status_map[vm_id].get("ipAddresses", []) if ip_addr: log.debug("VM %r has IP addresses: %r", vm_id, ip_addr) has_ip_vm_ids.add(vm_id) else: log.debug("VM %r IP addresses are %r, retrying", vm_id, ip_addr) timeout_secs -= time.time() - t0 t0 = time.time() if needs_ip_vm_ids - has_ip_vm_ids: log.info( "Waiting for %d of %d VMs to acquire IPs (%d seconds remaining)", len(needs_ip_vm_ids - has_ip_vm_ids), len(needs_ip_vm_ids), timeout_secs) time.sleep(1) else: return raise CurieTestException( "Timed out waiting for %d of %d VMs to acquire IPs" % (len(needs_ip_vm_ids), len(power_op_vm_ids)))
def clone_vms(self, vm, vm_names, node_ids=(), datastore_name=None, max_parallel_tasks=None, linked_clone=False): """ Clones 'vm' and creates the VMs with names 'vm_names'. Args: vm CurieVM: Base VM that clones will be created from. vm_names list of strings: One clone will be created for each name in list. node_ids list of node ids: If provided, must be the same length as 'vm_names', then 'vm_names[xx]' will be cloned to 'node_ids[xx]'. Otherwise VMs will be cloned to random nodes on cluster. datastore_name: If provided, name of datastore VMs will be cloned to. Otherwise the VMs will be created on the datastore associated with the curie server's settings for this cluster. max_parallel_tasks int: The number of VMs to power on in parallel. The default value is FLAGS.prism_max_parallel_tasks. linked_clone (bool): Whether or not the clones should be "normal" full clones or linked clones. Returns: List of cloned VMs. """ # TODO (jklein): Max parallel tasks if not node_ids: nodes = self.nodes() node_ids = [] for _ in range(len(vm_names)): node_ids.append(random.choice(nodes).node_id()) vm_desc = VmDescriptor.from_prism_entity_json( self._prism_client.vms_get_by_id(vm.vm_id())) if datastore_name is None: target_ctr_uuid = self._container_id else: target_ctr_uuid = self.__identifier_to_container_uuid( datastore_name) clone_spec = vm_desc.to_ahv_vm_clone_spec(vm_names, ctr_uuid=target_ctr_uuid) cutoff = self._prism_client.get_cluster_timestamp_usecs() task = PrismTask.from_task_id( self._prism_client, self._prism_client.vms_clone(vm.vm_id(), clone_spec)) PrismTaskPoller.execute_parallel_tasks(tasks=[ task, ], timeout_secs=len(vm_names) * 900, prism_client=self._prism_client, cutoff_usecs=cutoff) log.info("Clone task complete") task_json = AcropolisTaskInfo( **self._prism_client.tasks_get_by_id(task.id(), True)) created_uuids = set(e.uuid for e in task_json.entity_list if e.entity_type.strip().upper() == "VM") # Block until all VMs are found via /vms API. vms = self.__wait_for_vms(created_uuids) vm_name_map = {vm["vmName"]: vm for vm in vms} sorted_vms = [vm_name_map[vm_name] for vm_name in vm_names] # Create placement map which controls where VMs are placed when powered on. for node_id, vm in zip(node_ids, sorted_vms): self.__vm_uuid_host_uuid_map[vm["uuid"]] = node_id return sorted_vms
def delete_vms(self, vms, ignore_errors=False, max_parallel_tasks=None, timeout_secs=None): """Delete VMs. Acropolis DELETE requests for /vms/{vm_id} are async. This method collects all taskUuids and polls until completion. Args: vms (list<CurieVM>): List of VMs to delete. ignore_errors (bool): Optional. Whether to allow individual tasks to fail. Default False. max_parallel_tasks (int): Max number of requests to have in-flight at any given time. (Currently ignored) timeout_secs (int): If provided, overall timeout for VM deletion tasks. Raises: CurieTestException: - If any VM is not already powered off. - All VMs are not destroyed with in the timeout. - Destroy task fails and ignore_errors is False. """ # TODO (jklein): max_parallel_tasks won't work unless this is changed to # use task descriptors. if timeout_secs is None: timeout_secs = len(vms) * 60 task_t0 = self._prism_client.get_cluster_timestamp_usecs() vm_id_task_map = {} for vm_id, tid in self._prism_client.vms_delete( [vm.vm_id() for vm in vms]).iteritems(): if tid is None: raise CurieTestException("Failed to delete VM %s" % vm_id) vm_id_task_map[vm_id] = PrismTask.from_task_id( self._prism_client, tid) try: PrismTaskPoller.execute_parallel_tasks( tasks=vm_id_task_map.values(), max_parallel=self._get_max_parallel_tasks(max_parallel_tasks), timeout_secs=timeout_secs, prism_client=self._prism_client, cutoff_usecs=task_t0) except CurieTestException: if not ignore_errors: raise log.debug("Ignoring exception in delete_vms", exc_info=True) failed_to_delete_vm_ids = [] for vm_id, task in vm_id_task_map.iteritems(): if task.get_status() != TaskStatus.kSucceeded: failed_to_delete_vm_ids.append(vm_id) if failed_to_delete_vm_ids: msg = "Failed to delete vms: %s" % ", ".join( failed_to_delete_vm_ids) if ignore_errors: log.error(msg) else: raise CurieTestException(msg)
def create_curie_task_instance(self): return PrismTask( self._prism, PrismTaskDescriptor(create_task_func=self))