Exemple #1
0
    async def wait_till_provisioned(self, resource):
        """Wait till resource is provisioned."""
        cont_id = resource

        start = datetime.now()
        timeout = 20
        timeout_time = start + timedelta(minutes=timeout)

        while datetime.now() < timeout_time:
            try:
                inspect = await self.podman.inspect(cont_id)
            except ProvisioningError as err:
                logger.error(f"{self.dsp_name}: {object2json(err)}")
                raise ServerNotFoundError(cont_id) from err
            server = inspect[0]
            running = server["State"]["Running"]
            is_error = server["State"]["Error"]
            if running or is_error:
                break
            await asyncio.sleep(1)

        done_time = datetime.now()
        prov_duration = (done_time - start).total_seconds()

        if datetime.now() >= timeout_time:
            logger.warning(
                f"{self.dsp_name}: {cont_id} was not provisioned within a timeout of"
                f" {timeout} mins")
        else:
            logger.info(
                f"{self.dsp_name}: {cont_id} was provisioned in {prov_duration:.1f}s"
            )

        logger.debug(f"{self.dsp_name}: Resource: {object2json(server)}")

        with open(os.path.expanduser(self.ssh_key), "r") as key_file:
            key_content = key_file.read()

        if not await self.podman.exec_command(cont_id, "mkdir -p /root/.ssh/"):
            raise ProvisioningError(
                f"Could not copy public key to container {cont_id}")

        if not await self.podman.exec_command(
                cont_id,
                f'echo "{key_content}" >> /root/.ssh/authorized_keys'):
            raise ProvisioningError(
                f"Could not copy public key to container {cont_id}")

        if not await self.podman.exec_command(cont_id,
                                              "systemctl restart sshd"):
            raise ProvisioningError(
                f"Failed restarting sshd service in container {cont_id}")

        return server
Exemple #2
0
    def prov_result_to_host_data(self, prov_result):
        """Get needed host information from podman provisioning result."""
        result = {}

        result["id"] = prov_result.get("Id")
        result["name"] = prov_result["Config"]["Hostname"]

        network_set = prov_result.get("NetworkSettings")

        result["addresses"] = []
        if network_set:
            try:  # TODO self.network_name
                for net in network_set["Networks"]:
                    result["addresses"].append(
                        network_set["Networks"][net]["IPAddress"])
            except KeyError as kerror:
                raise ProvisioningError(
                    f"{self.dsp_name}: Container state improper") from kerror

        status = self.get_status(prov_result.get("State"))
        error_obj = None
        if status == STATUS_ERROR:
            error_obj = prov_result.get("State")

        result["fault"] = error_obj
        result["status"] = status

        return result
Exemple #3
0
    async def abort_and_delete(self, hosts_to_delete, error_hosts):
        """Delete hosts and abort provisioning with an error."""
        logger.info(f"{self.dsp_name}: Aborting provisioning due to error.")
        for host in error_hosts:
            logger.error(f"{self.dsp_name}: Error: {str(host.error)}")

        logger.info(f"{self.dsp_name}: Given the error, will delete hosts")
        await self.delete_hosts(hosts_to_delete)
        raise ProvisioningError(error_hosts)
Exemple #4
0
    async def wait_till_provisioned(self, aws_id):
        """Wait for AWS provisioning result."""
        instance = self.ec2.Instance(aws_id)
        instance.wait_until_running()
        response = self.client.describe_instances(InstanceIds=[aws_id])
        result = {}
        try:  # returns dict with aws instance information
            result = response["Reservations"][0]["Instances"][0]
        except (KeyError, IndexError):
            raise ProvisioningError(
                "Unexpected data format in response of provisioned instance.")

        return result
Exemple #5
0
    async def create_server(self, req):
        """Issue creation of a server.

        req - dict of server requirements - can contains values defined in
              POST /servers official OpenStack API
              https://docs.openstack.org/api-ref/compute/?expanded=create-server-detail#create-server

        The req object can contain following additional attributes:
        * 'flavor': uuid or name of flavor to use
        * 'network': uuid or name of network to use. Will be added to networks
                     list if present
        """
        name = req.get("name")
        logger.info(f"{self.dsp_name}: Creating server {name}")
        specs = deepcopy(req)  # work with own copy, do not modify the input

        flavor = self._translate_flavor(req)
        specs["flavorRef"] = flavor["id"]
        if specs.get("flavor"):
            del specs["flavor"]

        image = self._translate_image(req)
        specs["imageRef"] = image["id"]
        if specs.get("image"):
            del specs["image"]

        network_specs = self._translate_networks(req, spec=True)
        specs["networks"] = network_specs
        if specs.get("network"):
            del specs["network"]

        error_attempts = 0
        while True:
            try:
                response = await self.nova.servers.create(server=specs)
            except ServerError as exc:
                logger.debug(exc)
                error_attempts += 1
                if error_attempts > SERVER_ERROR_RETRY:
                    raise ProvisioningError(
                        f"{self.dsp_name}: Fail to create server",
                        specs) from exc
                await asyncio.sleep(SERVER_ERROR_SLEEP)
            else:
                break
        return response.get("server")
Exemple #6
0
async def exec_async_subprocess(program, args, raise_on_err=True):
    """Util method to execute subprocess asynchronously."""
    process = await asyncio.create_subprocess_exec(
        program,
        *args,
        stdout=asyncio.subprocess.PIPE,
        stderr=asyncio.subprocess.PIPE,
    )
    stdout, stderr = await process.communicate()
    if stdout:
        stdout = stdout.decode()
    if stdout is None:
        stdout = ""
    if stderr:
        stderr = stderr.decode()
    if stdout is None:
        stderr = ""
    if process.returncode != 0 and raise_on_err:
        raise ProvisioningError(stderr)
    return stdout, stderr, process
Exemple #7
0
    async def create_server(self, req):
        """Issue creation of a server.

        req - dict of server requirements

        The req object can contain following additional attributes:
        * 'image': ami or name of image
        * 'flavor': flavor to use
        """
        logger.info("Creating AWS server")
        specs = deepcopy(req)  # work with own copy, do not modify the input

        aws_res = self.ec2.create_instances(
            ImageId=specs.get("image"),
            MinCount=1,
            MaxCount=1,
            InstanceType=specs.get("flavor"),
            KeyName=self.ssh_key,
            SecurityGroupIds=[self.sec_group],
        )

        ids = [srv.id for srv in aws_res]
        if len(
                ids
        ) != 1:  # ids must be len of 1 as we provision one vm at the time
            raise ProvisioningError(
                "Unexpected number of instances provisioned.")
        # creating name for instance (visible in aws ec2 WebUI)
        taglist = [{"Key": "name", "Value": specs.get("name")}]
        for key in self.instance_tags:
            taglist.append({"Key": key, "Value": self.instance_tags[key]})

        self.ec2.create_tags(Resources=ids, Tags=taglist)

        # returns id of provisioned instance
        return ids[0]
Exemple #8
0
    async def create_server(self, req):
        """Request and create resource on selected provider."""
        hostname = req["name"]
        logger.info(
            f"{self.dsp_name}: Creating container for host: {hostname}")

        image = req["image"]
        network = req.get(
            "network")  # preparation method should set this value
        if not network:
            logger.error(
                f"{self.dsp_name}: Failed to load network requirement from: {req}"
            )
            raise ProvisioningError(
                "Could not set up podman network for some host(s)")

        container_id = await self.podman.run(
            image,
            hostname,
            network,
            extra_options=self.podman_options,
            remove_at_stop=True,
        )
        return container_id
Exemple #9
0
    async def _provision_base(self,
                              reqs,
                              res_check_timeout=60,
                              res_busy_sleep=10):  # pylint: disable=too-many-locals, too-many-branches
        """Provision hosts based on list of host requirements.

        Main function which does provisioning and validation.
        Parameters:
            reqs - dictionary with requirements for provider
            res_check_timeout (default 60) - timeout (minutes) to wait for resources
            res_busy_sleep (default 10) - time to wait before checking again (minutes)
        """
        logger.info(f"{self.dsp_name}: Validating hosts definitions")
        if not reqs:
            raise ProvisioningError(
                f"{self.dsp_name}: Can not continue with empty requirement for provider"
            )

        await self.validate_hosts(reqs)
        logger.info(f"{self.dsp_name}: Host definitions valid")

        logger.info(f"{self.dsp_name}: Checking available resources")

        res_check_start = datetime.now()
        while not await self.can_provision(reqs):
            await asyncio.sleep(res_busy_sleep * 60)
            if datetime.now() - res_check_start >= timedelta(
                    minutes=res_check_timeout):
                raise ValidationError(
                    f"{self.dsp_name}: Not enough resources to provision")
        logger.info(f"{self.dsp_name}: Resource availability: OK")
        started = datetime.now()

        logger.info(
            f"{self.dsp_name}: Issuing provisioning of {len(reqs)} host(s)")
        create_servers = []
        for req in reqs:
            awaitable = self.create_server(req)
            create_servers.append(awaitable)

        # expect the exception in return data to be parsed later
        create_resps = await asyncio.gather(*create_servers,
                                            return_exceptions=True)

        logger.info(f"{self.dsp_name}: Provisioning issued")

        logger.info(f"{self.dsp_name}: Waiting for all hosts to be active")

        error_hosts = []
        wait_servers = []
        for response in create_resps:
            if not isinstance(response, ProvisioningError):
                # response might be okay so let us wait for result
                awaitable = self.wait_till_provisioned(response)
                wait_servers.append(awaitable)
            else:
                # use ProvisioningError arguments to create missing Host object
                # which we append to error hosts list for later usage
                error_hosts.append(
                    Host(
                        provider=self,
                        host_id=None,
                        name=response.args[SPECS]["name"],
                        ip_addrs=[],
                        status=STATUS_OTHER,
                        rawdata=response.args,
                        error_obj=response.args,
                    ))

        server_results = await asyncio.gather(*wait_servers)
        provisioned = datetime.now()

        logger.info(
            f"{self.dsp_name}: "
            "All hosts reached provisioning final state (ACTIVE or ERROR)")
        logger.info(
            f"{self.dsp_name}: Provisioning duration: {provisioned - started}")

        hosts = [self.to_host(srv) for srv in server_results if srv]

        error_hosts += await self.parse_error_hosts(hosts)
        active_hosts = [h for h in hosts if h not in error_hosts]
        success_hosts = []

        if global_context["config"].get("post_provisioning_ssh_check", True):
            # check ssh connectivity to succeeded hosts
            wait_ssh = []
            for host in active_hosts:
                awaitable = self._wait_for_ssh(host)
                wait_ssh.append(awaitable)

            ssh_results = await asyncio.gather(*wait_ssh)
            # We distinguish the success hosts and new error hosts from active by using:
            # res[RET_CODE] 0
            #   - the result of operation returned from self._wait_for_ssh()
            # res[HOST_OBJ] 1
            #   - the host object returned from self._wait_for_ssh()
            for res in ssh_results:
                if res[RET_CODE]:
                    success_hosts.append(res[HOST_OBJ])
                else:
                    res[HOST_OBJ].error = (
                        "Could not establish ssh connection to host "
                        f"{res[HOST_OBJ].host_id} with IP {res[HOST_OBJ].ip_addr}"
                    )
                    error_hosts.append(res[HOST_OBJ])
        else:  # we do not check the ssh connection to VMs
            success_hosts = active_hosts

        missing_reqs = [
            req for req in reqs
            if req["name"] in [host.name for host in error_hosts]
        ]

        return (success_hosts, error_hosts, missing_reqs)
Exemple #10
0
    async def wait_till_provisioned(self, resource):
        """
        Wait till server is provisioned.

        Provisioned means that server is in ACTIVE or ERROR state

        State is checked by polling. Polling can be controller via `poll_sleep` and
        `poll_sleep_initial` options. This is useful when provisioning a lot of
        machines as it is better to increase initial poll to not ask to often as
        provisioning resources takes some time.

        Waits till timeout happens. Timeout can be either specified or default provider
        timeout is used.

        Return information about provisioned server.
        """
        uuid = resource.get("id")

        poll_sleep_initial = self.poll_sleep_initial + self.poll_init_adj
        poll_sleep_initial = (poll_sleep_initial / 2 +
                              poll_sleep_initial * random() * 1.5)
        poll_sleep = self.poll_sleep + self.poll_adj
        timeout = self.timeout

        start = datetime.now()
        timeout_time = start + timedelta(minutes=timeout)

        # do not check the state immediately, it will take some time
        logger.debug(f"{uuid}: sleeping for {poll_sleep_initial} seconds")
        await asyncio.sleep(poll_sleep_initial)

        resp = {}
        logger.debug(f"Waiting for: {uuid}")
        error_attempts = 0
        while datetime.now() < timeout_time:
            try:
                resp = await self.nova.servers.get(uuid)
            except NotFoundError as nf_err:
                raise ServerNotFoundError(uuid) from nf_err
            except ServerError as err:
                logger.debug(f"{self.dsp_name}: {err}")
                error_attempts += 1
                if error_attempts > SERVER_ERROR_RETRY:
                    raise ProvisioningError(uuid) from err

            server = resp["server"]
            if server["status"] in ["ACTIVE", "ERROR"]:
                break

            poll_sleep += 0.5  # increase delays to check the longer it takes
            logger.debug(f"{uuid}: sleeping for {poll_sleep} seconds")
            await asyncio.sleep(poll_sleep)

        done_time = datetime.now()
        prov_duration = (done_time - start).total_seconds()

        if datetime.now() >= timeout_time:
            logger.warning(f"{self.dsp_name}: Host {uuid} was not provisioned "
                           f"within a timeout of {timeout} mins")
        else:
            logger.info(
                f"{self.dsp_name}: Host {uuid} was provisioned in {prov_duration:.1f}s"
            )

        return server
Exemple #11
0
    async def create_server(self, req):
        """Issue creation of a server.

        req - dict of server requirements - can contains values defined in
              POST /servers official OpenStack API
              https://docs.openstack.org/api-ref/compute/?expanded=create-server-detail#create-server

        The req object can contain following additional attributes:
        * 'flavor': uuid or name of flavor to use
        * 'network': uuid or name of network to use. Will be added to networks
                     list if present
        """
        name = req.get("name")
        logger.info(f"{self.dsp_name}: Creating server {name}")
        specs = deepcopy(req)  # work with own copy, do not modify the input

        flavor = self._translate_flavor(req)
        specs["flavorRef"] = flavor["id"]
        if specs.get("flavor"):
            del specs["flavor"]

        image = self._translate_image(req)
        specs["imageRef"] = image["id"]
        if specs.get("image"):
            del specs["image"]

        network_specs = self._translate_networks(req, spec=True)
        specs["networks"] = network_specs
        if specs.get("network"):
            del specs["network"]

        error_attempts = 0
        while True:
            try:
                response = await self.nova.servers.create(server=specs)
            except ServerError as exc:
                logger.debug(exc)
                error_attempts += 1
                if error_attempts <= SERVER_ERROR_RETRY:
                    await asyncio.sleep(SERVER_ERROR_SLEEP)
                    continue  # Try again due to ServerError

            if error_attempts > SERVER_ERROR_RETRY:
                # now we are past to what we would like to wait fail now
                raise ProvisioningError(
                    f"{self.dsp_name}: Failed to create server {req['name']}",
                    req,  # add the requirement dictionary to traceback for later
                )

            fault = response["server"].get("fault", {})

            if fault.get("code") == 500:
                # In such scenario OpenStack might run out of hosts to provision
                # This is not related to reaching OpenStack quota but to OpenStack
                # itself being fully loaded and without free resources to provide
                logger.info(
                    f"{self.dsp_name}: Unable to allocate resources for the required "
                    f"server (all available resources busy)")
                error_attempts += 1
                logger.info(
                    f"{self.dsp_name}: Retrying request in {SERVER_RES_SLEEP} minutes"
                )
                # We should wait for OpenStack for reasonable time to try to reprovision
                # This sleep time should be longer for higher probability for Openstack
                # having freed some resources for us even when we are not reaching quota
                await asyncio.sleep(SERVER_RES_SLEEP * 60
                                    )  # * 60 - sleep for minutes
            else:
                # provisioning seems to pass correctly break to return result
                break

        return response.get("server")
Exemple #12
0
    async def provision_hosts(self, hosts):
        """Provision hosts based on list of host requirements.

        Main provider method for provisioning.

        First it validates that host requirements are valid and that
        provider has enough resources(quota).

        Then issues provisioning and waits for it succeed. Raises exception if any of
        the servers was not successfully provisioned. If that happens it issues deletion
        of all already provisioned resources.

        Return list of information about provisioned servers.
        """
        logger.info("Validating hosts definitions")
        await self.validate_hosts(hosts)
        logger.info("Host definitions valid")

        logger.info("Checking available resources")
        can = await self.can_provision(hosts)
        if not can:
            raise ValidationError("Not enough resources to provision")
        logger.info("Resource availability: OK")

        started = datetime.now()

        count = len(hosts)
        logger.info(f"Issuing provisioning of {count} hosts")
        create_servers = []
        for req in hosts:
            awaitable = self.create_server(req)
            create_servers.append(awaitable)
        create_resps = await asyncio.gather(*create_servers)
        logger.info("Provisioning issued")

        logger.info("Waiting for all hosts to be available")
        wait_servers = []
        for create_resp in create_resps:
            awaitable = self.wait_till_provisioned(create_resp)
            wait_servers.append(awaitable)

        server_results = await asyncio.gather(*wait_servers)
        provisioned = datetime.now()
        provi_duration = provisioned - started

        logger.info(
            "All hosts reached provisioning final state (ACTIVE or ERROR)")
        logger.info(f"Provisioning duration: {provi_duration}")

        errors = self.parse_errors(server_results)
        if errors:
            logger.info("Some host did not start properly")
            for err in errors:
                self.print_basic_info(err)
            logger.info("Given the error, will delete all hosts")
            await self.delete_hosts(server_results)
            raise ProvisioningError(errors)

        hosts = [self.to_host(srv) for srv in server_results]
        for host in hosts:
            logger.info(host)
        return hosts
Exemple #13
0
    async def _provision_base(self, reqs):  # pylint: disable=too-many-locals
        """Provision hosts based on list of host requirements.

        Main function which does provisioning and not any validation.
        """
        logger.info(f"{self.dsp_name}: Validating hosts definitions")
        if not reqs:
            raise ProvisioningError(
                f"{self.dsp_name}: Can not continue with empty requirement for provider"
            )

        await self.validate_hosts(reqs)
        logger.info(f"{self.dsp_name}: Host definitions valid")

        logger.info(f"{self.dsp_name}: Checking available resources")

        if not await self.can_provision(reqs):
            raise ValidationError(
                f"{self.dsp_name}: Not enough resources to provision")
        logger.info(f"{self.dsp_name}: Resource availability: OK")
        started = datetime.now()

        logger.info(
            f"{self.dsp_name}: Issuing provisioning of {len(reqs)} host(s)")
        create_servers = []
        for req in reqs:
            awaitable = self.create_server(req)
            create_servers.append(awaitable)
        create_resps = await asyncio.gather(*create_servers)
        logger.info(f"{self.dsp_name}: Provisioning issued")

        logger.info(f"{self.dsp_name}: Waiting for all hosts to be active")
        wait_servers = []
        for create_resp in create_resps:
            awaitable = self.wait_till_provisioned(create_resp)
            wait_servers.append(awaitable)

        server_results = await asyncio.gather(*wait_servers)
        provisioned = datetime.now()

        logger.info(
            f"{self.dsp_name}: "
            "All hosts reached provisioning final state (ACTIVE or ERROR)")
        logger.info(
            f"{self.dsp_name}: Provisioning duration: {provisioned - started}")

        hosts = [self.to_host(srv) for srv in server_results]
        error_hosts = await self.parse_error_hosts(hosts)
        active_hosts = [h for h in hosts if h not in error_hosts]
        success_hosts = []

        if global_context["config"].get("post_provisioning_ssh_check", True):
            # check ssh connectivity to succeeded hosts
            wait_ssh = []
            for host in active_hosts:
                awaitable = self._wait_for_ssh(host)
                wait_ssh.append(awaitable)

            ssh_results = await asyncio.gather(*wait_ssh)
            # We distinguish the success hosts and new error hosts from active by using:
            # res[RET_CODE] 0
            #   - the result of operation returned from self._wait_for_ssh()
            # res[HOST_OBJ] 1
            #   - the host object returned from self._wait_for_ssh()
            for res in ssh_results:
                if res[RET_CODE]:
                    success_hosts.append(res[HOST_OBJ])
                else:
                    res[HOST_OBJ].error = (
                        "Could not establish ssh connection to host "
                        f"{res[HOST_OBJ].host_id} with IP {res[HOST_OBJ].ip_addr}"
                    )
                    error_hosts.append(res[HOST_OBJ])
        else:  # we do not check the ssh connection to VMs
            success_hosts = active_hosts

        missing_reqs = [
            req for req in reqs
            if req["name"] in [host.name for host in error_hosts]
        ]

        return (success_hosts, error_hosts, missing_reqs)