Beispiel #1
0
 async def _delete_security_groups(self):
     timeout = Timeout(
         30, "Unable to delete AWS security group " + self.cluster_name, warn=True
     )
     while timeout.run():
         try:
             await self._clients["ec2"].delete_security_group(
                 GroupName=self.cluster_name, DryRun=False
             )
         except Exception:
             await asyncio.sleep(2)
         break
Beispiel #2
0
 async def _set_address_from_logs(self):
     timeout = Timeout(
         30,
         "Failed to find %s ip address after 30 seconds." % self.task_type)
     while timeout.run():
         async for line in self.logs():
             for query_string in ["worker at:", "Scheduler at:"]:
                 if query_string in line:
                     address = line.split(query_string)[1].strip()
                     if self._use_public_ip:
                         self.external_address = address.replace(
                             self.private_ip, self.public_ip)
                     logger.debug("%s", line)
                     self.address = address
                     return
         else:
             if not await self._task_is_running():
                 raise RuntimeError("%s exited unexpectedly!" %
                                    type(self).__name__)
             continue
         break
Beispiel #3
0
    async def start(self):
        timeout = Timeout(60, "Unable to start %s after 60 seconds" % self.task_type)
        while timeout.run():
            try:
                kwargs = (
                    {"tags": dict_to_aws(self.tags)}
                    if await self._is_long_arn_format_enabled()
                    else {}
                )  # Tags are only supported if you opt into long arn format so we need to check for that
                response = await self._clients["ecs"].run_task(
                    cluster=self.cluster_arn,
                    taskDefinition=self.task_definition_arn,
                    overrides={
                        "containerOverrides": [
                            {
                                "name": "dask-{}".format(self.task_type),
                                "environment": dict_to_aws(
                                    self.environment, key_string="name"
                                ),
                                **self._overrides,
                            }
                        ]
                    },
                    count=1,
                    launchType="FARGATE" if self.fargate else "EC2",
                    networkConfiguration={
                        "awsvpcConfiguration": {
                            "subnets": self._vpc_subnets,
                            "securityGroups": self._security_groups,
                            "assignPublicIp": "ENABLED"
                            if self._use_public_ip
                            else "DISABLED",
                        }
                    },
                    **kwargs
                )

                if not response.get("tasks"):
                    raise RuntimeError(response)  # print entire response

                [self.task] = response["tasks"]
                break
            except Exception as e:
                timeout.set_exception(e)
                await asyncio.sleep(1)

        self.task_arn = self.task["taskArn"]
        while self.task["lastStatus"] in ["PENDING", "PROVISIONING"]:
            await asyncio.sleep(1)
            await self._update_task()
        if not await self._task_is_running():
            raise RuntimeError("%s failed to start" % type(self).__name__)
        [eni] = [
            attachment
            for attachment in self.task["attachments"]
            if attachment["type"] == "ElasticNetworkInterface"
        ]
        [network_interface_id] = [
            detail["value"]
            for detail in eni["details"]
            if detail["name"] == "networkInterfaceId"
        ]
        eni = await self._clients["ec2"].describe_network_interfaces(
            NetworkInterfaceIds=[network_interface_id]
        )
        [interface] = eni["NetworkInterfaces"]
        if self._use_public_ip:
            self.public_ip = interface["Association"]["PublicIp"]
        self.private_ip = interface["PrivateIpAddresses"][0]["PrivateIpAddress"]
        await self._set_address_from_logs()
        self.status = "running"
Beispiel #4
0
    async def create_vm(self):
        """

        https://botocore.amazonaws.com/v1/documentation/api/latest/reference/services/ec2.html#EC2.Client.run_instances
        """
        # TODO Enable Spot support
        async with self.cluster.boto_session.create_client(
            "ec2", region_name=self.region) as client:
            self.vpc = self.vpc or await get_default_vpc(client)
            self.subnet_id = (self.subnet_id
                              or (await get_vpc_subnets(client, self.vpc))[0])
            self.security_groups = self.security_groups or [
                await get_security_group(client, self.vpc)
            ]
            self.ami = self.ami or await get_latest_ami_id(
                client,
                "ubuntu/images/hvm-ssd/ubuntu-focal-20.04-amd64-server-*",
                "099720109477",  # Canonical
            )

            vm_kwargs = {
                "BlockDeviceMappings": [{
                    "DeviceName": "/dev/sda1",
                    "VirtualName": "sda1",
                    "Ebs": {
                        "DeleteOnTermination": True,
                        "VolumeSize": self.filesystem_size,
                        "VolumeType": "gp2",
                        "Encrypted": False,
                    },
                }],
                "ImageId":
                self.ami,
                "InstanceType":
                self.instance_type,
                "MaxCount":
                1,
                "MinCount":
                1,
                "Monitoring": {
                    "Enabled": False
                },
                "UserData":
                self.cluster.render_process_cloud_init(self),
                "InstanceInitiatedShutdownBehavior":
                "terminate",
                "NetworkInterfaces": [{
                    "AssociatePublicIpAddress": True,
                    "DeleteOnTermination": True,
                    "Description": "public",
                    "DeviceIndex": 0,
                    "Groups": self.security_groups,
                    "SubnetId": self.subnet_id,
                }],
            }

            if self.key_name:
                vm_kwargs["KeyName"] = self.key_name

            if self.iam_instance_profile:
                vm_kwargs["IamInstanceProfile"] = self.iam_instance_profile

            if self.availability_zone:
                if isinstance(self.availability_zone, list):
                    self.availability_zone = random.choice(
                        self.availability_zone)
                vm_kwargs["Placement"] = {
                    "AvailabilityZone": self.availability_zone
                }

            response = await client.run_instances(**vm_kwargs)
            [self.instance] = response["Instances"]
            await client.create_tags(
                Resources=[self.instance["InstanceId"]],
                Tags=[
                    {
                        "Key": "Name",
                        "Value": self.name
                    },
                    {
                        "Key": "Dask Cluster",
                        "Value": self.cluster.uuid
                    },
                ],
            )
            self.cluster._log(
                f"Created instance {self.instance['InstanceId']} as {self.name}"
            )

            timeout = Timeout(
                300,
                f"Failed Public IP for instance {self.instance['InstanceId']}",
            )
            while ("PublicIpAddress" not in self.instance or
                   self.instance["PublicIpAddress"] is None) and timeout.run():
                backoff = 0.1
                await asyncio.sleep(
                    min(backoff, 10) + backoff % 1
                )  # Exponential backoff with a cap of 10 seconds and some jitter
                try:
                    response = await client.describe_instances(
                        InstanceIds=[self.instance["InstanceId"]],
                        DryRun=False)
                    [reservation] = response["Reservations"]
                    [self.instance] = reservation["Instances"]
                except botocore.exceptions.ClientError as e:
                    timeout.set_exception(e)
                backoff = backoff * 2
            return self.instance["PublicIpAddress"]