Example #1
0
    def __get_max_number_of_instances(self, instance_type):
        """
        Get the maximum number of requestable instances according to the scheduler type and other configuration params.

        :param instance_type The instance type to use in the awsbatch case
        :return: the max number of instances requestable by the user
        """
        try:
            max_size = int(self.parameters.get("MaxSize"))
            if self.parameters.get("Scheduler") == "awsbatch":
                vcpus = get_instance_vcpus(self.region, instance_type)
                max_size = -(-max_size // vcpus)
        except ValueError:
            self.__fail("Unable to convert max size parameter to an integer")
        return max_size
Example #2
0
    def __get_max_number_of_instances(self, instance_type):
        """
        Get the maximum number of requestable instances according to the scheduler type and other configuration params.

        :param instance_type The instance type to use in the awsbatch case
        :return: the max number of instances requestable by the user
        """
        try:
            max_size = int(self.parameters.get("MaxSize"))
            if self.parameters.get("Scheduler") == "awsbatch":
                vcpus = get_instance_vcpus(self.region, instance_type)
                max_size = -(-max_size // vcpus)
        except ValueError:
            self.__fail("Unable to convert max size parameter to an integer")
        return max_size
Example #3
0
def compute_instance_type_validator(param_key, param_value, pcluster_config):
    """Validate compute instance type, calling ec2_instance_type_validator if the scheduler is not awsbatch."""
    errors = []
    warnings = []

    cluster_config = pcluster_config.get_section("cluster")
    scheduler = cluster_config.get_param_value("scheduler")
    if scheduler == "awsbatch":
        supported_instances = get_supported_compute_instance_types(scheduler)
        if supported_instances:
            for instance in param_value.split(","):
                if not instance.strip() in supported_instances:
                    errors.append(
                        "compute_instance_type '{0}' is not supported by awsbatch in region '{1}'".format(
                            instance, pcluster_config.region
                        )
                    )
        else:
            warnings.append(
                "Unable to get instance types supported by awsbatch. Skipping compute_instance_type validation"
            )

        if "," not in param_value and "." in param_value:
            # if the type is not a list, and contains dot (nor optimal, nor a family)
            # validate instance type against max_vcpus limit
            vcpus = get_instance_vcpus(pcluster_config.region, param_value)
            if vcpus <= 0:
                warnings.append(
                    "Unable to get the number of vcpus for the compute_instance_type '{0}'. "
                    "Skipping instance type against max_vcpus validation".format(param_value)
                )
            else:
                if cluster_config.get_param_value("max_vcpus") < vcpus:
                    errors.append(
                        "max_vcpus must be greater than or equal to {0}, that is the number of vcpus "
                        "available for the {1} that you selected as compute_instance_type".format(vcpus, param_value)
                    )
    else:
        errors, warnings = ec2_instance_type_validator(param_key, param_value, pcluster_config)

    return errors, warnings
Example #4
0
    def validate(self, resource_type, resource_value):  # noqa: C901 FIXME
        """
        Validate the given resource. Print an error and exit in case of error.

        :param resource_type: Resource type
        :param resource_value: Resource value
        """
        # Loop over all supported resource checks
        if resource_type == "EC2KeyPair":
            try:
                ec2 = boto3.client(
                    "ec2",
                    region_name=self.region,
                    aws_access_key_id=self.aws_access_key_id,
                    aws_secret_access_key=self.aws_secret_access_key,
                )
                ec2.describe_key_pairs(KeyNames=[resource_value])
            except ClientError as e:
                self.__fail(resource_type,
                            e.response.get("Error").get("Message"))
        if resource_type == "EC2IAMRoleName":
            try:
                iam = boto3.client(
                    "iam",
                    region_name=self.region,
                    aws_access_key_id=self.aws_access_key_id,
                    aws_secret_access_key=self.aws_secret_access_key,
                )

                arn = iam.get_role(
                    RoleName=resource_value).get("Role").get("Arn")
                account_id = (boto3.client(
                    "sts",
                    region_name=self.region,
                    aws_access_key_id=self.aws_access_key_id,
                    aws_secret_access_key=self.aws_secret_access_key,
                ).get_caller_identity().get("Account"))

                partition = self.__get_partition()

                iam_policy = [
                    (
                        [
                            "ec2:DescribeVolumes",
                            "ec2:AttachVolume",
                            "ec2:DescribeInstanceAttribute",
                            "ec2:DescribeInstanceStatus",
                            "ec2:DescribeInstances",
                        ],
                        "*",
                    ),
                    (["dynamodb:ListTables"], "*"),
                    (
                        [
                            "sqs:SendMessage",
                            "sqs:ReceiveMessage",
                            "sqs:ChangeMessageVisibility",
                            "sqs:DeleteMessage",
                            "sqs:GetQueueUrl",
                        ],
                        "arn:%s:sqs:%s:%s:parallelcluster-*" %
                        (partition, self.region, account_id),
                    ),
                    (
                        [
                            "autoscaling:DescribeAutoScalingGroups",
                            "autoscaling:TerminateInstanceInAutoScalingGroup",
                            "autoscaling:SetDesiredCapacity",
                            "autoscaling:DescribeTags",
                            "autoScaling:UpdateAutoScalingGroup",
                        ],
                        "*",
                    ),
                    (
                        [
                            "dynamodb:PutItem",
                            "dynamodb:Query",
                            "dynamodb:GetItem",
                            "dynamodb:DeleteItem",
                            "dynamodb:DescribeTable",
                        ],
                        "arn:%s:dynamodb:%s:%s:table/parallelcluster-*" %
                        (partition, self.region, account_id),
                    ),
                    (
                        ["cloudformation:DescribeStacks"],
                        "arn:%s:cloudformation:%s:%s:stack/parallelcluster-*" %
                        (partition, self.region, account_id),
                    ),
                    (["s3:GetObject"], "arn:%s:s3:::%s-aws-parallelcluster/*" %
                     (partition, self.region)),
                    (["sqs:ListQueues"], "*"),
                ]

                for actions, resource_arn in iam_policy:
                    response = iam.simulate_principal_policy(
                        PolicySourceArn=arn,
                        ActionNames=actions,
                        ResourceArns=[resource_arn])
                    for decision in response.get("EvaluationResults"):
                        if decision.get("EvalDecision") != "allowed":
                            print(
                                "IAM role error on user provided role %s: action %s is %s"
                                % (resource_value,
                                   decision.get("EvalActionName"),
                                   decision.get("EvalDecision")))
                            print(
                                "See https://aws-parallelcluster.readthedocs.io/en/latest/iam.html"
                            )
                            sys.exit(1)
            except ClientError as e:
                self.__fail(resource_type,
                            e.response.get("Error").get("Message"))
        # VPC Id
        elif resource_type == "VPC":
            try:
                ec2 = boto3.client(
                    "ec2",
                    region_name=self.region,
                    aws_access_key_id=self.aws_access_key_id,
                    aws_secret_access_key=self.aws_secret_access_key,
                )
                ec2.describe_vpcs(VpcIds=[resource_value])
            except ClientError as e:
                self.__fail(resource_type,
                            e.response.get("Error").get("Message"))
            # Check for DNS support in the VPC
            if (not ec2.describe_vpc_attribute(
                    VpcId=resource_value, Attribute="enableDnsSupport").get(
                        "EnableDnsSupport").get("Value")):
                self.__fail(
                    resource_type,
                    "DNS Support is not enabled in %s" % resource_value)
            if (not ec2.describe_vpc_attribute(
                    VpcId=resource_value, Attribute="enableDnsHostnames").get(
                        "EnableDnsHostnames").get("Value")):
                self.__fail(resource_type,
                            "DNS Hostnames not enabled in %s" % resource_value)
        # VPC Subnet Id
        elif resource_type == "VPCSubnet":
            try:
                ec2 = boto3.client(
                    "ec2",
                    region_name=self.region,
                    aws_access_key_id=self.aws_access_key_id,
                    aws_secret_access_key=self.aws_secret_access_key,
                )
                ec2.describe_subnets(SubnetIds=[resource_value])
            except ClientError as e:
                self.__fail(resource_type,
                            e.response.get("Error").get("Message"))
        # VPC Security Group
        elif resource_type == "VPCSecurityGroup":
            try:
                ec2 = boto3.client(
                    "ec2",
                    region_name=self.region,
                    aws_access_key_id=self.aws_access_key_id,
                    aws_secret_access_key=self.aws_secret_access_key,
                )
                ec2.describe_security_groups(GroupIds=[resource_value])
            except ClientError as e:
                self.__fail(resource_type,
                            e.response.get("Error").get("Message"))
        # EC2 AMI Id
        elif resource_type == "EC2Ami":
            try:
                ec2 = boto3.client(
                    "ec2",
                    region_name=self.region,
                    aws_access_key_id=self.aws_access_key_id,
                    aws_secret_access_key=self.aws_secret_access_key,
                )
                ec2.describe_images(ImageIds=[resource_value])
            except ClientError as e:
                self.__fail(resource_type,
                            e.response.get("Error").get("Message"))
        # EC2 Placement Group
        elif resource_type == "EC2PlacementGroup":
            if resource_value == "DYNAMIC":
                pass
            else:
                try:
                    ec2 = boto3.client(
                        "ec2",
                        region_name=self.region,
                        aws_access_key_id=self.aws_access_key_id,
                        aws_secret_access_key=self.aws_secret_access_key,
                    )
                    ec2.describe_placement_groups(GroupNames=[resource_value])
                except ClientError as e:
                    self.__fail(resource_type,
                                e.response.get("Error").get("Message"))
        # URL
        elif resource_type == "URL":
            scheme = urlparse(resource_value).scheme
            if scheme == "s3":
                pass
            else:
                try:
                    urllib.request.urlopen(resource_value)
                except urllib.error.HTTPError as e:
                    self.__fail(
                        resource_type,
                        "%s %s %s" % (resource_value, e.code, e.reason))
                except urllib.error.URLError as e:
                    self.__fail(resource_type,
                                "%s %s" % (resource_value, e.reason))
        # EC2 EBS Snapshot Id
        elif resource_type == "EC2Snapshot":
            try:
                ec2 = boto3.client(
                    "ec2",
                    region_name=self.region,
                    aws_access_key_id=self.aws_access_key_id,
                    aws_secret_access_key=self.aws_secret_access_key,
                )
                test = ec2.describe_snapshots(
                    SnapshotIds=[resource_value]).get("Snapshots")[0]
                if test.get("State") != "completed":
                    self.__fail(
                        resource_type,
                        "Snapshot %s is in state '%s' not 'completed'" %
                        (resource_value, test.get("State")),
                    )
            except ClientError as e:
                self.__fail(resource_type,
                            e.response.get("Error").get("Message"))
        # EC2 EBS Volume Id
        elif resource_type == "EC2Volume":
            try:
                ec2 = boto3.client(
                    "ec2",
                    region_name=self.region,
                    aws_access_key_id=self.aws_access_key_id,
                    aws_secret_access_key=self.aws_secret_access_key,
                )
                test = ec2.describe_volumes(
                    VolumeIds=[resource_value]).get("Volumes")[0]
                if test.get("State") != "available":
                    self.__fail(
                        resource_type,
                        "Volume %s is in state '%s' not 'available'" %
                        (resource_value, test.get("State")),
                    )
            except ClientError as e:
                if (e.response.get("Error").get("Message").endswith(
                        "parameter volumes is invalid. Expected: 'vol-...'.")):
                    self.__fail(resource_type,
                                "Volume %s does not exist." % resource_value)

                self.__fail(resource_type,
                            e.response.get("Error").get("Message"))
        # EFS file system Id
        elif resource_type == "EFSFSId":
            try:
                ec2 = boto3.client(
                    "ec2",
                    region_name=self.region,
                    aws_access_key_id=self.aws_access_key_id,
                    aws_secret_access_key=self.aws_secret_access_key,
                )
                efs = boto3.client(
                    "efs",
                    region_name=self.region,
                    aws_access_key_id=self.aws_access_key_id,
                    aws_secret_access_key=self.aws_secret_access_key,
                )
                self.__check_efs_fs_id(ec2, efs, resource_value)
            except ClientError as e:
                self.__fail(resource_type,
                            e.response.get("Error").get("Message"))
        # EFS Performance Mode check
        elif resource_type == "EFSPerfMode":
            if resource_value != "generalPurpose" and resource_value != "maxIO":
                self.__fail(
                    resource_type,
                    "Invalid value for 'performance_mode'! "
                    "Acceptable values for 'performance_mode' are generalPurpose and maxIO",
                )
        # EFS Throughput check
        elif resource_type == "EFSThroughput":
            throughput_mode = resource_value[0]
            provisioned_throughput = resource_value[1]
            if throughput_mode and (throughput_mode != "provisioned"
                                    and throughput_mode != "bursting"):
                self.__fail(
                    resource_type,
                    "Invalid value for 'throughput_mode'! "
                    "Acceptable values for 'throughput_mode' are bursting and provisioned",
                )
            if provisioned_throughput is not None:
                if throughput_mode != "provisioned":
                    self.__fail(
                        resource_type,
                        "When specifying 'provisioned_throughput', the 'throughput_mode' must be set to provisioned",
                    )
            else:
                if throughput_mode == "provisioned":
                    self.__fail(
                        resource_type,
                        "When specifying 'throughput_mode' to provisioned, "
                        "the 'provisioned_throughput' option must be specified",
                    )
        # RAID EBS IOPS
        elif resource_type == "RAIDIOPS":
            raid_iops = float(resource_value[0])
            raid_vol_size = float(resource_value[1])
            if raid_iops > raid_vol_size * 50:
                self.__fail(
                    resource_type,
                    "IOPS to volume size ratio of %s is too high; maximum is 50."
                    % (raid_iops / raid_vol_size),
                )
        # RAID Array Type
        elif resource_type == "RAIDType":
            if resource_value != "0" and resource_value != "1":
                self.__fail(
                    resource_type,
                    "Invalid raid_type, only RAID 0 and RAID 1 are currently supported."
                )
        # Number of RAID Volumes Requested
        elif resource_type == "RAIDNumVol":
            if int(resource_value) > 5 or int(resource_value) < 2:
                self.__fail(
                    resource_type,
                    "Invalid num_of_raid_volumes. "
                    "Needs min of 2 volumes for RAID and max of 5 EBS volumes are currently supported.",
                )
        # FSX FS Id check
        elif resource_type in [
                "fsx_fs_id", "FSx_storage_capacity",
                "FSx_imported_file_chunk_size", "FSx_export_path"
        ]:
            self.__validate_fsx_parameters(resource_type, resource_value)
        elif resource_type == "EFA":
            self.__validate_efa_parameters(resource_type, resource_value)

        # Batch Parameters
        elif resource_type == "AWSBatch_Parameters":
            # Check region
            if self.region in [
                    "ap-northeast-3",
                    "eu-north-1",
                    "cn-north-1",
                    "cn-northwest-1",
                    "us-gov-east-1",
                    "us-gov-west-1",
            ]:
                self.__fail(
                    resource_type,
                    "Region %s is not supported with batch scheduler" %
                    self.region)

            # Check spot bid percentage
            if "SpotPrice" in resource_value:
                spot_price = int(resource_value["SpotPrice"])
                if spot_price > 100 or spot_price < 0:
                    self.__fail(
                        resource_type,
                        "Spot bid percentage needs to be between 0 and 100")

            min_size = int(resource_value["MinSize"])
            desired_size = int(resource_value["DesiredSize"])
            max_size = int(resource_value["MaxSize"])

            if desired_size < min_size:
                self.__fail(
                    resource_type,
                    "Desired vcpus must be greater than or equal to min vcpus")

            if desired_size > max_size:
                self.__fail(
                    resource_type,
                    "Desired vcpus must be fewer than or equal to max vcpus")

            if max_size < min_size:
                self.__fail(
                    resource_type,
                    "Max vcpus must be greater than or equal to min vcpus")

            # Check compute instance types
            if "ComputeInstanceType" in resource_value:
                compute_instance_type = resource_value["ComputeInstanceType"]
                try:
                    supported_instances = get_supported_features(
                        self.region, "batch").get("instances")
                    if supported_instances:
                        for instance in compute_instance_type.split(","):
                            if not instance.strip() in supported_instances:
                                self.__fail(
                                    resource_type,
                                    "Instance type %s not supported by batch in this region"
                                    % instance)
                    else:
                        self.__warn(
                            "Unable to get instance types supported by Batch. Skipping instance type validation"
                        )

                    if "," not in compute_instance_type and "." in compute_instance_type:
                        # if the type is not a list, and contains dot (nor optimal, nor a family)
                        # validate instance type against max_vcpus limit
                        vcpus = get_instance_vcpus(self.region,
                                                   compute_instance_type)
                        if vcpus <= 0:
                            self.__warn(
                                "Unable to get the number of vcpus for the {0} instance type. "
                                "Skipping instance type against max_vcpus validation"
                                .format(compute_instance_type))
                        else:
                            if max_size < vcpus:
                                self.__fail(
                                    resource_type,
                                    "Max vcpus must be greater than or equal to {0}, that is the number of vcpus "
                                    "available for the {1} that you selected as compute instance type"
                                    .format(vcpus, compute_instance_type),
                                )
                except ClientError as e:
                    self.__fail(resource_type,
                                e.response.get("Error").get("Message"))

            # Check custom batch url
            if "CustomAWSBatchTemplateURL" in resource_value:
                self.validate("URL",
                              resource_value["CustomAWSBatchTemplateURL"])
Example #5
0
    def __check_account_capacity(self):  # noqa: C901
        """Try to launch the requested number of instances to verify Account limits."""
        cluster_section = self.get_section("cluster")
        vpc_section = self.get_section("vpc")

        if (not cluster_section
                or cluster_section.get_param_value("scheduler") == "awsbatch"
                or cluster_section.get_param_value("cluster_type") == "spot"
                or not vpc_section):
            return

        master_instance_type = cluster_section.get_param_value(
            "master_instance_type")
        compute_instance_type = cluster_section.get_param_value(
            "compute_instance_type")
        # get max size
        if cluster_section.get_param_value("scheduler") == "awsbatch":
            max_vcpus = cluster_section.get_param_value("max_vcpus")
            vcpus = get_instance_vcpus(self.region, compute_instance_type)
            max_size = -(-max_vcpus // vcpus)
        else:
            max_size = cluster_section.get_param_value("max_queue_size")
        if max_size < 0:
            warn(
                "Unable to check AWS account capacity. Skipping limits validation"
            )
            return

        # Check for insufficient Account capacity
        compute_subnet = vpc_section.get_param_value("compute_subnet_id")
        master_subnet = vpc_section.get_param_value("master_subnet_id")
        if not compute_subnet:
            compute_subnet = master_subnet

        # Initialize CpuOptions
        disable_hyperthreading = cluster_section.get_param_value(
            "disable_hyperthreading")
        master_vcpus = get_instance_vcpus(self.region, master_instance_type)
        compute_vcpus = get_instance_vcpus(self.region, compute_instance_type)
        master_cpu_options = {
            "CoreCount": master_vcpus // 2,
            "ThreadsPerCore": 1
        } if disable_hyperthreading else {}
        compute_cpu_options = {
            "CoreCount": compute_vcpus // 2,
            "ThreadsPerCore": 1
        } if disable_hyperthreading else {}

        # Initialize Placement Group Logic
        placement_group = cluster_section.get_param_value("placement_group")
        placement = cluster_section.get_param_value("placement")
        master_placement_group = ({
            "GroupName": placement_group
        } if placement_group not in [None, "NONE", "DYNAMIC"]
                                  and placement == "cluster" else {})
        compute_placement_group = ({
            "GroupName": placement_group
        } if placement_group not in [None, "NONE", "DYNAMIC"] else {})

        # Test Master Instance Configuration
        self.__ec2_run_instance(
            max_size,
            InstanceType=master_instance_type,
            MinCount=1,
            MaxCount=1,
            ImageId=get_latest_alinux_ami_id(),
            SubnetId=master_subnet,
            CpuOptions=master_cpu_options,
            Placement=master_placement_group,
            DryRun=True,
        )

        # Test Compute Instances Configuration
        self.__ec2_run_instance(
            max_size,
            InstanceType=compute_instance_type,
            MinCount=max_size,
            MaxCount=max_size,
            ImageId=get_latest_alinux_ami_id(),
            SubnetId=compute_subnet,
            CpuOptions=compute_cpu_options,
            Placement=compute_placement_group,
            DryRun=True,
        )