def __get_max_number_of_instances(self, instance_type): """ Get the maximum number of requestable instances according to the scheduler type and other configuration params. :param instance_type The instance type to use in the awsbatch case :return: the max number of instances requestable by the user """ try: max_size = int(self.parameters.get("MaxSize")) if self.parameters.get("Scheduler") == "awsbatch": vcpus = get_instance_vcpus(self.region, instance_type) max_size = -(-max_size // vcpus) except ValueError: self.__fail("Unable to convert max size parameter to an integer") return max_size
def compute_instance_type_validator(param_key, param_value, pcluster_config): """Validate compute instance type, calling ec2_instance_type_validator if the scheduler is not awsbatch.""" errors = [] warnings = [] cluster_config = pcluster_config.get_section("cluster") scheduler = cluster_config.get_param_value("scheduler") if scheduler == "awsbatch": supported_instances = get_supported_compute_instance_types(scheduler) if supported_instances: for instance in param_value.split(","): if not instance.strip() in supported_instances: errors.append( "compute_instance_type '{0}' is not supported by awsbatch in region '{1}'".format( instance, pcluster_config.region ) ) else: warnings.append( "Unable to get instance types supported by awsbatch. Skipping compute_instance_type validation" ) if "," not in param_value and "." in param_value: # if the type is not a list, and contains dot (nor optimal, nor a family) # validate instance type against max_vcpus limit vcpus = get_instance_vcpus(pcluster_config.region, param_value) if vcpus <= 0: warnings.append( "Unable to get the number of vcpus for the compute_instance_type '{0}'. " "Skipping instance type against max_vcpus validation".format(param_value) ) else: if cluster_config.get_param_value("max_vcpus") < vcpus: errors.append( "max_vcpus must be greater than or equal to {0}, that is the number of vcpus " "available for the {1} that you selected as compute_instance_type".format(vcpus, param_value) ) else: errors, warnings = ec2_instance_type_validator(param_key, param_value, pcluster_config) return errors, warnings
def validate(self, resource_type, resource_value): # noqa: C901 FIXME """ Validate the given resource. Print an error and exit in case of error. :param resource_type: Resource type :param resource_value: Resource value """ # Loop over all supported resource checks if resource_type == "EC2KeyPair": try: ec2 = boto3.client( "ec2", region_name=self.region, aws_access_key_id=self.aws_access_key_id, aws_secret_access_key=self.aws_secret_access_key, ) ec2.describe_key_pairs(KeyNames=[resource_value]) except ClientError as e: self.__fail(resource_type, e.response.get("Error").get("Message")) if resource_type == "EC2IAMRoleName": try: iam = boto3.client( "iam", region_name=self.region, aws_access_key_id=self.aws_access_key_id, aws_secret_access_key=self.aws_secret_access_key, ) arn = iam.get_role( RoleName=resource_value).get("Role").get("Arn") account_id = (boto3.client( "sts", region_name=self.region, aws_access_key_id=self.aws_access_key_id, aws_secret_access_key=self.aws_secret_access_key, ).get_caller_identity().get("Account")) partition = self.__get_partition() iam_policy = [ ( [ "ec2:DescribeVolumes", "ec2:AttachVolume", "ec2:DescribeInstanceAttribute", "ec2:DescribeInstanceStatus", "ec2:DescribeInstances", ], "*", ), (["dynamodb:ListTables"], "*"), ( [ "sqs:SendMessage", "sqs:ReceiveMessage", "sqs:ChangeMessageVisibility", "sqs:DeleteMessage", "sqs:GetQueueUrl", ], "arn:%s:sqs:%s:%s:parallelcluster-*" % (partition, self.region, account_id), ), ( [ "autoscaling:DescribeAutoScalingGroups", "autoscaling:TerminateInstanceInAutoScalingGroup", "autoscaling:SetDesiredCapacity", "autoscaling:DescribeTags", "autoScaling:UpdateAutoScalingGroup", ], "*", ), ( [ "dynamodb:PutItem", "dynamodb:Query", "dynamodb:GetItem", "dynamodb:DeleteItem", "dynamodb:DescribeTable", ], "arn:%s:dynamodb:%s:%s:table/parallelcluster-*" % (partition, self.region, account_id), ), ( ["cloudformation:DescribeStacks"], "arn:%s:cloudformation:%s:%s:stack/parallelcluster-*" % (partition, self.region, account_id), ), (["s3:GetObject"], "arn:%s:s3:::%s-aws-parallelcluster/*" % (partition, self.region)), (["sqs:ListQueues"], "*"), ] for actions, resource_arn in iam_policy: response = iam.simulate_principal_policy( PolicySourceArn=arn, ActionNames=actions, ResourceArns=[resource_arn]) for decision in response.get("EvaluationResults"): if decision.get("EvalDecision") != "allowed": print( "IAM role error on user provided role %s: action %s is %s" % (resource_value, decision.get("EvalActionName"), decision.get("EvalDecision"))) print( "See https://aws-parallelcluster.readthedocs.io/en/latest/iam.html" ) sys.exit(1) except ClientError as e: self.__fail(resource_type, e.response.get("Error").get("Message")) # VPC Id elif resource_type == "VPC": try: ec2 = boto3.client( "ec2", region_name=self.region, aws_access_key_id=self.aws_access_key_id, aws_secret_access_key=self.aws_secret_access_key, ) ec2.describe_vpcs(VpcIds=[resource_value]) except ClientError as e: self.__fail(resource_type, e.response.get("Error").get("Message")) # Check for DNS support in the VPC if (not ec2.describe_vpc_attribute( VpcId=resource_value, Attribute="enableDnsSupport").get( "EnableDnsSupport").get("Value")): self.__fail( resource_type, "DNS Support is not enabled in %s" % resource_value) if (not ec2.describe_vpc_attribute( VpcId=resource_value, Attribute="enableDnsHostnames").get( "EnableDnsHostnames").get("Value")): self.__fail(resource_type, "DNS Hostnames not enabled in %s" % resource_value) # VPC Subnet Id elif resource_type == "VPCSubnet": try: ec2 = boto3.client( "ec2", region_name=self.region, aws_access_key_id=self.aws_access_key_id, aws_secret_access_key=self.aws_secret_access_key, ) ec2.describe_subnets(SubnetIds=[resource_value]) except ClientError as e: self.__fail(resource_type, e.response.get("Error").get("Message")) # VPC Security Group elif resource_type == "VPCSecurityGroup": try: ec2 = boto3.client( "ec2", region_name=self.region, aws_access_key_id=self.aws_access_key_id, aws_secret_access_key=self.aws_secret_access_key, ) ec2.describe_security_groups(GroupIds=[resource_value]) except ClientError as e: self.__fail(resource_type, e.response.get("Error").get("Message")) # EC2 AMI Id elif resource_type == "EC2Ami": try: ec2 = boto3.client( "ec2", region_name=self.region, aws_access_key_id=self.aws_access_key_id, aws_secret_access_key=self.aws_secret_access_key, ) ec2.describe_images(ImageIds=[resource_value]) except ClientError as e: self.__fail(resource_type, e.response.get("Error").get("Message")) # EC2 Placement Group elif resource_type == "EC2PlacementGroup": if resource_value == "DYNAMIC": pass else: try: ec2 = boto3.client( "ec2", region_name=self.region, aws_access_key_id=self.aws_access_key_id, aws_secret_access_key=self.aws_secret_access_key, ) ec2.describe_placement_groups(GroupNames=[resource_value]) except ClientError as e: self.__fail(resource_type, e.response.get("Error").get("Message")) # URL elif resource_type == "URL": scheme = urlparse(resource_value).scheme if scheme == "s3": pass else: try: urllib.request.urlopen(resource_value) except urllib.error.HTTPError as e: self.__fail( resource_type, "%s %s %s" % (resource_value, e.code, e.reason)) except urllib.error.URLError as e: self.__fail(resource_type, "%s %s" % (resource_value, e.reason)) # EC2 EBS Snapshot Id elif resource_type == "EC2Snapshot": try: ec2 = boto3.client( "ec2", region_name=self.region, aws_access_key_id=self.aws_access_key_id, aws_secret_access_key=self.aws_secret_access_key, ) test = ec2.describe_snapshots( SnapshotIds=[resource_value]).get("Snapshots")[0] if test.get("State") != "completed": self.__fail( resource_type, "Snapshot %s is in state '%s' not 'completed'" % (resource_value, test.get("State")), ) except ClientError as e: self.__fail(resource_type, e.response.get("Error").get("Message")) # EC2 EBS Volume Id elif resource_type == "EC2Volume": try: ec2 = boto3.client( "ec2", region_name=self.region, aws_access_key_id=self.aws_access_key_id, aws_secret_access_key=self.aws_secret_access_key, ) test = ec2.describe_volumes( VolumeIds=[resource_value]).get("Volumes")[0] if test.get("State") != "available": self.__fail( resource_type, "Volume %s is in state '%s' not 'available'" % (resource_value, test.get("State")), ) except ClientError as e: if (e.response.get("Error").get("Message").endswith( "parameter volumes is invalid. Expected: 'vol-...'.")): self.__fail(resource_type, "Volume %s does not exist." % resource_value) self.__fail(resource_type, e.response.get("Error").get("Message")) # EFS file system Id elif resource_type == "EFSFSId": try: ec2 = boto3.client( "ec2", region_name=self.region, aws_access_key_id=self.aws_access_key_id, aws_secret_access_key=self.aws_secret_access_key, ) efs = boto3.client( "efs", region_name=self.region, aws_access_key_id=self.aws_access_key_id, aws_secret_access_key=self.aws_secret_access_key, ) self.__check_efs_fs_id(ec2, efs, resource_value) except ClientError as e: self.__fail(resource_type, e.response.get("Error").get("Message")) # EFS Performance Mode check elif resource_type == "EFSPerfMode": if resource_value != "generalPurpose" and resource_value != "maxIO": self.__fail( resource_type, "Invalid value for 'performance_mode'! " "Acceptable values for 'performance_mode' are generalPurpose and maxIO", ) # EFS Throughput check elif resource_type == "EFSThroughput": throughput_mode = resource_value[0] provisioned_throughput = resource_value[1] if throughput_mode and (throughput_mode != "provisioned" and throughput_mode != "bursting"): self.__fail( resource_type, "Invalid value for 'throughput_mode'! " "Acceptable values for 'throughput_mode' are bursting and provisioned", ) if provisioned_throughput is not None: if throughput_mode != "provisioned": self.__fail( resource_type, "When specifying 'provisioned_throughput', the 'throughput_mode' must be set to provisioned", ) else: if throughput_mode == "provisioned": self.__fail( resource_type, "When specifying 'throughput_mode' to provisioned, " "the 'provisioned_throughput' option must be specified", ) # RAID EBS IOPS elif resource_type == "RAIDIOPS": raid_iops = float(resource_value[0]) raid_vol_size = float(resource_value[1]) if raid_iops > raid_vol_size * 50: self.__fail( resource_type, "IOPS to volume size ratio of %s is too high; maximum is 50." % (raid_iops / raid_vol_size), ) # RAID Array Type elif resource_type == "RAIDType": if resource_value != "0" and resource_value != "1": self.__fail( resource_type, "Invalid raid_type, only RAID 0 and RAID 1 are currently supported." ) # Number of RAID Volumes Requested elif resource_type == "RAIDNumVol": if int(resource_value) > 5 or int(resource_value) < 2: self.__fail( resource_type, "Invalid num_of_raid_volumes. " "Needs min of 2 volumes for RAID and max of 5 EBS volumes are currently supported.", ) # FSX FS Id check elif resource_type in [ "fsx_fs_id", "FSx_storage_capacity", "FSx_imported_file_chunk_size", "FSx_export_path" ]: self.__validate_fsx_parameters(resource_type, resource_value) elif resource_type == "EFA": self.__validate_efa_parameters(resource_type, resource_value) # Batch Parameters elif resource_type == "AWSBatch_Parameters": # Check region if self.region in [ "ap-northeast-3", "eu-north-1", "cn-north-1", "cn-northwest-1", "us-gov-east-1", "us-gov-west-1", ]: self.__fail( resource_type, "Region %s is not supported with batch scheduler" % self.region) # Check spot bid percentage if "SpotPrice" in resource_value: spot_price = int(resource_value["SpotPrice"]) if spot_price > 100 or spot_price < 0: self.__fail( resource_type, "Spot bid percentage needs to be between 0 and 100") min_size = int(resource_value["MinSize"]) desired_size = int(resource_value["DesiredSize"]) max_size = int(resource_value["MaxSize"]) if desired_size < min_size: self.__fail( resource_type, "Desired vcpus must be greater than or equal to min vcpus") if desired_size > max_size: self.__fail( resource_type, "Desired vcpus must be fewer than or equal to max vcpus") if max_size < min_size: self.__fail( resource_type, "Max vcpus must be greater than or equal to min vcpus") # Check compute instance types if "ComputeInstanceType" in resource_value: compute_instance_type = resource_value["ComputeInstanceType"] try: supported_instances = get_supported_features( self.region, "batch").get("instances") if supported_instances: for instance in compute_instance_type.split(","): if not instance.strip() in supported_instances: self.__fail( resource_type, "Instance type %s not supported by batch in this region" % instance) else: self.__warn( "Unable to get instance types supported by Batch. Skipping instance type validation" ) if "," not in compute_instance_type and "." in compute_instance_type: # if the type is not a list, and contains dot (nor optimal, nor a family) # validate instance type against max_vcpus limit vcpus = get_instance_vcpus(self.region, compute_instance_type) if vcpus <= 0: self.__warn( "Unable to get the number of vcpus for the {0} instance type. " "Skipping instance type against max_vcpus validation" .format(compute_instance_type)) else: if max_size < vcpus: self.__fail( resource_type, "Max vcpus must be greater than or equal to {0}, that is the number of vcpus " "available for the {1} that you selected as compute instance type" .format(vcpus, compute_instance_type), ) except ClientError as e: self.__fail(resource_type, e.response.get("Error").get("Message")) # Check custom batch url if "CustomAWSBatchTemplateURL" in resource_value: self.validate("URL", resource_value["CustomAWSBatchTemplateURL"])
def __check_account_capacity(self): # noqa: C901 """Try to launch the requested number of instances to verify Account limits.""" cluster_section = self.get_section("cluster") vpc_section = self.get_section("vpc") if (not cluster_section or cluster_section.get_param_value("scheduler") == "awsbatch" or cluster_section.get_param_value("cluster_type") == "spot" or not vpc_section): return master_instance_type = cluster_section.get_param_value( "master_instance_type") compute_instance_type = cluster_section.get_param_value( "compute_instance_type") # get max size if cluster_section.get_param_value("scheduler") == "awsbatch": max_vcpus = cluster_section.get_param_value("max_vcpus") vcpus = get_instance_vcpus(self.region, compute_instance_type) max_size = -(-max_vcpus // vcpus) else: max_size = cluster_section.get_param_value("max_queue_size") if max_size < 0: warn( "Unable to check AWS account capacity. Skipping limits validation" ) return # Check for insufficient Account capacity compute_subnet = vpc_section.get_param_value("compute_subnet_id") master_subnet = vpc_section.get_param_value("master_subnet_id") if not compute_subnet: compute_subnet = master_subnet # Initialize CpuOptions disable_hyperthreading = cluster_section.get_param_value( "disable_hyperthreading") master_vcpus = get_instance_vcpus(self.region, master_instance_type) compute_vcpus = get_instance_vcpus(self.region, compute_instance_type) master_cpu_options = { "CoreCount": master_vcpus // 2, "ThreadsPerCore": 1 } if disable_hyperthreading else {} compute_cpu_options = { "CoreCount": compute_vcpus // 2, "ThreadsPerCore": 1 } if disable_hyperthreading else {} # Initialize Placement Group Logic placement_group = cluster_section.get_param_value("placement_group") placement = cluster_section.get_param_value("placement") master_placement_group = ({ "GroupName": placement_group } if placement_group not in [None, "NONE", "DYNAMIC"] and placement == "cluster" else {}) compute_placement_group = ({ "GroupName": placement_group } if placement_group not in [None, "NONE", "DYNAMIC"] else {}) # Test Master Instance Configuration self.__ec2_run_instance( max_size, InstanceType=master_instance_type, MinCount=1, MaxCount=1, ImageId=get_latest_alinux_ami_id(), SubnetId=master_subnet, CpuOptions=master_cpu_options, Placement=master_placement_group, DryRun=True, ) # Test Compute Instances Configuration self.__ec2_run_instance( max_size, InstanceType=compute_instance_type, MinCount=max_size, MaxCount=max_size, ImageId=get_latest_alinux_ami_id(), SubnetId=compute_subnet, CpuOptions=compute_cpu_options, Placement=compute_placement_group, DryRun=True, )