def main(instance_type, desired_capacity, queue, custom_ami, job_id, job_name, job_owner, job_project, keep_forever, scratch_size, scratch_iops, root_size, placement_group, spot_price, efa_support, base_os, subnet, ht_support, fsx_lustre_bucket, fsx_lustre_size, fsx_lustre_dns, tags): cloudformation = boto3.client('cloudformation') s3 = boto3.resource('s3') aligo_configuration = configuration.get_aligo_configuration() # Note: If you change the ComputeNode, you also need to adjust the IAM policy to match your new template name create_stack_location = s3.Object( aligo_configuration['S3Bucket'], aligo_configuration['S3InstallFolder'] + '/templates/ComputeNode.template') stack_template = create_stack_location.get()['Body'].read().decode('utf-8') soca_private_subnets = [ aligo_configuration['PrivateSubnet1'], aligo_configuration['PrivateSubnet2'], aligo_configuration['PrivateSubnet3'] ] if fsx_lustre_dns is False: if fsx_lustre_bucket is False: if fsx_lustre_size is not False: return { 'success': False, 'error': 'You must specify fsx_lustre_bucket parameter if you specify fsx_lustre_capacity' } else: if fsx_lustre_bucket.startswith("s3://"): # remove trailing / if exist fsx_lustre_bucket = fsx_lustre_bucket if fsx_lustre_bucket[ -1] != '/' else fsx_lustre_bucket[:-1] s3_client = boto3.client("s3") try: s3_client.get_bucket_acl( Bucket=fsx_lustre_bucket.split('s3://')[-1]) except botocore.exceptions.ClientError: return { 'success': False, 'error': 'SOCA does not have access to this bucket. Update IAM policy as described on https://soca.dev/tutorials/job-fsx-lustre-backend/' } if fsx_lustre_size is False: fsx_lustre_size = 1200 else: fsx_lustre_capacity_allowed = [ 1200, 2400, 3600, 7200, 10800 ] if fsx_lustre_size not in fsx_lustre_capacity_allowed: return { 'success': False, 'error': 'fsx_lustre_size must be: 1200, 2400, 3600, 7200, 10800' } else: return { 'success': False, 'error': 'fsx_lustre_bucket must start with s3://' } if subnet is False: subnet_id = random.choice(soca_private_subnets) else: if subnet in soca_private_subnets: subnet_id = subnet else: return { 'success': False, 'error': 'Incorrect subnet_id. Must be one of ' + str(soca_private_subnets) } if int(desired_capacity) > 1: if placement_group == 'false': # for testing purpose, sometimes we want to compare performance w/ and w/o placement group placement_group = 'false' else: placement_group = 'true' else: placement_group = 'false' cpus_count_pattern = re.search(r'[.](\d+)', instance_type) if cpus_count_pattern: cpu_per_system = int(cpus_count_pattern.group(1)) * 2 else: if 'xlarge' in instance_type: cpu_per_system = '2' else: cpu_per_system = '1' # Force Tag if they don't exist. DO NOT DELETE them or host won't be able to be registered by nodes_manager.py if keep_forever is True: unique_id = str(uuid.uuid4()) stack_name = aligo_configuration[ 'ClusterId'] + '-keepforever-' + queue + '-' + unique_id job_id = stack_name tags['soca:KeepForever'] = 'true' else: stack_name = aligo_configuration['ClusterId'] + '-job-' + str(job_id) tags['soca:KeepForever'] = 'false' if 'soca:NodeType' not in tags.keys(): tags['soca:NodeType'] = 'soca-compute-node' if 'soca:ClusterId' not in tags.keys(): tags['soca:ClusterId'] = aligo_configuration['ClusterId'] if 'soca:JobId' not in tags.keys(): tags['soca:JobId'] = job_id if 'Name' not in tags.keys(): tags['Name'] = stack_name.replace('_', '-') job_parameters = { 'StackUUID': str(uuid.uuid4()), 'Version': aligo_configuration['Version'], 'S3InstallFolder': aligo_configuration['S3InstallFolder'], 'S3Bucket': aligo_configuration['S3Bucket'], 'PlacementGroup': placement_group, 'SecurityGroupId': aligo_configuration['ComputeNodeSecurityGroup'], 'KeepForever': 'true' if keep_forever is True else 'false', # needs to be lowercase 'SSHKeyPair': aligo_configuration['SSHKeyPair'], 'ComputeNodeInstanceProfile': aligo_configuration['ComputeNodeInstanceProfile'], 'Efa': efa_support, 'JobId': job_id, 'ScratchSize': 0 if scratch_size is False else scratch_size, 'RootSize': 10 if root_size is False else root_size, 'ImageId': custom_ami if custom_ami is not None else aligo_configuration['CustomAMI'], 'JobName': job_name, 'JobQueue': queue, 'JobOwner': job_owner, 'JobProject': job_project, 'ClusterId': aligo_configuration['ClusterId'], 'EFSAppsDns': aligo_configuration['EFSAppsDns'], 'EFSDataDns': aligo_configuration['EFSDataDns'], 'SubnetId': subnet_id, 'InstanceType': instance_type, 'SchedulerHostname': aligo_configuration['SchedulerPrivateDnsName'], 'DesiredCapacity': desired_capacity, 'BaseOS': aligo_configuration['BaseOS'] if base_os is False else base_os, 'SpotPrice': spot_price if spot_price is not None else 'false', 'CoreCount': cpu_per_system, 'ThreadsPerCore': 2 if ht_support == 'true' else 1, 'SolutionMetricLambda': aligo_configuration['SolutionMetricLambda'] if 'SolutionMetricLambda' in aligo_configuration.keys() else 'false', 'VolumeTypeIops': scratch_iops, 'FSxLustreBucket': 'false' if fsx_lustre_bucket is False else fsx_lustre_bucket, 'FSxLustreSize': 1200 if fsx_lustre_size is False else fsx_lustre_size, 'FSxLustreDns': 'false' if fsx_lustre_dns is False else fsx_lustre_dns, } stack_tags = [{ 'Key': str(k), 'Value': str(v) } for k, v in tags.items() if v] stack_params = [{ 'ParameterKey': str(k), 'ParameterValue': str(v) } for k, v in job_parameters.items() if v] if job_parameters['BaseOS'] not in ['rhel7', 'centos7', 'amazonlinux2']: return { 'success': False, 'error': 'base_os must be one of the following value: centos7, amazonlinux2, rhel7' } if job_parameters['Efa'] == 'true': if not 'n' in job_parameters['InstanceType']: return { 'success': False, 'error': 'You have requested EFA support but your instance type does not support EFA: ' + str(job_parameters['InstanceType']) } can_launch = can_launch_capacity(job_parameters['InstanceType'], job_parameters['DesiredCapacity'], job_parameters['ImageId'], subnet_id) if can_launch is True: try: launch = cloudformation.create_stack(StackName=stack_name, TemplateBody=stack_template, Parameters=stack_params, Tags=stack_tags) # PBS configuration is automatically updated by nodes_manager return { 'success': True, 'stack_name': stack_name, 'compute_node': 'job' + str(job_id) } except Exception as e: exc_type, exc_obj, exc_tb = sys.exc_info() fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1] return { 'success': False, 'error': str(exc_type) + ' : ' + str(fname) + ' : ' + str(exc_tb.tb_lineno) + ' : ' + str(e) + ' : ' + str(e) } else: return {'success': False, 'error': 'Dry Run failed: ' + can_launch}
import re import sys import uuid import boto3 from botocore.exceptions import ClientError sys.path.append(os.path.dirname(__file__)) import configuration from botocore import exceptions import cloudformation_builder cloudformation = boto3.client('cloudformation') s3 = boto3.client('s3') ec2 = boto3.client('ec2') servicequotas = boto3.client("service-quotas") aligo_configuration = configuration.get_aligo_configuration() def verify_ri_saving_availabilities(instance_type, instance_type_info): if instance_type not in instance_type_info.keys(): instance_type_info[instance_type] = { 'current_instance_in_use': 0, 'current_ri_purchased': 0 } token = True next_token = '' # List all instance from this type currently running while token is True: response = ec2.describe_instances( Filters=[{ 'Name': 'instance-type',