def main(instance_type, desired_capacity, queue, custom_ami, job_id, job_name,
         job_owner, job_project, keep_forever, scratch_size, scratch_iops,
         root_size, placement_group, spot_price, efa_support, base_os, subnet,
         ht_support, fsx_lustre_bucket, fsx_lustre_size, fsx_lustre_dns, tags):

    cloudformation = boto3.client('cloudformation')
    s3 = boto3.resource('s3')

    aligo_configuration = configuration.get_aligo_configuration()
    # Note: If you change the ComputeNode, you also need to adjust the IAM policy to match your new template name
    create_stack_location = s3.Object(
        aligo_configuration['S3Bucket'],
        aligo_configuration['S3InstallFolder'] +
        '/templates/ComputeNode.template')
    stack_template = create_stack_location.get()['Body'].read().decode('utf-8')
    soca_private_subnets = [
        aligo_configuration['PrivateSubnet1'],
        aligo_configuration['PrivateSubnet2'],
        aligo_configuration['PrivateSubnet3']
    ]

    if fsx_lustre_dns is False:
        if fsx_lustre_bucket is False:
            if fsx_lustre_size is not False:
                return {
                    'success':
                    False,
                    'error':
                    'You must specify fsx_lustre_bucket parameter if you specify fsx_lustre_capacity'
                }
        else:
            if fsx_lustre_bucket.startswith("s3://"):
                # remove trailing / if exist
                fsx_lustre_bucket = fsx_lustre_bucket if fsx_lustre_bucket[
                    -1] != '/' else fsx_lustre_bucket[:-1]
                s3_client = boto3.client("s3")
                try:
                    s3_client.get_bucket_acl(
                        Bucket=fsx_lustre_bucket.split('s3://')[-1])
                except botocore.exceptions.ClientError:
                    return {
                        'success':
                        False,
                        'error':
                        'SOCA does not have access to this bucket. Update IAM policy as described on https://soca.dev/tutorials/job-fsx-lustre-backend/'
                    }

                if fsx_lustre_size is False:
                    fsx_lustre_size = 1200
                else:
                    fsx_lustre_capacity_allowed = [
                        1200, 2400, 3600, 7200, 10800
                    ]
                    if fsx_lustre_size not in fsx_lustre_capacity_allowed:
                        return {
                            'success':
                            False,
                            'error':
                            'fsx_lustre_size must be: 1200, 2400, 3600, 7200, 10800'
                        }

            else:
                return {
                    'success': False,
                    'error': 'fsx_lustre_bucket must start with s3://'
                }

    if subnet is False:
        subnet_id = random.choice(soca_private_subnets)
    else:
        if subnet in soca_private_subnets:
            subnet_id = subnet
        else:
            return {
                'success':
                False,
                'error':
                'Incorrect subnet_id. Must be one of ' +
                str(soca_private_subnets)
            }

    if int(desired_capacity) > 1:
        if placement_group == 'false':
            # for testing purpose, sometimes we want to compare performance w/ and w/o placement group
            placement_group = 'false'
        else:
            placement_group = 'true'
    else:
        placement_group = 'false'

    cpus_count_pattern = re.search(r'[.](\d+)', instance_type)
    if cpus_count_pattern:
        cpu_per_system = int(cpus_count_pattern.group(1)) * 2
    else:
        if 'xlarge' in instance_type:
            cpu_per_system = '2'
        else:
            cpu_per_system = '1'

        # Force Tag if they don't exist. DO NOT DELETE them or host won't be able to be registered by nodes_manager.py
    if keep_forever is True:
        unique_id = str(uuid.uuid4())
        stack_name = aligo_configuration[
            'ClusterId'] + '-keepforever-' + queue + '-' + unique_id
        job_id = stack_name
        tags['soca:KeepForever'] = 'true'
    else:
        stack_name = aligo_configuration['ClusterId'] + '-job-' + str(job_id)
        tags['soca:KeepForever'] = 'false'

    if 'soca:NodeType' not in tags.keys():
        tags['soca:NodeType'] = 'soca-compute-node'

    if 'soca:ClusterId' not in tags.keys():
        tags['soca:ClusterId'] = aligo_configuration['ClusterId']

    if 'soca:JobId' not in tags.keys():
        tags['soca:JobId'] = job_id

    if 'Name' not in tags.keys():
        tags['Name'] = stack_name.replace('_', '-')

    job_parameters = {
        'StackUUID':
        str(uuid.uuid4()),
        'Version':
        aligo_configuration['Version'],
        'S3InstallFolder':
        aligo_configuration['S3InstallFolder'],
        'S3Bucket':
        aligo_configuration['S3Bucket'],
        'PlacementGroup':
        placement_group,
        'SecurityGroupId':
        aligo_configuration['ComputeNodeSecurityGroup'],
        'KeepForever':
        'true' if keep_forever is True else 'false',  # needs to be lowercase
        'SSHKeyPair':
        aligo_configuration['SSHKeyPair'],
        'ComputeNodeInstanceProfile':
        aligo_configuration['ComputeNodeInstanceProfile'],
        'Efa':
        efa_support,
        'JobId':
        job_id,
        'ScratchSize':
        0 if scratch_size is False else scratch_size,
        'RootSize':
        10 if root_size is False else root_size,
        'ImageId':
        custom_ami
        if custom_ami is not None else aligo_configuration['CustomAMI'],
        'JobName':
        job_name,
        'JobQueue':
        queue,
        'JobOwner':
        job_owner,
        'JobProject':
        job_project,
        'ClusterId':
        aligo_configuration['ClusterId'],
        'EFSAppsDns':
        aligo_configuration['EFSAppsDns'],
        'EFSDataDns':
        aligo_configuration['EFSDataDns'],
        'SubnetId':
        subnet_id,
        'InstanceType':
        instance_type,
        'SchedulerHostname':
        aligo_configuration['SchedulerPrivateDnsName'],
        'DesiredCapacity':
        desired_capacity,
        'BaseOS':
        aligo_configuration['BaseOS'] if base_os is False else base_os,
        'SpotPrice':
        spot_price if spot_price is not None else 'false',
        'CoreCount':
        cpu_per_system,
        'ThreadsPerCore':
        2 if ht_support == 'true' else 1,
        'SolutionMetricLambda':
        aligo_configuration['SolutionMetricLambda']
        if 'SolutionMetricLambda' in aligo_configuration.keys() else 'false',
        'VolumeTypeIops':
        scratch_iops,
        'FSxLustreBucket':
        'false' if fsx_lustre_bucket is False else fsx_lustre_bucket,
        'FSxLustreSize':
        1200 if fsx_lustre_size is False else fsx_lustre_size,
        'FSxLustreDns':
        'false' if fsx_lustre_dns is False else fsx_lustre_dns,
    }

    stack_tags = [{
        'Key': str(k),
        'Value': str(v)
    } for k, v in tags.items() if v]
    stack_params = [{
        'ParameterKey': str(k),
        'ParameterValue': str(v)
    } for k, v in job_parameters.items() if v]

    if job_parameters['BaseOS'] not in ['rhel7', 'centos7', 'amazonlinux2']:
        return {
            'success':
            False,
            'error':
            'base_os must be one of the following value: centos7, amazonlinux2, rhel7'
        }

    if job_parameters['Efa'] == 'true':
        if not 'n' in job_parameters['InstanceType']:
            return {
                'success':
                False,
                'error':
                'You have requested EFA support but your instance type does not support EFA: '
                + str(job_parameters['InstanceType'])
            }

    can_launch = can_launch_capacity(job_parameters['InstanceType'],
                                     job_parameters['DesiredCapacity'],
                                     job_parameters['ImageId'], subnet_id)

    if can_launch is True:
        try:
            launch = cloudformation.create_stack(StackName=stack_name,
                                                 TemplateBody=stack_template,
                                                 Parameters=stack_params,
                                                 Tags=stack_tags)

            # PBS configuration is automatically updated by nodes_manager
            return {
                'success': True,
                'stack_name': stack_name,
                'compute_node': 'job' + str(job_id)
            }

        except Exception as e:
            exc_type, exc_obj, exc_tb = sys.exc_info()
            fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1]
            return {
                'success':
                False,
                'error':
                str(exc_type) + ' : ' + str(fname) + ' : ' +
                str(exc_tb.tb_lineno) + ' : ' + str(e) + ' : ' + str(e)
            }
    else:
        return {'success': False, 'error': 'Dry Run failed: ' + can_launch}
Example #2
0
import re
import sys
import uuid
import boto3
from botocore.exceptions import ClientError

sys.path.append(os.path.dirname(__file__))
import configuration
from botocore import exceptions
import cloudformation_builder

cloudformation = boto3.client('cloudformation')
s3 = boto3.client('s3')
ec2 = boto3.client('ec2')
servicequotas = boto3.client("service-quotas")
aligo_configuration = configuration.get_aligo_configuration()


def verify_ri_saving_availabilities(instance_type, instance_type_info):
    if instance_type not in instance_type_info.keys():
        instance_type_info[instance_type] = {
            'current_instance_in_use': 0,
            'current_ri_purchased': 0
        }
        token = True
        next_token = ''
        # List all instance from this type currently running
        while token is True:
            response = ec2.describe_instances(
                Filters=[{
                    'Name': 'instance-type',