def main(**params):
    try:
        # Metadata
        t = Template()
        t.set_version("2010-09-09")
        t.set_description("(SOCA) - Base template to deploy compute nodes.")
        allow_anonymous_data_collection = params["MetricCollectionAnonymous"]
        debug = False
        mip_usage = False
        instances_list = params["InstanceType"].split("+")
        asg_lt = asg_LaunchTemplate()
        ltd = LaunchTemplateData("NodeLaunchTemplateData")
        mip = MixedInstancesPolicy()
        stack_name = Ref("AWS::StackName")

        # Begin LaunchTemplateData
        UserData = '''#!/bin/bash -xe
export PATH=$PATH:/usr/local/bin
if [[ "''' + params['BaseOS'] + '''" == "centos7" ]] || [[ "''' + params['BaseOS'] + '''" == "rhel7" ]];
    then
        EASY_INSTALL=$(which easy_install-2.7)
        $EASY_INSTALL pip
        PIP=$(which pip2.7)
        $PIP install awscli
        yum install -y nfs-utils # enforce install of nfs-utils
else
     # Upgrade awscli on ALI (do not use yum)
     EASY_INSTALL=$(which easy_install-2.7)
     $EASY_INSTALL pip
     PIP=$(which pip)
     $PIP install awscli --upgrade 
fi
if [[ "''' + params['BaseOS'] + '''" == "amazonlinux2" ]];
    then
        /usr/sbin/update-motd --disable
fi

GET_INSTANCE_TYPE=$(curl http://169.254.169.254/latest/meta-data/instance-type)
echo export "SOCA_CONFIGURATION="''' + str(params['ClusterId']) + '''"" >> /etc/environment
echo export "SOCA_BASE_OS="''' + str(params['BaseOS']) + '''"" >> /etc/environment
echo export "SOCA_JOB_QUEUE="''' + str(params['JobQueue']) + '''"" >> /etc/environment
echo export "SOCA_JOB_OWNER="''' + str(params['JobOwner']) + '''"" >> /etc/environment
echo export "SOCA_JOB_NAME="''' + str(params['JobName']) + '''"" >> /etc/environment
echo export "SOCA_JOB_PROJECT="''' + str(params['JobProject']) + '''"" >> /etc/environment
echo export "SOCA_VERSION="''' + str(params['Version']) + '''"" >> /etc/environment
echo export "SOCA_JOB_EFA="''' + str(params['Efa']).lower() + '''"" >> /etc/environment
echo export "SOCA_JOB_ID="''' + str(params['JobId']) + '''"" >> /etc/environment
echo export "SOCA_SCRATCH_SIZE=''' + str(params['ScratchSize']) + '''" >> /etc/environment
echo export "SOCA_INSTALL_BUCKET="''' + str(params['S3Bucket']) + '''"" >> /etc/environment
echo export "SOCA_INSTALL_BUCKET_FOLDER="''' + str(params['S3InstallFolder']) + '''"" >> /etc/environment
echo export "SOCA_FSX_LUSTRE_BUCKET="''' + str(params['FSxLustreConfiguration']['fsx_lustre']).lower() + '''"" >> /etc/environment
echo export "SOCA_FSX_LUSTRE_DNS="''' + str(params['FSxLustreConfiguration']['existing_fsx']).lower() + '''"" >> /etc/environment
echo export "SOCA_INSTANCE_TYPE=$GET_INSTANCE_TYPE" >> /etc/environment
echo export "SOCA_INSTANCE_HYPERTHREADING="''' + str(params['ThreadsPerCore']).lower() + '''"" >> /etc/environment
echo export "SOCA_HOST_SYSTEM_LOG="/apps/soca/''' + str(params['ClusterId']) + '''/cluster_node_bootstrap/logs/''' + str(params['JobId']) + '''/$(hostname -s)"" >> /etc/environment
echo export "AWS_STACK_ID=${AWS::StackName}" >> /etc/environment
echo export "AWS_DEFAULT_REGION=${AWS::Region}" >> /etc/environment


source /etc/environment
AWS=$(which aws)

# Give yum permission to the user on this specific machine
echo "''' + params['JobOwner'] + ''' ALL=(ALL) /bin/yum" >> /etc/sudoers

mkdir -p /apps
mkdir -p /data

# Mount EFS
echo "''' + params['EFSDataDns'] + ''':/ /data nfs4 nfsvers=4.1,rsize=1048576,wsize=1048576,hard,timeo=600,retrans=2 0 0" >> /etc/fstab
echo "''' + params['EFSAppsDns'] + ''':/ /apps nfs4 nfsvers=4.1,rsize=1048576,wsize=1048576,hard,timeo=600,retrans=2 0 0" >> /etc/fstab
mount -a 

# Configure NTP
yum remove -y ntp
yum install -y chrony
mv /etc/chrony.conf  /etc/chrony.conf.original
echo -e """
# use the local instance NTP service, if available
server 169.254.169.123 prefer iburst minpoll 4 maxpoll 4

# Use public servers from the pool.ntp.org project.
# Please consider joining the pool (http://www.pool.ntp.org/join.html).
# !!! [BEGIN] SOCA REQUIREMENT
# You will need to open UDP egress traffic on your security group if you want to enable public pool
#pool 2.amazon.pool.ntp.org iburst
# !!! [END] SOCA REQUIREMENT
# Record the rate at which the system clock gains/losses time.
driftfile /var/lib/chrony/drift

# Allow the system clock to be stepped in the first three updates
# if its offset is larger than 1 second.
makestep 1.0 3

# Specify file containing keys for NTP authentication.
keyfile /etc/chrony.keys

# Specify directory for log files.
logdir /var/log/chrony

# save data between restarts for fast re-load
dumponexit
dumpdir /var/run/chrony
""" > /etc/chrony.conf
systemctl enable chronyd

# Prepare  Log folder
mkdir -p $SOCA_HOST_SYSTEM_LOG
echo "@reboot /bin/bash /apps/soca/$SOCA_CONFIGURATION/cluster_node_bootstrap/ComputeNodePostReboot.sh >> $SOCA_HOST_SYSTEM_LOG/ComputeNodePostInstall.log 2>&1" | crontab -
$AWS s3 cp s3://$SOCA_INSTALL_BUCKET/$SOCA_INSTALL_BUCKET_FOLDER/scripts/config.cfg /root/
/bin/bash /apps/soca/$SOCA_CONFIGURATION/cluster_node_bootstrap/ComputeNode.sh ''' + params['SchedulerHostname'] + ''' >> $SOCA_HOST_SYSTEM_LOG/ComputeNode.sh.log 2>&1'''

        ltd.EbsOptimized = True
        for instance in instances_list:
            if "t2." in instance:
                ltd.EbsOptimized = False
            else:
                # t2 does not support CpuOptions
                ltd.CpuOptions = CpuOptions(
                    CoreCount=int(params["CoreCount"]),
                    ThreadsPerCore=1 if params["ThreadsPerCore"] is False else 2)

        ltd.IamInstanceProfile = IamInstanceProfile(Arn=params["ComputeNodeInstanceProfileArn"])
        ltd.KeyName = params["SSHKeyPair"]
        ltd.ImageId = params["ImageId"]
        if params["SpotPrice"] is not False and params["SpotAllocationCount"] is False:
            ltd.InstanceMarketOptions = InstanceMarketOptions(
                MarketType="spot",
                SpotOptions=SpotOptions(
                    MaxPrice=Ref("AWS::NoValue") if params["SpotPrice"] == "auto" else str(params["SpotPrice"])
                    # auto -> cap at OD price
                )
            )
        ltd.InstanceType = instances_list[0]
        ltd.NetworkInterfaces = [NetworkInterfaces(
            InterfaceType="efa" if params["Efa"] is not False else Ref("AWS::NoValue"),
            DeleteOnTermination=True,
            DeviceIndex=0,
            Groups=[params["SecurityGroupId"]]
        )]
        ltd.UserData = Base64(Sub(UserData))
        ltd.BlockDeviceMappings = [
            BlockDeviceMapping(
                DeviceName="/dev/xvda" if params["BaseOS"] == "amazonlinux2" else "/dev/sda1",
                Ebs=EBSBlockDevice(
                    VolumeSize=params["RootSize"],
                    VolumeType="gp2",
                    DeleteOnTermination="false" if params["KeepEbs"] is True else "true",
                    Encrypted=True))
        ]
        if int(params["ScratchSize"]) > 0:
            ltd.BlockDeviceMappings.append(
                BlockDeviceMapping(
                    DeviceName="/dev/xvdbx",
                    Ebs=EBSBlockDevice(
                        VolumeSize=params["ScratchSize"],
                        VolumeType="io1" if int(params["VolumeTypeIops"]) > 0 else "gp2",
                        Iops=params["VolumeTypeIops"] if int(params["VolumeTypeIops"]) > 0 else Ref("AWS::NoValue"),
                        DeleteOnTermination="false" if params["KeepEbs"] is True else "true",
                        Encrypted=True))
            )
        # End LaunchTemplateData

        # Begin Launch Template Resource
        lt = LaunchTemplate("NodeLaunchTemplate")
        lt.LaunchTemplateName = params["ClusterId"] + "-" + str(params["JobId"])
        lt.LaunchTemplateData = ltd
        t.add_resource(lt)
        # End Launch Template Resource

        asg_lt.LaunchTemplateSpecification = LaunchTemplateSpecification(
            LaunchTemplateId=Ref(lt),
            Version=GetAtt(lt, "LatestVersionNumber")
        )

        asg_lt.Overrides = []
        for instance in instances_list:
            asg_lt.Overrides.append(LaunchTemplateOverrides(
                InstanceType=instance))

        # Begin InstancesDistribution
        if params["SpotPrice"] is not False and \
                params["SpotAllocationCount"] is not False and \
                (params["DesiredCapacity"] - params["SpotAllocationCount"]) > 0:
            mip_usage = True
            idistribution = InstancesDistribution()
            idistribution.OnDemandAllocationStrategy = "prioritized"  # only supported value
            idistribution.OnDemandBaseCapacity = params["DesiredCapacity"] - params["SpotAllocationCount"]
            idistribution.OnDemandPercentageAboveBaseCapacity = "0"  # force the other instances to be SPOT
            idistribution.SpotMaxPrice = Ref("AWS::NoValue") if params["SpotPrice"] == "auto" else str(
                params["SpotPrice"])
            idistribution.SpotAllocationStrategy = params['SpotAllocationStrategy']
            mip.InstancesDistribution = idistribution

        # End MixedPolicyInstance

        # Begin FSx for Lustre
        if params["FSxLustreConfiguration"]["fsx_lustre"] is not False:
            if params["FSxLustreConfiguration"]["existing_fsx"] is False:
                fsx_lustre = FileSystem("FSxForLustre")
                fsx_lustre.FileSystemType = "LUSTRE"
                fsx_lustre.StorageCapacity = params["FSxLustreConfiguration"]["capacity"]
                fsx_lustre.SecurityGroupIds = [params["SecurityGroupId"]]
                fsx_lustre.SubnetIds = params["SubnetId"]

                if params["FSxLustreConfiguration"]["s3_backend"] is not False:
                    fsx_lustre_configuration = LustreConfiguration()
                    fsx_lustre_configuration.ImportPath = params["FSxLustreConfiguration"]["import_path"] if params["FSxLustreConfiguration"]["import_path"] is not False else params["FSxLustreConfiguration"]["s3_backend"]
                    fsx_lustre_configuration.ExportPath = params["FSxLustreConfiguration"]["import_path"] if params["FSxLustreConfiguration"]["import_path"] is not False else params["FSxLustreConfiguration"]["s3_backend"] + "/" + params["ClusterId"] + "-fsxoutput/job-" +  params["JobId"] + "/"
                    fsx_lustre.LustreConfiguration = fsx_lustre_configuration

                fsx_lustre.Tags = base_Tags(
                    # False disable PropagateAtLaunch
                    Name=str(params["ClusterId"] + "-compute-job-" + params["JobId"]),
                    _soca_JobId=str(params["JobId"]),
                    _soca_JobName=str(params["JobName"]),
                    _soca_JobQueue=str(params["JobQueue"]),
                    _soca_StackId=stack_name,
                    _soca_JobOwner=str(params["JobOwner"]),
                    _soca_JobProject=str(params["JobProject"]),
                    _soca_KeepForever=str(params["KeepForever"]).lower(),
                    _soca_FSx="true",
                    _soca_ClusterId=str(params["ClusterId"]),
                )
                t.add_resource(fsx_lustre)
        # End FSx For Lustre

        # Begin AutoScalingGroup Resource
        asg = AutoScalingGroup("AutoScalingComputeGroup")
        asg.DependsOn = "NodeLaunchTemplate"
        if mip_usage is True or instances_list.__len__() > 1:
            mip.LaunchTemplate = asg_lt
            asg.MixedInstancesPolicy = mip

        else:
            asg.LaunchTemplate = LaunchTemplateSpecification(
                LaunchTemplateId=Ref(lt),
                Version=GetAtt(lt, "LatestVersionNumber"))

        asg.MinSize = int(params["DesiredCapacity"])
        asg.MaxSize = int(params["DesiredCapacity"])
        asg.VPCZoneIdentifier = params["SubnetId"]

        if params["PlacementGroup"] is True:
            pg = PlacementGroup("ComputeNodePlacementGroup")
            pg.Strategy = "cluster"
            t.add_resource(pg)
            asg.PlacementGroup = Ref(pg)

        asg.Tags = Tags(
            Name=str(params["ClusterId"]) + "-compute-job-" + str(params["JobId"]),
            _soca_JobId=str(params["JobId"]),
            _soca_JobName=str(params["JobName"]),
            _soca_JobQueue=str(params["JobQueue"]),
            _soca_StackId=stack_name,
            _soca_JobOwner=str(params["JobOwner"]),
            _soca_JobProject=str(params["JobProject"]),
            _soca_KeepForever=str(params["KeepForever"]).lower(),
            _soca_ClusterId=str(params["ClusterId"]),
            _soca_NodeType="soca-compute-node")
        t.add_resource(asg)
        # End AutoScalingGroup Resource

        # Begin Custom Resource
        # Change Mapping to No if you want to disable this
        if allow_anonymous_data_collection is True:
            metrics = CustomResourceSendAnonymousMetrics("SendAnonymousData")
            metrics.ServiceToken = params["SolutionMetricLambda"]
            metrics.DesiredCapacity = str(params["DesiredCapacity"])
            metrics.InstanceType = str(params["InstanceType"])
            metrics.Efa = str(params["Efa"])
            metrics.ScratchSize = str(params["ScratchSize"])
            metrics.RootSize = str(params["RootSize"])
            metrics.SpotPrice = str(params["SpotPrice"])
            metrics.BaseOS = str(params["BaseOS"])
            metrics.StackUUID = str(params["StackUUID"])
            metrics.KeepForever = str(params["KeepForever"])
            metrics.FsxLustre = str(params["FSxLustreConfiguration"])
            t.add_resource(metrics)
            # End Custom Resource

        if debug is True:
            print(t.to_json())

        # Tags must use "soca:<Key>" syntax
        template_output = t.to_yaml().replace("_soca_", "soca:")
        return {'success': True,
                'output': template_output}

    except Exception as e:
        exc_type, exc_obj, exc_tb = sys.exc_info()
        fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1]
        return {'success': False,
                'output': 'cloudformation_builder.py: ' + (
                            str(e) + ': error :' + str(exc_type) + ' ' + str(fname) + ' ' + str(exc_tb.tb_lineno))}
Ejemplo n.º 2
0
def add_launch_template(template, hosts_sg):
    """Function to create a launch template.

    :param template: ECS Cluster template
    :type template: troposphere.Template
    :param hosts_sg: security group for the EC2 hosts
    :type hosts_sg: troposphere.ec2.SecurityGroup

    :return: launch_template
    :rtype: troposphere.ec2.LaunchTemplate
    """
    # from troposphere.cloudformation import (
    #     WaitCondition, WaitConditionHandle
    # )
    #  Deactivated conditions given you could run with no EC2 at all.
    # Tricky condition to do as the WaitCondition and Handler cannot be created on a CFN Update, but only at the
    # very creation of the stack.
    # wait_handle = WaitConditionHandle(
    #     'BootstrapHandle',
    #     template=template
    # )
    # WaitCondition(
    #     'BootStrapCondition',
    #     template=template,
    #     DependsOn=[hosts_role],
    #     Handle=Ref(wait_handle),
    #     Timeout='900'
    # )

    launch_template = LaunchTemplate(
        "LaunchTemplate",
        template=template,
        Metadata=cloudformation.Metadata(
            cloudformation.Init(
                cloudformation.InitConfigSets(
                    default=["awspackages", "dockerconfig", "ecsconfig", "awsservices"]
                ),
                awspackages=cloudformation.InitConfig(
                    packages={"yum": {"awslogs": [], "amazon-ssm-agent": []}},
                    commands={
                        "001-check-packages": {"command": "rpm -qa | grep amazon"},
                        "002-check-packages": {"command": "rpm -qa | grep aws"},
                    },
                ),
                awsservices=cloudformation.InitConfig(
                    services={
                        "sysvinit": {
                            "amazon-ssm-agent": {"enabled": True, "ensureRunning": True}
                        }
                    }
                ),
                dockerconfig=cloudformation.InitConfig(
                    commands={
                        "001-stop-docker": {"command": "systemctl stop docker"},
                        "098-reload-systemd": {"command": "systemctl daemon-reload"},
                    },
                    files={
                        "/etc/sysconfig/docker": {
                            "owner": "root",
                            "group": "root",
                            "mode": "644",
                            "content": Join(
                                "\n",
                                [
                                    "DAEMON_MAXFILES=1048576",
                                    Join(
                                        " ",
                                        ["OPTIONS=--default-ulimit nofile=1024:4096"],
                                    ),
                                    "DAEMON_PIDFILE_TIMEOUT=10",
                                    "#EOF",
                                    "",
                                ],
                            ),
                        }
                    },
                    services={
                        "sysvinit": {
                            "docker": {
                                "enabled": True,
                                "ensureRunning": True,
                                "files": ["/etc/sysconfig/docker"],
                                "commands": ["098-reload-systemd"],
                            }
                        }
                    },
                ),
                ecsconfig=cloudformation.InitConfig(
                    files={
                        "/etc/ecs/ecs.config": {
                            "owner": "root",
                            "group": "root",
                            "mode": "644",
                            "content": Join(
                                "\n",
                                [
                                    Sub(f"ECS_CLUSTER=${{{CLUSTER_NAME_T}}}"),
                                    "ECS_ENABLE_TASK_IAM_ROLE=true",
                                    "ECS_ENABLE_SPOT_INSTANCE_DRAINING=true",
                                    "ECS_ENABLE_TASK_IAM_ROLE_NETWORK_HOST=true",
                                    "ECS_ENABLE_CONTAINER_METADATA=true",
                                    "ECS_ENABLE_UNTRACKED_IMAGE_CLEANUP=true",
                                    "ECS_UPDATES_ENABLED=true",
                                    "ECS_ENGINE_TASK_CLEANUP_WAIT_DURATION=15m",
                                    "ECS_IMAGE_CLEANUP_INTERVAL=10m",
                                    "ECS_NUM_IMAGES_DELETE_PER_CYCLE=100",
                                    "ECS_ENABLE_TASK_ENI=true",
                                    "ECS_AWSVPC_BLOCK_IMDS=true",
                                    "ECS_TASK_METADATA_RPS_LIMIT=300,400",
                                    "ECS_ENABLE_AWSLOGS_EXECUTIONROLE_OVERRIDE=true",
                                    'ECS_AVAILABLE_LOGGING_DRIVERS=["awslogs", "json-file"]',
                                    "#EOF",
                                ],
                            ),
                        }
                    },
                    commands={
                        "0001-restartecs": {
                            "command": "systemctl --no-block restart ecs"
                        }
                    },
                ),
            )
        ),
        LaunchTemplateData=LaunchTemplateData(
            BlockDeviceMappings=[
                LaunchTemplateBlockDeviceMapping(
                    DeviceName="/dev/xvda",
                    Ebs=EBSBlockDevice(DeleteOnTermination=True, Encrypted=True),
                )
            ],
            ImageId=Ref(compute_params.ECS_AMI_ID),
            InstanceInitiatedShutdownBehavior="terminate",
            IamInstanceProfile=IamInstanceProfile(
                Arn=Sub(f"${{{HOST_PROFILE_T}.Arn}}")
            ),
            TagSpecifications=[
                TagSpecifications(
                    ResourceType="instance",
                    Tags=Tags(
                        Name=Sub(f"EcsNodes-${{{CLUSTER_NAME_T}}}"),
                        StackName=Ref("AWS::StackName"),
                        StackId=Ref("AWS::StackId"),
                    ),
                )
            ],
            InstanceType="m5a.large",
            Monitoring=Monitoring(Enabled=True),
            SecurityGroupIds=[GetAtt(hosts_sg, "GroupId")],
            UserData=Base64(
                Join(
                    "\n",
                    [
                        "#!/usr/bin/env bash",
                        "export PATH=$PATH:/opt/aws/bin",
                        "cfn-init -v || yum install aws-cfn-bootstrap -y",
                        Sub(
                            f"cfn-init --region ${{AWS::Region}} -r LaunchTemplate -s ${{AWS::StackName}}"
                        ),
                        # 'if [ $? -ne 0 ]; then',
                        # Sub(f'cfn-signal -e 1 -r "Failed to bootstrap" \'${{{wait_handle.title}}}\''),
                        # 'halt',
                        # 'else',
                        # Sub(f'cfn-signal -e 0 -r "Successfully bootstrapped" \'${{{wait_handle.title}}}\''),
                        # 'fi',
                        "# EOF",
                    ],
                )
            ),
        ),
        LaunchTemplateName=Ref(CLUSTER_NAME_T),
    )
    return launch_template
Ejemplo n.º 3
0
def main(**params):
    try:
        # Metadata
        t = Template()
        t.set_version("2010-09-09")
        t.set_description("(SOCA) - Base template to deploy compute nodes. Version 2.7.2")
        allow_anonymous_data_collection = params["MetricCollectionAnonymous"]
        debug = False
        mip_usage = False
        instances_list = params["InstanceType"] # list of instance type. Use + to specify more than one type
        asg_lt = asg_LaunchTemplate()
        ltd = LaunchTemplateData("NodeLaunchTemplateData")
        mip = MixedInstancesPolicy()
        stack_name = Ref("AWS::StackName")

        # Begin LaunchTemplateData
        UserData = '''#!/bin/bash -x
export PATH=$PATH:/usr/local/bin
if [[ "''' + params['BaseOS'] + '''" == "centos7" ]] || [[ "''' + params['BaseOS'] + '''" == "rhel7" ]];
then
     yum install -y python3-pip
     PIP=$(which pip3)
     $PIP install awscli
     yum install -y nfs-utils # enforce install of nfs-utils
else
     yum install -y python3-pip
     PIP=$(which pip3)
     $PIP install awscli
fi
if [[ "''' + params['BaseOS'] + '''" == "amazonlinux2" ]];
    then
        /usr/sbin/update-motd --disable
fi

GET_INSTANCE_TYPE=$(curl http://169.254.169.254/latest/meta-data/instance-type)
echo export "SOCA_CONFIGURATION="''' + str(params['ClusterId']) + '''"" >> /etc/environment
echo export "SOCA_BASE_OS="''' + str(params['BaseOS']) + '''"" >> /etc/environment
echo export "SOCA_JOB_QUEUE="''' + str(params['JobQueue']) + '''"" >> /etc/environment
echo export "SOCA_JOB_OWNER="''' + str(params['JobOwner']) + '''"" >> /etc/environment
echo export "SOCA_JOB_NAME="''' + str(params['JobName']) + '''"" >> /etc/environment
echo export "SOCA_JOB_PROJECT="''' + str(params['JobProject']) + '''"" >> /etc/environment
echo export "SOCA_VERSION="''' + str(params['Version']) + '''"" >> /etc/environment
echo export "SOCA_JOB_EFA="''' + str(params['Efa']).lower() + '''"" >> /etc/environment
echo export "SOCA_JOB_ID="''' + str(params['JobId']) + '''"" >> /etc/environment
echo export "SOCA_SCRATCH_SIZE=''' + str(params['ScratchSize']) + '''" >> /etc/environment
echo export "SOCA_INSTALL_BUCKET="''' + str(params['S3Bucket']) + '''"" >> /etc/environment
echo export "SOCA_INSTALL_BUCKET_FOLDER="''' + str(params['S3InstallFolder']) + '''"" >> /etc/environment
echo export "SOCA_FSX_LUSTRE_BUCKET="''' + str(params['FSxLustreConfiguration']['fsx_lustre']).lower() + '''"" >> /etc/environment
echo export "SOCA_FSX_LUSTRE_DNS="''' + str(params['FSxLustreConfiguration']['existing_fsx']).lower() + '''"" >> /etc/environment
echo export "SOCA_INSTANCE_TYPE=$GET_INSTANCE_TYPE" >> /etc/environment
echo export "SOCA_INSTANCE_HYPERTHREADING="''' + str(params['ThreadsPerCore']).lower() + '''"" >> /etc/environment
echo export "SOCA_SYSTEM_METRICS="''' + str(params['SystemMetrics']).lower() + '''"" >> /etc/environment
echo export "SOCA_ESDOMAIN_ENDPOINT="''' + str(params['ESDomainEndpoint']).lower() + '''"" >> /etc/environment
echo export "SOCA_AUTH_PROVIDER="''' + str(params['AuthProvider']).lower() + '''"" >> /etc/environment
echo export "SOCA_HOST_SYSTEM_LOG="/apps/soca/''' + str(params['ClusterId']) + '''/cluster_node_bootstrap/logs/''' + str(params['JobId']) + '''/$(hostname -s)"" >> /etc/environment
echo export "AWS_STACK_ID=${AWS::StackName}" >> /etc/environment
echo export "AWS_DEFAULT_REGION=${AWS::Region}" >> /etc/environment


source /etc/environment
AWS=$(command -v aws)

# Give yum permission to the user on this specific machine
echo "''' + params['JobOwner'] + ''' ALL=(ALL) /bin/yum" >> /etc/sudoers

# Mount File system
mkdir -p /apps
mkdir -p /data

FS_DATA_PROVIDER='''+params['FileSystemDataProvider']+'''
FS_DATA='''+params['FileSystemData']+'''
FS_APPS_PROVIDER='''+params['FileSystemAppsProvider']+'''
FS_APPS='''+params['FileSystemApps']+'''

if [[ "$FS_DATA_PROVIDER" == "fsx_lustre" ]] || [[ "$FS_APPS_PROVIDER" == "fsx_lustre" ]]; then
    if [[ -z "$(rpm -qa lustre-client)" ]]; then
        # Install FSx for Lustre Client
        if [[ "$SOCA_BASE_OS" == "amazonlinux2" ]]; then
            amazon-linux-extras install -y lustre2.10
        else
            kernel=$(uname -r)
            machine=$(uname -m)
            echo "Found kernel version: $kernel running on: $machine"
            yum -y install wget
            if [[ $kernel == *"3.10.0-957"*$machine ]]; then
                yum -y install https://downloads.whamcloud.com/public/lustre/lustre-2.10.8/el7/client/RPMS/x86_64/kmod-lustre-client-2.10.8-1.el7.x86_64.rpm
                yum -y install https://downloads.whamcloud.com/public/lustre/lustre-2.10.8/el7/client/RPMS/x86_64/lustre-client-2.10.8-1.el7.x86_64.rpm
            elif [[ $kernel == *"3.10.0-1062"*$machine ]]; then
                wget https://fsx-lustre-client-repo-public-keys.s3.amazonaws.com/fsx-rpm-public-key.asc -O /tmp/fsx-rpm-public-key.asc
                rpm --import /tmp/fsx-rpm-public-key.asc
                wget https://fsx-lustre-client-repo.s3.amazonaws.com/el/7/fsx-lustre-client.repo -O /etc/yum.repos.d/aws-fsx.repo
                sed -i 's#7#7.7#' /etc/yum.repos.d/aws-fsx.repo
                yum clean all
                yum install -y kmod-lustre-client lustre-client
            elif [[ $kernel == *"3.10.0-1127"*$machine ]]; then
                wget https://fsx-lustre-client-repo-public-keys.s3.amazonaws.com/fsx-rpm-public-key.asc -O /tmp/fsx-rpm-public-key.asc
                rpm --import /tmp/fsx-rpm-public-key.asc
                wget https://fsx-lustre-client-repo.s3.amazonaws.com/el/7/fsx-lustre-client.repo -O /etc/yum.repos.d/aws-fsx.repo
                sed -i 's#7#7.8#' /etc/yum.repos.d/aws-fsx.repo
                yum clean all
                yum install -y kmod-lustre-client lustre-client
            elif [[ $kernel == *"3.10.0-1160"*$machine ]]; then
                wget https://fsx-lustre-client-repo-public-keys.s3.amazonaws.com/fsx-rpm-public-key.asc -O /tmp/fsx-rpm-public-key.asc
                rpm --import /tmp/fsx-rpm-public-key.asc
                wget https://fsx-lustre-client-repo.s3.amazonaws.com/el/7/fsx-lustre-client.repo -O /etc/yum.repos.d/aws-fsx.repo
                yum clean all
                yum install -y kmod-lustre-client lustre-client
            elif [[ $kernel == *"4.18.0-193"*$machine ]]; then
                # FSX for Lustre on aarch64 is supported only on 4.18.0-193
                wget https://fsx-lustre-client-repo-public-keys.s3.amazonaws.com/fsx-rpm-public-key.asc -O /tmp/fsx-rpm-public-key.asc
                rpm --import /tmp/fsx-rpm-public-key.asc
                wget https://fsx-lustre-client-repo.s3.amazonaws.com/centos/7/fsx-lustre-client.repo -O /etc/yum.repos.d/aws-fsx.repo
                yum clean all
                yum install -y kmod-lustre-client lustre-client
            else
                echo "ERROR: Can't install FSx for Lustre client as kernel version: $kernel isn't matching expected versions: (x86_64: 3.10.0-957, -1062, -1127, -1160, aarch64: 4.18.0-193)!"
            fi
        fi
    fi
fi

if [[ "$FS_DATA_PROVIDER" == "efs" ]]; then
    echo "$FS_DATA:/ /data nfs4 nfsvers=4.1,rsize=1048576,wsize=1048576,hard,timeo=600,retrans=2,noresvport 0 0" >> /etc/fstab
elif [[ "$FS_DATA_PROVIDER" == "fsx_lustre" ]]; then
    FSX_ID=$(echo $FS_DATA | cut -d. -f1)
    FSX_DATA_MOUNT_NAME=$($AWS fsx describe-file-systems --file-system-ids $FSX_ID  --query FileSystems[].LustreConfiguration.MountName --output text)
    echo "$FS_DATA@tcp:/$FSX_DATA_MOUNT_NAME /data lustre defaults,noatime,flock,_netdev 0 0" >> /etc/fstab
fi

if [[ "$FS_APPS_PROVIDER" == "efs" ]]; then
    echo "$FS_APPS:/ /apps nfs4 nfsvers=4.1,rsize=1048576,wsize=1048576,hard,timeo=600,retrans=2,noresvport 0 0" >> /etc/fstab
elif [[ "$FS_APPS_PROVIDER" == "fsx_lustre" ]]; then
    FSX_ID=$(echo $FS_APPS | cut -d. -f1)
    FSX_APPS_MOUNT_NAME=$($AWS fsx describe-file-systems --file-system-ids $FSX_ID  --query FileSystems[].LustreConfiguration.MountName --output text)
    echo "$FS_APPS@tcp:/$FSX_APPS_MOUNT_NAME /apps lustre defaults,noatime,flock,_netdev 0 0" >> /etc/fstab
fi

FS_MOUNT=0
mount -a 
while [[ $? -ne 0 ]] && [[ $FS_MOUNT -lt 5 ]]
do
    SLEEP_TIME=$(( RANDOM % 60 ))
    echo "Failed to mount FS, retrying in $SLEEP_TIME seconds and Loop $FS_MOUNT/5..."
    sleep $SLEEP_TIME
    ((FS_MOUNT++))
    mount -a
done

# Configure Chrony
yum remove -y ntp
yum install -y chrony
mv /etc/chrony.conf  /etc/chrony.conf.original
echo -e """
# use the local instance NTP service, if available
server 169.254.169.123 prefer iburst minpoll 4 maxpoll 4

# Use public servers from the pool.ntp.org project.
# Please consider joining the pool (http://www.pool.ntp.org/join.html).
# !!! [BEGIN] SOCA REQUIREMENT
# You will need to open UDP egress traffic on your security group if you want to enable public pool
#pool 2.amazon.pool.ntp.org iburst
# !!! [END] SOCA REQUIREMENT
# Record the rate at which the system clock gains/losses time.
driftfile /var/lib/chrony/drift

# Allow the system clock to be stepped in the first three updates
# if its offset is larger than 1 second.
makestep 1.0 3

# Specify file containing keys for NTP authentication.
keyfile /etc/chrony.keys

# Specify directory for log files.
logdir /var/log/chrony

# save data between restarts for fast re-load
dumponexit
dumpdir /var/run/chrony
""" > /etc/chrony.conf
systemctl enable chronyd

# Prepare  Log folder
mkdir -p $SOCA_HOST_SYSTEM_LOG
echo "@reboot /bin/bash /apps/soca/$SOCA_CONFIGURATION/cluster_node_bootstrap/ComputeNodePostReboot.sh >> $SOCA_HOST_SYSTEM_LOG/ComputeNodePostReboot.log 2>&1" | crontab -
cp /apps/soca/$SOCA_CONFIGURATION/cluster_node_bootstrap/config.cfg /root/
/bin/bash /apps/soca/$SOCA_CONFIGURATION/cluster_node_bootstrap/ComputeNode.sh ''' + params['SchedulerHostname'] + ''' >> $SOCA_HOST_SYSTEM_LOG/ComputeNode.sh.log 2>&1'''

        # Specify the security groups to assign to the compute nodes. Max 5 per instance
        security_groups = [params["SecurityGroupId"]]
        if params["AdditionalSecurityGroupIds"]:
            for sg_id in params["AdditionalSecurityGroupIds"]:
                security_groups.append(sg_id)

        # Specify the IAM instance profile to use
        instance_profile = params["ComputeNodeInstanceProfileArn"] if params["CustomIamInstanceProfile"] is False else params["CustomIamInstanceProfile"]

        SpotFleet = True if ((params["SpotPrice"] is not False) and (params["SpotAllocationCount"] is False) and (int(params["DesiredCapacity"]) > 1 or len(instances_list)>1)) else False
        ltd.EbsOptimized = True
        for instance in instances_list:
            if "t2." in instance:
                ltd.EbsOptimized = False

            # metal + t2 does not support CpuOptions
            unsupported = ["t2.", "metal"]
            if all(itype not in instance for itype in unsupported) and (SpotFleet is False or len(instances_list) == 1):
                # Spotfleet with multiple instance types doesn't support CpuOptions
                # So we can't add CpuOptions if SpotPrice is specified and when multiple instances are specified
                ltd.CpuOptions = CpuOptions(
                    CoreCount=int(params["CoreCount"]),
                    ThreadsPerCore=1 if params["ThreadsPerCore"] is False else 2)

        ltd.IamInstanceProfile = IamInstanceProfile(Arn=instance_profile)
        ltd.KeyName = params["SSHKeyPair"]
        ltd.ImageId = params["ImageId"]
        if params["SpotPrice"] is not False and params["SpotAllocationCount"] is False:
            ltd.InstanceMarketOptions = InstanceMarketOptions(
                MarketType="spot",
                SpotOptions=SpotOptions(
                    MaxPrice=Ref("AWS::NoValue") if params["SpotPrice"] == "auto" else str(params["SpotPrice"])
                    # auto -> cap at OD price
                )
            )
        ltd.InstanceType = instances_list[0]
        ltd.NetworkInterfaces = [NetworkInterfaces(
            InterfaceType="efa" if params["Efa"] is not False else Ref("AWS::NoValue"),
            DeleteOnTermination=True,
            DeviceIndex=0,
            Groups=security_groups
        )]
        ltd.UserData = Base64(Sub(UserData))
        ltd.BlockDeviceMappings = [
            LaunchTemplateBlockDeviceMapping(
                DeviceName="/dev/xvda" if params["BaseOS"] == "amazonlinux2" else "/dev/sda1",
                Ebs=EBSBlockDevice(
                    VolumeSize=params["RootSize"],
                    VolumeType="gp3",
                    DeleteOnTermination="false" if params["KeepEbs"] is True else "true",
                    Encrypted=True))
        ]
        if int(params["ScratchSize"]) > 0:
            ltd.BlockDeviceMappings.append(
                BlockDeviceMapping(
                    DeviceName="/dev/xvdbx",
                    Ebs=EBSBlockDevice(
                        VolumeSize=params["ScratchSize"],
                        VolumeType="io1" if int(params["VolumeTypeIops"]) > 0 else "gp3",
                        Iops=params["VolumeTypeIops"] if int(params["VolumeTypeIops"]) > 0 else Ref("AWS::NoValue"),
                        DeleteOnTermination="false" if params["KeepEbs"] is True else "true",
                        Encrypted=True))
            )
        ltd.TagSpecifications = [ec2.TagSpecifications(
            ResourceType="instance",
            Tags = base_Tags(
                Name=str(params["ClusterId"]) + "-compute-job-" + str(params["JobId"]),
                _soca_JobId=str(params["JobId"]),
                _soca_JobName=str(params["JobName"]),
                _soca_JobQueue=str(params["JobQueue"]),
                _soca_StackId=stack_name,
                _soca_JobOwner=str(params["JobOwner"]),
                _soca_JobProject=str(params["JobProject"]),
                _soca_TerminateWhenIdle=str(params["TerminateWhenIdle"]),
                _soca_KeepForever=str(params["KeepForever"]).lower(),
                _soca_ClusterId=str(params["ClusterId"]),
                _soca_NodeType="soca-compute-node"))]
        # End LaunchTemplateData

        # Begin Launch Template Resource
        lt = LaunchTemplate("NodeLaunchTemplate")
        lt.LaunchTemplateName = params["ClusterId"] + "-" + str(params["JobId"])
        lt.LaunchTemplateData = ltd
        t.add_resource(lt)
        # End Launch Template Resource

        if SpotFleet is True:
            # SpotPrice is defined and DesiredCapacity > 1 or need to try more than 1 instance_type
            # Create SpotFleet

            # Begin SpotFleetRequestConfigData Resource
            sfrcd = ec2.SpotFleetRequestConfigData()
            sfrcd.AllocationStrategy = params["SpotAllocationStrategy"]
            sfrcd.ExcessCapacityTerminationPolicy = "noTermination"
            sfrcd.IamFleetRole = params["SpotFleetIAMRoleArn"]
            sfrcd.InstanceInterruptionBehavior = "terminate"
            if params["SpotPrice"] != "auto":
                sfrcd.SpotPrice = str(params["SpotPrice"])
            sfrcd.SpotMaintenanceStrategies = ec2.SpotMaintenanceStrategies(
                    CapacityRebalance=ec2.SpotCapacityRebalance(ReplacementStrategy="launch"))
            sfrcd.TargetCapacity = params["DesiredCapacity"]
            sfrcd.Type = "maintain"
            sfltc = ec2.LaunchTemplateConfigs()
            sflts = ec2.LaunchTemplateSpecification(
                    LaunchTemplateId=Ref(lt),
                    Version=GetAtt(lt, "LatestVersionNumber"))
            sfltc.LaunchTemplateSpecification = sflts
            sfltc.Overrides = []
            for subnet in params["SubnetId"]:
                for index, instance in enumerate(instances_list):
                    if params["WeightedCapacity"] is not False:
                        sfltc.Overrides.append(ec2.LaunchTemplateOverrides(
                                InstanceType = instance,
                                SubnetId = subnet,
                                WeightedCapacity = params["WeightedCapacity"][index]))
                    else:
                        sfltc.Overrides.append(ec2.LaunchTemplateOverrides(
                                InstanceType = instance,
                                SubnetId = subnet))
            sfrcd.LaunchTemplateConfigs = [sfltc]
            TagSpecifications = ec2.SpotFleetTagSpecification(
                ResourceType="spot-fleet-request",
                Tags=base_Tags(
                Name=str(params["ClusterId"]) + "-compute-job-" + str(params["JobId"]),
                _soca_JobId=str(params["JobId"]),
                _soca_JobName=str(params["JobName"]),
                _soca_JobQueue=str(params["JobQueue"]),
                _soca_StackId=stack_name,
                _soca_JobOwner=str(params["JobOwner"]),
                _soca_JobProject=str(params["JobProject"]),
                _soca_TerminateWhenIdle=str(params["TerminateWhenIdle"]),
                _soca_KeepForever=str(params["KeepForever"]).lower(),
                _soca_ClusterId=str(params["ClusterId"]),
                _soca_NodeType="soca-compute-node"))
            # End SpotFleetRequestConfigData Resource

            # Begin SpotFleet Resource
            spotfleet = ec2.SpotFleet("SpotFleet")
            spotfleet.SpotFleetRequestConfigData = sfrcd
            t.add_resource(spotfleet)
            # End SpotFleet Resource
        else:

            asg_lt.LaunchTemplateSpecification = LaunchTemplateSpecification(
                LaunchTemplateId=Ref(lt),
                Version=GetAtt(lt, "LatestVersionNumber")
            )

            asg_lt.Overrides = []
            for index, instance in enumerate(instances_list):
                if params["WeightedCapacity"] is not False:
                    mip_usage = True
                    asg_lt.Overrides.append(LaunchTemplateOverrides(
                        InstanceType=instance,
                        WeightedCapacity=str(params["WeightedCapacity"][index])))
                else:
                    asg_lt.Overrides.append(LaunchTemplateOverrides(
                        InstanceType=instance))

            # Begin InstancesDistribution
            if params["SpotPrice"] is not False and \
                    params["SpotAllocationCount"] is not False and \
                    (int(params["DesiredCapacity"]) - int(params["SpotAllocationCount"])) > 0:
                mip_usage = True
                idistribution = InstancesDistribution()
                idistribution.OnDemandAllocationStrategy = "prioritized"  # only supported value
                idistribution.OnDemandBaseCapacity = params["DesiredCapacity"] - params["SpotAllocationCount"]
                idistribution.OnDemandPercentageAboveBaseCapacity = "0"  # force the other instances to be SPOT
                idistribution.SpotMaxPrice = Ref("AWS::NoValue") if params["SpotPrice"] == "auto" else str(
                    params["SpotPrice"])
                idistribution.SpotAllocationStrategy = params['SpotAllocationStrategy']
                mip.InstancesDistribution = idistribution

            # End MixedPolicyInstance

            # Begin AutoScalingGroup Resource
            asg = AutoScalingGroup("AutoScalingComputeGroup")
            asg.DependsOn = "NodeLaunchTemplate"
            if mip_usage is True or instances_list.__len__() > 1:
                mip.LaunchTemplate = asg_lt
                asg.MixedInstancesPolicy = mip

            else:
                asg.LaunchTemplate = LaunchTemplateSpecification(
                    LaunchTemplateId=Ref(lt),
                    Version=GetAtt(lt, "LatestVersionNumber"))

            asg.MinSize = int(params["DesiredCapacity"])
            asg.MaxSize = int(params["DesiredCapacity"])
            asg.VPCZoneIdentifier = params["SubnetId"]
            asg.CapacityRebalance = False

            if params["PlacementGroup"] is True:
                pg = PlacementGroup("ComputeNodePlacementGroup")
                pg.Strategy = "cluster"
                t.add_resource(pg)
                asg.PlacementGroup = Ref(pg)

            asg.Tags = Tags(
                Name=str(params["ClusterId"]) + "-compute-job-" + str(params["JobId"]),
                _soca_JobId=str(params["JobId"]),
                _soca_JobName=str(params["JobName"]),
                _soca_JobQueue=str(params["JobQueue"]),
                _soca_StackId=stack_name,
                _soca_JobOwner=str(params["JobOwner"]),
                _soca_JobProject=str(params["JobProject"]),
                _soca_TerminateWhenIdle=str(params["TerminateWhenIdle"]),
                _soca_KeepForever=str(params["KeepForever"]).lower(),
                _soca_ClusterId=str(params["ClusterId"]),
                _soca_NodeType="soca-compute-node")
            t.add_resource(asg)
            # End AutoScalingGroup Resource

        # Begin FSx for Lustre
        if params["FSxLustreConfiguration"]["fsx_lustre"] is not False:
            if params["FSxLustreConfiguration"]["existing_fsx"] is False:
                fsx_lustre = FileSystem("FSxForLustre")
                fsx_lustre.FileSystemType = "LUSTRE"
                fsx_lustre.StorageCapacity = params["FSxLustreConfiguration"]["capacity"]
                fsx_lustre.SecurityGroupIds = security_groups
                fsx_lustre.SubnetIds = params["SubnetId"]
                fsx_lustre_configuration = LustreConfiguration()
                fsx_lustre_configuration.DeploymentType = params["FSxLustreConfiguration"]["deployment_type"].upper()
                if params["FSxLustreConfiguration"]["deployment_type"].upper() == "PERSISTENT_1":
                    fsx_lustre_configuration.PerUnitStorageThroughput = params["FSxLustreConfiguration"]["per_unit_throughput"]

                if params["FSxLustreConfiguration"]["s3_backend"] is not False:
                    fsx_lustre_configuration.ImportPath = params["FSxLustreConfiguration"]["import_path"] if params["FSxLustreConfiguration"]["import_path"] is not False else params["FSxLustreConfiguration"]["s3_backend"]
                    fsx_lustre_configuration.ExportPath = params["FSxLustreConfiguration"]["import_path"] if params["FSxLustreConfiguration"]["import_path"] is not False else params["FSxLustreConfiguration"]["s3_backend"] + "/" + params["ClusterId"] + "-fsxoutput/job-" +  params["JobId"] + "/"

                fsx_lustre.LustreConfiguration = fsx_lustre_configuration
                fsx_lustre.Tags = base_Tags(
                    # False disable PropagateAtLaunch
                    Name=str(params["ClusterId"] + "-compute-job-" + params["JobId"]),
                    _soca_JobId=str(params["JobId"]),
                    _soca_JobName=str(params["JobName"]),
                    _soca_JobQueue=str(params["JobQueue"]),
                    _soca_TerminateWhenIdle=str(params["TerminateWhenIdle"]),
                    _soca_StackId=stack_name,
                    _soca_JobOwner=str(params["JobOwner"]),
                    _soca_JobProject=str(params["JobProject"]),
                    _soca_KeepForever=str(params["KeepForever"]).lower(),
                    _soca_FSx="true",
                    _soca_ClusterId=str(params["ClusterId"]),
                )
                t.add_resource(fsx_lustre)
        # End FSx For Lustre

        # Begin Custom Resource
        # Change Mapping to No if you want to disable this
        if allow_anonymous_data_collection is True:
            metrics = CustomResourceSendAnonymousMetrics("SendAnonymousData")
            metrics.ServiceToken = params["SolutionMetricsLambda"]
            metrics.DesiredCapacity = str(params["DesiredCapacity"])
            metrics.InstanceType = str(params["InstanceType"])
            metrics.Efa = str(params["Efa"])
            metrics.ScratchSize = str(params["ScratchSize"])
            metrics.RootSize = str(params["RootSize"])
            metrics.SpotPrice = str(params["SpotPrice"])
            metrics.BaseOS = str(params["BaseOS"])
            metrics.StackUUID = str(params["StackUUID"])
            metrics.KeepForever = str(params["KeepForever"])
            metrics.FsxLustre = str(params["FSxLustreConfiguration"])
            metrics.TerminateWhenIdle = str(params["TerminateWhenIdle"])
            metrics.Dcv = "false"
            t.add_resource(metrics)
        # End Custom Resource

        if debug is True:
            print(t.to_json())

        # Tags must use "soca:<Key>" syntax
        template_output = t.to_yaml().replace("_soca_", "soca:")
        return {'success': True,
                'output': template_output}

    except Exception as e:
        exc_type, exc_obj, exc_tb = sys.exc_info()
        fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1]
        return {'success': False,
                'output': 'cloudformation_builder.py: ' + (
                            str(e) + ': error :' + str(exc_type) + ' ' + str(fname) + ' ' + str(exc_tb.tb_lineno))}
Ejemplo n.º 4
0
def main(**params):
    try:
        # Metadata
        t = Template()
        t.set_version("2010-09-09")
        t.set_description(
            "(SOCA) - Base template to deploy compute nodes. Version 2.6.0")
        allow_anonymous_data_collection = params["MetricCollectionAnonymous"]
        debug = False
        mip_usage = False
        instances_list = params[
            "InstanceType"]  # list of instance type. Use + to specify more than one type
        asg_lt = asg_LaunchTemplate()
        ltd = LaunchTemplateData("NodeLaunchTemplateData")
        mip = MixedInstancesPolicy()
        stack_name = Ref("AWS::StackName")

        # Begin LaunchTemplateData
        UserData = '''#!/bin/bash -ex

# Configure the proxy
value="''' + params['ProxyCACert'] + '''"
echo $value >  /etc/pki/ca-trust/source/anchors/proxyCA.pem
update-ca-trust

cat <<EOF > /etc/profile.d/proxy.sh
proxy_url="http://''' + params['ProxyPrivateDnsName'] + ''':3128/"

export HTTP_PROXY=\$proxy_url
export HTTPS_PROXY=\$proxy_url
export http_proxy=\$proxy_url
export https_proxy=\$proxy_url

# No proxy:
# Comma separated list of destinations that shouldn't go to the proxy.
# - EC2 metadata service
# - Private IP address ranges (VPC local)
export NO_PROXY="''' + params['NoProxy'] + '''"
export no_proxy=\$NO_PROXY

export REQUESTS_CA_BUNDLE=/etc/pki/ca-trust/extracted/pem/tls-ca-bundle.pem
EOF

source /etc/profile.d/proxy.sh

cat <<EOF > /etc/yum.repos.d/10_proxy.conf
[main]
proxy=http://''' + params['ProxyPrivateDnsName'] + ''':3128/
EOF

if grep -q 'Amazon Linux release 2' /etc/system-release; then
    BASE_OS=amazonlinux2
elif grep -q 'CentOS Linux release 7' /etc/system-release; then
    BASE_OS=centos7
else
    BASE_OS=rhel7
fi

# Install pip and awscli
export PATH=$PATH:/usr/local/bin
if [[ "$BASE_OS" == "centos7" ]] || [[ "$BASE_OS" == "rhel7" ]];
then
     yum install -y python3-pip
     PIP=$(which pip3)
     $PIP install awscli
else
     yum install -y python3-pip
     PIP=$(which pip3)
     $PIP install awscli
fi

# Configure using ansible
# If not amazon linux then the proxy needs to be set up before ansible can be installed.
# The playbooks are downloaded from S3 using the S3 VPC endpoint so don't require the proxy.
if ! yum list installed ansible &> /dev/null; then
    if [ $BASE_OS == "amazonlinux2" ]; then
        amazon-linux-extras install -y ansible2
    else
        yum -y install ansible
    fi
fi
aws s3 cp --recursive s3://''' + params['S3Bucket'] + '''/''' + params[
            'S3InstallFolder'] + '''/playbooks/ /root/playbooks/
cd /root/playbooks
ansible-playbook computeNode.yml -e Region=''' + params[
                'Region'] + ''' -e Domain=''' + params[
                    'SocaDomain'] + ''' -e S3InstallBucket=''' + params[
                        'S3Bucket'] + ''' -e S3InstallFolder=''' + params[
                            'S3InstallFolder'] + ''' -e ClusterId=''' + params[
                                'ClusterId'] + ''' -e NoProxy=''' + params[
                                    'NoProxy'] + ''' -e NodeType=''' + params[
                                        'NodeType'] + ''' >> /root/ansible.log 2>&1

if [[ "$BASE_OS" == "centos7" ]] || [[ "$BASE_OS" == "rhel7" ]];
then
     yum install -y nfs-utils # enforce install of nfs-utils
fi

if [[ "$BASE_OS" == "amazonlinux2" ]];
    then
        /usr/sbin/update-motd --disable
fi

GET_INSTANCE_TYPE=$(curl http://169.254.169.254/latest/meta-data/instance-type)
echo export "SOCA_CONFIGURATION="''' + str(params['ClusterId']
                                           ) + '''"" >> /etc/environment
echo export "SOCA_BASE_OS="$BASE_OS"" >> /etc/environment
echo export "SOCA_JOB_QUEUE="''' + str(params['JobQueue']
                                       ) + '''"" >> /etc/environment
echo export "SOCA_JOB_OWNER="''' + str(params['JobOwner']
                                       ) + '''"" >> /etc/environment
echo export "SOCA_JOB_NAME="''' + str(params['JobName']
                                      ) + '''"" >> /etc/environment
echo export "SOCA_JOB_PROJECT="''' + str(params['JobProject']
                                         ) + '''"" >> /etc/environment
echo export "SOCA_VERSION="''' + str(params['Version']
                                     ) + '''"" >> /etc/environment
echo export "SOCA_JOB_EFA="''' + str(params['Efa']).lower(
                                     ) + '''"" >> /etc/environment
echo export "SOCA_JOB_ID="''' + str(params['JobId']
                                    ) + '''"" >> /etc/environment
echo export "SOCA_SCRATCH_SIZE=''' + str(
                                        params['ScratchSize']
                                    ) + '''" >> /etc/environment
echo export "SOCA_INSTALL_BUCKET="''' + str(
                                        params['S3Bucket']
                                    ) + '''"" >> /etc/environment
echo export "SOCA_INSTALL_BUCKET_FOLDER="''' + str(
                                        params['S3InstallFolder']
                                    ) + '''"" >> /etc/environment
echo export "SOCA_FSX_LUSTRE_BUCKET="''' + str(
                                        params['FSxLustreConfiguration']
                                        ['fsx_lustre']
                                    ).lower() + '''"" >> /etc/environment
echo export "SOCA_FSX_LUSTRE_DNS="''' + str(
                                        params['FSxLustreConfiguration']
                                        ['existing_fsx']
                                    ).lower() + '''"" >> /etc/environment
echo export "SOCA_INSTANCE_TYPE=$GET_INSTANCE_TYPE" >> /etc/environment
echo export "SOCA_INSTANCE_HYPERTHREADING="''' + str(
                                        params['ThreadsPerCore']
                                    ).lower() + '''"" >> /etc/environment
echo export "SOCA_SYSTEM_METRICS="''' + str(params['SystemMetrics']).lower(
                                    ) + '''"" >> /etc/environment
echo export "SOCA_ESDOMAIN_ENDPOINT="''' + str(
                                        params['ESDomainEndpoint']
                                    ).lower() + '''"" >> /etc/environment


echo export "SOCA_HOST_SYSTEM_LOG="/apps/soca/''' + str(
                                        params['ClusterId']
                                    ) + '''/cluster_node_bootstrap/logs/''' + str(
                                        params['JobId']
                                    ) + '''/$(hostname -s)"" >> /etc/environment
echo export "AWS_STACK_ID=${AWS::StackName}" >> /etc/environment
echo export "AWS_DEFAULT_REGION=''' + params[
                                        'Region'] + '''" >> /etc/environment


source /etc/environment
AWS=$(which aws)

# Give yum permission to the user on this specific machine
echo "''' + params['JobOwner'] + ''' ALL=(ALL) /bin/yum" >> /etc/sudoers

mkdir -p /apps
mkdir -p /data

# Mount EFS
echo "''' + params['EFSDataDns'] + ''':/ /data nfs4 nfsvers=4.1,rsize=1048576,wsize=1048576,hard,timeo=600,retrans=2,noresvport 0 0" >> /etc/fstab
echo "''' + params['EFSAppsDns'] + ''':/ /apps nfs4 nfsvers=4.1,rsize=1048576,wsize=1048576,hard,timeo=600,retrans=2,noresvport 0 0" >> /etc/fstab
EFS_MOUNT=0
mount -a 
while [[ $? -ne 0 ]] && [[ $EFS_MOUNT -lt 5 ]]
  do
    SLEEP_TIME=$(( RANDOM % 60 ))
    echo "Failed to mount EFS, retrying in $SLEEP_TIME seconds and Loop $EFS_MOUNT/5..."
    sleep $SLEEP_TIME
    ((EFS_MOUNT++))
    mount -a
  done

# Configure Chrony
yum remove -y ntp
yum install -y chrony
mv /etc/chrony.conf  /etc/chrony.conf.original
echo -e """
# use the local instance NTP service, if available
server 169.254.169.123 prefer iburst minpoll 4 maxpoll 4

# Use public servers from the pool.ntp.org project.
# Please consider joining the pool (http://www.pool.ntp.org/join.html).
# !!! [BEGIN] SOCA REQUIREMENT
# You will need to open UDP egress traffic on your security group if you want to enable public pool
#pool 2.amazon.pool.ntp.org iburst
# !!! [END] SOCA REQUIREMENT
# Record the rate at which the system clock gains/losses time.
driftfile /var/lib/chrony/drift

# Allow the system clock to be stepped in the first three updates
# if its offset is larger than 1 second.
makestep 1.0 3

# Specify file containing keys for NTP authentication.
keyfile /etc/chrony.keys

# Specify directory for log files.
logdir /var/log/chrony

# save data between restarts for fast re-load
dumponexit
dumpdir /var/run/chrony
""" > /etc/chrony.conf
systemctl enable chronyd

# Prepare  Log folder
mkdir -p $SOCA_HOST_SYSTEM_LOG
chmod +x /apps/soca/$SOCA_CONFIGURATION/cluster_node_bootstrap/ComputeNodePostReboot.sh
echo "@reboot /apps/soca/$SOCA_CONFIGURATION/cluster_node_bootstrap/ComputeNodePostReboot.sh >> $SOCA_HOST_SYSTEM_LOG/ComputeNodePostReboot.log 2>&1" | crontab -
$AWS s3 cp s3://$SOCA_INSTALL_BUCKET/$SOCA_INSTALL_BUCKET_FOLDER/scripts/config.cfg /root/
chmod +x /apps/soca/$SOCA_CONFIGURATION/cluster_node_bootstrap/ComputeNode.sh
/apps/soca/$SOCA_CONFIGURATION/cluster_node_bootstrap/ComputeNode.sh ''' + params[
                                            'SchedulerHostname'] + ''' >> $SOCA_HOST_SYSTEM_LOG/ComputeNode.sh.log 2>&1'''

        SpotFleet = True if ((params["SpotPrice"] is not False) and
                             (int(params["DesiredCapacity"]) > 1
                              or len(instances_list) > 1)) else False
        ltd.EbsOptimized = True
        for instance in instances_list:
            if "t2." in instance:
                ltd.EbsOptimized = False

            # metal + t2 does not support CpuOptions
            unsupported = ["t2.", "metal"]
            if all(itype not in instance
                   for itype in unsupported) and (SpotFleet is False
                                                  or len(instances_list) == 1):
                # Spotfleet with multiple instance types doesn't support CpuOptions
                # So we can't add CpuOptions if SpotPrice is specified and when multiple instances are specified
                ltd.CpuOptions = CpuOptions(
                    CoreCount=int(params["CoreCount"]),
                    ThreadsPerCore=1
                    if params["ThreadsPerCore"] is False else 2)

        ltd.IamInstanceProfile = IamInstanceProfile(
            Arn=params["ComputeNodeInstanceProfileArn"])
        ltd.KeyName = params["SSHKeyPair"]
        ltd.ImageId = params["ImageId"]
        if params["SpotPrice"] is not False and params[
                "SpotAllocationCount"] is False:
            ltd.InstanceMarketOptions = InstanceMarketOptions(
                MarketType="spot",
                SpotOptions=SpotOptions(
                    MaxPrice=Ref("AWS::NoValue") if params["SpotPrice"]
                    == "auto" else str(params["SpotPrice"])
                    # auto -> cap at OD price
                ))
        ltd.InstanceType = instances_list[0]
        ltd.NetworkInterfaces = [
            NetworkInterfaces(InterfaceType="efa" if params["Efa"] is not False
                              else Ref("AWS::NoValue"),
                              DeleteOnTermination=True,
                              DeviceIndex=0,
                              Groups=[params["SecurityGroupId"]])
        ]
        ltd.UserData = Base64(Sub(UserData))
        ltd.BlockDeviceMappings = [
            LaunchTemplateBlockDeviceMapping(
                DeviceName="/dev/xvda"
                if params["BaseOS"] == "amazonlinux2" else "/dev/sda1",
                Ebs=EBSBlockDevice(VolumeSize=params["RootSize"],
                                   VolumeType="gp2",
                                   DeleteOnTermination="false"
                                   if params["KeepEbs"] is True else "true",
                                   Encrypted=True))
        ]
        if int(params["ScratchSize"]) > 0:
            ltd.BlockDeviceMappings.append(
                BlockDeviceMapping(
                    DeviceName="/dev/xvdbx",
                    Ebs=EBSBlockDevice(
                        VolumeSize=params["ScratchSize"],
                        VolumeType="io1"
                        if int(params["VolumeTypeIops"]) > 0 else "gp2",
                        Iops=params["VolumeTypeIops"]
                        if int(params["VolumeTypeIops"]) > 0 else
                        Ref("AWS::NoValue"),
                        DeleteOnTermination="false"
                        if params["KeepEbs"] is True else "true",
                        Encrypted=True)))
        ltd.TagSpecifications = [
            ec2.TagSpecifications(
                ResourceType="instance",
                Tags=base_Tags(
                    Name=str(params["ClusterId"]) + "-compute-job-" +
                    str(params["JobId"]),
                    _soca_JobId=str(params["JobId"]),
                    _soca_JobName=str(params["JobName"]),
                    _soca_JobQueue=str(params["JobQueue"]),
                    _soca_StackId=stack_name,
                    _soca_JobOwner=str(params["JobOwner"]),
                    _soca_JobProject=str(params["JobProject"]),
                    _soca_TerminateWhenIdle=str(params["TerminateWhenIdle"]),
                    _soca_KeepForever=str(params["KeepForever"]).lower(),
                    _soca_ClusterId=str(params["ClusterId"]),
                    _soca_NodeType="soca-compute-node"))
        ]
        # End LaunchTemplateData

        # Begin Launch Template Resource
        lt = LaunchTemplate("NodeLaunchTemplate")
        lt.LaunchTemplateName = params["ClusterId"] + "-" + str(
            params["JobId"])
        lt.LaunchTemplateData = ltd
        t.add_resource(lt)
        # End Launch Template Resource

        if SpotFleet is True:
            # SpotPrice is defined and DesiredCapacity > 1 or need to try more than 1 instance_type
            # Create SpotFleet

            # Begin SpotFleetRequestConfigData Resource
            sfrcd = ec2.SpotFleetRequestConfigData()
            sfrcd.AllocationStrategy = params["SpotAllocationStrategy"]
            sfrcd.ExcessCapacityTerminationPolicy = "noTermination"
            sfrcd.IamFleetRole = params["SpotFleetIAMRoleArn"]
            sfrcd.InstanceInterruptionBehavior = "terminate"
            if params["SpotPrice"] != "auto":
                sfrcd.SpotPrice = str(params["SpotPrice"])
            sfrcd.TargetCapacity = params["DesiredCapacity"]
            sfrcd.Type = "maintain"
            sfltc = ec2.LaunchTemplateConfigs()
            sflts = ec2.LaunchTemplateSpecification(LaunchTemplateId=Ref(lt),
                                                    Version=GetAtt(
                                                        lt,
                                                        "LatestVersionNumber"))
            sfltc.LaunchTemplateSpecification = sflts
            sfltc.Overrides = []
            for subnet in params["SubnetId"]:
                for instance in instances_list:
                    sfltc.Overrides.append(
                        ec2.LaunchTemplateOverrides(InstanceType=instance,
                                                    SubnetId=subnet))
            sfrcd.LaunchTemplateConfigs = [sfltc]
            TagSpecifications = ec2.SpotFleetTagSpecification(
                ResourceType="spot-fleet-request",
                Tags=base_Tags(
                    Name=str(params["ClusterId"]) + "-compute-job-" +
                    str(params["JobId"]),
                    _soca_JobId=str(params["JobId"]),
                    _soca_JobName=str(params["JobName"]),
                    _soca_JobQueue=str(params["JobQueue"]),
                    _soca_StackId=stack_name,
                    _soca_JobOwner=str(params["JobOwner"]),
                    _soca_JobProject=str(params["JobProject"]),
                    _soca_TerminateWhenIdle=str(params["TerminateWhenIdle"]),
                    _soca_KeepForever=str(params["KeepForever"]).lower(),
                    _soca_ClusterId=str(params["ClusterId"]),
                    _soca_NodeType="soca-compute-node"))
            # End SpotFleetRequestConfigData Resource

            # Begin SpotFleet Resource
            spotfleet = ec2.SpotFleet("SpotFleet")
            spotfleet.SpotFleetRequestConfigData = sfrcd
            t.add_resource(spotfleet)
            # End SpotFleet Resource
        else:

            asg_lt.LaunchTemplateSpecification = LaunchTemplateSpecification(
                LaunchTemplateId=Ref(lt),
                Version=GetAtt(lt, "LatestVersionNumber"))

            asg_lt.Overrides = []
            for instance in instances_list:
                asg_lt.Overrides.append(
                    LaunchTemplateOverrides(InstanceType=instance))

            # Begin InstancesDistribution
            if params["SpotPrice"] is not False and \
                    params["SpotAllocationCount"] is not False and \
                    (int(params["DesiredCapacity"]) - int(params["SpotAllocationCount"])) > 0:
                mip_usage = True
                idistribution = InstancesDistribution()
                idistribution.OnDemandAllocationStrategy = "prioritized"  # only supported value
                idistribution.OnDemandBaseCapacity = params[
                    "DesiredCapacity"] - params["SpotAllocationCount"]
                idistribution.OnDemandPercentageAboveBaseCapacity = "0"  # force the other instances to be SPOT
                idistribution.SpotMaxPrice = Ref(
                    "AWS::NoValue") if params["SpotPrice"] == "auto" else str(
                        params["SpotPrice"])
                idistribution.SpotAllocationStrategy = params[
                    'SpotAllocationStrategy']
                mip.InstancesDistribution = idistribution

            # End MixedPolicyInstance

            # Begin AutoScalingGroup Resource
            asg = AutoScalingGroup("AutoScalingComputeGroup")
            asg.DependsOn = "NodeLaunchTemplate"
            if mip_usage is True or instances_list.__len__() > 1:
                mip.LaunchTemplate = asg_lt
                asg.MixedInstancesPolicy = mip

            else:
                asg.LaunchTemplate = LaunchTemplateSpecification(
                    LaunchTemplateId=Ref(lt),
                    Version=GetAtt(lt, "LatestVersionNumber"))

            asg.MinSize = int(params["DesiredCapacity"])
            asg.MaxSize = int(params["DesiredCapacity"])
            asg.VPCZoneIdentifier = params["SubnetId"]

            if params["PlacementGroup"] is True:
                pg = PlacementGroup("ComputeNodePlacementGroup")
                pg.Strategy = "cluster"
                t.add_resource(pg)
                asg.PlacementGroup = Ref(pg)

            asg.Tags = Tags(
                Name=str(params["ClusterId"]) + "-compute-job-" +
                str(params["JobId"]),
                _soca_JobId=str(params["JobId"]),
                _soca_JobName=str(params["JobName"]),
                _soca_JobQueue=str(params["JobQueue"]),
                _soca_StackId=stack_name,
                _soca_JobOwner=str(params["JobOwner"]),
                _soca_JobProject=str(params["JobProject"]),
                _soca_TerminateWhenIdle=str(params["TerminateWhenIdle"]),
                _soca_KeepForever=str(params["KeepForever"]).lower(),
                _soca_ClusterId=str(params["ClusterId"]),
                _soca_NodeType="soca-compute-node")
            t.add_resource(asg)
            # End AutoScalingGroup Resource

        # Begin FSx for Lustre
        if params["FSxLustreConfiguration"]["fsx_lustre"] is not False:
            if params["FSxLustreConfiguration"]["existing_fsx"] is False:
                fsx_lustre = FileSystem("FSxForLustre")
                fsx_lustre.FileSystemType = "LUSTRE"
                fsx_lustre.StorageCapacity = params["FSxLustreConfiguration"][
                    "capacity"]
                fsx_lustre.SecurityGroupIds = [params["SecurityGroupId"]]
                fsx_lustre.SubnetIds = params["SubnetId"]
                fsx_lustre_configuration = LustreConfiguration()
                fsx_lustre_configuration.DeploymentType = params[
                    "FSxLustreConfiguration"]["deployment_type"].upper()
                if params["FSxLustreConfiguration"]["deployment_type"].upper(
                ) == "PERSISTENT_1":
                    fsx_lustre_configuration.PerUnitStorageThroughput = params[
                        "FSxLustreConfiguration"]["per_unit_throughput"]

                if params["FSxLustreConfiguration"]["s3_backend"] is not False:
                    fsx_lustre_configuration.ImportPath = params[
                        "FSxLustreConfiguration"]["import_path"] if params[
                            "FSxLustreConfiguration"][
                                "import_path"] is not False else params[
                                    "FSxLustreConfiguration"]["s3_backend"]
                    fsx_lustre_configuration.ExportPath = params[
                        "FSxLustreConfiguration"]["import_path"] if params[
                            "FSxLustreConfiguration"][
                                "import_path"] is not False else params[
                                    "FSxLustreConfiguration"][
                                        "s3_backend"] + "/" + params[
                                            "ClusterId"] + "-fsxoutput/job-" + params[
                                                "JobId"] + "/"

                fsx_lustre.LustreConfiguration = fsx_lustre_configuration
                fsx_lustre.Tags = base_Tags(
                    # False disable PropagateAtLaunch
                    Name=str(params["ClusterId"] + "-compute-job-" +
                             params["JobId"]),
                    _soca_JobId=str(params["JobId"]),
                    _soca_JobName=str(params["JobName"]),
                    _soca_JobQueue=str(params["JobQueue"]),
                    _soca_TerminateWhenIdle=str(params["TerminateWhenIdle"]),
                    _soca_StackId=stack_name,
                    _soca_JobOwner=str(params["JobOwner"]),
                    _soca_JobProject=str(params["JobProject"]),
                    _soca_KeepForever=str(params["KeepForever"]).lower(),
                    _soca_FSx="true",
                    _soca_ClusterId=str(params["ClusterId"]),
                )
                t.add_resource(fsx_lustre)
        # End FSx For Lustre

        # Begin Custom Resource
        # Change Mapping to No if you want to disable this
        if allow_anonymous_data_collection is True:
            metrics = CustomResourceSendAnonymousMetrics("SendAnonymousData")
            metrics.ServiceToken = params["SolutionMetricLambda"]
            metrics.DesiredCapacity = str(params["DesiredCapacity"])
            metrics.InstanceType = str(params["InstanceType"])
            metrics.Efa = str(params["Efa"])
            metrics.ScratchSize = str(params["ScratchSize"])
            metrics.RootSize = str(params["RootSize"])
            metrics.SpotPrice = str(params["SpotPrice"])
            metrics.BaseOS = str(params["BaseOS"])
            metrics.StackUUID = str(params["StackUUID"])
            metrics.KeepForever = str(params["KeepForever"])
            metrics.FsxLustre = str(params["FSxLustreConfiguration"])
            metrics.TerminateWhenIdle = str(params["TerminateWhenIdle"])
            metrics.Dcv = "false"
            t.add_resource(metrics)
        # End Custom Resource

        if debug is True:
            print(t.to_json())

        # Tags must use "soca:<Key>" syntax
        template_output = t.to_yaml().replace("_soca_", "soca:")
        return {'success': True, 'output': template_output}

    except Exception as e:
        exc_type, exc_obj, exc_tb = sys.exc_info()
        fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1]
        return {
            'success':
            False,
            'output':
            'cloudformation_builder.py: ' +
            (str(e) + ': error :' + str(exc_type) + ' ' + str(fname) + ' ' +
             str(exc_tb.tb_lineno))
        }
Ejemplo n.º 5
0
launch_template = LaunchTemplate(
    region.replace("-", "") + "ecslivelaunchtemplate",
    LaunchTemplateName = "ecs-live-launch-template",
    LaunchTemplateData = LaunchTemplateData(
        ImageId = image_id,
        BlockDeviceMappings = [
            LaunchTemplateBlockDeviceMapping(
                DeviceName = "/dev/xvda",
                Ebs = EBSBlockDevice(
                    "ecsliveblockdevice",
                    VolumeSize = 30
                )
            )
        ],
        CreditSpecification = LaunchTemplateCreditSpecification(
            CpuCredits = "Unlimited"
        ),
        InstanceType = "t3.micro",
        IamInstanceProfile = IamInstanceProfile(
            region.replace("-", "") + "ecsliveiaminstanceprofile",
            Arn = GetAtt(ecs_instance_profile, "Arn")
        ),
        KeyName = "live-eu-west-1",
        SecurityGroupIds = [
            GetAtt(security_group, "GroupId")
        ],
        UserData = Base64(
            "#!/bin/bash\n" \
            "/usr/bin/yum install -y awscli && aws s3 cp s3://mgmt.eu-west-1.weblox.io/bootstrap/bootstrap.sh /root/bootstrap.sh && chmod 700 /root/bootstrap.sh && /root/bootstrap.sh\n"
        )
    ),
)