Beispiel #1
0
    def add_asg_fleet(self, scope: BaseApp, cluster: Cluster,
                      fleet) -> List[AutoScalingGroup]:
        created_fleets: List[AutoScalingGroup] = []

        node_labels = fleet.get('nodeLabels', {})
        node_labels["fleetName"] = fleet.get('name')
        node_labels_as_str = ','.join(map('='.join, node_labels.items()))

        # Source of tweaks: https://kubedex.com/90-days-of-aws-eks-in-production/
        kubelet_extra_args = ' '.join([
            # Add node labels
            f'--node-labels {node_labels_as_str}'
            if len(node_labels_as_str) else '',

            # Capture resource reservation for kubernetes system daemons like the kubelet, container runtime,
            # node problem detector, etc.
            '--kube-reserved cpu=250m,memory=1Gi,ephemeral-storage=1Gi',

            # Capture resources for vital system functions, such as sshd, udev.
            '--system-reserved cpu=250m,memory=0.2Gi,ephemeral-storage=1Gi',

            # Start evicting pods from this node once these thresholds are crossed.
            '--eviction-hard memory.available<0.2Gi,nodefs.available<10%',
        ])

        cluster_sg = SecurityGroup.from_security_group_id(
            self,
            'eks-cluster-sg',
            security_group_id=cluster.cluster_security_group_id)

        asg_tags = {
            "k8s.io/cluster-autoscaler/enabled": "true",
            f"k8s.io/cluster-autoscaler/{cluster.cluster_name}": "owned",
        }

        # For correctly autoscaling the cluster we need our autoscaling groups to not span across AZs
        # to avoid the AZ Rebalance, hence we create an ASG per subnet
        for counter, subnet in enumerate(cluster.vpc.private_subnets):
            asg: AutoScalingGroup = cluster.add_capacity(
                id=scope.prefixed_str(f'{fleet.get("name")}-{counter}'),
                instance_type=InstanceType(fleet.get('instanceType')),
                min_capacity=fleet.get('autoscaling', {}).get('minInstances'),
                max_capacity=fleet.get('autoscaling', {}).get('maxInstances'),
                bootstrap_options=BootstrapOptions(
                    kubelet_extra_args=kubelet_extra_args, ),
                spot_price=str(fleet.get('spotPrice'))
                if fleet.get('spotPrice') else None,
                vpc_subnets=SubnetSelection(subnets=[subnet]),
            )
            created_fleets.append(asg)
            self._add_userdata_production_tweaks(asg)

            for key, value in asg_tags.items():
                Tag.add(asg, key, value)

        return created_fleets
def get_file_system(scope: Construct) -> FileSystem:
    config = get_volume_config()
    stack_name = config.stack_name
    security_group = SecurityGroup.from_security_group_id(
        scope,
        'nfs_security_group',
        security_group_id=Fn.import_value(stack_name + 'SecurityGroupId'))
    return FileSystem.from_file_system_attributes(
        scope,
        'filesystem',
        file_system_id=Fn.import_value(stack_name + 'FileSystemId'),
        security_group=security_group)
Beispiel #3
0
    def __init__(self, scope: core.Construct, id: str, deploy_env: str,
                 vpc: aws_ec2.Vpc, db_redis_stack: RdsElasticacheEfsStack,
                 config: dict, **kwargs) -> None:
        super().__init__(scope, id, **kwargs)
        self.config = config
        self.deploy_env = deploy_env
        self.db_port = DB_PORT
        # cannot map volumes to Fargate task defs yet - so this is done via Boto3 since CDK does not
        # support it yet: https://github.com/aws/containers-roadmap/issues/825
        #self.efs_file_system_id = db_redis_stack.efs_file_system_id
        cluster_name = get_cluster_name(deploy_env)
        self.cluster = ecs.Cluster(self,
                                   cluster_name,
                                   cluster_name=cluster_name,
                                   vpc=vpc)
        pwd_secret = ecs.Secret.from_ssm_parameter(
            StringParameter.from_secure_string_parameter_attributes(
                self,
                f"dbpwd-{deploy_env}",
                version=1,
                parameter_name="postgres_pwd"))
        self.secrets = {"POSTGRES_PASSWORD": pwd_secret}
        environment = {
            "EXECUTOR":
            "Celery",
            "POSTGRES_HOST":
            db_redis_stack.db_host,
            "POSTGRES_PORT":
            str(self.db_port),
            "POSTGRES_DB":
            "airflow",
            "POSTGRES_USER":
            self.config["dbadmin"],
            "REDIS_HOST":
            db_redis_stack.redis_host,
            "VISIBILITY_TIMEOUT":
            str(self.config["celery_broker_visibility_timeout"])
        }
        image_asset = DockerImageAsset(self,
                                       "AirflowImage",
                                       directory="build",
                                       repository_name=config["ecr_repo_name"])
        self.image = ecs.ContainerImage.from_docker_image_asset(image_asset)
        # web server - this initializes the db so must happen first
        self.web_service = self.airflow_web_service(environment)
        # https://github.com/aws/aws-cdk/issues/1654
        self.web_service_sg().connections.allow_to_default_port(
            db_redis_stack.postgres_db, 'allow PG')
        redis_port_info = Port(protocol=Protocol.TCP,
                               string_representation="allow to redis",
                               from_port=REDIS_PORT,
                               to_port=REDIS_PORT)
        worker_port_info = Port(protocol=Protocol.TCP,
                                string_representation="allow to worker",
                                from_port=AIRFLOW_WORKER_PORT,
                                to_port=AIRFLOW_WORKER_PORT)
        redis_sg = SecurityGroup.from_security_group_id(
            self,
            id=f"Redis-SG-{deploy_env}",
            security_group_id=db_redis_stack.redis.vpc_security_group_ids[0])
        bastion_sg = db_redis_stack.bastion.connections.security_groups[0]
        self.web_service_sg().connections.allow_to(redis_sg, redis_port_info,
                                                   'allow Redis')
        self.web_service_sg().connections.allow_to_default_port(
            db_redis_stack.efs_file_system)
        # scheduler
        self.scheduler_service = self.create_scheduler_ecs_service(environment)
        # worker
        self.worker_service = self.worker_service(environment)
        self.scheduler_sg().connections.allow_to_default_port(
            db_redis_stack.postgres_db, 'allow PG')
        self.scheduler_sg().connections.allow_to(redis_sg, redis_port_info,
                                                 'allow Redis')
        self.scheduler_sg().connections.allow_to_default_port(
            db_redis_stack.efs_file_system)

        self.worker_sg().connections.allow_to_default_port(
            db_redis_stack.postgres_db, 'allow PG')
        self.worker_sg().connections.allow_to(redis_sg, redis_port_info,
                                              'allow Redis')
        self.worker_sg().connections.allow_to_default_port(
            db_redis_stack.efs_file_system)
        # When you start an airflow worker, airflow starts a tiny web server
        # subprocess to serve the workers local log files to the airflow main
        # web server, who then builds pages and sends them to users. This defines
        # the port on which the logs are served. It needs to be unused, and open
        # visible from the main web server to connect into the workers.
        self.web_service_sg().connections.allow_to(self.worker_sg(),
                                                   worker_port_info,
                                                   'web service to worker')