Example #1
0
def main(cluster_name, regions, sleep, kubeconfig,
         aws_access_key, aws_secret_key, idle_threshold, type_idle_threshold,
         instance_init_time, slack_hook, dry_run, verbose):
    if verbose > 0:
        logger_handler = logging.StreamHandler(sys.stderr)
        logger_handler.setFormatter(logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s'))
        logger.addHandler(logger_handler)
        logger.setLevel(DEBUG_LOGGING_MAP.get(verbose, logging.DEBUG))

    if not (aws_secret_key and aws_access_key):
        logger.error("Missing AWS credentials. Please provide aws-access-key and aws-secret-key.")
        sys.exit(1)

    cluster = Cluster(aws_access_key=aws_access_key,
                      aws_secret_key=aws_secret_key,
                      regions=regions.split(','),
                      kubeconfig=kubeconfig,
                      idle_threshold=idle_threshold,
                      instance_init_time=instance_init_time,
                      type_idle_threshold=type_idle_threshold,
                      cluster_name=cluster_name,
                      slack_hook=slack_hook,
                      dry_run=dry_run
                      )
    backoff = sleep
    while True:
        scaled = cluster.scale_loop()
        if scaled:
            time.sleep(sleep)
            backoff = sleep
        else:
            logger.warn("backoff: %s" % backoff)
            backoff *= 2
            time.sleep(backoff)
class TestCluster(unittest.TestCase):
    def setUp(self):
        # load dummy kube specs
        dir_path = os.path.dirname(os.path.realpath(__file__))
        with open(os.path.join(dir_path, 'data/busybox.yaml'), 'r') as f:
            self.dummy_pod = yaml.load(f.read())
        with open(os.path.join(dir_path, 'data/ds-pod.yaml'), 'r') as f:
            self.dummy_ds_pod = yaml.load(f.read())
        with open(os.path.join(dir_path, 'data/rc-pod.yaml'), 'r') as f:
            self.dummy_rc_pod = yaml.load(f.read())
        with open(os.path.join(dir_path, 'data/node.yaml'), 'r') as f:
            self.dummy_node = yaml.load(f.read())
            for condition in self.dummy_node['status']['conditions']:
                if condition['type'] == 'Ready' and condition['status'] == 'True':
                    condition['lastHeartbeatTime'] = datetime.now(condition['lastHeartbeatTime'].tzinfo)
            # Convert timestamps to strings to match PyKube
            for condition in self.dummy_node['status']['conditions']:
                condition['lastHeartbeatTime'] = datetime.isoformat(condition['lastHeartbeatTime'])
                condition['lastTransitionTime'] = datetime.isoformat(condition['lastTransitionTime'])
        
        # this isn't actually used here
        # only needed to create the KubePod object...
        dir_path = os.path.dirname(os.path.realpath(__file__))
        self.api = pykube.HTTPClient(pykube.KubeConfig.from_file(os.path.join(dir_path, './data/kube_config.yaml')))

        self.cluster = Cluster(
            kubeconfig='~/.kube/config',
            idle_threshold=60,
            spare_agents=1,
            instance_init_time=60,
            resource_group='my-rg',
            notifier=None,
            service_principal_app_id='dummy',
            service_principal_secret='dummy',
            service_principal_tenant_id='dummy',
            subscription_id='dummy',
            client_private_key='dummy',
            ca_private_key='dummy',
            ignore_pools='',
            over_provision=0
        )

    def test_get_pending_pods(self):
        dummy_node = copy.deepcopy(self.dummy_node)
        dummy_node['metadata']['name'] = 'k8s-agentpool1-16334397-0'
        node = KubeNode(pykube.Node(self.api, dummy_node))
        node.capacity = capacity.get_capacity_for_instance_type(node.instance_type)
        pod = KubePod(pykube.Pod(self.api, self.dummy_pod))

        act = self.cluster.get_pending_pods([pod], [node])
        self.assertEqual(len(act), 0)

        node = KubeNode(pykube.Node(self.api, dummy_node))
        node.capacity = capacity.get_capacity_for_instance_type(node.instance_type)
        pod2 = KubePod(pykube.Pod(self.api, self.dummy_pod))
        pod3 = KubePod(pykube.Pod(self.api, self.dummy_pod))

        act = self.cluster.get_pending_pods([pod, pod2, pod3], [node])
        #only one should fit
        self.assertEqual(len(act), 2)  
Example #3
0
    def setUp(self):
        # load dummy kube specs
        dir_path = os.path.dirname(os.path.realpath(__file__))
        with open(os.path.join(dir_path, 'data/busybox.yaml'), 'r') as f:
            self.dummy_pod = yaml.load(f.read())
        with open(os.path.join(dir_path, 'data/ds-pod.yaml'), 'r') as f:
            self.dummy_ds_pod = yaml.load(f.read())
        with open(os.path.join(dir_path, 'data/rc-pod.yaml'), 'r') as f:
            self.dummy_rc_pod = yaml.load(f.read())
        with open(os.path.join(dir_path, 'data/node.yaml'), 'r') as f:
            self.dummy_node = yaml.load(f.read())

        # this isn't actually used here
        # only needed to create the KubePod object...
        self.api = pykube.HTTPClient(
            pykube.KubeConfig.from_file('~/.kube/config'))

        # start creating our mock ec2 environment
        self.mocks = [moto.mock_ec2(), moto.mock_autoscaling()]
        for moto_mock in self.mocks:
            moto_mock.start()

        client = boto3.client('autoscaling', region_name='us-west-2')
        self.asg_client = client

        client.create_launch_configuration(LaunchConfigurationName='dummy-lc',
                                           ImageId='ami-deadbeef',
                                           KeyName='dummy-key',
                                           SecurityGroups=[
                                               'sg-cafebeef',
                                           ],
                                           InstanceType='t2.medium')

        client.create_auto_scaling_group(AutoScalingGroupName='dummy-asg',
                                         LaunchConfigurationName='dummy-lc',
                                         MinSize=0,
                                         MaxSize=10,
                                         VPCZoneIdentifier='subnet-beefbeef',
                                         Tags=[{
                                             'Key': 'KubernetesCluster',
                                             'Value': 'dummy-cluster',
                                             'PropagateAtLaunch': True
                                         }, {
                                             'Key': 'KubernetesRole',
                                             'Value': 'worker',
                                             'PropagateAtLaunch': True
                                         }])

        # finally our cluster
        self.cluster = Cluster(aws_access_key='',
                               aws_secret_key='',
                               regions=['us-west-2', 'us-east-1', 'us-west-1'],
                               kubeconfig='~/.kube/config',
                               pod_namespace=None,
                               idle_threshold=60,
                               instance_init_time=60,
                               type_idle_threshold=60,
                               cluster_name='dummy-cluster',
                               notifier=Notifier(),
                               dry_run=False)
Example #4
0
def main(cluster_name, aws_regions, azure_resource_groups,
         azure_slow_scale_classes, sleep, kubeconfig, azure_client_id,
         azure_client_secret, azure_subscription_id, azure_tenant_id,
         aws_access_key, aws_secret_key, pod_namespace, datadog_api_key,
         idle_threshold, type_idle_threshold, max_scale_in_fraction,
         drain_utilization, over_provision, instance_init_time, no_scale,
         no_maintenance, slack_hook, slack_bot_token, dry_run, verbose):
    logger_handler = logging.StreamHandler(sys.stderr)
    logger_handler.setFormatter(
        logging.Formatter(
            '%(asctime)s - %(name)s - %(levelname)s - %(message)s',
            datefmt='%Y-%m-%dT%H:%M:%S%z'))
    logger.addHandler(logger_handler)
    logger.setLevel(DEBUG_LOGGING_MAP.get(verbose, logging.CRITICAL))

    aws_regions_list = aws_regions.split(',') if aws_regions else []
    if not (aws_secret_key and aws_access_key) and aws_regions_list:
        logger.error(
            "Missing AWS credentials. Please provide aws-access-key and aws-secret-key."
        )
        sys.exit(1)

    notifier = Notifier(slack_hook, slack_bot_token)
    cluster = Cluster(
        aws_access_key=aws_access_key,
        aws_secret_key=aws_secret_key,
        aws_regions=aws_regions_list,
        azure_client_id=azure_client_id,
        azure_client_secret=azure_client_secret,
        azure_subscription_id=azure_subscription_id,
        azure_tenant_id=azure_tenant_id,
        azure_resource_group_names=azure_resource_groups.split(',')
        if azure_resource_groups else [],
        azure_slow_scale_classes=azure_slow_scale_classes.split(',')
        if azure_slow_scale_classes else [],
        kubeconfig=kubeconfig,
        pod_namespace=pod_namespace,
        idle_threshold=idle_threshold,
        instance_init_time=instance_init_time,
        type_idle_threshold=type_idle_threshold,
        cluster_name=cluster_name,
        max_scale_in_fraction=max_scale_in_fraction,
        drain_utilization_below=drain_utilization,
        scale_up=not no_scale,
        maintainance=not no_maintenance,
        over_provision=over_provision,
        datadog_api_key=datadog_api_key,
        notifier=notifier,
        dry_run=dry_run,
    )
    backoff = sleep
    while True:
        scaled = cluster.scale_loop()
        if scaled:
            time.sleep(sleep)
            backoff = sleep
        else:
            logger.warn("backoff: %s" % backoff)
            backoff *= 2
            time.sleep(backoff)
Example #5
0
def main(resource_group, acs_deployment, sleep, kubeconfig,
         service_principal_app_id, service_principal_secret,
         kubeconfig_private_key, client_private_key, 
         service_principal_tenant_id, spare_agents, idle_threshold,
         no_scale, over_provision, no_maintenance, ignore_pools, slack_hook, slack_bot_token,
         dry_run, verbose, debug):
    logger_handler = logging.StreamHandler(sys.stderr)
    logger_handler.setFormatter(logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s'))
    logger.addHandler(logger_handler)
    logger.setLevel(DEBUG_LOGGING_MAP.get(verbose, logging.CRITICAL))

    if not (service_principal_app_id and service_principal_secret and service_principal_tenant_id):
        logger.error("Missing Azure credentials. Please provide aws-service_principal_app_id, service_principal_secret and service_principal_tenant_id.")
        sys.exit(1)
    
    if not client_private_key:
        logger.error('Missing client_private_key. Provide it through --client-private-key or CLIENT_PRIVATE_KEY environment variable')
    
    if not kubeconfig_private_key:
        logger.error('Missing kubeconfig_private_key. Provide it through --kubeconfig-private-key or KUBECONFIG_PRIVATE_KEY environment variable')
    
    notifier = None
    if slack_hook and slack_bot_token:
        notifier = Notifier(slack_hook, slack_bot_token)

    instance_init_time = 600
    
    cluster = Cluster(kubeconfig=kubeconfig,
                      instance_init_time=instance_init_time,
                      spare_agents=spare_agents,
                      idle_threshold=idle_threshold,
                      resource_group=resource_group,
                      acs_deployment=acs_deployment,
                      service_principal_app_id=service_principal_app_id,
                      service_principal_secret=service_principal_secret,
                      service_principal_tenant_id=service_principal_tenant_id,
                      kubeconfig_private_key=kubeconfig_private_key,
                      client_private_key=client_private_key,
                      scale_up=not no_scale,
                      ignore_pools=ignore_pools,
                      maintainance=not no_maintenance,
                      over_provision=over_provision,
                      notifier=notifier,
                      dry_run=dry_run,
                      )
    cluster.login()
    backoff = sleep
    while True:
        scaled = cluster.loop(debug)
        if scaled:
            time.sleep(sleep)
            backoff = sleep
        else:
            logger.warn("backoff: %s" % backoff)
            backoff *= 2
            time.sleep(backoff)
def main(container_service_name, resource_group, sleep, kubeconfig,
         service_principal_app_id, service_principal_secret,
         service_principal_tenant_id, cpu_per_node, datadog_api_key,
         idle_threshold, reserve_idle_threshold, over_provision,
         instance_init_time, no_scale, no_maintenance, slack_hook,
         slack_bot_token, dry_run, verbose):
    logger_handler = logging.StreamHandler(sys.stderr)
    logger_handler.setFormatter(
        logging.Formatter(
            '%(asctime)s - %(name)s - %(levelname)s - %(message)s'))
    logger.addHandler(logger_handler)
    logger.setLevel(DEBUG_LOGGING_MAP.get(verbose, logging.CRITICAL))

    if not (service_principal_app_id and service_principal_secret
            and service_principal_tenant_id):
        logger.error(
            "Missing Azure credentials. Please provide aws-service_principal_app_id, service_principal_secret and service_principal_tenant_id."
        )
        sys.exit(1)

    notifier = Notifier(slack_hook, slack_bot_token)
    cluster = Cluster(
        service_principal_app_id=service_principal_app_id,
        service_principal_secret=service_principal_secret,
        service_principal_tenant_id=service_principal_tenant_id,
        kubeconfig=kubeconfig,
        idle_threshold=idle_threshold,
        instance_init_time=instance_init_time,
        reserve_idle_threshold=reserve_idle_threshold,
        container_service_name=container_service_name,
        resource_group=resource_group,
        scale_up=not no_scale,
        maintainance=not no_maintenance,
        over_provision=over_provision,
        datadog_api_key=datadog_api_key,
        notifier=notifier,
        dry_run=dry_run,
    )
    backoff = sleep
    while True:
        scaled = cluster.scale_loop()
        if scaled:
            time.sleep(sleep)
            backoff = sleep
        else:
            logger.warn("backoff: %s" % backoff)
            backoff *= 2
            time.sleep(backoff)
    def setUp(self):
        # load dummy kube specs
        dir_path = os.path.dirname(os.path.realpath(__file__))
        with open(os.path.join(dir_path, 'data/busybox.yaml'), 'r') as f:
            self.dummy_pod = yaml.load(f.read())
        with open(os.path.join(dir_path, 'data/ds-pod.yaml'), 'r') as f:
            self.dummy_ds_pod = yaml.load(f.read())
        with open(os.path.join(dir_path, 'data/rc-pod.yaml'), 'r') as f:
            self.dummy_rc_pod = yaml.load(f.read())
        with open(os.path.join(dir_path, 'data/node.yaml'), 'r') as f:
            self.dummy_node = yaml.load(f.read())
            for condition in self.dummy_node['status']['conditions']:
                if condition['type'] == 'Ready' and condition[
                        'status'] == 'True':
                    condition['lastHeartbeatTime'] = datetime.now(
                        condition['lastHeartbeatTime'].tzinfo)
            # Convert timestamps to strings to match PyKube
            for condition in self.dummy_node['status']['conditions']:
                condition['lastHeartbeatTime'] = datetime.isoformat(
                    condition['lastHeartbeatTime'])
                condition['lastTransitionTime'] = datetime.isoformat(
                    condition['lastTransitionTime'])

        # this isn't actually used here
        # only needed to create the KubePod object...
        dir_path = os.path.dirname(os.path.realpath(__file__))
        self.api = pykube.HTTPClient(
            pykube.KubeConfig.from_file(
                os.path.join(dir_path, './data/kube_config.yaml')))

        self.cluster = Cluster(kubeconfig='~/.kube/config',
                               idle_threshold=60,
                               spare_agents=1,
                               instance_init_time=60,
                               resource_group='my-rg',
                               notifier=None,
                               service_principal_app_id='dummy',
                               service_principal_secret='dummy',
                               service_principal_tenant_id='dummy',
                               etcd_client_private_key='dummy',
                               etcd_server_private_key='dummy',
                               subscription_id='dummy',
                               kubeconfig_private_key='dummy',
                               client_private_key='dummy',
                               ca_private_key='dummy',
                               ignore_pools='',
                               over_provision=0)
def main(cluster_name, regions, sleep, kubeconfig, pod_namespace,
         aws_access_key, aws_secret_key, datadog_api_key, idle_threshold,
         type_idle_threshold, over_provision, instance_init_time, no_scale,
         no_maintenance, slack_hook, slack_bot_token, dry_run, verbose,
         drainable_labels, scale_label, instance_type_priorities):
    logger_handler = logging.StreamHandler(sys.stderr)
    logger_handler.setFormatter(
        logging.Formatter(
            '%(asctime)s - %(name)s - %(levelname)s - %(message)s'))
    logger.addHandler(logger_handler)
    logger.setLevel(DEBUG_LOGGING_MAP.get(verbose, logging.CRITICAL))

    if not (aws_secret_key and aws_access_key):
        logger.error(
            "Missing AWS credentials. Please provide aws-access-key and aws-secret-key."
        )
        sys.exit(1)

    notifier = Notifier(slack_hook, slack_bot_token)
    cluster = Cluster(aws_access_key=aws_access_key,
                      aws_secret_key=aws_secret_key,
                      regions=regions.split(','),
                      kubeconfig=kubeconfig,
                      pod_namespace=pod_namespace,
                      idle_threshold=idle_threshold,
                      instance_init_time=instance_init_time,
                      type_idle_threshold=type_idle_threshold,
                      cluster_name=cluster_name,
                      scale_up=not no_scale,
                      maintainance=not no_maintenance,
                      over_provision=over_provision,
                      datadog_api_key=datadog_api_key,
                      notifier=notifier,
                      dry_run=dry_run,
                      drainable_labels=drainable_labels,
                      scale_label=scale_label,
                      instance_type_priorities=instance_type_priorities)
    backoff = sleep
    while True:
        scaled = cluster.scale_loop()
        if scaled:
            time.sleep(sleep)
            backoff = sleep
        else:
            logger.warn("backoff: %s" % backoff)
            backoff *= 2
            time.sleep(backoff)
def main(sleep, kubeconfig, kubecontext, scale_out_webhook, scale_in_webhook,
         spare_agents, pool_name_regex, idle_threshold, drain, no_scale,
         over_provision, no_maintenance, ignore_pools, slack_hook, verbose,
         debug):
    logger_handler = logging.StreamHandler(sys.stderr)
    logger_handler.setFormatter(
        logging.Formatter(
            '%(asctime)s - %(name)s - %(levelname)s - %(message)s'))
    logger.addHandler(logger_handler)
    logger.setLevel(DEBUG_LOGGING_MAP.get(verbose, logging.CRITICAL))

    notifier = None
    if slack_hook:
        notifier = Notifier(slack_hook)

    cluster = Cluster(kubeconfig=kubeconfig,
                      kubecontext=kubecontext,
                      scale_out_webhook=scale_out_webhook,
                      scale_in_webhook=scale_in_webhook,
                      pool_name_regex=pool_name_regex,
                      spare_agents=spare_agents,
                      idle_threshold=idle_threshold,
                      drain=drain,
                      scale_up=not no_scale,
                      ignore_pools=ignore_pools,
                      maintainance=not no_maintenance,
                      over_provision=over_provision,
                      notifier=notifier)
    cluster.login()
    backoff = sleep
    while True:
        scaled = cluster.loop(debug)
        if scaled:
            time.sleep(sleep)
            backoff = sleep
        else:
            logger.warn("backoff: %s" % backoff)
            backoff *= 2
            time.sleep(backoff)
Example #10
0
def main(cluster_name, regions, sleep, kubeconfig,
         aws_access_key, aws_secret_key, datadog_api_key,
         idle_threshold, type_idle_threshold,
         instance_init_time, no_scale, no_maintenance,
         slack_hook, dry_run, verbose):
    logger_handler = logging.StreamHandler(sys.stderr)
    logger_handler.setFormatter(logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s'))
    logger.addHandler(logger_handler)
    logger.setLevel(DEBUG_LOGGING_MAP.get(verbose, logging.CRITICAL))

    if not (aws_secret_key and aws_access_key):
        logger.error("Missing AWS credentials. Please provide aws-access-key and aws-secret-key.")
        sys.exit(1)

    cluster = Cluster(aws_access_key=aws_access_key,
                      aws_secret_key=aws_secret_key,
                      regions=regions.split(','),
                      kubeconfig=kubeconfig,
                      idle_threshold=idle_threshold,
                      instance_init_time=instance_init_time,
                      type_idle_threshold=type_idle_threshold,
                      cluster_name=cluster_name,
                      scale_up=not no_scale,
                      maintainance=not no_maintenance,
                      datadog_api_key=datadog_api_key,
                      slack_hook=slack_hook,
                      dry_run=dry_run
                      )
    backoff = sleep
    while True:
        scaled = cluster.scale_loop()
        if scaled:
            time.sleep(sleep)
            backoff = sleep
        else:
            logger.warn("backoff: %s" % backoff)
            backoff *= 2
            time.sleep(backoff)
Example #11
0
def main(provider_name, timer, scale_up_cap, scale_down_cap, scale_max,
         scale_min, azure_subscription_id, azure_tenant_id, azure_client_id,
         azure_client_secret, azure_location, azure_resource_group,
         azure_vmss_name, verbose):
    #Logger settings
    logger_handler = logging.StreamHandler(sys.stderr)
    logger_handler.setFormatter(
        logging.Formatter(
            '%(asctime)s - %(name)s - %(levelname)s - %(message)s'))
    logger.addHandler(logger_handler)
    logger.setLevel(DEBUG_LOGGING_MAP.get(verbose, logging.CRITICAL))

    logger.debug("Debug mode activated")

    if not provider_name:
        logger.error("Provider not specified, ex : --provider Azure")
        sys.exit(1)

    logger.debug("Provider Name : " + provider_name)
    logger.debug("Timer : " + str(timer))
    logger.debug("Scale Up Cap : " + str(scale_up_cap))
    logger.debug("Scale Down Cap : " + str(scale_down_cap))
    logger.debug("Maximum Nodes : " + str(scale_max))
    logger.debug("Minimum Nodes : " + str(scale_min))
    logger.debug("Azure Subscription ID : " + azure_subscription_id)
    logger.debug("Azure Tenant ID : " + azure_tenant_id)
    logger.debug("Azure Client ID : " + azure_client_id)
    logger.debug("Azure Client Secret : " + azure_client_secret)
    logger.debug("Azure Resource Group : " + azure_resource_group)
    logger.debug("Azure Location : " + azure_location)
    logger.debug("Azure VMSS Targeted : " + azure_vmss_name)

    logger.info("DC/OS Autoscaler Started")

    cluster = Cluster(provider_name=provider_name,
                      scale_up_cap=scale_up_cap,
                      scale_down_cap=scale_down_cap,
                      scale_max=scale_max,
                      scale_min=scale_min,
                      azure_subscription_id=azure_subscription_id,
                      azure_tenant_id=azure_tenant_id,
                      azure_client_id=azure_client_id,
                      azure_client_secret=azure_client_secret,
                      azure_location=azure_location,
                      azure_resource_group=azure_resource_group,
                      azure_vmss_name=azure_vmss_name)
    while True:
        cluster.check_health()
        cluster.decide_to_scale()
        time.sleep(timer)

    logger.info("DC/OS Autoscaler Stopped")
Example #12
0
class TestCluster(unittest.TestCase):
    def setUp(self):
        # load dummy kube specs
        dir_path = os.path.dirname(os.path.realpath(__file__))
        with open(os.path.join(dir_path, 'data/busybox.yaml'), 'r') as f:
            self.dummy_pod = yaml.load(f.read())
        with open(os.path.join(dir_path, 'data/ds-pod.yaml'), 'r') as f:
            self.dummy_ds_pod = yaml.load(f.read())
        with open(os.path.join(dir_path, 'data/rc-pod.yaml'), 'r') as f:
            self.dummy_rc_pod = yaml.load(f.read())
        with open(os.path.join(dir_path, 'data/node.yaml'), 'r') as f:
            self.dummy_node = yaml.load(f.read())
            for condition in self.dummy_node['status']['conditions']:
                if condition['type'] == 'Ready' and condition['status'] == 'True':
                    condition['lastHeartbeatTime'] = datetime.now(condition['lastHeartbeatTime'].tzinfo)
            # Convert timestamps to strings to match PyKube
            for condition in self.dummy_node['status']['conditions']:
                condition['lastHeartbeatTime'] = datetime.isoformat(condition['lastHeartbeatTime'])
                condition['lastTransitionTime'] = datetime.isoformat(condition['lastTransitionTime'])


        # this isn't actually used here
        # only needed to create the KubePod object...
        self.api = pykube.HTTPClient(pykube.KubeConfig.from_file('~/.kube/config'))

        # start creating our mock ec2 environment
        self.mocks = [moto.mock_ec2(), moto.mock_autoscaling()]
        for moto_mock in self.mocks:
            moto_mock.start()

        client = boto3.client('autoscaling', region_name='us-west-2')
        self.asg_client = client

        client.create_launch_configuration(
            LaunchConfigurationName='dummy-lc',
            ImageId='ami-deadbeef',
            KeyName='dummy-key',
            SecurityGroups=[
                'sg-cafebeef',
            ],
            InstanceType='t2.medium'
        )

        client.create_auto_scaling_group(
            AutoScalingGroupName='dummy-asg',
            LaunchConfigurationName='dummy-lc',
            MinSize=0,
            MaxSize=10,
            VPCZoneIdentifier='subnet-beefbeef',
            Tags=[
                {
                    'Key': 'KubernetesCluster',
                    'Value': 'dummy-cluster',
                    'PropagateAtLaunch': True
                },
                {
                    'Key': 'KubernetesRole',
                    'Value': 'worker',
                    'PropagateAtLaunch': True
                }
            ]
        )

        # finally our cluster
        self.cluster = Cluster(
            aws_access_key='fake',
            aws_secret_key='fake',
            aws_regions=['us-west-2', 'us-east-1', 'us-west-1'],
            azure_client_id='',
            azure_client_secret='',
            azure_subscription_id='',
            azure_tenant_id='',
            azure_resource_group_names=[],
            azure_slow_scale_classes=[],
            kubeconfig='~/.kube/config',
            pod_namespace=None,
            idle_threshold=60,
            instance_init_time=60,
            type_idle_threshold=60,
            cluster_name='dummy-cluster',
            notifier=mock.Mock(),
            dry_run=False
        )

    def tearDown(self):
        for moto_mock in self.mocks:
            moto_mock.stop()

    def _spin_up_node(self, launch_time=None):
        return self._spin_up_nodes(1, launch_time=launch_time)[0]

    def _spin_up_nodes(self, count, launch_time=None):
        assert count <= 256
        # spin up dummy ec2 node
        self.asg_client.set_desired_capacity(AutoScalingGroupName='dummy-asg',
                                             DesiredCapacity=count)
        response = self.asg_client.describe_auto_scaling_groups()
        nodes = []
        for i, instance in enumerate(response['AutoScalingGroups'][0]['Instances']):
            instance_id = instance['InstanceId']

            dummy_node = copy.deepcopy(self.dummy_node)
            dummy_node['metadata']['labels']['aws/id'] = instance_id
            dummy_node['metadata']['name'] = '10.0.' + str(i) + '.228'
            node = KubeNode(pykube.Node(self.api, dummy_node))
            node.cordon = mock.Mock(return_value="mocked stuff")
            node.drain = mock.Mock(return_value="mocked stuff")
            node.uncordon = mock.Mock(return_value="mocked stuff")
            node.delete = mock.Mock(return_value="mocked stuff")
            nodes.append(node)
        return nodes

    def test_reap_dead_node(self):
        node = copy.deepcopy(self.dummy_node)
        TestInstance = collections.namedtuple('TestInstance', ['launch_time'])
        instance = TestInstance(datetime.now(pytz.utc))

        ready_condition = None
        for condition in node['status']['conditions']:
            if condition['type'] == 'Ready':
                ready_condition = condition
                break
        ready_condition['status'] = 'Unknown'

        ready_condition['lastHeartbeatTime'] = datetime.isoformat(datetime.now(pytz.utc) - timedelta(minutes=30))
        kube_node = KubeNode(pykube.Node(self.api, node))
        kube_node.delete = mock.Mock(return_value="mocked stuff")
        self.cluster.maintain([kube_node], {kube_node.instance_id: instance}, {}, [], [])
        kube_node.delete.assert_not_called()

        ready_condition['lastHeartbeatTime'] = datetime.isoformat(datetime.now(pytz.utc) - timedelta(hours=2))
        kube_node = KubeNode(pykube.Node(self.api, node))
        kube_node.delete = mock.Mock(return_value="mocked stuff")
        self.cluster.maintain([kube_node], {kube_node.instance_id: instance}, {}, [], [])
        kube_node.delete.assert_called_once_with()

    def test_max_scale_in(self):
        node1 = copy.deepcopy(self.dummy_node)
        node2 = copy.deepcopy(self.dummy_node)
        TestInstance = collections.namedtuple('TestInstance', ['launch_time'])
        instance1 = TestInstance(datetime.now(pytz.utc))
        instance2 = TestInstance(datetime.now(pytz.utc))

        for node in [node1, node2]:
            for condition in node['status']['conditions']:
                if condition['type'] == 'Ready':
                    condition['status'] = 'Unknown'
                    condition['lastHeartbeatTime'] = datetime.isoformat(datetime.now(pytz.utc) - timedelta(hours=2))
                    break

        kube_node1 = KubeNode(pykube.Node(self.api, node1))
        kube_node1.delete = mock.Mock(return_value="mocked stuff")
        kube_node2 = KubeNode(pykube.Node(self.api, node2))
        kube_node2.delete = mock.Mock(return_value="mocked stuff")
        self.cluster.maintain([kube_node1, kube_node2], {kube_node1.instance_id: instance1, kube_node2.instance_id: instance2}, {}, [], [])
        kube_node1.delete.assert_not_called()
        kube_node2.delete.assert_not_called()

    def test_scale_up_selector(self):
        self.dummy_pod['spec']['nodeSelector'] = {
            'aws/type': 'm4.large'
        }
        pod = KubePod(pykube.Pod(self.api, self.dummy_pod))
        selectors_hash = utils.selectors_to_hash(pod.selectors)
        asgs = self.cluster.autoscaling_groups.get_all_groups([])
        self.cluster.fulfill_pending(asgs, selectors_hash, [pod])

        response = self.asg_client.describe_auto_scaling_groups()
        self.assertEqual(len(response['AutoScalingGroups']), 1)
        self.assertEqual(response['AutoScalingGroups'][0]['DesiredCapacity'], 0)

    def test_scale_up(self):
        pod = KubePod(pykube.Pod(self.api, self.dummy_pod))
        selectors_hash = utils.selectors_to_hash(pod.selectors)
        asgs = self.cluster.autoscaling_groups.get_all_groups([])
        self.cluster.fulfill_pending(asgs, selectors_hash, [pod])

        response = self.asg_client.describe_auto_scaling_groups()
        self.assertEqual(len(response['AutoScalingGroups']), 1)
        self.assertGreater(response['AutoScalingGroups'][0]['DesiredCapacity'], 0)

    def test_scale_up_notification(self):
        big_pod_spec = copy.deepcopy(self.dummy_pod)
        for container in big_pod_spec['spec']['containers']:
            container['resources']['requests']['cpu'] = '100'
        pod = KubePod(pykube.Pod(self.api, self.dummy_pod))
        big_pod = KubePod(pykube.Pod(self.api, big_pod_spec))
        selectors_hash = utils.selectors_to_hash(pod.selectors)
        asgs = self.cluster.autoscaling_groups.get_all_groups([])
        self.cluster.fulfill_pending(asgs, selectors_hash, [pod, big_pod])
        self.cluster.notifier.notify_scale.assert_called_with(mock.ANY, mock.ANY, [pod])

    def test_timed_out_group(self):
        with mock.patch('autoscaler.autoscaling_groups.AutoScalingGroup.is_timed_out') as is_timed_out:
            with mock.patch('autoscaler.autoscaling_groups.AutoScalingGroup.scale') as scale:
                is_timed_out.return_value = True
                scale.return_value = utils.CompletedFuture(None)

                pod = KubePod(pykube.Pod(self.api, self.dummy_pod))
                selectors_hash = utils.selectors_to_hash(pod.selectors)
                asgs = self.cluster.autoscaling_groups.get_all_groups([])
                self.cluster.fulfill_pending(asgs, selectors_hash, [pod])

                scale.assert_not_called()

                response = self.asg_client.describe_auto_scaling_groups()
                self.assertEqual(len(response['AutoScalingGroups']), 1)
                self.assertEqual(response['AutoScalingGroups'][0]['DesiredCapacity'], 0)

    def test_scale_down(self):
        """
        kube node with daemonset and no pod --> cordon
        """
        node = self._spin_up_node()

        all_nodes = [node]
        managed_nodes = [n for n in all_nodes if node.is_managed()]
        running_insts_map = self.cluster.get_running_instances_map(managed_nodes, [])
        pods_to_schedule = {}
        asgs = self.cluster.autoscaling_groups.get_all_groups(all_nodes)

        ds_pod = KubePod(pykube.Pod(self.api, self.dummy_ds_pod))
        running_or_pending_assigned_pods = [ds_pod]

        self.cluster.idle_threshold = -1
        self.cluster.type_idle_threshold = -1
        self.cluster.LAUNCH_HOUR_THRESHOLD['aws'] = -1
        self.cluster.maintain(
            managed_nodes, running_insts_map,
            pods_to_schedule, running_or_pending_assigned_pods, asgs)

        response = self.asg_client.describe_auto_scaling_groups()
        self.assertEqual(len(response['AutoScalingGroups']), 1)
        self.assertEqual(response['AutoScalingGroups'][0]['DesiredCapacity'], 1)
        node.cordon.assert_called_once_with()

    def test_scale_down_launch_grace_period(self):
        """
        kube node with daemonset and no pod + launch grace period --> noop
        """
        node = self._spin_up_node()
        all_nodes = [node]
        managed_nodes = [n for n in all_nodes if node.is_managed()]
        running_insts_map = self.cluster.get_running_instances_map(managed_nodes, [])
        pods_to_schedule = {}
        asgs = self.cluster.autoscaling_groups.get_all_groups(all_nodes)

        ds_pod = KubePod(pykube.Pod(self.api, self.dummy_ds_pod))
        running_or_pending_assigned_pods = [ds_pod]

        self.cluster.idle_threshold = -1
        self.cluster.type_idle_threshold = -1
        self.cluster.LAUNCH_HOUR_THRESHOLD['aws'] = 60*30
        self.cluster.maintain(
            managed_nodes, running_insts_map,
            pods_to_schedule, running_or_pending_assigned_pods, asgs)

        response = self.asg_client.describe_auto_scaling_groups()
        self.assertEqual(len(response['AutoScalingGroups']), 1)
        self.assertEqual(response['AutoScalingGroups'][0]['DesiredCapacity'], 1)
        node.cordon.assert_not_called()

    def test_scale_down_grace_period(self):
        """
        kube node with daemonset and no pod + grace period --> noop
        """
        node = self._spin_up_node()
        all_nodes = [node]
        managed_nodes = [n for n in all_nodes if node.is_managed()]
        running_insts_map = self.cluster.get_running_instances_map(managed_nodes, [])
        pods_to_schedule = {}
        asgs = self.cluster.autoscaling_groups.get_all_groups(all_nodes)

        # kube node with daemonset and no pod --> cordon
        ds_pod = KubePod(pykube.Pod(self.api, self.dummy_ds_pod))
        running_or_pending_assigned_pods = [ds_pod]

        self.cluster.maintain(
            managed_nodes, running_insts_map,
            pods_to_schedule, running_or_pending_assigned_pods, asgs)

        response = self.asg_client.describe_auto_scaling_groups()
        self.assertEqual(len(response['AutoScalingGroups']), 1)
        self.assertEqual(response['AutoScalingGroups'][0]['DesiredCapacity'], 1)
        node.cordon.assert_not_called()

    def test_scale_down_busy(self):
        """
        kube node with daemonset and pod/rc-pod --> noop
        """
        node = self._spin_up_node()
        all_nodes = [node]
        managed_nodes = [n for n in all_nodes if node.is_managed()]
        running_insts_map = self.cluster.get_running_instances_map(managed_nodes, [])
        pods_to_schedule = {}
        asgs = self.cluster.autoscaling_groups.get_all_groups(all_nodes)

        # kube node with daemonset and pod --> noop
        ds_pod = KubePod(pykube.Pod(self.api, self.dummy_ds_pod))
        pod = KubePod(pykube.Pod(self.api, self.dummy_pod))
        rc_pod = KubePod(pykube.Pod(self.api, self.dummy_rc_pod))

        pod_scenarios = [
            # kube node with daemonset and pod --> noop
            [ds_pod, pod],
            # kube node with daemonset and rc pod --> noop
            [ds_pod, rc_pod]
        ]

        # make sure we're not on grace period
        self.cluster.idle_threshold = -1
        self.cluster.type_idle_threshold = -1

        for pods in pod_scenarios:
            state = self.cluster.get_node_state(
                node, asgs[0], pods, pods_to_schedule,
                running_insts_map, collections.Counter())
            self.assertEqual(state, ClusterNodeState.BUSY)

            self.cluster.maintain(
                managed_nodes, running_insts_map,
                pods_to_schedule, pods, asgs)

            response = self.asg_client.describe_auto_scaling_groups()
            self.assertEqual(len(response['AutoScalingGroups']), 1)
            self.assertEqual(response['AutoScalingGroups'][0]['DesiredCapacity'], 1)
            node.cordon.assert_not_called()

    def test_scale_down_under_utilized_undrainable(self):
        """
        kube node with daemonset and pod/rc-pod --> noop
        """
        node = self._spin_up_node()
        all_nodes = [node]
        managed_nodes = [n for n in all_nodes if node.is_managed()]
        running_insts_map = self.cluster.get_running_instances_map(managed_nodes, [])
        pods_to_schedule = {}
        asgs = self.cluster.autoscaling_groups.get_all_groups(all_nodes)

        # create some undrainable pods
        ds_pod = KubePod(pykube.Pod(self.api, self.dummy_ds_pod))
        for container in self.dummy_pod['spec']['containers']:
            container.pop('resources', None)
        pod = KubePod(pykube.Pod(self.api, self.dummy_pod))
        self.dummy_rc_pod['metadata']['labels']['openai/do-not-drain'] = 'true'
        for container in self.dummy_rc_pod['spec']['containers']:
            container.pop('resources', None)
        rc_pod = KubePod(pykube.Pod(self.api, self.dummy_rc_pod))

        pod_scenarios = [
            # kube node with daemonset and pod with no resource ask --> noop
            [ds_pod, pod],
            # kube node with daemonset and critical rc pod --> noop
            [ds_pod, rc_pod]
        ]

        # make sure we're not on grace period
        self.cluster.idle_threshold = -1
        self.cluster.type_idle_threshold = -1
        self.cluster.LAUNCH_HOUR_THRESHOLD['aws'] = -1

        for pods in pod_scenarios:
            state = self.cluster.get_node_state(
                node, asgs[0], pods, pods_to_schedule,
                running_insts_map, collections.Counter())
            self.assertEqual(state, ClusterNodeState.UNDER_UTILIZED_UNDRAINABLE)

            self.cluster.maintain(
                managed_nodes, running_insts_map,
                pods_to_schedule, pods, asgs)

            response = self.asg_client.describe_auto_scaling_groups()
            self.assertEqual(len(response['AutoScalingGroups']), 1)
            self.assertEqual(response['AutoScalingGroups'][0]['DesiredCapacity'], 1)
            node.cordon.assert_not_called()

    def test_scale_down_under_utilized_drainable(self):
        """
        kube node with daemonset and rc-pod --> cordon+drain
        """
        node = self._spin_up_node()
        all_nodes = [node]
        managed_nodes = [n for n in all_nodes if node.is_managed()]
        running_insts_map = self.cluster.get_running_instances_map(managed_nodes, [])
        pods_to_schedule = {}
        asgs = self.cluster.autoscaling_groups.get_all_groups(all_nodes)

        # create some undrainable pods
        ds_pod = KubePod(pykube.Pod(self.api, self.dummy_ds_pod))
        for container in self.dummy_rc_pod['spec']['containers']:
            container.pop('resources', None)
        rc_pod = KubePod(pykube.Pod(self.api, self.dummy_rc_pod))
        pods = [ds_pod, rc_pod]

        # make sure we're not on grace period
        self.cluster.idle_threshold = -1
        self.cluster.type_idle_threshold = -1
        self.cluster.LAUNCH_HOUR_THRESHOLD['aws'] = -1

        state = self.cluster.get_node_state(
            node, asgs[0], pods, pods_to_schedule,
            running_insts_map, collections.Counter())
        self.assertEqual(state, ClusterNodeState.UNDER_UTILIZED_DRAINABLE)

        self.cluster.maintain(
            managed_nodes, running_insts_map,
            pods_to_schedule, pods, asgs)

        response = self.asg_client.describe_auto_scaling_groups()
        self.assertEqual(len(response['AutoScalingGroups']), 1)
        self.assertEqual(response['AutoScalingGroups'][0]['DesiredCapacity'], 1)
        node.cordon.assert_called_once_with()
        node.drain.assert_called_once_with(pods, notifier=mock.ANY)

    def test_prioritization(self):
        TestingGroup = collections.namedtuple('TestingGroup', ['region', 'name', 'selectors', 'global_priority', 'is_spot'])
        high_pri = TestingGroup('test', 'test', {}, -1, False)
        low_pri = TestingGroup('test', 'test', {}, 0, False)

        self.assertEqual([high_pri, low_pri], list(self.cluster._prioritize_groups([low_pri, high_pri])))
class TestClusterWithPrioritiesConfiguredReversed(unittest.TestCase):
    def setUp(self):
        # load dummy kube specs
        dir_path = os.path.dirname(os.path.realpath(__file__))
        with open(os.path.join(dir_path, 'data/busybox.yaml'), 'r') as f:
            self.dummy_pod = yaml.load(f.read())

        # this isn't actually used here
        # only needed to create the KubePod object...
        self.api = pykube.HTTPClient(
            pykube.KubeConfig.from_file('~/.kube/config'))

        # start creating our mock ec2 environment
        self.mocks = [moto.mock_ec2(), moto.mock_autoscaling()]
        for moto_mock in self.mocks:
            moto_mock.start()

        client = boto3.client('autoscaling', region_name='us-west-2')
        self.asg_client = client

        client.create_launch_configuration(
            LaunchConfigurationName='dummy-lc-large-gpu',
            ImageId='ami-deadbeef',
            KeyName='dummy-key',
            SecurityGroups=[
                'sg-cafebeef',
            ],
            InstanceType='p2.8xlarge')

        client.create_launch_configuration(
            LaunchConfigurationName='dummy-lc-small-gpu',
            ImageId='ami-deadbeef',
            KeyName='dummy-key',
            SecurityGroups=[
                'sg-cafebeef',
            ],
            InstanceType='p2.xlarge')

        client.create_auto_scaling_group(
            AutoScalingGroupName='dummy-asg-large-gpu',
            LaunchConfigurationName='dummy-lc-large-gpu',
            MinSize=0,
            MaxSize=2,
            VPCZoneIdentifier='subnet-beefbeef',
            Tags=[{
                'Key': 'KubernetesCluster',
                'Value': 'dummy-cluster-with-priorities',
                'PropagateAtLaunch': True
            }, {
                'Key': 'KubernetesRole',
                'Value': 'worker',
                'PropagateAtLaunch': True
            }])

        client.create_auto_scaling_group(
            AutoScalingGroupName='dummy-asg-small-gpu',
            LaunchConfigurationName='dummy-lc-small-gpu',
            MinSize=0,
            MaxSize=2,
            VPCZoneIdentifier='subnet-beefbeef',
            Tags=[{
                'Key': 'KubernetesCluster',
                'Value': 'dummy-cluster-with-priorities',
                'PropagateAtLaunch': True
            }, {
                'Key': 'KubernetesRole',
                'Value': 'worker',
                'PropagateAtLaunch': True
            }])

        # Note that instance_type_priorities is set.
        # p2.8xlarges have a higher priority here.
        self.cluster = Cluster(aws_access_key='',
                               aws_secret_key='',
                               regions=['us-west-2'],
                               kubeconfig='~/.kube/config',
                               pod_namespace=None,
                               idle_threshold=60,
                               instance_init_time=60,
                               type_idle_threshold=60,
                               cluster_name='dummy-cluster-with-priorities',
                               instance_type_priorities={
                                   'p2.xlarge': set(['10']),
                                   'p2.8xlarge': set(['2'])
                               },
                               notifier=Notifier(),
                               dry_run=False)

    def tearDown(self):
        for moto_mock in self.mocks:
            moto_mock.stop()

    def test_scale_up(self):
        pod = KubePod(pykube.Pod(self.api, self.dummy_pod))
        selectors_hash = utils.selectors_to_hash(pod.selectors)
        asgs = self.cluster.autoscaling_groups.get_all_groups([])
        self.cluster.fulfill_pending(asgs, selectors_hash, [pod])

        response = self.asg_client.describe_auto_scaling_groups()
        self.assertEqual(len(response['AutoScalingGroups']), 2)
        big_gpu_asg, small_gpu_asg = {}, {}
        if (response['AutoScalingGroups'][0]['AutoScalingGroupName'] ==
                'dummy-asg-small-gpu'):
            small_gpu_asg = response['AutoScalingGroups'][0]
            big_gpu_asg = response['AutoScalingGroups'][1]
        else:
            small_gpu_asg = response['AutoScalingGroups'][1]
            big_gpu_asg = response['AutoScalingGroups'][0]

        self.assertGreater(big_gpu_asg['DesiredCapacity'], 0)
        self.assertEqual(small_gpu_asg['DesiredCapacity'], 0)
Example #14
0
class TestCluster(unittest.TestCase):
    def setUp(self):
        # load dummy kube specs
        dir_path = os.path.dirname(os.path.realpath(__file__))
        with open(os.path.join(dir_path, 'data/busybox.yaml'), 'r') as f:
            self.dummy_pod = yaml.load(f.read())
        with open(os.path.join(dir_path, 'data/ds-pod.yaml'), 'r') as f:
            self.dummy_ds_pod = yaml.load(f.read())
        with open(os.path.join(dir_path, 'data/rc-pod.yaml'), 'r') as f:
            self.dummy_rc_pod = yaml.load(f.read())
        with open(os.path.join(dir_path, 'data/node.yaml'), 'r') as f:
            self.dummy_node = yaml.load(f.read())

        # this isn't actually used here
        # only needed to create the KubePod object...
        self.api = pykube.HTTPClient(pykube.KubeConfig.from_file('~/.kube/config'))

        # start creating our mock ec2 environment
        self.mocks = [moto.mock_ec2(), moto.mock_autoscaling()]
        for moto_mock in self.mocks:
            moto_mock.start()

        client = boto3.client('autoscaling')
        self.asg_client = client

        client.create_launch_configuration(
            LaunchConfigurationName='dummy-lc',
            ImageId='ami-deadbeef',
            KeyName='dummy-key',
            SecurityGroups=[
                'sg-cafebeef',
            ],
            InstanceType='t2.medium'
        )

        client.create_auto_scaling_group(
            AutoScalingGroupName='dummy-asg',
            LaunchConfigurationName='dummy-lc',
            MinSize=0,
            MaxSize=10,
            VPCZoneIdentifier='subnet-beefbeef',
            Tags=[
                {
                    'Key': 'KubernetesCluster',
                    'Value': 'dummy-cluster',
                    'PropagateAtLaunch': True
                },
                {
                    'Key': 'KubernetesRole',
                    'Value': 'worker',
                    'PropagateAtLaunch': True
                }
            ]
        )

        # finally our cluster
        self.cluster = Cluster(
            aws_access_key='',
            aws_secret_key='',
            regions=['us-west-2', 'us-east-1', 'us-west-1'],
            kubeconfig='~/.kube/config',
            idle_threshold=60,
            instance_init_time=60,
            type_idle_threshold=60,
            cluster_name='dummy-cluster',
            slack_hook='',
            dry_run=False
        )

    def tearDown(self):
        for moto_mock in self.mocks:
            moto_mock.stop()

    def _spin_up_node(self):
        # spin up dummy ec2 node
        self.asg_client.set_desired_capacity(AutoScalingGroupName='dummy-asg',
                                             DesiredCapacity=1)
        response = self.asg_client.describe_auto_scaling_groups()
        instance_id = response['AutoScalingGroups'][0]['Instances'][0]['InstanceId']

        self.dummy_node['metadata']['labels']['aws/id'] = instance_id
        node = KubeNode(pykube.Node(self.api, self.dummy_node))
        node.cordon = mock.Mock(return_value="mocked stuff")
        node.drain = mock.Mock(return_value="mocked stuff")
        node.uncordon = mock.Mock(return_value="mocked stuff")
        node.delete = mock.Mock(return_value="mocked stuff")
        return node

    def test_scale_up_selector(self):
        self.dummy_pod['spec']['nodeSelector'] = {
            'aws/type': 'm4.large'
        }
        pod = KubePod(pykube.Pod(self.api, self.dummy_pod))
        selectors_hash = utils.selectors_to_hash(pod.selectors)
        asgs = self.cluster.autoscaling_groups.get_all_groups([])
        self.cluster.fulfill_pending(asgs, selectors_hash, [pod])

        response = self.asg_client.describe_auto_scaling_groups()
        self.assertEqual(len(response['AutoScalingGroups']), 1)
        self.assertEqual(response['AutoScalingGroups'][0]['DesiredCapacity'], 0)

    def test_scale_up(self):
        pod = KubePod(pykube.Pod(self.api, self.dummy_pod))
        selectors_hash = utils.selectors_to_hash(pod.selectors)
        asgs = self.cluster.autoscaling_groups.get_all_groups([])
        self.cluster.fulfill_pending(asgs, selectors_hash, [pod])

        response = self.asg_client.describe_auto_scaling_groups()
        self.assertEqual(len(response['AutoScalingGroups']), 1)
        self.assertGreater(response['AutoScalingGroups'][0]['DesiredCapacity'], 0)

    def test_scale_down(self):
        """
        kube node with daemonset and no pod --> cordon
        """
        node = self._spin_up_node()
        all_nodes = [node]
        managed_nodes = [n for n in all_nodes if node.is_managed()]
        running_insts_map = self.cluster.get_running_instances_map(managed_nodes)
        pods_to_schedule = {}
        asgs = self.cluster.autoscaling_groups.get_all_groups(all_nodes)

        ds_pod = KubePod(pykube.Pod(self.api, self.dummy_ds_pod))
        running_or_pending_assigned_pods = [ds_pod]

        self.cluster.idle_threshold = -1
        self.cluster.type_idle_threshold = -1
        self.cluster.maintain(
            managed_nodes, running_insts_map,
            pods_to_schedule, running_or_pending_assigned_pods, asgs)

        response = self.asg_client.describe_auto_scaling_groups()
        self.assertEqual(len(response['AutoScalingGroups']), 1)
        self.assertEqual(response['AutoScalingGroups'][0]['DesiredCapacity'], 1)
        node.cordon.assert_called_once_with()

    def test_scale_down_grace_period(self):
        """
        kube node with daemonset and no pod + grace period --> noop
        """
        node = self._spin_up_node()
        all_nodes = [node]
        managed_nodes = [n for n in all_nodes if node.is_managed()]
        running_insts_map = self.cluster.get_running_instances_map(managed_nodes)
        pods_to_schedule = {}
        asgs = self.cluster.autoscaling_groups.get_all_groups(all_nodes)

        # kube node with daemonset and no pod --> cordon
        ds_pod = KubePod(pykube.Pod(self.api, self.dummy_ds_pod))
        running_or_pending_assigned_pods = [ds_pod]

        self.cluster.maintain(
            managed_nodes, running_insts_map,
            pods_to_schedule, running_or_pending_assigned_pods, asgs)

        response = self.asg_client.describe_auto_scaling_groups()
        self.assertEqual(len(response['AutoScalingGroups']), 1)
        self.assertEqual(response['AutoScalingGroups'][0]['DesiredCapacity'], 1)
        node.cordon.assert_not_called()

    def test_scale_down_busy(self):
        """
        kube node with daemonset and pod/rc-pod --> noop
        """
        node = self._spin_up_node()
        all_nodes = [node]
        managed_nodes = [n for n in all_nodes if node.is_managed()]
        running_insts_map = self.cluster.get_running_instances_map(managed_nodes)
        pods_to_schedule = {}
        asgs = self.cluster.autoscaling_groups.get_all_groups(all_nodes)

        # kube node with daemonset and pod --> noop
        ds_pod = KubePod(pykube.Pod(self.api, self.dummy_ds_pod))
        pod = KubePod(pykube.Pod(self.api, self.dummy_pod))
        rc_pod = KubePod(pykube.Pod(self.api, self.dummy_rc_pod))

        pod_scenarios = [
            # kube node with daemonset and pod --> noop
            [ds_pod, pod],
            # kube node with daemonset and rc pod --> noop
            [ds_pod, rc_pod]
        ]

        # make sure we're not on grace period
        self.cluster.idle_threshold = -1
        self.cluster.type_idle_threshold = -1

        for pods in pod_scenarios:
            state = self.cluster.get_node_state(
                node, asgs[0], pods, pods_to_schedule,
                running_insts_map, collections.Counter())
            self.assertEqual(state, ClusterNodeState.BUSY)

            self.cluster.maintain(
                managed_nodes, running_insts_map,
                pods_to_schedule, pods, asgs)

            response = self.asg_client.describe_auto_scaling_groups()
            self.assertEqual(len(response['AutoScalingGroups']), 1)
            self.assertEqual(response['AutoScalingGroups'][0]['DesiredCapacity'], 1)
            node.cordon.assert_not_called()

    def test_scale_down_under_utilized_undrainable(self):
        """
        kube node with daemonset and pod/rc-pod --> noop
        """
        node = self._spin_up_node()
        all_nodes = [node]
        managed_nodes = [n for n in all_nodes if node.is_managed()]
        running_insts_map = self.cluster.get_running_instances_map(managed_nodes)
        pods_to_schedule = {}
        asgs = self.cluster.autoscaling_groups.get_all_groups(all_nodes)

        # create some undrainable pods
        ds_pod = KubePod(pykube.Pod(self.api, self.dummy_ds_pod))
        for container in self.dummy_pod['spec']['containers']:
            container.pop('resources', None)
        pod = KubePod(pykube.Pod(self.api, self.dummy_pod))
        self.dummy_rc_pod['metadata']['labels']['openai/do-not-drain'] = 'true'
        for container in self.dummy_rc_pod['spec']['containers']:
            container.pop('resources', None)
        rc_pod = KubePod(pykube.Pod(self.api, self.dummy_rc_pod))

        pod_scenarios = [
            # kube node with daemonset and pod with no resource ask --> noop
            [ds_pod, pod],
            # kube node with daemonset and critical rc pod --> noop
            [ds_pod, rc_pod]
        ]

        # make sure we're not on grace period
        self.cluster.idle_threshold = -1
        self.cluster.type_idle_threshold = -1

        for pods in pod_scenarios:
            state = self.cluster.get_node_state(
                node, asgs[0], pods, pods_to_schedule,
                running_insts_map, collections.Counter())
            self.assertEqual(state, ClusterNodeState.UNDER_UTILIZED_UNDRAINABLE)

            self.cluster.maintain(
                managed_nodes, running_insts_map,
                pods_to_schedule, pods, asgs)

            response = self.asg_client.describe_auto_scaling_groups()
            self.assertEqual(len(response['AutoScalingGroups']), 1)
            self.assertEqual(response['AutoScalingGroups'][0]['DesiredCapacity'], 1)
            node.cordon.assert_not_called()

    def test_scale_down_under_utilized_drainable(self):
        """
        kube node with daemonset and rc-pod --> cordon+drain
        """
        node = self._spin_up_node()
        all_nodes = [node]
        managed_nodes = [n for n in all_nodes if node.is_managed()]
        running_insts_map = self.cluster.get_running_instances_map(managed_nodes)
        pods_to_schedule = {}
        asgs = self.cluster.autoscaling_groups.get_all_groups(all_nodes)

        # create some undrainable pods
        ds_pod = KubePod(pykube.Pod(self.api, self.dummy_ds_pod))
        for container in self.dummy_rc_pod['spec']['containers']:
            container.pop('resources', None)
        rc_pod = KubePod(pykube.Pod(self.api, self.dummy_rc_pod))
        pods = [ds_pod, rc_pod]

        # make sure we're not on grace period
        self.cluster.idle_threshold = -1
        self.cluster.type_idle_threshold = -1

        state = self.cluster.get_node_state(
            node, asgs[0], pods, pods_to_schedule,
            running_insts_map, collections.Counter())
        self.assertEqual(state, ClusterNodeState.UNDER_UTILIZED_DRAINABLE)

        self.cluster.maintain(
            managed_nodes, running_insts_map,
            pods_to_schedule, pods, asgs)

        response = self.asg_client.describe_auto_scaling_groups()
        self.assertEqual(len(response['AutoScalingGroups']), 1)
        self.assertEqual(response['AutoScalingGroups'][0]['DesiredCapacity'], 1)
        node.cordon.assert_called_once_with()
        node.drain.assert_called_once_with(pods)
Example #15
0
    def setUp(self):
        # load dummy kube specs
        dir_path = os.path.dirname(os.path.realpath(__file__))
        with open(os.path.join(dir_path, 'data/busybox.yaml'), 'r') as f:
            self.dummy_pod = yaml.load(f.read())
        with open(os.path.join(dir_path, 'data/ds-pod.yaml'), 'r') as f:
            self.dummy_ds_pod = yaml.load(f.read())
        with open(os.path.join(dir_path, 'data/rc-pod.yaml'), 'r') as f:
            self.dummy_rc_pod = yaml.load(f.read())
        with open(os.path.join(dir_path, 'data/node.yaml'), 'r') as f:
            self.dummy_node = yaml.load(f.read())

        # this isn't actually used here
        # only needed to create the KubePod object...
        self.api = pykube.HTTPClient(pykube.KubeConfig.from_file('~/.kube/config'))

        # start creating our mock ec2 environment
        self.mocks = [moto.mock_ec2(), moto.mock_autoscaling()]
        for moto_mock in self.mocks:
            moto_mock.start()

        client = boto3.client('autoscaling')
        self.asg_client = client

        client.create_launch_configuration(
            LaunchConfigurationName='dummy-lc',
            ImageId='ami-deadbeef',
            KeyName='dummy-key',
            SecurityGroups=[
                'sg-cafebeef',
            ],
            InstanceType='t2.medium'
        )

        client.create_auto_scaling_group(
            AutoScalingGroupName='dummy-asg',
            LaunchConfigurationName='dummy-lc',
            MinSize=0,
            MaxSize=10,
            VPCZoneIdentifier='subnet-beefbeef',
            Tags=[
                {
                    'Key': 'KubernetesCluster',
                    'Value': 'dummy-cluster',
                    'PropagateAtLaunch': True
                },
                {
                    'Key': 'KubernetesRole',
                    'Value': 'worker',
                    'PropagateAtLaunch': True
                }
            ]
        )

        # finally our cluster
        self.cluster = Cluster(
            aws_access_key='',
            aws_secret_key='',
            regions=['us-west-2', 'us-east-1', 'us-west-1'],
            kubeconfig='~/.kube/config',
            idle_threshold=60,
            instance_init_time=60,
            type_idle_threshold=60,
            cluster_name='dummy-cluster',
            slack_hook='',
            dry_run=False
        )
Example #16
0
class TestCluster(unittest.TestCase):
    def setUp(self):
        # load dummy kube specs
        dir_path = os.path.dirname(os.path.realpath(__file__))
        with open(os.path.join(dir_path, 'data/busybox.yaml'), 'r') as f:
            self.dummy_pod = yaml.load(f.read())
        with open(os.path.join(dir_path, 'data/ds-pod.yaml'), 'r') as f:
            self.dummy_ds_pod = yaml.load(f.read())
        with open(os.path.join(dir_path, 'data/rc-pod.yaml'), 'r') as f:
            self.dummy_rc_pod = yaml.load(f.read())
        with open(os.path.join(dir_path, 'data/node.yaml'), 'r') as f:
            self.dummy_node = yaml.load(f.read())

        # this isn't actually used here
        # only needed to create the KubePod object...
        self.api = pykube.HTTPClient(
            pykube.KubeConfig.from_file('~/.kube/config'))

        # start creating our mock ec2 environment
        self.mocks = [moto.mock_ec2(), moto.mock_autoscaling()]
        for moto_mock in self.mocks:
            moto_mock.start()

        client = boto3.client('autoscaling')
        self.asg_client = client

        client.create_launch_configuration(LaunchConfigurationName='dummy-lc',
                                           ImageId='ami-deadbeef',
                                           KeyName='dummy-key',
                                           SecurityGroups=[
                                               'sg-cafebeef',
                                           ],
                                           InstanceType='t2.medium')

        client.create_auto_scaling_group(AutoScalingGroupName='dummy-asg',
                                         LaunchConfigurationName='dummy-lc',
                                         MinSize=0,
                                         MaxSize=10,
                                         VPCZoneIdentifier='subnet-beefbeef',
                                         Tags=[{
                                             'Key': 'KubernetesCluster',
                                             'Value': 'dummy-cluster',
                                             'PropagateAtLaunch': True
                                         }, {
                                             'Key': 'KubernetesRole',
                                             'Value': 'worker',
                                             'PropagateAtLaunch': True
                                         }])

        # finally our cluster
        self.cluster = Cluster(aws_access_key='',
                               aws_secret_key='',
                               regions=['us-west-2', 'us-east-1', 'us-west-1'],
                               kubeconfig='~/.kube/config',
                               idle_threshold=60,
                               instance_init_time=60,
                               type_idle_threshold=60,
                               cluster_name='dummy-cluster',
                               slack_hook='',
                               dry_run=False)

    def tearDown(self):
        for moto_mock in self.mocks:
            moto_mock.stop()

    def _spin_up_node(self):
        # spin up dummy ec2 node
        self.asg_client.set_desired_capacity(AutoScalingGroupName='dummy-asg',
                                             DesiredCapacity=1)
        response = self.asg_client.describe_auto_scaling_groups()
        instance_id = response['AutoScalingGroups'][0]['Instances'][0][
            'InstanceId']

        self.dummy_node['metadata']['labels']['aws/id'] = instance_id
        node = KubeNode(pykube.Node(self.api, self.dummy_node))
        node.cordon = mock.Mock(return_value="mocked stuff")
        node.drain = mock.Mock(return_value="mocked stuff")
        node.uncordon = mock.Mock(return_value="mocked stuff")
        node.delete = mock.Mock(return_value="mocked stuff")
        return node

    def test_scale_up_selector(self):
        self.dummy_pod['spec']['nodeSelector'] = {'aws/type': 'm4.large'}
        pod = KubePod(pykube.Pod(self.api, self.dummy_pod))
        selectors_hash = utils.selectors_to_hash(pod.selectors)
        asgs = self.cluster.autoscaling_groups.get_all_groups([])
        self.cluster.fulfill_pending(asgs, selectors_hash, [pod])

        response = self.asg_client.describe_auto_scaling_groups()
        self.assertEqual(len(response['AutoScalingGroups']), 1)
        self.assertEqual(response['AutoScalingGroups'][0]['DesiredCapacity'],
                         0)

    def test_scale_up(self):
        pod = KubePod(pykube.Pod(self.api, self.dummy_pod))
        selectors_hash = utils.selectors_to_hash(pod.selectors)
        asgs = self.cluster.autoscaling_groups.get_all_groups([])
        self.cluster.fulfill_pending(asgs, selectors_hash, [pod])

        response = self.asg_client.describe_auto_scaling_groups()
        self.assertEqual(len(response['AutoScalingGroups']), 1)
        self.assertGreater(response['AutoScalingGroups'][0]['DesiredCapacity'],
                           0)

    def test_scale_down(self):
        """
        kube node with daemonset and no pod --> cordon
        """
        node = self._spin_up_node()
        all_nodes = [node]
        managed_nodes = [n for n in all_nodes if node.is_managed()]
        running_insts_map = self.cluster.get_running_instances_map(
            managed_nodes)
        pods_to_schedule = {}
        asgs = self.cluster.autoscaling_groups.get_all_groups(all_nodes)

        ds_pod = KubePod(pykube.Pod(self.api, self.dummy_ds_pod))
        running_or_pending_assigned_pods = [ds_pod]

        self.cluster.idle_threshold = -1
        self.cluster.type_idle_threshold = -1
        self.cluster.maintain(managed_nodes, running_insts_map,
                              pods_to_schedule,
                              running_or_pending_assigned_pods, asgs)

        response = self.asg_client.describe_auto_scaling_groups()
        self.assertEqual(len(response['AutoScalingGroups']), 1)
        self.assertEqual(response['AutoScalingGroups'][0]['DesiredCapacity'],
                         1)
        node.cordon.assert_called_once_with()

    def test_scale_down_grace_period(self):
        """
        kube node with daemonset and no pod + grace period --> noop
        """
        node = self._spin_up_node()
        all_nodes = [node]
        managed_nodes = [n for n in all_nodes if node.is_managed()]
        running_insts_map = self.cluster.get_running_instances_map(
            managed_nodes)
        pods_to_schedule = {}
        asgs = self.cluster.autoscaling_groups.get_all_groups(all_nodes)

        # kube node with daemonset and no pod --> cordon
        ds_pod = KubePod(pykube.Pod(self.api, self.dummy_ds_pod))
        running_or_pending_assigned_pods = [ds_pod]

        self.cluster.maintain(managed_nodes, running_insts_map,
                              pods_to_schedule,
                              running_or_pending_assigned_pods, asgs)

        response = self.asg_client.describe_auto_scaling_groups()
        self.assertEqual(len(response['AutoScalingGroups']), 1)
        self.assertEqual(response['AutoScalingGroups'][0]['DesiredCapacity'],
                         1)
        node.cordon.assert_not_called()

    def test_scale_down_busy(self):
        """
        kube node with daemonset and pod/rc-pod --> noop
        """
        node = self._spin_up_node()
        all_nodes = [node]
        managed_nodes = [n for n in all_nodes if node.is_managed()]
        running_insts_map = self.cluster.get_running_instances_map(
            managed_nodes)
        pods_to_schedule = {}
        asgs = self.cluster.autoscaling_groups.get_all_groups(all_nodes)

        # kube node with daemonset and pod --> noop
        ds_pod = KubePod(pykube.Pod(self.api, self.dummy_ds_pod))
        pod = KubePod(pykube.Pod(self.api, self.dummy_pod))
        rc_pod = KubePod(pykube.Pod(self.api, self.dummy_rc_pod))

        pod_scenarios = [
            # kube node with daemonset and pod --> noop
            [ds_pod, pod],
            # kube node with daemonset and rc pod --> noop
            [ds_pod, rc_pod]
        ]

        # make sure we're not on grace period
        self.cluster.idle_threshold = -1
        self.cluster.type_idle_threshold = -1

        for pods in pod_scenarios:
            state = self.cluster.get_node_state(node, asgs[0], pods,
                                                pods_to_schedule,
                                                running_insts_map,
                                                collections.Counter())
            self.assertEqual(state, ClusterNodeState.BUSY)

            self.cluster.maintain(managed_nodes, running_insts_map,
                                  pods_to_schedule, pods, asgs)

            response = self.asg_client.describe_auto_scaling_groups()
            self.assertEqual(len(response['AutoScalingGroups']), 1)
            self.assertEqual(
                response['AutoScalingGroups'][0]['DesiredCapacity'], 1)
            node.cordon.assert_not_called()

    def test_scale_down_under_utilized_undrainable(self):
        """
        kube node with daemonset and pod/rc-pod --> noop
        """
        node = self._spin_up_node()
        all_nodes = [node]
        managed_nodes = [n for n in all_nodes if node.is_managed()]
        running_insts_map = self.cluster.get_running_instances_map(
            managed_nodes)
        pods_to_schedule = {}
        asgs = self.cluster.autoscaling_groups.get_all_groups(all_nodes)

        # create some undrainable pods
        ds_pod = KubePod(pykube.Pod(self.api, self.dummy_ds_pod))
        for container in self.dummy_pod['spec']['containers']:
            container.pop('resources', None)
        pod = KubePod(pykube.Pod(self.api, self.dummy_pod))
        self.dummy_rc_pod['metadata']['labels']['openai/do-not-drain'] = 'true'
        for container in self.dummy_rc_pod['spec']['containers']:
            container.pop('resources', None)
        rc_pod = KubePod(pykube.Pod(self.api, self.dummy_rc_pod))

        pod_scenarios = [
            # kube node with daemonset and pod with no resource ask --> noop
            [ds_pod, pod],
            # kube node with daemonset and critical rc pod --> noop
            [ds_pod, rc_pod]
        ]

        # make sure we're not on grace period
        self.cluster.idle_threshold = -1
        self.cluster.type_idle_threshold = -1

        for pods in pod_scenarios:
            state = self.cluster.get_node_state(node, asgs[0], pods,
                                                pods_to_schedule,
                                                running_insts_map,
                                                collections.Counter())
            self.assertEqual(state,
                             ClusterNodeState.UNDER_UTILIZED_UNDRAINABLE)

            self.cluster.maintain(managed_nodes, running_insts_map,
                                  pods_to_schedule, pods, asgs)

            response = self.asg_client.describe_auto_scaling_groups()
            self.assertEqual(len(response['AutoScalingGroups']), 1)
            self.assertEqual(
                response['AutoScalingGroups'][0]['DesiredCapacity'], 1)
            node.cordon.assert_not_called()

    def test_scale_down_under_utilized_drainable(self):
        """
        kube node with daemonset and rc-pod --> cordon+drain
        """
        node = self._spin_up_node()
        all_nodes = [node]
        managed_nodes = [n for n in all_nodes if node.is_managed()]
        running_insts_map = self.cluster.get_running_instances_map(
            managed_nodes)
        pods_to_schedule = {}
        asgs = self.cluster.autoscaling_groups.get_all_groups(all_nodes)

        # create some undrainable pods
        ds_pod = KubePod(pykube.Pod(self.api, self.dummy_ds_pod))
        for container in self.dummy_rc_pod['spec']['containers']:
            container.pop('resources', None)
        rc_pod = KubePod(pykube.Pod(self.api, self.dummy_rc_pod))
        pods = [ds_pod, rc_pod]

        # make sure we're not on grace period
        self.cluster.idle_threshold = -1
        self.cluster.type_idle_threshold = -1

        state = self.cluster.get_node_state(node, asgs[0], pods,
                                            pods_to_schedule,
                                            running_insts_map,
                                            collections.Counter())
        self.assertEqual(state, ClusterNodeState.UNDER_UTILIZED_DRAINABLE)

        self.cluster.maintain(managed_nodes, running_insts_map,
                              pods_to_schedule, pods, asgs)

        response = self.asg_client.describe_auto_scaling_groups()
        self.assertEqual(len(response['AutoScalingGroups']), 1)
        self.assertEqual(response['AutoScalingGroups'][0]['DesiredCapacity'],
                         1)
        node.cordon.assert_called_once_with()
        node.drain.assert_called_once_with(pods)
Example #17
0
def main(provider_name, timer, scale_up_cap, scale_down_cap, scale_max,
         scale_min, endpoint_path, azure_subscription_id, azure_tenant_id,
         azure_client_id, azure_client_secret, azure_location,
         azure_resource_group, azure_vmss_name, verbose):
    #Logger settings
    logger_handler = logging.StreamHandler(sys.stderr)
    loginformater = '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
    logger_handler.setFormatter(logging.Formatter(loginformater))
    LOGGER.addHandler(logger_handler)
    LOGGER.setLevel(DEBUG_LOGGING_MAP.get(verbose, logging.CRITICAL))

    LOGGER.debug("Debug mode activated")

    #os.environ.get('DATABASE_NAME', '')

    if not os.environ.get('AS_PROVIDER_NAME', provider_name):
        LOGGER.error("Provider not specified, ex : --provider-name Azure")
        sys.exit(1)

    LOGGER.debug("Provider Name : " +
                 str(os.environ.get('AS_PROVIDER_NAME', provider_name)))
    LOGGER.debug("Timer : " + str(os.environ.get('AS_TIMER', timer)))
    LOGGER.debug("Scale Up Cap : " +
                 str(os.environ.get('AS_SCALE_UP_MAX', scale_up_cap)))
    LOGGER.debug("Scale Down Cap : " +
                 str(os.environ.get('AS_SCALE_DOWN_MAX', scale_down_cap)))
    LOGGER.debug("Maximum Nodes : " +
                 str(os.environ.get('AS_SCALE_MAX', scale_max)))
    LOGGER.debug("Minimum Nodes : " +
                 str(os.environ.get('AS_SCALE_MIN', scale_min)))
    LOGGER.debug("EndPoint Path : " +
                 str(os.environ.get('AS_ENDPOINT', endpoint_path)))
    LOGGER.debug(
        "Azure Subscription ID : " +
        str(os.environ.get('AZURE_SUBSCRIPTION_ID', azure_subscription_id)))
    LOGGER.debug("Azure Tenant ID : " +
                 str(os.environ.get('AZURE_TENANT_ID', azure_tenant_id)))
    LOGGER.debug("Azure Client ID : " +
                 str(os.environ.get('AZURE_CLIENT_ID', azure_client_id)))
    LOGGER.debug(
        "Azure Client Secret : " +
        str(os.environ.get('AZURE_CLIENT_SECRET', azure_client_secret)))
    LOGGER.debug("Azure Resource Group : " +
                 str(os.environ.get('AZURE_RG', azure_resource_group)))
    LOGGER.debug("Azure Location : " +
                 str(os.environ.get('AZURE_LOCATION', azure_location)))
    LOGGER.debug("Azure VMSS Targeted : " +
                 str(os.environ.get('AZURE_VMSS', azure_vmss_name)))

    LOGGER.info("DC/OS Autoscaler Started")

    cluster = Cluster(
        provider_name=os.environ.get('AS_PROVIDER_NAME', provider_name),
        scale_up_cap=os.environ.get('AS_SCALE_UP_MAX', scale_up_cap),
        scale_down_cap=os.environ.get('AS_SCALE_DOWN_MAX', scale_down_cap),
        scale_max=os.environ.get('AS_SCALE_MAX', scale_max),
        scale_min=os.environ.get('AS_SCALE_MIN', scale_min),
        endpoint_path=os.environ.get('AS_ENDPOINT', endpoint_path),
        azure_subscription_id=os.environ.get('AZURE_SUBSCRIPTION_ID',
                                             azure_subscription_id),
        azure_tenant_id=os.environ.get('AZURE_TENANT_ID', azure_tenant_id),
        azure_client_id=os.environ.get('AZURE_CLIENT_ID', azure_client_id),
        azure_client_secret=os.environ.get('AZURE_CLIENT_SECRET',
                                           azure_client_secret),
        azure_location=os.environ.get('AZURE_LOCATION', azure_location),
        azure_resource_group=os.environ.get('AZURE_RG', azure_resource_group),
        azure_vmss_name=os.environ.get('AZURE_VMSS', azure_vmss_name))
    while True:
        metrics = {
            "totalCPU": 0,
            "totalMEM": 0,
            "usedCPU": 0,
            "usedMEM": 0,
            "ratioCPU": 0,
            "ratioMEM": 0,
            "nbNodes": 0
        }
        cluster.check_health(metrics)
        LOGGER.info("Total Cluster CPU = " + str(metrics["totalCPU"]) +
                    " - Total Cluster CPU = " + str(metrics["totalMEM"]))
        LOGGER.info("Total Used CPU = " + str(metrics["usedCPU"]) +
                    " - Total Cluster MEM = " + str(metrics["usedMEM"]))
        LOGGER.info("Ratio CPU = " + str(metrics["ratioCPU"]) +
                    "% - Ratio MEM = " + str(metrics["ratioMEM"]) + "%")
        if cluster.decide_to_scale(metrics) == 1:
            LOGGER.info("Scale Up Kicked ... In Progress")
            cluster.scale_cluster_up(metrics)
        if cluster.decide_to_scale(metrics) == -1:
            LOGGER.info("Scale Down Kicked... In Progress")
            cluster.scale_cluster_down(metrics)
        time.sleep(timer)

    LOGGER.info("DC/OS Autoscaler Stopped")
Example #18
0
def main(container_service_name, resource_group, sleep, kubeconfig,
         service_principal_app_id, service_principal_secret, service_principal_tenant_id,
         datadog_api_key,idle_threshold, spare_agents,
         template_file, parameters_file, template_file_url, parameters_file_url,
         over_provision, instance_init_time, no_scale, no_maintenance,
         slack_hook, slack_bot_token, dry_run, verbose, debug):
    logger_handler = logging.StreamHandler(sys.stderr)
    logger_handler.setFormatter(logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s'))
    logger.addHandler(logger_handler)
    logger.setLevel(DEBUG_LOGGING_MAP.get(verbose, logging.CRITICAL))

    if not (service_principal_app_id and service_principal_secret and service_principal_tenant_id):
        logger.error("Missing Azure credentials. Please provide aws-service_principal_app_id, service_principal_secret and service_principal_tenant_id.")
        sys.exit(1)
    
    if (template_file and not parameters_file) or (not template_file and parameters_file):
        logger.error("Both --template-file and --parameters-file should be provided when running on acs-engine")
        sys.exit(1)
    
    if (template_file and template_file_url):
        logger.error('--template-file and --template-file-url are mutually exclusive.')
        sys.exit(1)
    
    if (parameters_file and parameters_file_url):
        logger.error('--parameters-file and --parameters-file-url are mutually exclusive.')
        sys.exit(1)
        
    if template_file and container_service_name:
        logger.error("--template-file and --container-service-name cannot be specified simultaneously. Provide --container-service-name when running on ACS, or --template-file and --parameters-file when running on acs-engine")
        sys.exit(1)
        
    notifier = None
    if slack_hook and slack_bot_token:
        notifier = Notifier(slack_hook, slack_bot_token)
    
    cluster = Cluster(service_principal_app_id=service_principal_app_id,
                      service_principal_secret=service_principal_secret,
                      service_principal_tenant_id=service_principal_tenant_id,
                      kubeconfig=kubeconfig,
                      template_file=template_file,
                      template_file_url=template_file_url,
                      parameters_file_url=parameters_file_url,
                      parameters_file=parameters_file,
                      idle_threshold=idle_threshold,
                      instance_init_time=instance_init_time,
                      spare_agents=spare_agents,
                      container_service_name=container_service_name,
                      resource_group=resource_group,
                      scale_up=not no_scale,
                      maintainance=not no_maintenance,
                      over_provision=over_provision,
                      datadog_api_key=datadog_api_key,
                      notifier=notifier,
                      dry_run=dry_run,
                      )    
    backoff = sleep
    while True:
        scaled = cluster.scale_loop(debug)
        if scaled:
            time.sleep(sleep)
            backoff = sleep
        else:
            logger.warn("backoff: %s" % backoff)
            backoff *= 2
            time.sleep(backoff)
    def setUp(self):
        # load dummy kube specs
        dir_path = os.path.dirname(os.path.realpath(__file__))
        with open(os.path.join(dir_path, 'data/busybox.yaml'), 'r') as f:
            self.dummy_pod = yaml.load(f.read())
        with open(os.path.join(dir_path, 'data/ds-pod.yaml'), 'r') as f:
            self.dummy_ds_pod = yaml.load(f.read())
        with open(os.path.join(dir_path, 'data/rc-pod.yaml'), 'r') as f:
            self.dummy_rc_pod = yaml.load(f.read())
        with open(os.path.join(dir_path, 'data/node.yaml'), 'r') as f:
            self.dummy_node = yaml.load(f.read())
            for condition in self.dummy_node['status']['conditions']:
                if condition['type'] == 'Ready' and condition[
                        'status'] == 'True':
                    condition['lastHeartbeatTime'] = datetime.now(
                        condition['lastHeartbeatTime'].tzinfo)
            # Convert timestamps to strings to match PyKube
            for condition in self.dummy_node['status']['conditions']:
                condition['lastHeartbeatTime'] = datetime.isoformat(
                    condition['lastHeartbeatTime'])
                condition['lastTransitionTime'] = datetime.isoformat(
                    condition['lastTransitionTime'])

        # this isn't actually used here
        # only needed to create the KubePod object...
        self.api = pykube.HTTPClient(
            pykube.KubeConfig.from_file('~/.kube/config'))

        # start creating our mock ec2 environment
        self.mocks = [moto.mock_ec2(), moto.mock_autoscaling()]
        for moto_mock in self.mocks:
            moto_mock.start()

        client = boto3.client('autoscaling', region_name='us-west-2')
        self.asg_client = client

        client.create_launch_configuration(LaunchConfigurationName='dummy-lc',
                                           ImageId='ami-deadbeef',
                                           KeyName='dummy-key',
                                           SecurityGroups=[
                                               'sg-cafebeef',
                                           ],
                                           InstanceType='t2.medium')

        client.create_auto_scaling_group(AutoScalingGroupName='dummy-asg',
                                         LaunchConfigurationName='dummy-lc',
                                         MinSize=0,
                                         MaxSize=10,
                                         VPCZoneIdentifier='subnet-beefbeef',
                                         Tags=[{
                                             'Key': 'KubernetesCluster',
                                             'Value': 'dummy-cluster',
                                             'PropagateAtLaunch': True
                                         }, {
                                             'Key': 'KubernetesRole',
                                             'Value': 'worker',
                                             'PropagateAtLaunch': True
                                         }])

        # finally our cluster
        self.cluster = Cluster(
            aws_access_key='fake',
            aws_secret_key='fake',
            aws_regions=['us-west-2', 'us-east-1', 'us-west-1'],
            azure_client_id='',
            azure_client_secret='',
            azure_subscription_id='',
            azure_tenant_id='',
            azure_resource_group_names=[],
            azure_slow_scale_classes=[],
            kubeconfig='~/.kube/config',
            pod_namespace=None,
            drain_utilization_below=0.3,
            idle_threshold=60,
            instance_init_time=60,
            type_idle_threshold=60,
            cluster_name='dummy-cluster',
            notifier=mock.Mock(),
            dry_run=False)