def test_poll_cluster_matching_cluster_and_service_name(self, mock_boto): mock_client = mock_boto.return_value mock_client.describe_services.return_value = GOOD_SERVICE mock_client.list_tasks.return_value = TASKS mock_client.describe_tasks.return_value = GOOD_TASKS ecs_utils.poll_cluster_state(mock_client, 'service-foo', ['service-foo'], POLL_S)
def test_poll_cluster_new_arn(self, mock_boto): mock_client = mock_boto.return_value mock_client.describe_services.return_value = GOOD_SERVICE mock_client.list_tasks.return_value = TASKS mock_client.describe_tasks.return_value = GOOD_TASKS ecs_utils.poll_cluster_state(mock_client, 'cluster-foo', ['cluster-foo/service-foo'], POLL_S)
def test_poll_cluster_bad_tasks(self, mock_boto, mock_print_events): mock_client = mock_boto.return_value mock_client.describe_services.return_value = GOOD_SERVICE mock_client.list_tasks.return_value = TASKS mock_client.describe_tasks.return_value = BAD_TASKS with self.assertRaises(ecs_utils.TimeoutException): ecs_utils.poll_cluster_state(mock_client, 'cluster-foo', ['service-foo'], POLL_S)
def test_poll_cluster_with_inactive_service(self, mock_boto): mock_client = mock_boto.return_value mock_client.describe_services.return_value = INACTIVE_SERVICE mock_client.list_tasks.return_value = EMPTY_TASKS mock_client.describe_tasks.side_effect = Exception( 'Tasks cannot be empty.') ecs_utils.poll_cluster_state(mock_client, 'cluster-foo', ['service-foo'], POLL_S)
def rolling_replace_instances(ecs, ec2, cluster_name, batches, ami_id, force, drain_timeout_s): replace_start_time = time.time() services = get_services(ecs, cluster_name) if not services: raise RollingException('No services found in cluster. exiting.') utils.print_info( f'Checking cluster {cluster_name}, services {str(services)} are stable' ) ecs_utils.poll_cluster_state( ecs, cluster_name, services, polling_timeout=120 ) instances = get_container_instance_arns(ecs, cluster_name) # batches determines the number of instances you want to replace at once. # Choose conservatively, as this process temporarily reduces your capacity. # But note each batch can be time consuming (up to 10m per batch) batch_count = math.ceil(len(instances) / batches) utils.print_info(f'You have {len(instances)} instances.') utils.print_info(f'Terminating in batches of {batch_count}') if len(instances) <= batch_count: utils.print_warning( f'Terminating {batch_count} instances will cause downtime.' ) if not force: raise RollingException('Quitting, use --force to over-ride.') instance_batches = batch_instances(instances, batch_count) for to_drain in instance_batches: if len(to_drain) > 100: utils.print_error('Batch size exceeded 100, try using more batches.') raise RollingException( f'Quitting, batch size exceeded 100: {batch_count}.' ) response = ecs.describe_container_instances( cluster=cluster_name, containerInstances=to_drain) if not response.get('containerInstances'): raise RollingException('No containerInstances found.') # don't drain or teriminate any instances that are already up to date # (if the user provided the --ami-id flag) done_instances = get_already_updated_instances(response, ami_id) if len(done_instances) == len(to_drain): # move on if the whole batch is already up to date continue # drain instances in this batch ecs.update_container_instances_state(cluster=cluster_name, status='DRAINING', containerInstances=to_drain) utils.print_info(f'Wait for drain to complete with {drain_timeout_s}s timeout...') start_time = time.time() while len(done_instances) < len(to_drain): if (time.time() - start_time) > drain_timeout_s: raise RollingTimeoutException('Waiting for instance to complete draining. Giving up.') time.sleep(SLEEP_TIME_S) response = ecs.describe_container_instances( cluster=cluster_name, containerInstances=to_drain) for container_instance in response.get('containerInstances'): instance_id = container_instance.get('ec2InstanceId') running_tasks = container_instance.get('runningTasksCount') if running_tasks > 0: PRINT_PROGRESS() continue if instance_id not in done_instances: utils.print_info(f'{instance_id} is drained, terminate!') ec2.terminate_instances(InstanceIds=[instance_id]) done_instances.append(instance_id) # new instance will take as much as 10m to go into service # then we wait for ECS to resume a steady state before moving on ecs_utils.poll_cluster_state(ecs, cluster_name, services, polling_timeout=drain_timeout_s) utils.print_success(f'EC2 instance replacement process complete! {int(time.time() - replace_start_time)}s elapsed')