def handle_logs(tasks: list, clean_stale_log_groups: bool = False) -> None: # Prepare log groups defined_log_group_names = _get_defined_log_group_names(tasks) existing_log_group_names = _get_existing_log_group_names() # Create missing log groups missing_log_group_names = defined_log_group_names.difference( existing_log_group_names) _create_missing_log_groups(missing_log_group_names) # Set retention policy to 7 days. retention_days = 7 # TODO: Should be configurable for log_group_name in defined_log_group_names: logger.info('Setting retention days to %s for log group: %s ', retention_days, log_group_name) response = logs_client.put_retention_policy( logGroupName=log_group_name, retentionInDays=retention_days) logger.debug('Set retetion days to %s. Response: %s', retention_days, response) # Clean stale log groups. if clean_stale_log_groups: stale_log_group_names = existing_log_group_names.difference( defined_log_group_names) _clean_stale_log_groups(stale_log_group_names)
def _register_scalable_target(scale_dict: dict, resource_id: str) -> None: logger.info('Registering service as a scalable target: %s', resource_id) response = scaling_client.register_scalable_target( ServiceNamespace='ecs', ResourceId=resource_id, ScalableDimension='ecs:service:DesiredCount', MinCapacity=scale_dict['MinCapacity'], MaxCapacity=scale_dict['MaxCapacity'], ) logger.debug('Registered service as a scalable target details: %s', response)
def _deregister_scalable_target(resource_id: str) -> None: try: response = scaling_client.deregister_scalable_target( ServiceNamespace='ecs', ResourceId=resource_id, ScalableDimension='ecs:service:DesiredCount', ) logger.info('Deregistered service as a scalable target: %s', resource_id) logger.debug('Service deregistration response: %s', response) except ClientError as e: if e.response['Error']['Code'] == 'ObjectNotFoundException': logger.debug('No need to deregister..')
def _clean_stale_policies(services: list, existing_policies: list) -> None: for existing_policy_dict in existing_policies: if not _is_stale_policy(existing_policy_dict, services): continue logger.info('Removing state policy: %s', existing_policy_dict['PolicyName']) response = scaling_client.delete_scaling_policy( PolicyName=existing_policy_dict['PolicyName'], ServiceNamespace='ecs', ResourceId=existing_policy_dict['ResourceId'], ScalableDimension='ecs:service:DesiredCount' ) logger.debug('Removed stale policy details: %s', response)
def _update_services(services: list) -> None: for service_dict in services: logger.info('Updating service: %s', service_dict['serviceName']) # If a task revision is not specified, the latest ACTIVE revision is used. response = ecs_client.update_service( cluster=service_dict['cluster'], service=service_dict['serviceName'], desiredCount=service_dict['desiredCount'], taskDefinition=service_dict['taskDefinition'], deploymentConfiguration=service_dict.get('deploymentConfiguration', {}), forceNewDeployment=True, ) logger.debug('Updated service details: %s', response)
def _create_missing_metrics(missing_metrics: list) -> None: for missing_metric in missing_metrics: logger.info('Creating metric: %s', missing_metric['MetricStat']['Metric']['MetricName']) response = cloudwatch_client.put_metric_data( Namespace=missing_metric['MetricStat']['Metric']['Namespace'], MetricData=[ { 'MetricName': missing_metric['MetricStat']['Metric']['MetricName'], 'Timestamp': datetime.utcnow(), 'Value': 0, 'Unit': missing_metric['MetricStat']['Unit'] }, ] ) logger.debug('Created metric details: %s', response)
def _clean_stale_alarms(existing_alarms: list, services: list) -> None: # Extract alarm name definitions to a list. defined_alarm_names = [sd['alarm']['AlarmName'] for sd in services] # If an alarm exist on Cloudwatch but is not defined now, it is an stale alarm, anymore. stale_alarm_names = [ ea['AlarmName'] for ea in existing_alarms if ea['AlarmName'] not in defined_alarm_names ] # Batch delete for stale alarms. if stale_alarm_names: logger.info('Deleting stale alarms: %s', stale_alarm_names) response = cloudwatch_client.delete_alarms( AlarmNames=stale_alarm_names) logger.debug('Deleted stale alarms details: %s', response)
def clean_stale_tasks() -> None: """ Clean stale tasks. Leave only active tasks those are used by services. """ service_task_definitions = _retrieve_service_task_definitions() all_task_definitions = [] # type: List[Dict[str, Any]] next_token = None while True: if next_token: resp = ecs_client.list_task_definitions(status='ACTIVE', maxResults=100, nextToken=next_token) else: resp = ecs_client.list_task_definitions(status='ACTIVE', maxResults=100) all_task_definitions.extend(resp['taskDefinitionArns']) next_token = resp.get('nextToken') # All task definitions are loaded. if not next_token: break for task_definition in all_task_definitions: if task_definition not in service_task_definitions: logger.info('Deregistering task definition %s', task_definition) slept = 0 while True: try: response = ecs_client.deregister_task_definition(taskDefinition=task_definition) logger.debug('Deregistered stale task: %s', response) except ClientError as e: if e.response['Error']['Code'] == 'ThrottlingException': logger.info('Request is throttled. Waiting...') time.sleep(5) slept += 5 else: break # Give up trying after 20 seconds. if slept >= 20: break logger.info('Cleaned all stale tasks.')
def _delete_log_streams(existing_logs: list, failed_container_ids: list, days_ago: int) -> None: """ Delete log streams that are older than given days_ago and their container are not running anymore. """ deleted_count = 0 for existing_log in existing_logs: existing_log_streams = existing_log['log_streams'] existing_log_group_name = existing_log['log_group_name'] for existing_log_stream in existing_log_streams: # Example log stream name: 'ecs/container-foo/XXXXXXX-YYYY-WWWW-ZZZZ-XXXXXXXX' container_id = existing_log_stream['logStreamName'].split('/')[-1] if container_id not in failed_container_ids: continue last_event_time = existing_log_stream.get( 'lastEventTimestamp', existing_log_stream.get('creationTime')) if not last_event_time: logger.warning( 'Neither creation time nor last event time is known! %s', existing_log_stream) continue # AWS returns timestamp in milliseconds. last_event_datetime = datetime.fromtimestamp(last_event_time / 1000) if datetime.utcnow() - timedelta( days=days_ago) < last_event_datetime: continue logger.info( 'Deleting log stream: %s of log group %s. Last event time: %s', existing_log_stream['logStreamName'], existing_log_group_name, last_event_datetime) response = logs_client.delete_log_stream( logGroupName=existing_log_group_name, logStreamName=existing_log_stream['logStreamName'], ) deleted_count += 1 logger.debug('Deleted log stream: %s', response) logger.info('Deleted %s stale log streams.', deleted_count)
def _create_missing_services(services: list) -> None: # Get existing services existing_services = _get_existing_services(services) # Find missing services. missing_services = [ s for s in services if s['serviceName'] not in existing_services ] # Create missing services. for service_dict in missing_services: logger.info('Creating service: %s', service_dict['serviceName']) params = { 'cluster': service_dict['cluster'], 'serviceName': service_dict['serviceName'], 'taskDefinition': service_dict['taskDefinition'], 'desiredCount': service_dict['desiredCount'], 'launchType': service_dict['launchType'], 'schedulingStrategy': service_dict['schedulingStrategy'], 'deploymentController': service_dict['deploymentController'], 'loadBalancers': service_dict.get('loadBalancers', []), 'placementConstraints': service_dict.get('placementConstraints', []), 'placementStrategy': service_dict.get('placementStrategy', []), 'deploymentConfiguration': service_dict.get('deploymentConfiguration', {}), } if service_dict.get('healthCheckGracePeriodSeconds'): params['healthCheckGracePeriodSeconds'] = service_dict.get( 'healthCheckGracePeriodSeconds') response = ecs_client.create_service(**params) logger.debug('Created service details: %s', response)
def _register_task_definitions(tasks: list) -> None: for task_dict in tasks: # Create container definitions. container_definitions = [] for container_definition in task_dict['containerDefinitions']: d = { 'name': container_definition['name'], 'image': container_definition['image'], 'logConfiguration': container_definition['logConfiguration'], 'memoryReservation': container_definition['memoryReservation'], 'cpu': container_definition.get('cpu', 0), 'entryPoint': container_definition.get('entryPoint', []), 'command': container_definition.get('command', []), 'environment': container_definition.get('environment', []), 'portMappings': container_definition.get('portMappings', []), 'ulimits': container_definition.get('ulimits', []), 'mountPoints': container_definition.get('mountPoints', []), 'links': container_definition.get('links', []), } if container_definition.get('memory'): if container_definition['memory'] < container_definition['memoryReservation']: logger.error('memory must be equal or bigger than memoryReservation') raise AeropressException() d['memory'] = container_definition['memory'] container_definitions.append(d) logger.info('Creating task definition: %s', task_dict['family']) response = ecs_client.register_task_definition( family=task_dict['family'], taskRoleArn=task_dict['taskRoleArn'], executionRoleArn=task_dict['executionRoleArn'], networkMode=task_dict['networkMode'], containerDefinitions=container_definitions, requiresCompatibilities=task_dict['requiresCompatibilities'], volumes=task_dict.get('volumes', []), ) logger.debug('Created task definition details: %s', response)
def _create_or_update_all_policies(services: list, existing_policies: list) -> None: for service_dict in services: resource_id = 'service/' + service_dict['cluster'] + '/' + service_dict['serviceName'] if not service_dict.get('scale'): continue if not service_dict['scale'].get('policies'): continue # Create or update the policies. for policy_dict in service_dict['scale']['policies']: logger.info('Crating scaling policy: %s for %s', policy_dict['PolicyName'], resource_id) response = scaling_client.put_scaling_policy( PolicyName=policy_dict['PolicyName'], PolicyType=policy_dict['PolicyType'], ServiceNamespace='ecs', ResourceId=resource_id, ScalableDimension=policy_dict['ScalableDimension'], StepScalingPolicyConfiguration=policy_dict['StepScalingPolicyConfiguration'], ) logger.debug('Created scaling policy details: %s', response)
def create(cluster_name: str) -> None: logger.info('Creating cluster %s', cluster_name) response = ecs_client.create_cluster(clusterName=cluster_name) logger.debug('Created cluster details: %s', response)
def _clean_stale_log_groups(stale_log_group_names: set) -> None: for stale_log_group_name in stale_log_group_names: logger.info('Cleaning stale log group: %s', stale_log_group_name) response = logs_client.delete_log_group( logGroupName=stale_log_group_name) logger.debug('Clean stale log group details: %s', response)
def _create_missing_log_groups(missing_log_group_names: set) -> None: for missing_log_group_name in missing_log_group_names: logger.info('Creating log group: %s', missing_log_group_name) response = logs_client.create_log_group( logGroupName=missing_log_group_name) logger.debug('Created log group details: %s', response)