def main(): argp = argparse.ArgumentParser(description='ZMON AWS Agent') argp.add_argument('-e', '--entity-service', dest='entityservice') argp.add_argument('-r', '--region', dest='region', default=None) argp.add_argument('-j', '--json', dest='json', action='store_true') argp.add_argument('--no-oauth2', dest='disable_oauth2', action='store_true', default=False) argp.add_argument('--postgresql-user', dest='postgresql_user', default=os.environ.get('AGENT_POSTGRESQL_USER')) argp.add_argument('--postgresql-pass', dest='postgresql_pass', default=os.environ.get('AGENT_POSTGRESQL_PASS')) args = argp.parse_args() if not args.disable_oauth2: tokens.configure() tokens.manage('uid', ['uid']) tokens.start() logging.basicConfig(level=logging.INFO) # 1. Determine region if not args.region: logger.info('Trying to figure out region..') try: response = requests.get( 'http://169.254.169.254/latest/meta-data/placement/availability-zone', timeout=2) except: logger.exception( 'Region was not specified as a parameter and can not be fetched from instance meta-data!' ) raise region = response.text[:-1] else: region = args.region logger.info('Using region: {}'.format(region)) logger.info('Entity service URL: %s', args.entityservice) logger.info('Reading DNS data for hosted zones') aws.populate_dns_data() aws_account_id = aws.get_account_id(region) infrastructure_account = 'aws:{}'.format( aws_account_id) if aws_account_id else None if not infrastructure_account: logger.error( 'AWS agent: Cannot determine infrastructure account ID. Terminating!' ) return # 2. ZMON entities token = None if args.disable_oauth2 else tokens.get('uid') zmon_client = Zmon(args.entityservice, token=token, user_agent=get_user_agent()) query = { 'infrastructure_account': infrastructure_account, 'region': region, 'created_by': 'agent' } entities = zmon_client.get_entities(query) # 3. Get running apps apps = aws.get_running_apps(region, entities) elbs = [] scaling_groups = [] rds = [] elasticaches = [] dynamodbs = [] sqs = [] new_entities = [] to_be_removed = [] if len(apps) > 0: elbs = aws.get_running_elbs(region, infrastructure_account) scaling_groups = aws.get_auto_scaling_groups(region, infrastructure_account) rds = aws.get_rds_instances(region, infrastructure_account, entities) elasticaches = aws.get_elasticache_nodes(region, infrastructure_account) dynamodbs = aws.get_dynamodb_tables(region, infrastructure_account) certificates = aws.get_certificates(region, infrastructure_account) aws_limits = aws.get_limits(region, infrastructure_account, apps, elbs) sqs = aws.get_sqs_queues(region, infrastructure_account, entities) account_alias = aws.get_account_alias(region) ia_entity = { 'type': 'local', 'infrastructure_account': infrastructure_account, 'account_alias': account_alias, 'region': region, 'id': 'aws-ac[{}:{}]'.format(infrastructure_account, region), 'created_by': 'agent', } application_entities = aws.get_apps_from_entities(apps, infrastructure_account, region) if args.postgresql_user and args.postgresql_pass: postgresql_clusters = zmon_client.get_entities({ 'infrastructure_account': infrastructure_account, 'region': region, 'type': 'postgresql_cluster' }) postgresql_databases = postgresql.get_databases_from_clusters( postgresql_clusters, infrastructure_account, region, args.postgresql_user, args.postgresql_pass) else: # Pretend the list of DBs is empty, but also make sure we don't remove # any pre-existing database entities because we don't know about them. postgresql_databases = [] entities = [ e for e in entities if e.get('type') != 'postgresql_database' ] current_entities = (elbs + scaling_groups + apps + application_entities + rds + postgresql_databases + elasticaches + dynamodbs + certificates + sqs) current_entities.append(aws_limits) current_entities.append(ia_entity) # 4. Removing misssing entities existing_ids = get_existing_ids(entities) current_entities_ids = {e['id'] for e in current_entities} to_be_removed, delete_error_count = remove_missing_entities( existing_ids, current_entities_ids, zmon_client, json=args.json) logger.info( 'Found {} removed entities from {} entities ({} failed)'.format( len(new_entities), len(current_entities), delete_error_count)) # 5. Get new/updated entities new_entities, add_error_count = add_new_entities(current_entities, entities, zmon_client, json=args.json) logger.info('Found {} new entities from {} entities ({} failed)'.format( len(new_entities), len(current_entities), add_error_count)) # 6. Always add Local entity if not args.json: ia_entity['errors'] = { 'delete_count': delete_error_count, 'add_count': add_error_count } try: zmon_client.add_entity(ia_entity) except: logger.exception( 'Failed to add Local entity: {}'.format(ia_entity)) types = { e['type']: len([t for t in new_entities if t['type'] == e['type']]) for e in new_entities } for t, v in types.items(): logger.info('Found {} new entities of type: {}'.format(v, t)) # Check if it is a dry run! if args.json: d = { 'applications': application_entities, 'apps': apps, 'dynamodb': dynamodbs, 'elbs': elbs, 'elc': elasticaches, 'rds': rds, 'certificates': certificates, 'aws_limits': aws_limits, 'sqs_queues': sqs, 'new_entities': new_entities, 'to_be_removed': to_be_removed, } print(json.dumps(d, indent=4))
def test_aws_get_limits(monkeypatch, fail): ec2 = MagicMock() account_attrs = { 'AccountAttributes': [ {'AttributeName': 'max-instances', 'AttributeValues': [{'AttributeValue': '30'}]}, {'AttributeName': 'other', 'AttributeValues': [{'AttributeValue': '20'}]} ] } if not fail: ec2.describe_account_attributes.return_value = account_attrs else: ec2.describe_account_attributes.side_effect = Exception rds = MagicMock() account_attrs = { 'AccountQuotas': [ {'AccountQuotaName': 'ReservedDBInstances', 'Max': 100, 'Used': 10}, {'AccountQuotaName': 'AllocatedStorage', 'Max': 100, 'Used': 10}, {'AccountQuotaName': 'other', 'Max': 100, 'Used': 10}, ] } if not fail: rds.describe_account_attributes.return_value = account_attrs else: rds.describe_account_attributes.side_effect = Exception asg = MagicMock() account_limits = { 'MaxNumberOfAutoScalingGroups': 100, 'MaxNumberOfLaunchConfigurations': 100, 'NumberOfAutoScalingGroups': 20, 'NumberOfLaunchConfigurations': 20, } if not fail: asg.describe_account_limits.return_value = account_limits else: asg.describe_account_limits.side_effect = Exception iam = MagicMock() account_summary = { 'SummaryMap': { 'ServerCertificates': 1, 'ServerCertificatesQuota': 20, 'InstanceProfiles': 10, 'InstanceProfilesQuota': 20, 'Policies': 2, 'PoliciesQuota': 10, } } if not fail: iam.get_account_summary.return_value = account_summary else: iam.get_account_summary.side_effect = Exception boto = get_boto_client(monkeypatch, ec2, rds, asg, iam) apps = [ {'type': 'instance'}, {'type': 'instance'}, {'type': 'instance', 'spot_instance': True}, {'type': 'cassandra'} ] elbs = [{'type': 'elb'}, {'type': 'elb'}, {'type': 'elb'}] limits = aws.get_limits(REGION, ACCOUNT, apps, elbs, [{ 'id': aws.entity_id('aws-limits[{}:{}]'.format(ACCOUNT, REGION)), 'type': 'aws_limits', 'ec2-max-instances': 40, }]) if not fail: expected = { 'asg-max-groups': 100, 'asg-max-launch-configurations': 100, 'asg-used-groups': 20, 'asg-used-launch-configurations': 20, 'created_by': 'agent', 'ec2-max-instances': 30, 'ec2-used-instances': 2, 'ec2-max-spot-instances': 20, 'ec2-used-spot-instances': 1, 'elb-max-count': 20, 'elb-used-count': 3, 'iam-max-instance-profiles': 20, 'iam-max-policies': 10, 'iam-max-server-certificates': 20, 'iam-used-instance-profiles': 10, 'iam-used-policies': 2, 'iam-used-server-certificates': 1, 'id': 'aws-limits[aws:1234:eu-central-1]', 'infrastructure_account': 'aws:1234', 'rds-max-allocated': 100, 'rds-max-reserved': 100, 'rds-used-allocated': 10, 'rds-used-reserved': 10, 'region': 'eu-central-1', 'type': 'aws_limits' } else: expected = { 'created_by': 'agent', 'ec2-max-instances': 40, 'ec2-used-instances': 2, 'ec2-max-spot-instances': 20, 'ec2-used-spot-instances': 1, 'elb-max-count': 20, 'elb-used-count': 3, 'id': 'aws-limits[aws:1234:eu-central-1]', 'infrastructure_account': 'aws:1234', 'region': 'eu-central-1', 'type': 'aws_limits' } assert expected == limits calls = [ call('ec2', region_name=REGION), call('rds', region_name=REGION), call('autoscaling', region_name=REGION), call('iam', region_name=REGION), ] boto.assert_has_calls(calls)
def main(): argp = argparse.ArgumentParser(description='ZMON AWS Agent') argp.add_argument('-e', '--entity-service', dest='entityservice') argp.add_argument('-r', '--region', dest='region', default=None) argp.add_argument('-j', '--json', dest='json', action='store_true') argp.add_argument('-t', '--tracer', dest='tracer', default=os.environ.get('OPENTRACING_TRACER', 'noop')) argp.add_argument('--no-oauth2', dest='disable_oauth2', action='store_true', default=False) argp.add_argument('--postgresql-user', dest='postgresql_user', default=os.environ.get('AGENT_POSTGRESQL_USER')) argp.add_argument('--postgresql-pass', dest='postgresql_pass', default=os.environ.get('AGENT_POSTGRESQL_PASS')) args = argp.parse_args() if not args.disable_oauth2: tokens.configure() tokens.manage('uid', ['uid']) tokens.start() init_opentracing_tracer(args.tracer) root_span = opentracing.tracer.start_span( operation_name='aws_entity_discovery') with root_span: logging.basicConfig(level=logging.INFO) # 0. Fetch extra data for entities entity_extras = {} for ex in os.getenv('EXTRA_ENTITY_FIELDS', '').split(','): if '=' not in ex: continue k, v = ex.split('=', 1) if k and v: entity_extras[k] = v # 1. Determine region if not args.region: logger.info('Trying to figure out region..') try: response = requests.get( 'http://169.254.169.254/latest/meta-data/placement/availability-zone', timeout=2) except Exception: root_span.set_tag('error', True) root_span.log_kv({'exception': traceback.format_exc()}) logger.exception( 'Region was not specified as a parameter and' + 'can not be fetched from instance meta-data!') raise region = response.text[:-1] else: region = args.region root_span.set_tag('region', region) logger.info('Using region: {}'.format(region)) logger.info('Entity service URL: %s', args.entityservice) logger.info('Reading DNS data for hosted zones') aws.populate_dns_data() aws_account_id = aws.get_account_id(region) infrastructure_account = 'aws:{}'.format( aws_account_id) if aws_account_id else None if not infrastructure_account: logger.error( 'AWS agent: Cannot determine infrastructure account ID. Terminating!' ) return root_span.set_tag('account', infrastructure_account) # 2. ZMON entities if not args.disable_oauth2: token = os.getenv('ZMON_TOKEN', None) or tokens.get('uid') zmon_client = Zmon(args.entityservice, token=token, user_agent=get_user_agent()) query = { 'infrastructure_account': infrastructure_account, 'region': region, 'created_by': 'agent' } entities = zmon_client.get_entities(query) # 3. Get running apps apps = aws.get_running_apps(region, entities) elbs = [] scaling_groups = [] rds = [] elasticaches = [] dynamodbs = [] sqs = [] new_entities = [] to_be_removed = [] if len(apps) > 0: elbs = aws.get_running_elbs(region, infrastructure_account) scaling_groups = aws.get_auto_scaling_groups( region, infrastructure_account) rds = aws.get_rds_instances(region, infrastructure_account, entities) elasticaches = aws.get_elasticache_nodes(region, infrastructure_account) dynamodbs = aws.get_dynamodb_tables(region, infrastructure_account) certificates = aws.get_certificates(region, infrastructure_account) aws_limits = aws.get_limits(region, infrastructure_account, apps, elbs, entities) sqs = aws.get_sqs_queues(region, infrastructure_account, entities) postgresql_clusters = postgresql.get_postgresql_clusters( region, infrastructure_account, scaling_groups, apps) account_alias = aws.get_account_alias(region) ia_entity = { 'type': 'local', 'infrastructure_account': infrastructure_account, 'account_alias': account_alias, 'region': region, 'id': 'aws-ac[{}:{}]'.format(infrastructure_account, region), 'created_by': 'agent', } account_alias_prefix = os.getenv('ACCOUNT_ALIAS_PREFIX', None) owner = account_alias if account_alias_prefix: owner = owner.replace(account_alias_prefix, '', 1) root_span.set_tag('team', owner) application_entities = aws.get_apps_from_entities( apps, infrastructure_account, region) if args.postgresql_user and args.postgresql_pass: postgresql_databases = postgresql.get_databases_from_clusters( postgresql_clusters, infrastructure_account, region, args.postgresql_user, args.postgresql_pass) else: # Pretend the list of DBs is empty, but also make sure we don't remove # any pre-existing database entities because we don't know about them. postgresql_databases = [] entities = [ e for e in entities if e.get('type') != 'postgresql_database' ] current_entities = (elbs + scaling_groups + apps + application_entities + rds + postgresql_databases + postgresql_clusters + elasticaches + dynamodbs + certificates + sqs) current_entities.append(aws_limits) current_entities.append(ia_entity) for entity in current_entities: entity.update(entity_extras) # 4. Removing misssing entities existing_ids = get_existing_ids(entities) current_entities_ids = {e['id'] for e in current_entities} to_be_removed, delete_error_count = remove_missing_entities( existing_ids, current_entities_ids, zmon_client, json=args.json) root_span.log_kv({'total_entitites': str(len(current_entities))}) root_span.log_kv({'removed_entities': str(len(to_be_removed))}) logger.info( 'Found {} removed entities from {} entities ({} failed)'.format( len(to_be_removed), len(current_entities), delete_error_count)) # 5. Get new/updated entities new_entities, add_error_count = add_new_entities(current_entities, entities, zmon_client, json=args.json) root_span.log_kv({'new_entities': str(len(new_entities))}) logger.info( 'Found {} new entities from {} entities ({} failed)'.format( len(new_entities), len(current_entities), add_error_count)) # 6. Always add Local entity if not args.json: ia_entity['errors'] = { 'delete_count': delete_error_count, 'add_count': add_error_count } update_local_entity(zmon_client, ia_entity) types = { e['type']: len([t for t in new_entities if t['type'] == e['type']]) for e in new_entities } for t, v in types.items(): logger.info('Found {} new entities of type: {}'.format(v, t)) # Check if it is a dry run! if args.json: d = { 'applications': application_entities, 'apps': apps, 'dynamodb': dynamodbs, 'elbs': elbs, 'elc': elasticaches, 'rds': rds, 'certificates': certificates, 'aws_limits': aws_limits, 'sqs_queues': sqs, 'new_entities': new_entities, 'to_be_removed': to_be_removed, 'posgresql_clusters': postgresql_clusters } print(json.dumps(d, indent=4))
def main(): argp = argparse.ArgumentParser(description='ZMON AWS Agent') argp.add_argument('-e', '--entity-service', dest='entityservice') argp.add_argument('-r', '--region', dest='region', default=None) argp.add_argument('-j', '--json', dest='json', action='store_true') argp.add_argument('-t', '--tracer', dest='tracer', default=os.environ.get('OPENTRACING_TRACER', 'noop')) argp.add_argument('--no-oauth2', dest='disable_oauth2', action='store_true', default=False) argp.add_argument('--postgresql-user', dest='postgresql_user', default=os.environ.get('AGENT_POSTGRESQL_USER')) argp.add_argument('--postgresql-pass', dest='postgresql_pass', default=os.environ.get('AGENT_POSTGRESQL_PASS')) args = argp.parse_args() if not args.disable_oauth2: tokens.configure() tokens.manage('uid', ['uid']) tokens.start() init_opentracing_tracer(args.tracer) root_span = opentracing.tracer.start_span(operation_name='aws_entity_discovery') with root_span: logging.basicConfig(level=logging.INFO) # 0. Fetch extra data for entities entity_extras = {} for ex in os.getenv('EXTRA_ENTITY_FIELDS', '').split(','): if '=' not in ex: continue k, v = ex.split('=', 1) if k and v: entity_extras[k] = v # 1. Determine region if not args.region: logger.info('Trying to figure out region..') try: response = requests.get('http://169.254.169.254/latest/meta-data/placement/availability-zone', timeout=2) except Exception: root_span.set_tag('error', True) root_span.log_kv({'exception': traceback.format_exc()}) logger.exception('Region was not specified as a parameter and' + 'can not be fetched from instance meta-data!') raise region = response.text[:-1] else: region = args.region root_span.set_tag('region', region) logger.info('Using region: {}'.format(region)) logger.info('Entity service URL: %s', args.entityservice) logger.info('Reading DNS data for hosted zones') aws.populate_dns_data() aws_account_id = aws.get_account_id(region) infrastructure_account = 'aws:{}'.format(aws_account_id) if aws_account_id else None if not infrastructure_account: logger.error('AWS agent: Cannot determine infrastructure account ID. Terminating!') return root_span.set_tag('account', infrastructure_account) # 2. ZMON entities if not args.disable_oauth2: token = os.getenv('ZMON_TOKEN', None) or tokens.get('uid') zmon_client = Zmon(args.entityservice, token=token, user_agent=get_user_agent()) query = {'infrastructure_account': infrastructure_account, 'region': region, 'created_by': 'agent'} entities = zmon_client.get_entities(query) # 3. Get running apps apps = aws.get_running_apps(region, entities) elbs = [] scaling_groups = [] elastigroups = [] certificates = [] rds = [] elasticaches = [] dynamodbs = [] sqs = [] postgresql_clusters = [] aws_limits = [] new_entities = [] to_be_removed = [] if len(apps) > 0: elbs = aws.get_running_elbs(region, infrastructure_account) scaling_groups = aws.get_auto_scaling_groups(region, infrastructure_account) elastigroups = elastigroup.get_elastigroup_entities(region, infrastructure_account) rds = aws.get_rds_instances(region, infrastructure_account, entities) elasticaches = aws.get_elasticache_nodes(region, infrastructure_account) dynamodbs = aws.get_dynamodb_tables(region, infrastructure_account) certificates = aws.get_certificates(region, infrastructure_account) aws_limits = aws.get_limits(region, infrastructure_account, apps, elbs, entities) sqs = aws.get_sqs_queues(region, infrastructure_account, entities) postgresql_clusters = postgresql.get_postgresql_clusters(region, infrastructure_account, scaling_groups, apps) account_alias = aws.get_account_alias(region) ia_entity = { 'type': 'local', 'infrastructure_account': infrastructure_account, 'account_alias': account_alias, 'region': region, 'id': 'aws-ac[{}:{}]'.format(infrastructure_account, region), 'created_by': 'agent', } account_alias_prefix = os.getenv('ACCOUNT_ALIAS_PREFIX', None) owner = account_alias if account_alias_prefix: owner = owner.replace(account_alias_prefix, '', 1) root_span.set_tag('team', owner) application_entities = aws.get_apps_from_entities(apps, infrastructure_account, region) if args.postgresql_user and args.postgresql_pass: postgresql_databases = postgresql.get_databases_from_clusters(postgresql_clusters, infrastructure_account, region, args.postgresql_user, args.postgresql_pass) else: # Pretend the list of DBs is empty, but also make sure we don't remove # any pre-existing database entities because we don't know about them. postgresql_databases = [] entities = [e for e in entities if e.get('type') != 'postgresql_database'] current_entities = ( elbs + scaling_groups + elastigroups + apps + application_entities + rds + postgresql_databases + postgresql_clusters + elasticaches + dynamodbs + certificates + sqs) current_entities.append(aws_limits) current_entities.append(ia_entity) for entity in current_entities: entity.update(entity_extras) # 4. Removing misssing entities existing_ids = get_existing_ids(entities) current_entities_ids = {e['id'] for e in current_entities} to_be_removed, delete_error_count = remove_missing_entities( existing_ids, current_entities_ids, zmon_client, json=args.json) root_span.log_kv({'total_entitites': str(len(current_entities))}) root_span.log_kv({'removed_entities': str(len(to_be_removed))}) logger.info('Found {} removed entities from {} entities ({} failed)'.format( len(to_be_removed), len(current_entities), delete_error_count)) # 5. Get new/updated entities new_entities, add_error_count = add_new_entities(current_entities, entities, zmon_client, json=args.json) root_span.log_kv({'new_entities': str(len(new_entities))}) logger.info('Found {} new entities from {} entities ({} failed)'.format( len(new_entities), len(current_entities), add_error_count)) # 6. Always add Local entity if not args.json: ia_entity['errors'] = {'delete_count': delete_error_count, 'add_count': add_error_count} update_local_entity(zmon_client, ia_entity) types = {e['type']: len([t for t in new_entities if t['type'] == e['type']]) for e in new_entities} for t, v in types.items(): logger.info('Found {} new entities of type: {}'.format(v, t)) # Check if it is a dry run! if args.json: d = { 'applications': application_entities, 'apps': apps, 'elastigroups': elastigroups, 'dynamodb': dynamodbs, 'elbs': elbs, 'elc': elasticaches, 'rds': rds, 'certificates': certificates, 'aws_limits': aws_limits, 'sqs_queues': sqs, 'new_entities': new_entities, 'to_be_removed': to_be_removed, 'posgresql_clusters': postgresql_clusters } print(json.dumps(d, indent=4))