def __init__(self): config = load_config(include={'lambda.json', 'global.json'}) prefix = config['global']['account']['prefix'] athena_config = config['lambda']['athena_partitioner_config'] self._file_format = get_data_file_format(config) if self._file_format == 'parquet': self._alerts_regex = self.ALERTS_REGEX_PARQUET self._data_regex = self.DATA_REGEX_PARQUET elif self._file_format == 'json': self._alerts_regex = self.ALERTS_REGEX self._data_regex = self.DATA_REGEX else: message = ( 'file format "{}" is not supported. Supported file format are ' '"parquet", "json". Please update the setting in athena_partitioner_config ' 'in "conf/lambda.json"'.format(self._file_format)) raise ConfigError(message) self._athena_buckets = athena_partition_buckets(config) db_name = get_database_name(config) # Get the S3 bucket to store Athena query results results_bucket = athena_config.get( 'results_bucket', 's3://{}-streamalert-athena-results'.format(prefix)) self._s3_buckets_and_keys = defaultdict(set) self._create_client(db_name, results_bucket)
def handler(cls, options, config): """Run Terraform with an optional set of targets and clusters Args: options (argparse.Namespace): Parsed arguments from manage.py config (CLIConfig): Loaded StreamAlert config Returns: bool: False if errors occurred, True otherwise """ if not terraform_generate_handler(config=config): return False # Will create log tables only when file_format set to "json" and return erlier if # log tables creation failed. # This capabity will be faded out in the future release. if get_data_file_format(config) == 'json' and not create_log_tables( config=config): return target_modules, valid = _get_valid_tf_targets(config, options.target) if not valid: return False return terraform_runner( config, targets=target_modules if target_modules else None)
def _generate_global_module(config): # 2019-08-22 (Ryxias) # In version 3.0.0+, StreamAlert will default to always using the prefix, when "use_prefix" # is not present. # # Refer to this PR for more information: https://github.com/airbnb/streamalert/pull/979 use_prefix = config['global']['infrastructure'].get('classifier_sqs', {}).get( 'use_prefix', True) global_module = { 'source': './modules/tf_globals', 'account_id': config['global']['account']['aws_account_id'], 'region': config['global']['account']['region'], 'prefix': config['global']['account']['prefix'], 'kms_key_arn': '${aws_kms_key.server_side_encryption.arn}', 'rules_engine_timeout': config['lambda']['rules_engine_config']['timeout'], 'sqs_use_prefix': use_prefix, 'alerts_db_name': get_database_name(config), 'alerts_file_format': get_data_file_format(config), 'alerts_schema': generate_alerts_table_schema() } # The below code applies settings for resources only if the settings are explicitly # defined. This is because these resources have defaults defined in the # ./modules/tf_globals module. This will allow for overriding these setting, but # avoids storing defaults in mulitple locations if 'alerts_table' in config['global']['infrastructure']: for setting in {'read_capacity', 'write_capacity'}: value = config['global']['infrastructure']['alerts_table'].get( setting) if value: global_module['alerts_table_{}'.format(setting)] = value alert_fh_settings_with_defaults = { 'bucket_name', 'buffer_size', 'buffer_interval', 'cloudwatch_log_retention' } if 'alerts_firehose' in config['global']['infrastructure']: for setting in alert_fh_settings_with_defaults: value = config['global']['infrastructure']['alerts_firehose'].get( setting) if not value: continue global_module['alerts_firehose_{}'.format(setting)] = value if 'rule_staging' in config['global']['infrastructure']: if config['global']['infrastructure']['rule_staging'].get('enabled'): global_module['enable_rule_staging'] = True for setting in {'table_read_capacity', 'table_write_capacity'}: value = config['global']['infrastructure']['rule_staging'].get( setting) if value: # Defaults are set for this in the terraform module, so skip global_module['rules_{}'.format(setting)] = value return global_module
def generate_global_lambda_settings( config, conf_name, generate_func, tf_tmp_file_name, required=True): """Generate settings for global Lambda functions Args: config (dict): lambda function settings read from 'conf/' directory config_name (str): keyname of lambda function settings in config. generate_func (func): method to generate lambda function settings. tf_tmp_file (str): filename of terraform file, generated by CLI. message (str): Message will be logged by LOGGER. """ if conf_name == 'athena_partitioner_config': # Raise ConfigError when user doesn't explicitly set `file_format` # in `athena_partitioner_config` in conf/lambda.json when upgrade to v3.1.0. file_format = get_data_file_format(config) if not file_format or file_format not in ('parquet', 'json'): message = ( '[WARNING] ' 'It is required to explicitly set "file_format" for ' 'athena_partitioner_config in "conf/lambda.json" when upgrading to v3.1.0. ' 'Available values are "parquet" and "json". For more information, refer to ' 'https://github.com/airbnb/streamalert/issues/1143. ' 'In the future release, the default value of "file_format" will ' 'be changed to "parquet".' ) raise ConfigError(message) tf_tmp_file = os.path.join(config.build_directory, '{}.tf.json'.format(tf_tmp_file_name)) if required and conf_name not in config['lambda']: message = 'Required configuration missing in lambda.json: {}'.format(conf_name) raise ConfigError(message) if not config['lambda'].get(conf_name): LOGGER.warning('Optional configuration missing in lambda.json, skipping: %s', conf_name) remove_temp_terraform_file(tf_tmp_file) return if config['lambda'][conf_name].get('enabled', True): generated_config = generate_func(config=config) if generated_config: _create_terraform_module_file(generated_config, tf_tmp_file) else: remove_temp_terraform_file(tf_tmp_file)
def handler(cls, options, config): """Initialize infrastructure using Terraform Args: config (CLIConfig): Loaded StreamAlert config Returns: bool: False if errors occurred, True otherwise """ LOGGER.info('Initializing StreamAlert') # generate init Terraform files if not terraform_generate_handler(config=config, init=True): return False LOGGER.info('Initializing Terraform') if not run_command(['terraform', 'init'], cwd=config.build_directory): return False # build init infrastructure LOGGER.info('Building initial infrastructure') init_targets = [ 'aws_s3_bucket.lambda_source', 'aws_s3_bucket.logging_bucket', 'aws_s3_bucket.streamalert_secrets', 'aws_s3_bucket.terraform_remote_state', 'aws_s3_bucket.streamalerts', 'aws_kms_key.server_side_encryption', 'aws_kms_alias.server_side_encryption', 'aws_kms_key.streamalert_secrets', 'aws_kms_alias.streamalert_secrets', 'module.streamalert_athena', #required for the alerts table 'aws_dynamodb_table.terraform_remote_state_lock' ] # this bucket must exist before the log tables can be created, but # shouldn't be created unless the firehose is enabled if config['global']['infrastructure'].get('firehose', {}).get('enabled'): init_targets.append('aws_s3_bucket.streamalert_data') if not terraform_runner(config, targets=init_targets): LOGGER.error('An error occurred while running StreamAlert init') return False # generate the main.tf with remote state enabled LOGGER.info('Configuring Terraform Remote State') if not terraform_generate_handler( config=config, check_tf=False, check_creds=False): return False if not run_command(['terraform', 'init'], cwd=config.build_directory): return False LOGGER.info('Deploying Lambda Functions') functions = ['rule', 'alert', 'alert_merger', 'athena', 'classifier'] deploy(config, functions) # we need to manually create the streamalerts table since terraform does not support this # See: https://github.com/terraform-providers/terraform-provider-aws/issues/1486 if get_data_file_format(config) == 'json': # Terraform v0.12 now supports creating Athena tables. We will support # to use terraform aws_glue_catalog_table resource to create table only # when data file_format is set to "parquet" in "athena_partitioner_config" # # For "json" file_format, we will continue using Athena DDL query to # create tables. However, this capabity will be faded out in the future # release because we want users to take advantage of parquet performance. alerts_bucket = firehose_alerts_bucket(config) create_table('alerts', alerts_bucket, config) # Create the glue catalog tables for the enabled logs if not create_log_tables(config=config): return LOGGER.info('Building remaining infrastructure') return terraform_runner(config, refresh=False)
def create_table(table, bucket, config, schema_override=None): """Create a 'streamalert' Athena table Args: table (str): The name of the table being rebuilt bucket (str): The s3 bucket to be used as the location for Athena data table_type (str): The type of table being refreshed config (CLIConfig): Loaded StreamAlert config schema_override (set): An optional set of key=value pairs to be used for overriding the configured column_name=value_type. Returns: bool: False if errors occurred, True otherwise """ enabled_logs = FirehoseClient.load_enabled_log_sources( config['global']['infrastructure']['firehose'], config['logs']) # Convert special characters in schema name to underscores sanitized_table_name = FirehoseClient.sanitized_value(table) # Check that the log type is enabled via Firehose if sanitized_table_name != 'alerts' and sanitized_table_name not in enabled_logs: LOGGER.error( 'Table name %s missing from configuration or ' 'is not enabled.', sanitized_table_name) return False athena_client = get_athena_client(config) # Check if the table exists if athena_client.check_table_exists(sanitized_table_name): LOGGER.info('The \'%s\' table already exists.', sanitized_table_name) return True if table == 'alerts': # get a fake alert so we can get the keys needed and their types alert = Alert('temp_rule_name', {}, {}) output = alert.output_dict() schema = record_to_schema(output) athena_schema = helpers.logs_schema_to_athena_schema(schema) # Use the bucket if supplied, otherwise use the default alerts bucket bucket = bucket or firehose_alerts_bucket(config) query = _construct_create_table_statement( schema=athena_schema, table_name=table, bucket=bucket, file_format=get_data_file_format(config)) else: # all other tables are log types config_data_bucket = firehose_data_bucket(config) if not config_data_bucket: LOGGER.warning( 'The \'firehose\' module is not enabled in global.json') return False # Use the bucket if supplied, otherwise use the default data bucket bucket = bucket or config_data_bucket log_info = config['logs'][enabled_logs.get(sanitized_table_name)] schema = dict(log_info['schema']) sanitized_schema = FirehoseClient.sanitize_keys(schema) athena_schema = helpers.logs_schema_to_athena_schema(sanitized_schema) # Add envelope keys to Athena Schema configuration_options = log_info.get('configuration') if configuration_options: envelope_keys = configuration_options.get('envelope_keys') if envelope_keys: sanitized_envelope_key_schema = FirehoseClient.sanitize_keys( envelope_keys) # Note: this key is wrapped in backticks to be Hive compliant athena_schema[ '`streamalert:envelope_keys`'] = helpers.logs_schema_to_athena_schema( sanitized_envelope_key_schema) # Handle Schema overrides # This is useful when an Athena schema needs to differ from the normal log schema if schema_override: for override in schema_override: column_name, column_type = override.split('=') # Columns are escaped to avoid Hive issues with special characters column_name = '`{}`'.format(column_name) if column_name in athena_schema: athena_schema[column_name] = column_type LOGGER.info('Applied schema override: %s:%s', column_name, column_type) else: LOGGER.error( 'Schema override column %s not found in Athena Schema, skipping', column_name) query = _construct_create_table_statement( schema=athena_schema, table_name=sanitized_table_name, bucket=bucket, file_format=get_data_file_format(config)) success = athena_client.run_query(query=query) if not success: LOGGER.error('The %s table could not be created', sanitized_table_name) return False # Update the CLI config if table != 'alerts' and bucket != config_data_bucket: # Only add buckets to the config if they are not one of the default/configured buckets # Ensure 'buckets' exists in the config (since it is not required) config['lambda']['athena_partition_refresh_config']['buckets'] = ( config['lambda']['athena_partition_refresh_config'].get( 'buckets', {})) if bucket not in config['lambda']['athena_partition_refresh_config'][ 'buckets']: config['lambda']['athena_partition_refresh_config']['buckets'][ bucket] = 'data' config.write() LOGGER.info('The %s table was successfully created!', sanitized_table_name) return True
def generate_firehose(logging_bucket, main_dict, config): """Generate the Firehose Terraform modules Args: config (CLIConfig): The loaded StreamAlert Config main_dict (infinitedict): The Dict to marshal to a file logging_bucket (str): The name of the global logging bucket """ if not config['global']['infrastructure'].get('firehose', {}).get('enabled'): return prefix = config['global']['account']['prefix'] # This can return False but the check above ensures that that should never happen firehose_s3_bucket_name = firehose_data_bucket(config) firehose_conf = config['global']['infrastructure']['firehose'] # Firehose Setup module main_dict['module']['kinesis_firehose_setup'] = { 'source': './modules/tf_kinesis_firehose_setup', 'account_id': config['global']['account']['aws_account_id'], 'prefix': prefix, 'region': config['global']['account']['region'], 's3_logging_bucket': logging_bucket, 's3_bucket_name': firehose_s3_bucket_name, 'kms_key_id': '${aws_kms_key.server_side_encryption.key_id}' } enabled_logs = FirehoseClient.load_enabled_log_sources(firehose_conf, config['logs'], force_load=True) log_alarms_config = firehose_conf.get('enabled_logs', {}) db_name = get_database_name(config) firehose_prefix = prefix if firehose_conf.get('use_prefix', True) else '' # Add the Delivery Streams individually for log_stream_name, log_type_name in enabled_logs.items(): module_dict = { 'source': './modules/tf_kinesis_firehose_delivery_stream', 'buffer_size': (firehose_conf.get('buffer_size')), 'buffer_interval': (firehose_conf.get('buffer_interval', 300)), 'file_format': get_data_file_format(config), 'stream_name': FirehoseClient.generate_firehose_name(firehose_prefix, log_stream_name), 'role_arn': '${module.kinesis_firehose_setup.firehose_role_arn}', 's3_bucket_name': firehose_s3_bucket_name, 'kms_key_arn': '${aws_kms_key.server_side_encryption.arn}', 'glue_catalog_db_name': db_name, 'glue_catalog_table_name': log_stream_name, 'schema': generate_data_table_schema(config, log_type_name) } # Try to get alarm info for this specific log type alarm_info = log_alarms_config.get(log_type_name) if not alarm_info and ':' in log_type_name: # Fallback on looking for alarm info for the parent log type alarm_info = log_alarms_config.get(log_type_name.split(':')[0]) if alarm_info and alarm_info.get('enable_alarm'): module_dict['enable_alarm'] = True # There are defaults of these defined in the terraform module, so do # not set the variable values unless explicitly specified if alarm_info.get('log_min_count_threshold'): module_dict['alarm_threshold'] = alarm_info.get( 'log_min_count_threshold') if alarm_info.get('evaluation_periods'): module_dict['evaluation_periods'] = alarm_info.get( 'evaluation_periods') if alarm_info.get('period_seconds'): module_dict['period_seconds'] = alarm_info.get( 'period_seconds') if alarm_info.get('alarm_actions'): if not isinstance(alarm_info.get('alarm_actions'), list): module_dict['alarm_actions'] = [ alarm_info.get('alarm_actions') ] else: module_dict['alarm_actions'] = alarm_info.get( 'alarm_actions') else: module_dict['alarm_actions'] = [monitoring_topic_arn(config)] main_dict['module']['kinesis_firehose_{}'.format( log_stream_name)] = module_dict