def create_log_tables(config): """Create all tables needed for historical search Args: config (CLIConfig): Loaded StreamAlert config Returns: bool: False if errors occurred, True otherwise """ if not config['global']['infrastructure'].get('firehose', {}).get('enabled'): return True firehose_config = config['global']['infrastructure']['firehose'] firehose_s3_bucket_suffix = firehose_config.get('s3_bucket_suffix', 'streamalert-data') firehose_s3_bucket_name = '{}-{}'.format( config['global']['account']['prefix'], firehose_s3_bucket_suffix) enabled_logs = FirehoseClient.load_enabled_log_sources( config['global']['infrastructure']['firehose'], config['logs']) for log_stream_name in enabled_logs: if not create_table(log_stream_name, firehose_s3_bucket_name, config): return False return True
def generate_data_table_schema(config, table, schema_override=None): """Generate the schema for data table in terraform Args: config (CLIConfig): Loaded StreamAlert config table (string): The name of data table Returns: athena_schema (dict): Equivalent Athena schema used for generating create table statement """ enabled_logs = FirehoseClient.load_enabled_log_sources( config['global']['infrastructure']['firehose'], config['logs']) # Convert special characters in schema name to underscores sanitized_table_name = FirehoseClient.sanitized_value(table) # Check that the log type is enabled via Firehose if sanitized_table_name not in enabled_logs: LOGGER.error( 'Table name %s missing from configuration or ' 'is not enabled.', sanitized_table_name) return None log_info = config['logs'][enabled_logs.get(sanitized_table_name)] schema = dict(log_info['schema']) sanitized_schema = FirehoseClient.sanitize_keys(schema) athena_schema = logs_schema_to_athena_schema(sanitized_schema, False) # Add envelope keys to Athena Schema configuration_options = log_info.get('configuration') if configuration_options: envelope_keys = configuration_options.get('envelope_keys') if envelope_keys: sanitized_envelope_key_schema = FirehoseClient.sanitize_keys( envelope_keys) # Note: this key is wrapped in backticks to be Hive compliant athena_schema[ 'streamalert:envelope_keys'] = logs_schema_to_athena_schema( sanitized_envelope_key_schema, False) # Handle Schema overrides # This is useful when an Athena schema needs to differ from the normal log schema if schema_override: for override in schema_override: column_name, column_type = override.split('=') # Columns are escaped to avoid Hive issues with special characters column_name = '{}'.format(column_name) if column_name in athena_schema: athena_schema[column_name] = column_type LOGGER.info('Applied schema override: %s:%s', column_name, column_type) else: LOGGER.error( 'Schema override column %s not found in Athena Schema, skipping', column_name) return format_schema_tf(athena_schema)
def test_load_enabled_sources_invalid_log_subtype(self, log_mock): """FirehoseClient - Load Enabled Log Sources, Invalid Log Sub-type""" logs_config = {'log_type_01:sub_type_01': {}} log_type = 'log_type_01:sub_type_02' firehose_config = {'enabled_logs': [log_type]} enabled_logs = FirehoseClient.load_enabled_log_sources( firehose_config, logs_config) assert_equal(enabled_logs, dict()) log_mock.assert_called_with( 'Enabled Firehose log %s not declared in logs.json', log_type)
def test_load_enabled_sources(self): """FirehoseClient - Load Enabled Log Sources""" logs_config = { 'log_type_01:sub_type_01': {}, 'log_type_01:sub_type_02': {}, # This log type should is not enabled 'log_type_02:sub_type_01': {}, 'log_type_02:sub_type_02': {}, } firehose_config = { 'enabled_logs': [ 'log_type_01:sub_type_01', # One log for log_type_01 'log_type_02' # All of log_type_02 ] } expected_result = { 'log_type_01_sub_type_01': 'log_type_01:sub_type_01', 'log_type_02_sub_type_01': 'log_type_02:sub_type_01', 'log_type_02_sub_type_02': 'log_type_02:sub_type_02' } enabled_logs = FirehoseClient.load_enabled_log_sources( firehose_config, logs_config) assert_equal(enabled_logs, expected_result)
def create_table(table, bucket, config, schema_override=None): """Create a 'streamalert' Athena table Args: table (str): The name of the table being rebuilt bucket (str): The s3 bucket to be used as the location for Athena data table_type (str): The type of table being refreshed config (CLIConfig): Loaded StreamAlert config schema_override (set): An optional set of key=value pairs to be used for overriding the configured column_name=value_type. Returns: bool: False if errors occurred, True otherwise """ enabled_logs = FirehoseClient.load_enabled_log_sources( config['global']['infrastructure']['firehose'], config['logs']) # Convert special characters in schema name to underscores sanitized_table_name = FirehoseClient.sanitized_value(table) # Check that the log type is enabled via Firehose if sanitized_table_name != 'alerts' and sanitized_table_name not in enabled_logs: LOGGER.error( 'Table name %s missing from configuration or ' 'is not enabled.', sanitized_table_name) return False athena_client = get_athena_client(config) # Check if the table exists if athena_client.check_table_exists(sanitized_table_name): LOGGER.info('The \'%s\' table already exists.', sanitized_table_name) return True if table == 'alerts': # get a fake alert so we can get the keys needed and their types alert = Alert('temp_rule_name', {}, {}) output = alert.output_dict() schema = record_to_schema(output) athena_schema = helpers.logs_schema_to_athena_schema(schema) # Use the bucket if supplied, otherwise use the default alerts bucket bucket = bucket or firehose_alerts_bucket(config) query = _construct_create_table_statement( schema=athena_schema, table_name=table, bucket=bucket, file_format=get_data_file_format(config)) else: # all other tables are log types config_data_bucket = firehose_data_bucket(config) if not config_data_bucket: LOGGER.warning( 'The \'firehose\' module is not enabled in global.json') return False # Use the bucket if supplied, otherwise use the default data bucket bucket = bucket or config_data_bucket log_info = config['logs'][enabled_logs.get(sanitized_table_name)] schema = dict(log_info['schema']) sanitized_schema = FirehoseClient.sanitize_keys(schema) athena_schema = helpers.logs_schema_to_athena_schema(sanitized_schema) # Add envelope keys to Athena Schema configuration_options = log_info.get('configuration') if configuration_options: envelope_keys = configuration_options.get('envelope_keys') if envelope_keys: sanitized_envelope_key_schema = FirehoseClient.sanitize_keys( envelope_keys) # Note: this key is wrapped in backticks to be Hive compliant athena_schema[ '`streamalert:envelope_keys`'] = helpers.logs_schema_to_athena_schema( sanitized_envelope_key_schema) # Handle Schema overrides # This is useful when an Athena schema needs to differ from the normal log schema if schema_override: for override in schema_override: column_name, column_type = override.split('=') # Columns are escaped to avoid Hive issues with special characters column_name = '`{}`'.format(column_name) if column_name in athena_schema: athena_schema[column_name] = column_type LOGGER.info('Applied schema override: %s:%s', column_name, column_type) else: LOGGER.error( 'Schema override column %s not found in Athena Schema, skipping', column_name) query = _construct_create_table_statement( schema=athena_schema, table_name=sanitized_table_name, bucket=bucket, file_format=get_data_file_format(config)) success = athena_client.run_query(query=query) if not success: LOGGER.error('The %s table could not be created', sanitized_table_name) return False # Update the CLI config if table != 'alerts' and bucket != config_data_bucket: # Only add buckets to the config if they are not one of the default/configured buckets # Ensure 'buckets' exists in the config (since it is not required) config['lambda']['athena_partitioner_config']['buckets'] = ( config['lambda']['athena_partitioner_config'].get('buckets', {})) if bucket not in config['lambda']['athena_partitioner_config'][ 'buckets']: config['lambda']['athena_partitioner_config']['buckets'][ bucket] = 'data' config.write() LOGGER.info('The %s table was successfully created!', sanitized_table_name) return True
def generate_firehose(logging_bucket, main_dict, config): """Generate the Firehose Terraform modules Args: config (CLIConfig): The loaded StreamAlert Config main_dict (infinitedict): The Dict to marshal to a file logging_bucket (str): The name of the global logging bucket """ if not config['global']['infrastructure'].get('firehose', {}).get('enabled'): return prefix = config['global']['account']['prefix'] # This can return False but the check above ensures that that should never happen firehose_s3_bucket_name = firehose_data_bucket(config) firehose_conf = config['global']['infrastructure']['firehose'] # Firehose Setup module main_dict['module']['kinesis_firehose_setup'] = { 'source': './modules/tf_kinesis_firehose_setup', 'account_id': config['global']['account']['aws_account_id'], 'prefix': prefix, 'region': config['global']['account']['region'], 's3_logging_bucket': logging_bucket, 's3_bucket_name': firehose_s3_bucket_name, 'kms_key_id': '${aws_kms_key.server_side_encryption.key_id}' } enabled_logs = FirehoseClient.load_enabled_log_sources( firehose_conf, config['logs'], force_load=True ) log_alarms_config = firehose_conf.get('enabled_logs', {}) db_name = get_database_name(config) firehose_prefix = prefix if firehose_conf.get('use_prefix', True) else '' # Add the Delivery Streams individually for log_stream_name, log_type_name in enabled_logs.items(): module_dict = { 'source': './modules/tf_kinesis_firehose_delivery_stream', 'buffer_size': ( firehose_conf.get('buffer_size') ), 'buffer_interval': ( firehose_conf.get('buffer_interval', 300) ), 'file_format': get_data_file_format(config), 'stream_name': FirehoseClient.generate_firehose_name(firehose_prefix, log_stream_name), 'role_arn': '${module.kinesis_firehose_setup.firehose_role_arn}', 's3_bucket_name': firehose_s3_bucket_name, 'kms_key_arn': '${aws_kms_key.server_side_encryption.arn}', 'glue_catalog_db_name': db_name, 'glue_catalog_table_name': log_stream_name, 'schema': generate_data_table_schema(config, log_type_name) } # Try to get alarm info for this specific log type alarm_info = log_alarms_config.get(log_type_name) if not alarm_info and ':' in log_type_name: # Fallback on looking for alarm info for the parent log type alarm_info = log_alarms_config.get(log_type_name.split(':')[0]) if alarm_info and alarm_info.get('enable_alarm'): module_dict['enable_alarm'] = True # There are defaults of these defined in the terraform module, so do # not set the variable values unless explicitly specified if alarm_info.get('log_min_count_threshold'): module_dict['alarm_threshold'] = alarm_info.get('log_min_count_threshold') if alarm_info.get('evaluation_periods'): module_dict['evaluation_periods'] = alarm_info.get('evaluation_periods') if alarm_info.get('period_seconds'): module_dict['period_seconds'] = alarm_info.get('period_seconds') if alarm_info.get('alarm_actions'): if not isinstance(alarm_info.get('alarm_actions'), list): module_dict['alarm_actions'] = [alarm_info.get('alarm_actions')] else: module_dict['alarm_actions'] = alarm_info.get('alarm_actions') else: module_dict['alarm_actions'] = [monitoring_topic_arn(config)] main_dict['module']['kinesis_firehose_{}'.format(log_stream_name)] = module_dict