Exemple #1
0
def create_log_tables(config):
    """Create all tables needed for historical search
    Args:
        config (CLIConfig): Loaded StreamAlert config
    Returns:
        bool: False if errors occurred, True otherwise
    """
    if not config['global']['infrastructure'].get('firehose',
                                                  {}).get('enabled'):
        return True

    firehose_config = config['global']['infrastructure']['firehose']
    firehose_s3_bucket_suffix = firehose_config.get('s3_bucket_suffix',
                                                    'streamalert-data')
    firehose_s3_bucket_name = '{}-{}'.format(
        config['global']['account']['prefix'], firehose_s3_bucket_suffix)

    enabled_logs = FirehoseClient.load_enabled_log_sources(
        config['global']['infrastructure']['firehose'], config['logs'])

    for log_stream_name in enabled_logs:
        if not create_table(log_stream_name, firehose_s3_bucket_name, config):
            return False

    return True
Exemple #2
0
def generate_data_table_schema(config, table, schema_override=None):
    """Generate the schema for data table in terraform

    Args:
        config (CLIConfig): Loaded StreamAlert config
        table (string): The name of data table

    Returns:
        athena_schema (dict): Equivalent Athena schema used for generating create table statement
    """
    enabled_logs = FirehoseClient.load_enabled_log_sources(
        config['global']['infrastructure']['firehose'], config['logs'])

    # Convert special characters in schema name to underscores
    sanitized_table_name = FirehoseClient.sanitized_value(table)

    # Check that the log type is enabled via Firehose
    if sanitized_table_name not in enabled_logs:
        LOGGER.error(
            'Table name %s missing from configuration or '
            'is not enabled.', sanitized_table_name)
        return None

    log_info = config['logs'][enabled_logs.get(sanitized_table_name)]

    schema = dict(log_info['schema'])
    sanitized_schema = FirehoseClient.sanitize_keys(schema)

    athena_schema = logs_schema_to_athena_schema(sanitized_schema, False)

    # Add envelope keys to Athena Schema
    configuration_options = log_info.get('configuration')
    if configuration_options:
        envelope_keys = configuration_options.get('envelope_keys')
        if envelope_keys:
            sanitized_envelope_key_schema = FirehoseClient.sanitize_keys(
                envelope_keys)
            # Note: this key is wrapped in backticks to be Hive compliant
            athena_schema[
                'streamalert:envelope_keys'] = logs_schema_to_athena_schema(
                    sanitized_envelope_key_schema, False)

    # Handle Schema overrides
    #   This is useful when an Athena schema needs to differ from the normal log schema
    if schema_override:
        for override in schema_override:
            column_name, column_type = override.split('=')
            # Columns are escaped to avoid Hive issues with special characters
            column_name = '{}'.format(column_name)
            if column_name in athena_schema:
                athena_schema[column_name] = column_type
                LOGGER.info('Applied schema override: %s:%s', column_name,
                            column_type)
            else:
                LOGGER.error(
                    'Schema override column %s not found in Athena Schema, skipping',
                    column_name)

    return format_schema_tf(athena_schema)
Exemple #3
0
    def test_load_enabled_sources_invalid_log_subtype(self, log_mock):
        """FirehoseClient - Load Enabled Log Sources, Invalid Log Sub-type"""
        logs_config = {'log_type_01:sub_type_01': {}}
        log_type = 'log_type_01:sub_type_02'
        firehose_config = {'enabled_logs': [log_type]}

        enabled_logs = FirehoseClient.load_enabled_log_sources(
            firehose_config, logs_config)
        assert_equal(enabled_logs, dict())
        log_mock.assert_called_with(
            'Enabled Firehose log %s not declared in logs.json', log_type)
Exemple #4
0
    def test_load_enabled_sources(self):
        """FirehoseClient - Load Enabled Log Sources"""
        logs_config = {
            'log_type_01:sub_type_01': {},
            'log_type_01:sub_type_02':
            {},  # This log type should is not enabled
            'log_type_02:sub_type_01': {},
            'log_type_02:sub_type_02': {},
        }
        firehose_config = {
            'enabled_logs': [
                'log_type_01:sub_type_01',  # One log for log_type_01
                'log_type_02'  # All of log_type_02
            ]
        }
        expected_result = {
            'log_type_01_sub_type_01': 'log_type_01:sub_type_01',
            'log_type_02_sub_type_01': 'log_type_02:sub_type_01',
            'log_type_02_sub_type_02': 'log_type_02:sub_type_02'
        }

        enabled_logs = FirehoseClient.load_enabled_log_sources(
            firehose_config, logs_config)
        assert_equal(enabled_logs, expected_result)
Exemple #5
0
def create_table(table, bucket, config, schema_override=None):
    """Create a 'streamalert' Athena table

    Args:
        table (str): The name of the table being rebuilt
        bucket (str): The s3 bucket to be used as the location for Athena data
        table_type (str): The type of table being refreshed
        config (CLIConfig): Loaded StreamAlert config
        schema_override (set): An optional set of key=value pairs to be used for
            overriding the configured column_name=value_type.

    Returns:
        bool: False if errors occurred, True otherwise
    """
    enabled_logs = FirehoseClient.load_enabled_log_sources(
        config['global']['infrastructure']['firehose'], config['logs'])

    # Convert special characters in schema name to underscores
    sanitized_table_name = FirehoseClient.sanitized_value(table)

    # Check that the log type is enabled via Firehose
    if sanitized_table_name != 'alerts' and sanitized_table_name not in enabled_logs:
        LOGGER.error(
            'Table name %s missing from configuration or '
            'is not enabled.', sanitized_table_name)
        return False

    athena_client = get_athena_client(config)

    # Check if the table exists
    if athena_client.check_table_exists(sanitized_table_name):
        LOGGER.info('The \'%s\' table already exists.', sanitized_table_name)
        return True

    if table == 'alerts':
        # get a fake alert so we can get the keys needed and their types
        alert = Alert('temp_rule_name', {}, {})
        output = alert.output_dict()
        schema = record_to_schema(output)
        athena_schema = helpers.logs_schema_to_athena_schema(schema)

        # Use the bucket if supplied, otherwise use the default alerts bucket
        bucket = bucket or firehose_alerts_bucket(config)

        query = _construct_create_table_statement(
            schema=athena_schema,
            table_name=table,
            bucket=bucket,
            file_format=get_data_file_format(config))

    else:  # all other tables are log types

        config_data_bucket = firehose_data_bucket(config)
        if not config_data_bucket:
            LOGGER.warning(
                'The \'firehose\' module is not enabled in global.json')
            return False

        # Use the bucket if supplied, otherwise use the default data bucket
        bucket = bucket or config_data_bucket

        log_info = config['logs'][enabled_logs.get(sanitized_table_name)]

        schema = dict(log_info['schema'])
        sanitized_schema = FirehoseClient.sanitize_keys(schema)

        athena_schema = helpers.logs_schema_to_athena_schema(sanitized_schema)

        # Add envelope keys to Athena Schema
        configuration_options = log_info.get('configuration')
        if configuration_options:
            envelope_keys = configuration_options.get('envelope_keys')
            if envelope_keys:
                sanitized_envelope_key_schema = FirehoseClient.sanitize_keys(
                    envelope_keys)
                # Note: this key is wrapped in backticks to be Hive compliant
                athena_schema[
                    '`streamalert:envelope_keys`'] = helpers.logs_schema_to_athena_schema(
                        sanitized_envelope_key_schema)

        # Handle Schema overrides
        #   This is useful when an Athena schema needs to differ from the normal log schema
        if schema_override:
            for override in schema_override:
                column_name, column_type = override.split('=')
                # Columns are escaped to avoid Hive issues with special characters
                column_name = '`{}`'.format(column_name)
                if column_name in athena_schema:
                    athena_schema[column_name] = column_type
                    LOGGER.info('Applied schema override: %s:%s', column_name,
                                column_type)
                else:
                    LOGGER.error(
                        'Schema override column %s not found in Athena Schema, skipping',
                        column_name)

        query = _construct_create_table_statement(
            schema=athena_schema,
            table_name=sanitized_table_name,
            bucket=bucket,
            file_format=get_data_file_format(config))

    success = athena_client.run_query(query=query)
    if not success:
        LOGGER.error('The %s table could not be created', sanitized_table_name)
        return False

    # Update the CLI config
    if table != 'alerts' and bucket != config_data_bucket:
        # Only add buckets to the config if they are not one of the default/configured buckets
        # Ensure 'buckets' exists in the config (since it is not required)
        config['lambda']['athena_partitioner_config']['buckets'] = (
            config['lambda']['athena_partitioner_config'].get('buckets', {}))
        if bucket not in config['lambda']['athena_partitioner_config'][
                'buckets']:
            config['lambda']['athena_partitioner_config']['buckets'][
                bucket] = 'data'
            config.write()

    LOGGER.info('The %s table was successfully created!', sanitized_table_name)

    return True
Exemple #6
0
def generate_firehose(logging_bucket, main_dict, config):
    """Generate the Firehose Terraform modules

    Args:
        config (CLIConfig): The loaded StreamAlert Config
        main_dict (infinitedict): The Dict to marshal to a file
        logging_bucket (str): The name of the global logging bucket
    """
    if not config['global']['infrastructure'].get('firehose', {}).get('enabled'):
        return

    prefix = config['global']['account']['prefix']

    # This can return False but the check above ensures that that should never happen
    firehose_s3_bucket_name = firehose_data_bucket(config)

    firehose_conf = config['global']['infrastructure']['firehose']

    # Firehose Setup module
    main_dict['module']['kinesis_firehose_setup'] = {
        'source': './modules/tf_kinesis_firehose_setup',
        'account_id': config['global']['account']['aws_account_id'],
        'prefix': prefix,
        'region': config['global']['account']['region'],
        's3_logging_bucket': logging_bucket,
        's3_bucket_name': firehose_s3_bucket_name,
        'kms_key_id': '${aws_kms_key.server_side_encryption.key_id}'
    }

    enabled_logs = FirehoseClient.load_enabled_log_sources(
        firehose_conf,
        config['logs'],
        force_load=True
    )

    log_alarms_config = firehose_conf.get('enabled_logs', {})

    db_name = get_database_name(config)

    firehose_prefix = prefix if firehose_conf.get('use_prefix', True) else ''

    # Add the Delivery Streams individually
    for log_stream_name, log_type_name in enabled_logs.items():
        module_dict = {
            'source': './modules/tf_kinesis_firehose_delivery_stream',
            'buffer_size': (
                firehose_conf.get('buffer_size')
            ),
            'buffer_interval': (
                firehose_conf.get('buffer_interval', 300)
            ),
            'file_format': get_data_file_format(config),
            'stream_name': FirehoseClient.generate_firehose_name(firehose_prefix, log_stream_name),
            'role_arn': '${module.kinesis_firehose_setup.firehose_role_arn}',
            's3_bucket_name': firehose_s3_bucket_name,
            'kms_key_arn': '${aws_kms_key.server_side_encryption.arn}',
            'glue_catalog_db_name': db_name,
            'glue_catalog_table_name': log_stream_name,
            'schema': generate_data_table_schema(config, log_type_name)
        }

        # Try to get alarm info for this specific log type
        alarm_info = log_alarms_config.get(log_type_name)
        if not alarm_info and ':' in log_type_name:
            # Fallback on looking for alarm info for the parent log type
            alarm_info = log_alarms_config.get(log_type_name.split(':')[0])

        if alarm_info and alarm_info.get('enable_alarm'):
            module_dict['enable_alarm'] = True

            # There are defaults of these defined in the terraform module, so do
            # not set the variable values unless explicitly specified
            if alarm_info.get('log_min_count_threshold'):
                module_dict['alarm_threshold'] = alarm_info.get('log_min_count_threshold')

            if alarm_info.get('evaluation_periods'):
                module_dict['evaluation_periods'] = alarm_info.get('evaluation_periods')

            if alarm_info.get('period_seconds'):
                module_dict['period_seconds'] = alarm_info.get('period_seconds')

            if alarm_info.get('alarm_actions'):
                if not isinstance(alarm_info.get('alarm_actions'), list):
                    module_dict['alarm_actions'] = [alarm_info.get('alarm_actions')]
                else:
                    module_dict['alarm_actions'] = alarm_info.get('alarm_actions')
            else:
                module_dict['alarm_actions'] = [monitoring_topic_arn(config)]

        main_dict['module']['kinesis_firehose_{}'.format(log_stream_name)] = module_dict