Python get_data_file_format Exemples, streamalert.shared.utils.get_data_file_format Python Exemples

Exemple #1

0

Afficher le fichier

    def __init__(self):
        config = load_config(include={'lambda.json', 'global.json'})
        prefix = config['global']['account']['prefix']
        athena_config = config['lambda']['athena_partitioner_config']
        self._file_format = get_data_file_format(config)

        if self._file_format == 'parquet':
            self._alerts_regex = self.ALERTS_REGEX_PARQUET
            self._data_regex = self.DATA_REGEX_PARQUET

        elif self._file_format == 'json':
            self._alerts_regex = self.ALERTS_REGEX
            self._data_regex = self.DATA_REGEX
        else:
            message = (
                'file format "{}" is not supported. Supported file format are '
                '"parquet", "json". Please update the setting in athena_partitioner_config '
                'in "conf/lambda.json"'.format(self._file_format))
            raise ConfigError(message)

        self._athena_buckets = athena_partition_buckets(config)

        db_name = get_database_name(config)

        # Get the S3 bucket to store Athena query results
        results_bucket = athena_config.get(
            'results_bucket',
            's3://{}-streamalert-athena-results'.format(prefix))

        self._s3_buckets_and_keys = defaultdict(set)

        self._create_client(db_name, results_bucket)

Exemple #2

0

Afficher le fichier

Fichier : handlers.py Projet : webvul/streamalert

    def handler(cls, options, config):
        """Run Terraform with an optional set of targets and clusters

        Args:
            options (argparse.Namespace): Parsed arguments from manage.py
            config (CLIConfig): Loaded StreamAlert config

        Returns:
            bool: False if errors occurred, True otherwise
        """
        if not terraform_generate_handler(config=config):
            return False

        # Will create log tables only when file_format set to "json" and return erlier if
        # log tables creation failed.
        # This capabity will be faded out in the future release.
        if get_data_file_format(config) == 'json' and not create_log_tables(
                config=config):
            return

        target_modules, valid = _get_valid_tf_targets(config, options.target)
        if not valid:
            return False

        return terraform_runner(
            config, targets=target_modules if target_modules else None)

Exemple #3

0

Afficher le fichier

Fichier : generate.py Projet : tompiscitell/streamalert

def _generate_global_module(config):
    # 2019-08-22 (Ryxias)
    #   In version 3.0.0+, StreamAlert will default to always using the prefix, when "use_prefix"
    #   is not present.
    #
    #   Refer to this PR for more information: https://github.com/airbnb/streamalert/pull/979
    use_prefix = config['global']['infrastructure'].get('classifier_sqs',
                                                        {}).get(
                                                            'use_prefix', True)

    global_module = {
        'source': './modules/tf_globals',
        'account_id': config['global']['account']['aws_account_id'],
        'region': config['global']['account']['region'],
        'prefix': config['global']['account']['prefix'],
        'kms_key_arn': '${aws_kms_key.server_side_encryption.arn}',
        'rules_engine_timeout':
        config['lambda']['rules_engine_config']['timeout'],
        'sqs_use_prefix': use_prefix,
        'alerts_db_name': get_database_name(config),
        'alerts_file_format': get_data_file_format(config),
        'alerts_schema': generate_alerts_table_schema()
    }

    # The below code applies settings for resources only if the settings are explicitly
    # defined. This is because these resources have defaults defined in the
    # ./modules/tf_globals module. This will allow for overriding these setting, but
    # avoids storing defaults in mulitple locations
    if 'alerts_table' in config['global']['infrastructure']:
        for setting in {'read_capacity', 'write_capacity'}:
            value = config['global']['infrastructure']['alerts_table'].get(
                setting)
            if value:
                global_module['alerts_table_{}'.format(setting)] = value

    alert_fh_settings_with_defaults = {
        'bucket_name', 'buffer_size', 'buffer_interval',
        'cloudwatch_log_retention'
    }

    if 'alerts_firehose' in config['global']['infrastructure']:
        for setting in alert_fh_settings_with_defaults:
            value = config['global']['infrastructure']['alerts_firehose'].get(
                setting)
            if not value:
                continue

            global_module['alerts_firehose_{}'.format(setting)] = value

    if 'rule_staging' in config['global']['infrastructure']:
        if config['global']['infrastructure']['rule_staging'].get('enabled'):
            global_module['enable_rule_staging'] = True
            for setting in {'table_read_capacity', 'table_write_capacity'}:
                value = config['global']['infrastructure']['rule_staging'].get(
                    setting)
                if value:
                    # Defaults are set for this in the terraform module, so skip
                    global_module['rules_{}'.format(setting)] = value

    return global_module

Exemple #4

0

Afficher le fichier

def generate_global_lambda_settings(
        config,
        conf_name,
        generate_func,
        tf_tmp_file_name,
        required=True):
    """Generate settings for global Lambda functions

    Args:
        config (dict): lambda function settings read from 'conf/' directory
        config_name (str): keyname of lambda function settings in config.
        generate_func (func): method to generate lambda function settings.
        tf_tmp_file (str): filename of terraform file, generated by CLI.
        message (str): Message will be logged by LOGGER.
    """
    if conf_name == 'athena_partitioner_config':
        # Raise ConfigError when user doesn't explicitly set `file_format`
        # in `athena_partitioner_config` in conf/lambda.json when upgrade to v3.1.0.
        file_format = get_data_file_format(config)

        if not file_format or file_format not in ('parquet', 'json'):
            message = (
                '[WARNING] '
                'It is required to explicitly set "file_format" for '
                'athena_partitioner_config in "conf/lambda.json" when upgrading to v3.1.0. '
                'Available values are "parquet" and "json". For more information, refer to '
                'https://github.com/airbnb/streamalert/issues/1143. '
                'In the future release, the default value of "file_format" will '
                'be changed to "parquet".'
            )
            raise ConfigError(message)

    tf_tmp_file = os.path.join(config.build_directory, '{}.tf.json'.format(tf_tmp_file_name))

    if required and conf_name not in config['lambda']:
        message = 'Required configuration missing in lambda.json: {}'.format(conf_name)
        raise ConfigError(message)

    if not config['lambda'].get(conf_name):
        LOGGER.warning('Optional configuration missing in lambda.json, skipping: %s', conf_name)
        remove_temp_terraform_file(tf_tmp_file)
        return

    if config['lambda'][conf_name].get('enabled', True):
        generated_config = generate_func(config=config)
        if generated_config:
            _create_terraform_module_file(generated_config, tf_tmp_file)
    else:
        remove_temp_terraform_file(tf_tmp_file)

Exemple #5

0

Afficher le fichier

Fichier : handlers.py Projet : webvul/streamalert

    def handler(cls, options, config):
        """Initialize infrastructure using Terraform

        Args:
            config (CLIConfig): Loaded StreamAlert config

        Returns:
            bool: False if errors occurred, True otherwise
        """
        LOGGER.info('Initializing StreamAlert')

        # generate init Terraform files
        if not terraform_generate_handler(config=config, init=True):
            return False

        LOGGER.info('Initializing Terraform')
        if not run_command(['terraform', 'init'], cwd=config.build_directory):
            return False

        # build init infrastructure
        LOGGER.info('Building initial infrastructure')
        init_targets = [
            'aws_s3_bucket.lambda_source',
            'aws_s3_bucket.logging_bucket',
            'aws_s3_bucket.streamalert_secrets',
            'aws_s3_bucket.terraform_remote_state',
            'aws_s3_bucket.streamalerts',
            'aws_kms_key.server_side_encryption',
            'aws_kms_alias.server_side_encryption',
            'aws_kms_key.streamalert_secrets',
            'aws_kms_alias.streamalert_secrets',
            'module.streamalert_athena',  #required for the alerts table
            'aws_dynamodb_table.terraform_remote_state_lock'
        ]

        # this bucket must exist before the log tables can be created, but
        # shouldn't be created unless the firehose is enabled
        if config['global']['infrastructure'].get('firehose',
                                                  {}).get('enabled'):
            init_targets.append('aws_s3_bucket.streamalert_data')

        if not terraform_runner(config, targets=init_targets):
            LOGGER.error('An error occurred while running StreamAlert init')
            return False

        # generate the main.tf with remote state enabled
        LOGGER.info('Configuring Terraform Remote State')
        if not terraform_generate_handler(
                config=config, check_tf=False, check_creds=False):
            return False

        if not run_command(['terraform', 'init'], cwd=config.build_directory):
            return False

        LOGGER.info('Deploying Lambda Functions')

        functions = ['rule', 'alert', 'alert_merger', 'athena', 'classifier']

        deploy(config, functions)

        # we need to manually create the streamalerts table since terraform does not support this
        # See: https://github.com/terraform-providers/terraform-provider-aws/issues/1486
        if get_data_file_format(config) == 'json':
            # Terraform v0.12 now supports creating Athena tables. We will support
            # to use terraform aws_glue_catalog_table resource to create table only
            # when data file_format is set to "parquet" in "athena_partitioner_config"
            #
            # For "json" file_format, we will continue using Athena DDL query to
            # create tables. However, this capabity will be faded out in the future
            # release because we want users to take advantage of parquet performance.
            alerts_bucket = firehose_alerts_bucket(config)
            create_table('alerts', alerts_bucket, config)

            # Create the glue catalog tables for the enabled logs
            if not create_log_tables(config=config):
                return

        LOGGER.info('Building remaining infrastructure')
        return terraform_runner(config, refresh=False)

Exemple #6

0

Afficher le fichier

Fichier : handler.py Projet : tompiscitell/streamalert

def create_table(table, bucket, config, schema_override=None):
    """Create a 'streamalert' Athena table

    Args:
        table (str): The name of the table being rebuilt
        bucket (str): The s3 bucket to be used as the location for Athena data
        table_type (str): The type of table being refreshed
        config (CLIConfig): Loaded StreamAlert config
        schema_override (set): An optional set of key=value pairs to be used for
            overriding the configured column_name=value_type.

    Returns:
        bool: False if errors occurred, True otherwise
    """
    enabled_logs = FirehoseClient.load_enabled_log_sources(
        config['global']['infrastructure']['firehose'], config['logs'])

    # Convert special characters in schema name to underscores
    sanitized_table_name = FirehoseClient.sanitized_value(table)

    # Check that the log type is enabled via Firehose
    if sanitized_table_name != 'alerts' and sanitized_table_name not in enabled_logs:
        LOGGER.error(
            'Table name %s missing from configuration or '
            'is not enabled.', sanitized_table_name)
        return False

    athena_client = get_athena_client(config)

    # Check if the table exists
    if athena_client.check_table_exists(sanitized_table_name):
        LOGGER.info('The \'%s\' table already exists.', sanitized_table_name)
        return True

    if table == 'alerts':
        # get a fake alert so we can get the keys needed and their types
        alert = Alert('temp_rule_name', {}, {})
        output = alert.output_dict()
        schema = record_to_schema(output)
        athena_schema = helpers.logs_schema_to_athena_schema(schema)

        # Use the bucket if supplied, otherwise use the default alerts bucket
        bucket = bucket or firehose_alerts_bucket(config)

        query = _construct_create_table_statement(
            schema=athena_schema,
            table_name=table,
            bucket=bucket,
            file_format=get_data_file_format(config))

    else:  # all other tables are log types

        config_data_bucket = firehose_data_bucket(config)
        if not config_data_bucket:
            LOGGER.warning(
                'The \'firehose\' module is not enabled in global.json')
            return False

        # Use the bucket if supplied, otherwise use the default data bucket
        bucket = bucket or config_data_bucket

        log_info = config['logs'][enabled_logs.get(sanitized_table_name)]

        schema = dict(log_info['schema'])
        sanitized_schema = FirehoseClient.sanitize_keys(schema)

        athena_schema = helpers.logs_schema_to_athena_schema(sanitized_schema)

        # Add envelope keys to Athena Schema
        configuration_options = log_info.get('configuration')
        if configuration_options:
            envelope_keys = configuration_options.get('envelope_keys')
            if envelope_keys:
                sanitized_envelope_key_schema = FirehoseClient.sanitize_keys(
                    envelope_keys)
                # Note: this key is wrapped in backticks to be Hive compliant
                athena_schema[
                    '`streamalert:envelope_keys`'] = helpers.logs_schema_to_athena_schema(
                        sanitized_envelope_key_schema)

        # Handle Schema overrides
        #   This is useful when an Athena schema needs to differ from the normal log schema
        if schema_override:
            for override in schema_override:
                column_name, column_type = override.split('=')
                # Columns are escaped to avoid Hive issues with special characters
                column_name = '`{}`'.format(column_name)
                if column_name in athena_schema:
                    athena_schema[column_name] = column_type
                    LOGGER.info('Applied schema override: %s:%s', column_name,
                                column_type)
                else:
                    LOGGER.error(
                        'Schema override column %s not found in Athena Schema, skipping',
                        column_name)

        query = _construct_create_table_statement(
            schema=athena_schema,
            table_name=sanitized_table_name,
            bucket=bucket,
            file_format=get_data_file_format(config))

    success = athena_client.run_query(query=query)
    if not success:
        LOGGER.error('The %s table could not be created', sanitized_table_name)
        return False

    # Update the CLI config
    if table != 'alerts' and bucket != config_data_bucket:
        # Only add buckets to the config if they are not one of the default/configured buckets
        # Ensure 'buckets' exists in the config (since it is not required)
        config['lambda']['athena_partition_refresh_config']['buckets'] = (
            config['lambda']['athena_partition_refresh_config'].get(
                'buckets', {}))
        if bucket not in config['lambda']['athena_partition_refresh_config'][
                'buckets']:
            config['lambda']['athena_partition_refresh_config']['buckets'][
                bucket] = 'data'
            config.write()

    LOGGER.info('The %s table was successfully created!', sanitized_table_name)

    return True

Exemple #7

0

Afficher le fichier

def generate_firehose(logging_bucket, main_dict, config):
    """Generate the Firehose Terraform modules

    Args:
        config (CLIConfig): The loaded StreamAlert Config
        main_dict (infinitedict): The Dict to marshal to a file
        logging_bucket (str): The name of the global logging bucket
    """
    if not config['global']['infrastructure'].get('firehose',
                                                  {}).get('enabled'):
        return

    prefix = config['global']['account']['prefix']

    # This can return False but the check above ensures that that should never happen
    firehose_s3_bucket_name = firehose_data_bucket(config)

    firehose_conf = config['global']['infrastructure']['firehose']

    # Firehose Setup module
    main_dict['module']['kinesis_firehose_setup'] = {
        'source': './modules/tf_kinesis_firehose_setup',
        'account_id': config['global']['account']['aws_account_id'],
        'prefix': prefix,
        'region': config['global']['account']['region'],
        's3_logging_bucket': logging_bucket,
        's3_bucket_name': firehose_s3_bucket_name,
        'kms_key_id': '${aws_kms_key.server_side_encryption.key_id}'
    }

    enabled_logs = FirehoseClient.load_enabled_log_sources(firehose_conf,
                                                           config['logs'],
                                                           force_load=True)

    log_alarms_config = firehose_conf.get('enabled_logs', {})

    db_name = get_database_name(config)

    firehose_prefix = prefix if firehose_conf.get('use_prefix', True) else ''

    # Add the Delivery Streams individually
    for log_stream_name, log_type_name in enabled_logs.items():
        module_dict = {
            'source':
            './modules/tf_kinesis_firehose_delivery_stream',
            'buffer_size': (firehose_conf.get('buffer_size')),
            'buffer_interval': (firehose_conf.get('buffer_interval', 300)),
            'file_format':
            get_data_file_format(config),
            'stream_name':
            FirehoseClient.generate_firehose_name(firehose_prefix,
                                                  log_stream_name),
            'role_arn':
            '${module.kinesis_firehose_setup.firehose_role_arn}',
            's3_bucket_name':
            firehose_s3_bucket_name,
            'kms_key_arn':
            '${aws_kms_key.server_side_encryption.arn}',
            'glue_catalog_db_name':
            db_name,
            'glue_catalog_table_name':
            log_stream_name,
            'schema':
            generate_data_table_schema(config, log_type_name)
        }

        # Try to get alarm info for this specific log type
        alarm_info = log_alarms_config.get(log_type_name)
        if not alarm_info and ':' in log_type_name:
            # Fallback on looking for alarm info for the parent log type
            alarm_info = log_alarms_config.get(log_type_name.split(':')[0])

        if alarm_info and alarm_info.get('enable_alarm'):
            module_dict['enable_alarm'] = True

            # There are defaults of these defined in the terraform module, so do
            # not set the variable values unless explicitly specified
            if alarm_info.get('log_min_count_threshold'):
                module_dict['alarm_threshold'] = alarm_info.get(
                    'log_min_count_threshold')

            if alarm_info.get('evaluation_periods'):
                module_dict['evaluation_periods'] = alarm_info.get(
                    'evaluation_periods')

            if alarm_info.get('period_seconds'):
                module_dict['period_seconds'] = alarm_info.get(
                    'period_seconds')

            if alarm_info.get('alarm_actions'):
                if not isinstance(alarm_info.get('alarm_actions'), list):
                    module_dict['alarm_actions'] = [
                        alarm_info.get('alarm_actions')
                    ]
                else:
                    module_dict['alarm_actions'] = alarm_info.get(
                        'alarm_actions')
            else:
                module_dict['alarm_actions'] = [monitoring_topic_arn(config)]

        main_dict['module']['kinesis_firehose_{}'.format(
            log_stream_name)] = module_dict