Exemple #1
0
def rebuild_partitions(table, bucket, config):
    """Rebuild an Athena table's partitions

    Steps:
      - Get the list of current partitions
      - Destroy existing table
      - Re-create tables
      - Re-create partitions

    Args:
        table (str): The name of the table being rebuilt
        bucket (str): The s3 bucket to be used as the location for Athena data
        table_type (str): The type of table being refreshed
            Types of 'data' and 'alert' are accepted, but only 'data' is implemented
        config (CLIConfig): Loaded StreamAlert config

    Returns:
        bool: False if errors occurred, True otherwise
    """
    sanitized_table_name = FirehoseClient.sanitized_value(table)

    athena_client = get_athena_client(config)

    # Get the current set of partitions
    partitions = athena_client.get_table_partitions(sanitized_table_name)
    if not partitions:
        LOGGER.info('No partitions to rebuild for %s, nothing to do',
                    sanitized_table_name)
        return False

    # Drop the table
    LOGGER.info('Dropping table %s', sanitized_table_name)
    if not athena_client.drop_table(sanitized_table_name):
        return False

    LOGGER.info('Creating table %s', sanitized_table_name)

    # Re-create the table with previous partitions
    if not create_table(table, bucket, config):
        return False

    new_partitions_statements = helpers.add_partition_statements(
        partitions, bucket, sanitized_table_name)

    LOGGER.info('Creating total %d new partitions for %s', len(partitions),
                sanitized_table_name)

    for idx, statement in enumerate(new_partitions_statements):
        success = athena_client.run_query(query=statement)
        LOGGER.info('Rebuilt partitions part %d', idx + 1)
        if not success:
            LOGGER.error('Error re-creating new partitions for %s',
                         sanitized_table_name)
            write_partitions_statements(new_partitions_statements,
                                        sanitized_table_name)
            return False

    LOGGER.info('Successfully rebuilt all partitions for %s',
                sanitized_table_name)
    return True
Exemple #2
0
def generate_data_table_schema(config, table, schema_override=None):
    """Generate the schema for data table in terraform

    Args:
        config (CLIConfig): Loaded StreamAlert config
        table (string): The name of data table

    Returns:
        athena_schema (dict): Equivalent Athena schema used for generating create table statement
    """
    enabled_logs = FirehoseClient.load_enabled_log_sources(
        config['global']['infrastructure']['firehose'], config['logs'])

    # Convert special characters in schema name to underscores
    sanitized_table_name = FirehoseClient.sanitized_value(table)

    # Check that the log type is enabled via Firehose
    if sanitized_table_name not in enabled_logs:
        LOGGER.error(
            'Table name %s missing from configuration or '
            'is not enabled.', sanitized_table_name)
        return None

    log_info = config['logs'][enabled_logs.get(sanitized_table_name)]

    schema = dict(log_info['schema'])
    sanitized_schema = FirehoseClient.sanitize_keys(schema)

    athena_schema = logs_schema_to_athena_schema(sanitized_schema, False)

    # Add envelope keys to Athena Schema
    configuration_options = log_info.get('configuration')
    if configuration_options:
        envelope_keys = configuration_options.get('envelope_keys')
        if envelope_keys:
            sanitized_envelope_key_schema = FirehoseClient.sanitize_keys(
                envelope_keys)
            # Note: this key is wrapped in backticks to be Hive compliant
            athena_schema[
                'streamalert:envelope_keys'] = logs_schema_to_athena_schema(
                    sanitized_envelope_key_schema, False)

    # Handle Schema overrides
    #   This is useful when an Athena schema needs to differ from the normal log schema
    if schema_override:
        for override in schema_override:
            column_name, column_type = override.split('=')
            # Columns are escaped to avoid Hive issues with special characters
            column_name = '{}'.format(column_name)
            if column_name in athena_schema:
                athena_schema[column_name] = column_type
                LOGGER.info('Applied schema override: %s:%s', column_name,
                            column_type)
            else:
                LOGGER.error(
                    'Schema override column %s not found in Athena Schema, skipping',
                    column_name)

    return format_schema_tf(athena_schema)
Exemple #3
0
 def test_sanitized_value(self):
     """FirehoseClient - Sanitized Value"""
     expected_result = 'test_log_type_name'
     result = FirehoseClient.sanitized_value('test*log.type-name')
     assert_equal(result, expected_result)
Exemple #4
0
def create_table(table, bucket, config, schema_override=None):
    """Create a 'streamalert' Athena table

    Args:
        table (str): The name of the table being rebuilt
        bucket (str): The s3 bucket to be used as the location for Athena data
        table_type (str): The type of table being refreshed
        config (CLIConfig): Loaded StreamAlert config
        schema_override (set): An optional set of key=value pairs to be used for
            overriding the configured column_name=value_type.

    Returns:
        bool: False if errors occurred, True otherwise
    """
    enabled_logs = FirehoseClient.load_enabled_log_sources(
        config['global']['infrastructure']['firehose'], config['logs'])

    # Convert special characters in schema name to underscores
    sanitized_table_name = FirehoseClient.sanitized_value(table)

    # Check that the log type is enabled via Firehose
    if sanitized_table_name != 'alerts' and sanitized_table_name not in enabled_logs:
        LOGGER.error(
            'Table name %s missing from configuration or '
            'is not enabled.', sanitized_table_name)
        return False

    athena_client = get_athena_client(config)

    # Check if the table exists
    if athena_client.check_table_exists(sanitized_table_name):
        LOGGER.info('The \'%s\' table already exists.', sanitized_table_name)
        return True

    if table == 'alerts':
        # get a fake alert so we can get the keys needed and their types
        alert = Alert('temp_rule_name', {}, {})
        output = alert.output_dict()
        schema = record_to_schema(output)
        athena_schema = helpers.logs_schema_to_athena_schema(schema)

        # Use the bucket if supplied, otherwise use the default alerts bucket
        bucket = bucket or firehose_alerts_bucket(config)

        query = _construct_create_table_statement(
            schema=athena_schema,
            table_name=table,
            bucket=bucket,
            file_format=get_data_file_format(config))

    else:  # all other tables are log types

        config_data_bucket = firehose_data_bucket(config)
        if not config_data_bucket:
            LOGGER.warning(
                'The \'firehose\' module is not enabled in global.json')
            return False

        # Use the bucket if supplied, otherwise use the default data bucket
        bucket = bucket or config_data_bucket

        log_info = config['logs'][enabled_logs.get(sanitized_table_name)]

        schema = dict(log_info['schema'])
        sanitized_schema = FirehoseClient.sanitize_keys(schema)

        athena_schema = helpers.logs_schema_to_athena_schema(sanitized_schema)

        # Add envelope keys to Athena Schema
        configuration_options = log_info.get('configuration')
        if configuration_options:
            envelope_keys = configuration_options.get('envelope_keys')
            if envelope_keys:
                sanitized_envelope_key_schema = FirehoseClient.sanitize_keys(
                    envelope_keys)
                # Note: this key is wrapped in backticks to be Hive compliant
                athena_schema[
                    '`streamalert:envelope_keys`'] = helpers.logs_schema_to_athena_schema(
                        sanitized_envelope_key_schema)

        # Handle Schema overrides
        #   This is useful when an Athena schema needs to differ from the normal log schema
        if schema_override:
            for override in schema_override:
                column_name, column_type = override.split('=')
                # Columns are escaped to avoid Hive issues with special characters
                column_name = '`{}`'.format(column_name)
                if column_name in athena_schema:
                    athena_schema[column_name] = column_type
                    LOGGER.info('Applied schema override: %s:%s', column_name,
                                column_type)
                else:
                    LOGGER.error(
                        'Schema override column %s not found in Athena Schema, skipping',
                        column_name)

        query = _construct_create_table_statement(
            schema=athena_schema,
            table_name=sanitized_table_name,
            bucket=bucket,
            file_format=get_data_file_format(config))

    success = athena_client.run_query(query=query)
    if not success:
        LOGGER.error('The %s table could not be created', sanitized_table_name)
        return False

    # Update the CLI config
    if table != 'alerts' and bucket != config_data_bucket:
        # Only add buckets to the config if they are not one of the default/configured buckets
        # Ensure 'buckets' exists in the config (since it is not required)
        config['lambda']['athena_partitioner_config']['buckets'] = (
            config['lambda']['athena_partitioner_config'].get('buckets', {}))
        if bucket not in config['lambda']['athena_partitioner_config'][
                'buckets']:
            config['lambda']['athena_partitioner_config']['buckets'][
                bucket] = 'data'
            config.write()

    LOGGER.info('The %s table was successfully created!', sanitized_table_name)

    return True