Example #1
0
def rebuild_partitions(athena_client, options, config):
    """Rebuild an Athena table's partitions

    Steps:
      - Get the list of current partitions
      - Destroy existing table
      - Re-create tables
      - Re-create partitions

    Args:
        athena_client (boto3.client): Instantiated CLI AthenaClient
        options (namedtuple): The parsed args passed from the CLI
        config (CLIConfig): Loaded StreamAlert CLI
    """
    if not options.table_name:
        LOGGER_CLI.error('Missing command line argument --table_name')
        return

    if not options.bucket:
        LOGGER_CLI.error('Missing command line argument --bucket')
        return

    sa_firehose = StreamAlertFirehose(
        config['global']['account']['region'],
        config['global']['infrastructure']['firehose'], config['logs'])
    sanitized_table_name = sa_firehose.firehose_log_name(options.table_name)

    if options.type == 'data':
        # Get the current set of partitions
        partition_success, partitions = athena_client.run_athena_query(
            query='SHOW PARTITIONS {}'.format(sanitized_table_name),
            database='streamalert')
        if not partition_success:
            LOGGER_CLI.error('An error occured when loading partitions for %s',
                             sanitized_table_name)
            return

        unique_partitions = athena_helpers.unique_values_from_query(partitions)

        # Drop the table
        LOGGER_CLI.info('Dropping table %s', sanitized_table_name)
        drop_success, _ = athena_client.run_athena_query(
            query='DROP TABLE {}'.format(sanitized_table_name),
            database='streamalert')
        if not drop_success:
            LOGGER_CLI.error('An error occured when dropping the %s table',
                             sanitized_table_name)
            return

        LOGGER_CLI.info('Dropped table %s', sanitized_table_name)

        new_partitions_statement = athena_helpers.partition_statement(
            unique_partitions, options.bucket, sanitized_table_name)

        # Make sure our new alter table statement is within the query API limits
        if len(new_partitions_statement) > MAX_QUERY_LENGTH:
            LOGGER_CLI.error(
                'Partition statement too large, writing to local file')
            with open('partitions_{}.txt'.format(sanitized_table_name),
                      'w') as partition_file:
                partition_file.write(new_partitions_statement)
            return

        # Re-create the table with previous partitions
        options.refresh_type = 'add_hive_partition'
        create_table(athena_client, options, config)

        LOGGER_CLI.info('Creating %d new partitions for %s',
                        len(unique_partitions), sanitized_table_name)
        new_part_success, _ = athena_client.run_athena_query(
            query=new_partitions_statement, database='streamalert')
        if not new_part_success:
            LOGGER_CLI.error('Error re-creating new partitions for %s',
                             sanitized_table_name)
            return

        LOGGER_CLI.info('Successfully rebuilt partitions for %s',
                        sanitized_table_name)

    else:
        LOGGER_CLI.info('Refreshing alerts tables unsupported')
Example #2
0
def create_table(athena_client, options, config):
    """Create a 'streamalert' Athena table

    Args:
        athena_client (boto3.client): Instantiated CLI AthenaClient
        options (namedtuple): The parsed args passed from the CLI
        config (CLIConfig): Loaded StreamAlert CLI
    """
    sa_firehose = StreamAlertFirehose(
        config['global']['account']['region'],
        config['global']['infrastructure']['firehose'], config['logs'])

    if not options.bucket:
        LOGGER_CLI.error('Missing command line argument --bucket')
        return

    if not options.refresh_type:
        LOGGER_CLI.error('Missing command line argument --refresh_type')
        return

    if options.type == 'data':
        if not options.table_name:
            LOGGER_CLI.error('Missing command line argument --table_name')
            return

        # Convert special characters in schema name to underscores
        sanitized_table_name = sa_firehose.firehose_log_name(
            options.table_name)

        # Check that the log type is enabled via Firehose
        if sanitized_table_name not in sa_firehose.enabled_logs:
            LOGGER_CLI.error(
                'Table name %s missing from configuration or '
                'is not enabled.', sanitized_table_name)
            return

        # Check if the table exists
        if athena_client.check_table_exists(sanitized_table_name):
            LOGGER_CLI.info('The \'%s\' table already exists.',
                            sanitized_table_name)
            return

        log_info = config['logs'][options.table_name.replace('_', ':', 1)]

        schema = dict(log_info['schema'])
        sanitized_schema = StreamAlertFirehose.sanitize_keys(schema)

        athena_schema = handler_helpers.to_athena_schema(sanitized_schema)

        # Add envelope keys to Athena Schema
        configuration_options = log_info.get('configuration')
        if configuration_options:
            envelope_keys = configuration_options.get('envelope_keys')
            if envelope_keys:
                sanitized_envelope_key_schema = StreamAlertFirehose.sanitize_keys(
                    envelope_keys)
                # Note: this key is wrapped in backticks to be Hive compliant
                athena_schema[
                    '`streamalert:envelope_keys`'] = handler_helpers.to_athena_schema(
                        sanitized_envelope_key_schema)

        # Handle Schema overrides
        #   This is useful when an Athena schema needs to differ from the normal log schema
        if options.schema_override:
            for override in options.schema_override:
                if '=' not in override:
                    LOGGER_CLI.error(
                        'Invalid schema override [%s], use column_name=type format',
                        override)
                    return

                column_name, column_type = override.split('=')
                if not all([column_name, column_type]):
                    LOGGER_CLI.error(
                        'Invalid schema override [%s], use column_name=type format',
                        override)

                # Columns are escaped to avoid Hive issues with special characters
                column_name = '`{}`'.format(column_name)
                if column_name in athena_schema:
                    athena_schema[column_name] = column_type
                    LOGGER_CLI.info('Applied schema override: %s:%s',
                                    column_name, column_type)
                else:
                    LOGGER_CLI.error(
                        'Schema override column %s not found in Athena Schema, skipping',
                        column_name)

        query = _construct_create_table_statement(
            schema=athena_schema,
            table_name=sanitized_table_name,
            bucket=options.bucket)

    elif options.type == 'alerts':
        if athena_client.check_table_exists(options.type):
            LOGGER_CLI.info('The \'alerts\' table already exists.')
            return
        query = ALERTS_TABLE_STATEMENT.format(bucket=options.bucket)

    if query:
        create_table_success, _ = athena_client.run_athena_query(
            query=query, database='streamalert')

        if create_table_success:
            # Update the CLI config
            config['lambda']['athena_partition_refresh_config'] \
                  ['refresh_type'][options.refresh_type][options.bucket] = options.type
            config.write()

            table_name = options.type if options.type == 'alerts' else sanitized_table_name
            LOGGER_CLI.info('The %s table was successfully created!',
                            table_name)
Example #3
0
def rebuild_partitions(table, bucket, config):
    """Rebuild an Athena table's partitions

    Steps:
      - Get the list of current partitions
      - Destroy existing table
      - Re-create tables
      - Re-create partitions

    Args:
        table (str): The name of the table being rebuilt
        bucket (str): The s3 bucket to be used as the location for Athena data
        table_type (str): The type of table being refreshed
            Types of 'data' and 'alert' are accepted, but only 'data' is implemented
        config (CLIConfig): Loaded StreamAlert CLI
    """
    athena_client = StreamAlertAthenaClient(
        config, results_key_prefix='stream_alert_cli')

    sa_firehose = StreamAlertFirehose(
        config['global']['account']['region'],
        config['global']['infrastructure']['firehose'], config['logs'])

    sanitized_table_name = sa_firehose.firehose_log_name(table)

    # Get the current set of partitions
    partition_success, partitions = athena_client.run_athena_query(
        query='SHOW PARTITIONS {}'.format(sanitized_table_name),
        database=athena_client.sa_database)
    if not partition_success:
        LOGGER_CLI.error('An error occurred when loading partitions for %s',
                         sanitized_table_name)
        return

    unique_partitions = athena_helpers.unique_values_from_query(partitions)

    if not unique_partitions:
        LOGGER_CLI.info('No partitions to rebuild for %s, nothing to do',
                        sanitized_table_name)
        return

    # Drop the table
    LOGGER_CLI.info('Dropping table %s', sanitized_table_name)
    drop_success, _ = athena_client.run_athena_query(
        query='DROP TABLE {}'.format(sanitized_table_name),
        database=athena_client.sa_database)
    if not drop_success:
        LOGGER_CLI.error('An error occurred when dropping the %s table',
                         sanitized_table_name)
        return

    LOGGER_CLI.info('Dropped table %s', sanitized_table_name)

    LOGGER_CLI.info('Creating table %s', sanitized_table_name)

    # Re-create the table with previous partitions
    create_table(table, bucket, config)

    new_partitions_statement = athena_helpers.partition_statement(
        unique_partitions, bucket, sanitized_table_name)

    # Make sure our new alter table statement is within the query API limits
    if len(new_partitions_statement) > MAX_QUERY_LENGTH:
        LOGGER_CLI.error(
            'Partition statement too large, writing to local file')
        with open('partitions_{}.txt'.format(sanitized_table_name),
                  'w') as partition_file:
            partition_file.write(new_partitions_statement)
        return

    LOGGER_CLI.info('Creating %d new partitions for %s',
                    len(unique_partitions), sanitized_table_name)
    new_part_success, _ = athena_client.run_athena_query(
        query=new_partitions_statement, database=athena_client.sa_database)
    if not new_part_success:
        LOGGER_CLI.error('Error re-creating new partitions for %s',
                         sanitized_table_name)
        return

    LOGGER_CLI.info('Successfully rebuilt partitions for %s',
                    sanitized_table_name)
Example #4
0
def create_table(table, bucket, config, schema_override=None):
    """Create a 'streamalert' Athena table

    Args:
        table (str): The name of the table being rebuilt
        bucket (str): The s3 bucket to be used as the location for Athena data
        table_type (str): The type of table being refreshed
        config (CLIConfig): Loaded StreamAlert CLI
        schema_override (set): An optional set of key=value pairs to be used for
            overriding the configured column_name=value_type.
    """
    athena_client = StreamAlertAthenaClient(
        config, results_key_prefix='stream_alert_cli')

    sa_firehose = StreamAlertFirehose(
        config['global']['account']['region'],
        config['global']['infrastructure']['firehose'], config['logs'])

    # Convert special characters in schema name to underscores
    sanitized_table_name = sa_firehose.firehose_log_name(table)

    # Check that the log type is enabled via Firehose
    if sanitized_table_name != 'alerts' and sanitized_table_name not in sa_firehose.enabled_logs:
        LOGGER_CLI.error(
            'Table name %s missing from configuration or '
            'is not enabled.', sanitized_table_name)
        return

    # Check if the table exists
    if athena_client.check_table_exists(sanitized_table_name, True):
        LOGGER_CLI.info('The \'%s\' table already exists.',
                        sanitized_table_name)
        return

    if table == 'alerts':
        # get a fake alert so we can get the keys needed and their types
        alert = Alert('temp_rule_name', {}, {})
        output = alert.output_dict()
        schema = record_to_schema(output)
        athena_schema = handler_helpers.to_athena_schema(schema)

        query = _construct_create_table_statement(schema=athena_schema,
                                                  table_name=table,
                                                  bucket=bucket)

    else:  # all other tables are log types

        log_info = config['logs'][table.replace('_', ':', 1)]

        schema = dict(log_info['schema'])
        sanitized_schema = StreamAlertFirehose.sanitize_keys(schema)

        athena_schema = handler_helpers.to_athena_schema(sanitized_schema)

        # Add envelope keys to Athena Schema
        configuration_options = log_info.get('configuration')
        if configuration_options:
            envelope_keys = configuration_options.get('envelope_keys')
            if envelope_keys:
                sanitized_envelope_key_schema = StreamAlertFirehose.sanitize_keys(
                    envelope_keys)
                # Note: this key is wrapped in backticks to be Hive compliant
                athena_schema[
                    '`streamalert:envelope_keys`'] = handler_helpers.to_athena_schema(
                        sanitized_envelope_key_schema)

        # Handle Schema overrides
        #   This is useful when an Athena schema needs to differ from the normal log schema
        if schema_override:
            for override in schema_override:
                column_name, column_type = override.split('=')
                if not all([column_name, column_type]):
                    LOGGER_CLI.error(
                        'Invalid schema override [%s], use column_name=type format',
                        override)

                # Columns are escaped to avoid Hive issues with special characters
                column_name = '`{}`'.format(column_name)
                if column_name in athena_schema:
                    athena_schema[column_name] = column_type
                    LOGGER_CLI.info('Applied schema override: %s:%s',
                                    column_name, column_type)
                else:
                    LOGGER_CLI.error(
                        'Schema override column %s not found in Athena Schema, skipping',
                        column_name)

        query = _construct_create_table_statement(
            schema=athena_schema,
            table_name=sanitized_table_name,
            bucket=bucket)

    create_table_success, _ = athena_client.run_athena_query(
        query=query, database=athena_client.sa_database)

    if not create_table_success:
        LOGGER_CLI.error('The %s table could not be created',
                         sanitized_table_name)
        return

    # Update the CLI config
    if (table != 'alerts' and bucket not in config['lambda']
        ['athena_partition_refresh_config']['buckets']):
        config['lambda']['athena_partition_refresh_config']['buckets'][
            bucket] = 'data'
        config.write()

    LOGGER_CLI.info('The %s table was successfully created!',
                    sanitized_table_name)
Example #5
0
def rebuild_partitions(table, bucket, config):
    """Rebuild an Athena table's partitions

    Steps:
      - Get the list of current partitions
      - Destroy existing table
      - Re-create tables
      - Re-create partitions

    Args:
        table (str): The name of the table being rebuilt
        bucket (str): The s3 bucket to be used as the location for Athena data
        table_type (str): The type of table being refreshed
            Types of 'data' and 'alert' are accepted, but only 'data' is implemented
        config (CLIConfig): Loaded StreamAlert CLI
    """
    athena_client = get_athena_client(config)

    sa_firehose = StreamAlertFirehose(
        config['global']['account']['region'],
        config['global']['infrastructure']['firehose'], config['logs'])

    sanitized_table_name = sa_firehose.firehose_log_name(table)

    # Get the current set of partitions
    partitions = athena_client.get_table_partitions(sanitized_table_name)
    if not partitions:
        LOGGER_CLI.info('No partitions to rebuild for %s, nothing to do',
                        sanitized_table_name)
        return

    # Drop the table
    LOGGER_CLI.info('Dropping table %s', sanitized_table_name)
    success = athena_client.drop_table(sanitized_table_name)
    if not success:
        return

    LOGGER_CLI.info('Creating table %s', sanitized_table_name)

    # Re-create the table with previous partitions
    create_table(table, bucket, config)

    new_partitions_statement = helpers.add_partition_statement(
        partitions, bucket, sanitized_table_name)

    # Make sure our new alter table statement is within the query API limits
    if len(new_partitions_statement) > MAX_QUERY_LENGTH:
        LOGGER_CLI.error(
            'Partition statement too large, writing to local file')
        with open('partitions_{}.txt'.format(sanitized_table_name),
                  'w') as partition_file:
            partition_file.write(new_partitions_statement)
        return

    LOGGER_CLI.info('Creating %d new partitions for %s', len(partitions),
                    sanitized_table_name)

    success = athena_client.run_query(query=new_partitions_statement)
    if not success:
        LOGGER_CLI.error('Error re-creating new partitions for %s',
                         sanitized_table_name)
        return

    LOGGER_CLI.info('Successfully rebuilt partitions for %s',
                    sanitized_table_name)
Example #6
0
def create_table(athena_client, options, config):
    """Create a 'streamalert' Athena table

    Args:
        athena_client (boto3.client): Instantiated CLI AthenaClient
        options (namedtuple): The parsed args passed from the CLI
        config (CLIConfig): Loaded StreamAlert CLI
    """
    sa_firehose = StreamAlertFirehose(
        config['global']['account']['region'],
        config['global']['infrastructure']['firehose'], config['logs'])

    if not options.bucket:
        LOGGER_CLI.error('Missing command line argument --bucket')
        return

    if not options.refresh_type:
        LOGGER_CLI.error('Missing command line argument --refresh_type')
        return

    if options.type == 'data':
        if not options.table_name:
            LOGGER_CLI.error('Missing command line argument --table_name')
            return

        sanitized_table_name = sa_firehose.firehose_log_name(
            options.table_name)

        if sanitized_table_name not in sa_firehose.enabled_logs:
            LOGGER_CLI.error(
                'Table name %s missing from configuration or '
                'is not enabled.', sanitized_table_name)
            return

        if athena_client.check_table_exists(sanitized_table_name):
            LOGGER_CLI.info('The \'%s\' table already exists.',
                            sanitized_table_name)
            return

        log_info = config['logs'][options.table_name.replace('_', ':', 1)]
        schema = dict(log_info['schema'])
        schema_statement = ''

        sanitized_schema = StreamAlertFirehose.sanitize_keys(schema)
        athena_schema = {}

        _add_to_athena_schema(sanitized_schema, athena_schema)

        # Support envelope keys
        configuration_options = log_info.get('configuration')
        if configuration_options:
            envelope_keys = configuration_options.get('envelope_keys')
            if envelope_keys:
                sanitized_envelope_key_schema = StreamAlertFirehose.sanitize_keys(
                    envelope_keys)
                # Note: this key is wrapped in backticks to be Hive compliant
                _add_to_athena_schema(sanitized_envelope_key_schema,
                                      athena_schema,
                                      '`streamalert:envelope_keys`')

        for key_name, key_type in athena_schema.iteritems():
            # Account for nested structs
            if isinstance(key_type, dict):
                struct_schema = ''.join([
                    '{0}:{1},'.format(sub_key, sub_type)
                    for sub_key, sub_type in key_type.iteritems()
                ])
                nested_schema_statement = '{0} struct<{1}>, '.format(
                    key_name,
                    # Use the minus index to remove the last comma
                    struct_schema[:-1])
                schema_statement += nested_schema_statement
            else:
                schema_statement += '{0} {1},'.format(key_name, key_type)

        query = (
            'CREATE EXTERNAL TABLE {table_name} ({schema}) '
            'PARTITIONED BY (dt string) '
            'ROW FORMAT SERDE \'org.openx.data.jsonserde.JsonSerDe\' '
            'WITH SERDEPROPERTIES ( \'ignore.malformed.json\' = \'true\') '
            'LOCATION \'s3://{bucket}/{table_name}/\''.format(
                table_name=sanitized_table_name,
                # Use the minus index to remove the last comma
                schema=schema_statement[:-1],
                bucket=options.bucket))

    elif options.type == 'alerts':
        if athena_client.check_table_exists(options.type):
            LOGGER_CLI.info('The \'alerts\' table already exists.')
            return

        query = ('CREATE EXTERNAL TABLE alerts ('
                 'log_source string,'
                 'log_type string,'
                 'outputs array<string>,'
                 'record string,'
                 'rule_description string,'
                 'rule_name string,'
                 'source_entity string,'
                 'source_service string)'
                 'PARTITIONED BY (dt string)'
                 'ROW FORMAT SERDE \'org.openx.data.jsonserde.JsonSerDe\''
                 'LOCATION \'s3://{bucket}/alerts/\''.format(
                     bucket=options.bucket))

    if query:
        create_table_success, _ = athena_client.run_athena_query(
            query=query, database='streamalert')

        if create_table_success:
            # Update the CLI config
            config['lambda']['athena_partition_refresh_config'] \
                  ['refresh_type'][options.refresh_type][options.bucket] = options.type
            config.write()

            table_name = options.type if options.type == 'alerts' else sanitized_table_name
            LOGGER_CLI.info('The %s table was successfully created!',
                            table_name)