Ejemplo n.º 1
0
def drop_all_tables(config):
    """Drop all 'streamalert' Athena tables

    Used when cleaning up an existing deployment

    Args:
        config (CLIConfig): Loaded StreamAlert CLI
    """
    if not continue_prompt(
            message='Are you sure you want to drop all Athena tables?'):
        return

    athena_client = StreamAlertAthenaClient(
        config, results_key_prefix='stream_alert_cli')

    success, all_tables = athena_client.run_athena_query(
        query='SHOW TABLES', database=athena_client.sa_database)
    if not success:
        LOGGER_CLI.error('There was an issue getting all tables')
        return

    unique_tables = athena_helpers.unique_values_from_query(all_tables)

    for table in unique_tables:
        success, all_tables = athena_client.run_athena_query(
            query='DROP TABLE {}'.format(table),
            database=athena_client.sa_database)
        if not success:
            LOGGER_CLI.error('Unable to drop the %s table', table)
        else:
            LOGGER_CLI.info('Dropped %s', table)
Ejemplo n.º 2
0
def athena_handler(options, config):
    """Main Athena handler

    Args:
        options (namedtuple): The parsed args passed from the CLI
        config (CLIConfig): Loaded StreamAlert CLI
    """
    athena_client = StreamAlertAthenaClient(
        config, results_key_prefix='stream_alert_cli')

    if options.subcommand == 'init':
        config.generate_athena()

    elif options.subcommand == 'enable':
        config.set_athena_lambda_enable()

    elif options.subcommand == 'create-db':
        create_database(athena_client)

    elif options.subcommand == 'rebuild-partitions':
        rebuild_partitions(athena_client, options, config)

    elif options.subcommand == 'drop-all-tables':
        drop_all_tables(athena_client)

    elif options.subcommand == 'create-table':
        create_table(athena_client, options, config)
Ejemplo n.º 3
0
 def test_invalid_missing_config(self):
     """Athena - Load Missing Config File"""
     invalid_config_data = 'test'
     with mock_open(LAMBDA_FILE, invalid_config_data):
         with mock_open(GLOBAL_FILE, invalid_config_data):
             with patch('os.path.exists') as mock_exists:
                 mock_exists.return_value = False
                 client = StreamAlertAthenaClient()
Ejemplo n.º 4
0
    def test_load_valid_config(self):
        """Athena - Load Config"""
        global_contents = json.dumps(self.config_data['global'], indent=4)
        lambda_contents = json.dumps(self.config_data['lambda'], indent=4)

        with mock_open(GLOBAL_FILE, global_contents):
            with mock_open(LAMBDA_FILE, lambda_contents):
                client = StreamAlertAthenaClient()

                assert_equal(type(client.config), dict)
                assert_equal(set(client.config.keys()), {'global', 'lambda'})
Ejemplo n.º 5
0
def athena_handler(options):
    """Handle Athena operations"""
    athena_client = StreamAlertAthenaClient(
        CONFIG, results_key_prefix='stream_alert_cli')

    if options.subcommand == 'init':
        CONFIG.generate_athena()

    elif options.subcommand == 'enable':
        CONFIG.set_athena_lambda_enable()

    elif options.subcommand == 'create-db':
        if athena_client.check_database_exists():
            LOGGER_CLI.info(
                'The \'streamalert\' database already exists, nothing to do')
            return

        create_db_success, create_db_result = athena_client.run_athena_query(
            query='CREATE DATABASE streamalert')

        if create_db_success and create_db_result['ResultSet'].get('Rows'):
            LOGGER_CLI.info('streamalert database successfully created!')
            LOGGER_CLI.info('results: %s',
                            create_db_result['ResultSet']['Rows'])

    elif options.subcommand == 'create-table':
        if options.type == 'alerts':
            if not options.bucket:
                LOGGER_CLI.error('Missing command line argument --bucket')
                return

            if athena_client.check_table_exists(options.type):
                LOGGER_CLI.info('The \'alerts\' table already exists.')
                return

            query = ('CREATE EXTERNAL TABLE alerts ('
                     'log_source string,'
                     'log_type string,'
                     'outputs array<string>,'
                     'record string,'
                     'rule_description string,'
                     'rule_name string,'
                     'source_entity string,'
                     'source_service string)'
                     'PARTITIONED BY (dt string)'
                     'ROW FORMAT SERDE \'org.openx.data.jsonserde.JsonSerDe\''
                     'LOCATION \'s3://{bucket}/alerts/\''.format(
                         bucket=options.bucket))

            create_table_success, _ = athena_client.run_athena_query(
                query=query, database='streamalert')

            if create_table_success:
                CONFIG['lambda']['athena_partition_refresh_config'] \
                    ['refresh_type'][options.refresh_type][options.bucket] = 'alerts'
                CONFIG.write()
                LOGGER_CLI.info('The alerts table was successfully created!')
 def setup(self):
     self.client = StreamAlertAthenaClient(
         CONFIG_DATA, results_key_prefix='unit-testing')
class TestStreamAlertAthenaClient(object):
    """Test class for StreamAlertAthenaClient"""
    def setup(self):
        self.client = StreamAlertAthenaClient(
            CONFIG_DATA, results_key_prefix='unit-testing')

    @patch('stream_alert.athena_partition_refresh.main.LOGGER')
    def test_add_hive_partition(self, mock_logging):
        """Athena - Add Hive Partition"""
        query_result = [{'Repair: added data to metastore:foobar'},
                        {'Repair: added data to metastore:foobaz'}]
        self.client.athena_client = MockAthenaClient(results=query_result)
        result = self.client.add_hive_partition({
            'unit-testing.streamalerts':
            set([
                'alerts/dt=2017-08-26-14/rule_name_alerts-1304134918401.json',
                'alerts/dt=2017-08-27-14/rule_name_alerts-1304134918401.json'
            ]),
            'unit-testing.streamalert.data':
            set([
                'log_type_1/2017/08/26/14/test-data-11111-22222-33333.snappy',
                'log_type_2/2017/08/26/14/test-data-11111-22222-33333.snappy',
                'log_type_2/2017/08/26/15/test-data-11111-22222-33333.snappy',
                'log_type_2/2017/08/26/16/test-data-11111-22222-33333.snappy',
                'log_type_3/2017/08/26/14/test-data-11111-22222-33333.snappy',
                'log_type_1/2017/08/26/11/test-data-11111-22222-33333.snappy'
            ]),
            'test-bucket-with-data':
            set([
                '2017/08/26/14/rule_name_alerts-1304134918401.json',
                '2017/08/28/14/rule_name_alerts-1304134918401.json',
                '2017/07/30/14/rule_name_alerts-1304134918401.json'
            ])
        })

        assert_true(mock_logging.info.called)
        assert_true(result)

    @patch('stream_alert.athena_partition_refresh.main.LOGGER')
    def test_add_hive_partition_unknown_bucket(self, mock_logging):
        """Athena - Add Hive Partition - Unknown Bucket"""
        self.client.athena_client = MockAthenaClient(results=[])
        result = self.client.add_hive_partition({
            'bucket-not-in-config.streamalerts':
            set([
                'alerts/dt=2017-08-26-14/rule_name_alerts-1304134918401.json',
                'alerts/dt=2017-08-27-14/rule_name_alerts-1304134918401.json',
            ])
        })

        assert_true(mock_logging.error.called)
        assert_false(result)

    @patch('stream_alert.athena_partition_refresh.main.LOGGER')
    def test_add_hive_partition_unexpected_s3_key(self, mock_logging):
        """Athena - Add Hive Partition - Unexpected S3 Key"""
        self.client.athena_client = MockAthenaClient(results=[])
        result = self.client.add_hive_partition({
            'unit-testing.streamalerts':
            set(['a/pattern/that/does/not-match']),
            'test-bucket-with-data':
            set(['another/pattern/that/does/not-match'])
        })

        assert_true(mock_logging.error.called)
        assert_false(result)

    def test_check_table_exists(self):
        """Athena - Check Table Exists"""
        query_result = [{'alerts': True}]
        self.client.athena_client = MockAthenaClient(results=query_result)

        result = self.client.check_table_exists('unit-test')
        assert_true(result)

        generated_results_key = 'unit-testing/{}'.format(
            datetime.now().strftime('%Y/%m/%d'))
        assert_equal(self.client.athena_results_key, generated_results_key)

    @patch('stream_alert.athena_partition_refresh.main.LOGGER')
    def test_check_table_exists_invalid(self, mock_logging):
        """Athena - Check Table Exists - Does Not Exist"""
        query_result = None
        self.client.athena_client = MockAthenaClient(results=query_result)

        result = self.client.check_table_exists('unit-test')
        assert_false(result)
        assert_true(mock_logging.info.called)

    def test_check_database_exists_invalid(self):
        """Athena - Check Database Exists - Does Not Exist"""
        query_result = None
        self.client.athena_client = MockAthenaClient(results=query_result)

        assert_false(self.client.check_database_exists())

    def test_check_database_exists(self):
        """Athena - Check Database Exists"""
        query_result = [{'streamalert': True}]
        self.client.athena_client = MockAthenaClient(results=query_result)

        assert_true(self.client.check_database_exists())

    @patch('stream_alert.athena_partition_refresh.main.LOGGER')
    def test_run_athena_query_empty(self, mock_logging):
        """Athena - Run Athena Query - Empty Result"""
        query_result = None
        self.client.athena_client = MockAthenaClient(results=query_result)

        query_success, query_results = self.client.run_athena_query(
            query='SHOW DATABASES;')

        assert_true(query_success)
        assert_equal(query_results['ResultSet']['Rows'], [])
        assert_true(mock_logging.debug.called)

    def test_run_athena_query_async(self):
        """Athena - Run Athena Query - Async Call"""
        query_result = []
        self.client.athena_client = MockAthenaClient(results=query_result)

        query_success, _ = self.client.run_athena_query(
            query='SHOW DATABASES;', async=True)

        assert_true(query_success)

    @patch('stream_alert.athena_partition_refresh.main.LOGGER')
    def test_run_athena_query_error(self, mock_logging):
        """Athena - Run Athena Query - Error Result"""
        self.client.athena_client = MockAthenaClient(results=None,
                                                     result_state='FAILED')

        query_success, query_results = self.client.run_athena_query(
            query='SHOW DATABASES;')

        assert_true(mock_logging.error.called)
        assert_false(query_success)
        assert_equal(query_results, {})

    @patch('stream_alert.athena_partition_refresh.main.LOGGER')
    def test_repair_hive_table_unknown_bucket(self, mock_logging):
        """Athena - Repair Hive Table - Unknown Bucket"""
        self.client.athena_client = MockAthenaClient(result_state='SUCCEEDED')

        # This bucket is not in our `repair_hive_table` config map
        self.client.repair_hive_table({'my-test.result.bucket'})
        assert_true(mock_logging.warning.called)

    @patch('stream_alert.athena_partition_refresh.main.LOGGER')
    def test_repair_hive_table_failed_refresh(self, mock_logging):
        """Athena - Repair Hive Table - Failed Refresh"""
        self.client.athena_client = MockAthenaClient(result_state='FAILED')

        # This bucket is not in our `repair_hive_table` config map
        self.client.repair_hive_table({'unit-testing.streamalerts'})
        assert_true(mock_logging.error.called)

    @patch('stream_alert.athena_partition_refresh.main.LOGGER')
    def test_repair_hive_table(self, mock_logging):
        """Athena - Repair Hive Table"""
        query_result = [{'Status': 'SUCCEEDED'}]
        self.client.athena_client = MockAthenaClient(results=query_result)

        self.client.repair_hive_table({'unit-testing.streamalerts'})
        assert_true(mock_logging.info.called)

    def test_run_athena_query(self):
        """Athena - Run Athena Query - Normal Result"""
        self.client.athena_client = MockAthenaClient()

        query_success, query_results = self.client.run_athena_query(
            query='SHOW DATABASES;')

        assert_true(query_success)
        assert_equal(query_results['ResultSet']['Rows'], [{
            'Data': [{
                'test': 'test'
            }]
        }])
def athena_handler(options):
    """Handle Athena operations"""
    athena_client = StreamAlertAthenaClient(
        CONFIG, results_key_prefix='stream_alert_cli')

    if options.subcommand == 'init':
        CONFIG.generate_athena()

    elif options.subcommand == 'enable':
        CONFIG.set_athena_lambda_enable()

    elif options.subcommand == 'create-db':
        if athena_client.check_database_exists():
            LOGGER_CLI.info(
                'The \'streamalert\' database already exists, nothing to do')
            return

        create_db_success, create_db_result = athena_client.run_athena_query(
            query='CREATE DATABASE streamalert')

        if create_db_success and create_db_result['ResultSet'].get('Rows'):
            LOGGER_CLI.info('streamalert database successfully created!')
            LOGGER_CLI.info('results: %s',
                            create_db_result['ResultSet']['Rows'])

    elif options.subcommand == 'create-table':
        if not options.bucket:
            LOGGER_CLI.error('Missing command line argument --bucket')
            return

        if not options.refresh_type:
            LOGGER_CLI.error('Missing command line argument --refresh_type')
            return

        if options.type == 'data':
            if not options.table_name:
                LOGGER_CLI.error('Missing command line argument --table_name')
                return

            if options.table_name not in enabled_firehose_logs(CONFIG):
                LOGGER_CLI.error(
                    'Table name %s missing from configuration or '
                    'is not enabled.', options.table_name)
                return

            if athena_client.check_table_exists(options.table_name):
                LOGGER_CLI.info('The \'%s\' table already exists.',
                                options.table_name)
                return

            log_info = CONFIG['logs'][options.table_name.replace('_', ':', 1)]
            schema = dict(log_info['schema'])
            schema_statement = ''

            sanitized_schema = StreamAlert.sanitize_keys(schema)

            athena_schema = {}
            schema_type_mapping = {
                'string': 'string',
                'integer': 'int',
                'boolean': 'boolean',
                'float': 'decimal',
                dict: 'map<string, string>',
                list: 'array<string>'
            }

            def add_to_athena_schema(schema, root_key=''):
                """Helper function to add sanitized schemas to the Athena table schema"""
                # Setup the root_key dict
                if root_key and not athena_schema.get(root_key):
                    athena_schema[root_key] = {}

                for key_name, key_type in schema.iteritems():
                    # When using special characters in the beginning or end
                    # of a column name, they have to be wrapped in backticks
                    key_name = '`{}`'.format(key_name)

                    special_key = None
                    # Transform the {} or [] into hashable types
                    if key_type == {}:
                        special_key = dict
                    elif key_type == []:
                        special_key = list
                    # Cast nested dict as a string for now
                    # TODO(jacknagz): support recursive schemas
                    elif isinstance(key_type, dict):
                        special_key = 'string'

                    # Account for envelope keys
                    if root_key:
                        if special_key is not None:
                            athena_schema[root_key][
                                key_name] = schema_type_mapping[special_key]
                        else:
                            athena_schema[root_key][
                                key_name] = schema_type_mapping[key_type]
                    else:
                        if special_key is not None:
                            athena_schema[key_name] = schema_type_mapping[
                                special_key]
                        else:
                            athena_schema[key_name] = schema_type_mapping[
                                key_type]

            add_to_athena_schema(sanitized_schema)

            # Support envelope keys
            configuration_options = log_info.get('configuration')
            if configuration_options:
                envelope_keys = configuration_options.get('envelope_keys')
                if envelope_keys:
                    sanitized_envelope_keys = StreamAlert.sanitize_keys(
                        envelope_keys)
                    # Note: this key is wrapped in backticks to be Hive compliant
                    add_to_athena_schema(sanitized_envelope_keys,
                                         '`streamalert:envelope_keys`')

            for key_name, key_type in athena_schema.iteritems():
                # Account for nested structs
                if isinstance(key_type, dict):
                    struct_schema = ''.join([
                        '{0}:{1},'.format(sub_key, sub_type)
                        for sub_key, sub_type in key_type.iteritems()
                    ])
                    nested_schema_statement = '{0} struct<{1}>, '.format(
                        key_name,
                        # Use the minus index to remove the last comma
                        struct_schema[:-1])
                    schema_statement += nested_schema_statement
                else:
                    schema_statement += '{0} {1},'.format(key_name, key_type)

            query = (
                'CREATE EXTERNAL TABLE {table_name} ({schema}) '
                'PARTITIONED BY (dt string) '
                'ROW FORMAT SERDE \'org.openx.data.jsonserde.JsonSerDe\' '
                'LOCATION \'s3://{bucket}/{table_name}/\''.format(
                    table_name=options.table_name,
                    # Use the minus index to remove the last comma
                    schema=schema_statement[:-1],
                    bucket=options.bucket))

        elif options.type == 'alerts':
            if athena_client.check_table_exists(options.type):
                LOGGER_CLI.info('The \'alerts\' table already exists.')
                return

            query = ('CREATE EXTERNAL TABLE alerts ('
                     'log_source string,'
                     'log_type string,'
                     'outputs array<string>,'
                     'record string,'
                     'rule_description string,'
                     'rule_name string,'
                     'source_entity string,'
                     'source_service string)'
                     'PARTITIONED BY (dt string)'
                     'ROW FORMAT SERDE \'org.openx.data.jsonserde.JsonSerDe\''
                     'LOCATION \'s3://{bucket}/alerts/\''.format(
                         bucket=options.bucket))

        if query:
            create_table_success, _ = athena_client.run_athena_query(
                query=query, database='streamalert')

            if create_table_success:
                CONFIG['lambda']['athena_partition_refresh_config'] \
                      ['refresh_type'][options.refresh_type][options.bucket] = options.type
                CONFIG.write()
                table_name = options.type if options.type == 'alerts' else options.table_name
                LOGGER_CLI.info('The %s table was successfully created!',
                                table_name)
Ejemplo n.º 9
0
def athena_handler(options):
    """Handle Athena operations"""
    athena_client = StreamAlertAthenaClient(
        CONFIG, results_key_prefix='stream_alert_cli')

    if options.subcommand == 'init':
        CONFIG.generate_athena()

    elif options.subcommand == 'enable':
        CONFIG.set_athena_lambda_enable()

    elif options.subcommand == 'create-db':
        if athena_client.check_database_exists():
            LOGGER_CLI.info(
                'The \'streamalert\' database already exists, nothing to do')
            return

        create_db_success, create_db_result = athena_client.run_athena_query(
            query='CREATE DATABASE streamalert')

        if create_db_success and create_db_result['ResultSet'].get('Rows'):
            LOGGER_CLI.info('streamalert database successfully created!')
            LOGGER_CLI.info('results: %s',
                            create_db_result['ResultSet']['Rows'])

    elif options.subcommand == 'create-table':
        if not options.bucket:
            LOGGER_CLI.error('Missing command line argument --bucket')
            return

        if not options.refresh_type:
            LOGGER_CLI.error('Missing command line argument --refresh_type')
            return

        if options.type == 'data':
            if not options.table_name:
                LOGGER_CLI.error('Missing command line argument --table_name')
                return

            if options.table_name not in enabled_firehose_logs(CONFIG):
                LOGGER_CLI.error(
                    'Table name %s missing from configuration or '
                    'is not enabled.', options.table_name)
                return

            if athena_client.check_table_exists(options.table_name):
                LOGGER_CLI.info('The \'%s\' table already exists.',
                                options.table_name)
                return

            schema = CONFIG['logs'][options.table_name.replace('_',
                                                               ':')]['schema']
            sanitized_schema = StreamAlert.sanitize_keys(schema)

            athena_schema = {}
            schema_type_mapping = {
                'string': 'string',
                'integer': 'int',
                'boolean': 'boolean',
                'float': 'decimal',
                dict: 'map<string, string>',
                list: 'array<string>'
            }

            for key_name, key_type in sanitized_schema.iteritems():
                # Transform the {} or [] into hashable types
                if key_type == {}:
                    key_type = dict
                elif key_type == []:
                    key_type = list

                athena_schema[key_name] = schema_type_mapping[key_type]

            schema_statement = ''.join([
                '{0} {1},'.format(key_name, key_type)
                for key_name, key_type in athena_schema.iteritems()
            ])[:-1]
            query = ('CREATE EXTERNAL TABLE {table_name} ({schema})'
                     'PARTITIONED BY (dt string)'
                     'ROW FORMAT SERDE \'org.openx.data.jsonserde.JsonSerDe\''
                     'LOCATION \'s3://{bucket}/{table_name}/\''.format(
                         table_name=options.table_name,
                         schema=schema_statement,
                         bucket=options.bucket))

        elif options.type == 'alerts':
            if athena_client.check_table_exists(options.type):
                LOGGER_CLI.info('The \'alerts\' table already exists.')
                return

            query = ('CREATE EXTERNAL TABLE alerts ('
                     'log_source string,'
                     'log_type string,'
                     'outputs array<string>,'
                     'record string,'
                     'rule_description string,'
                     'rule_name string,'
                     'source_entity string,'
                     'source_service string)'
                     'PARTITIONED BY (dt string)'
                     'ROW FORMAT SERDE \'org.openx.data.jsonserde.JsonSerDe\''
                     'LOCATION \'s3://{bucket}/alerts/\''.format(
                         bucket=options.bucket))

        if query:
            create_table_success, _ = athena_client.run_athena_query(
                query=query, database='streamalert')

            if create_table_success:
                CONFIG['lambda']['athena_partition_refresh_config'] \
                      ['refresh_type'][options.refresh_type][options.bucket] = options.type
                CONFIG.write()
                LOGGER_CLI.info('The %s table was successfully created!',
                                options.type)
Ejemplo n.º 10
0
 def test_invalid_json_config(self):
     """Athena - Load Invalid Config"""
     invalid_config_data = 'This is not JSON!!!'
     with mock_open(LAMBDA_FILE, invalid_config_data):
         with mock_open(GLOBAL_FILE, invalid_config_data):
             client = StreamAlertAthenaClient()
Ejemplo n.º 11
0
 def setup(self):
     self.client = StreamAlertAthenaClient(
         config=self.config_data, results_key_prefix='unit-testing')
Ejemplo n.º 12
0
class TestStreamAlertAthenaClient(object):
    """Test class for StreamAlertAthenaClient"""
    def __init__(self):
        self.config_data = {
            'global': {
                'account': {
                    'aws_account_id': '111111111111',
                    'kms_key_alias': 'stream_alert_secrets',
                    'prefix': 'unit-testing',
                    'region': 'us-east-2'
                },
                'terraform': {
                    'tfstate_bucket':
                    'unit-testing.streamalert.terraform.state',
                    'tfstate_s3_key': 'stream_alert_state/terraform.tfstate',
                    'tfvars': 'terraform.tfvars'
                },
                'infrastructure': {
                    'monitoring': {
                        'create_sns_topic': True
                    }
                }
            },
            'lambda': {
                'alert_processor_config': {
                    'handler': 'stream_alert.alert_processor.main.handler',
                    'source_bucket': 'unit-testing.streamalert.source',
                    'source_current_hash': '<auto_generated>',
                    'source_object_key': '<auto_generated>',
                    'third_party_libraries': []
                },
                'rule_processor_config': {
                    'handler': 'stream_alert.rule_processor.main.handler',
                    'source_bucket': 'unit-testing.streamalert.source',
                    'source_current_hash': '<auto_generated>',
                    'source_object_key': '<auto_generated>',
                    'third_party_libraries': ['jsonpath_rw', 'netaddr']
                },
                'athena_partition_refresh_config': {
                    "enabled": True,
                    "refresh_type": {
                        "repair_hive_table": {
                            "unit-testing.streamalerts": "alerts"
                        },
                        "add_hive_partition": {}
                    },
                    "handler": "main.handler",
                    "timeout": "60",
                    "memory": "128",
                    "source_bucket": "unit-testing.streamalert.source",
                    "source_current_hash": "<auto_generated>",
                    "source_object_key": "<auto_generated",
                    "third_party_libraries": ["backoff"]
                }
            }
        }

    def setup(self):
        self.client = StreamAlertAthenaClient(
            config=self.config_data, results_key_prefix='unit-testing')

    @raises(ConfigError)
    def test_invalid_json_config(self):
        """Athena - Load Invalid Config"""
        invalid_config_data = 'This is not JSON!!!'
        with mock_open(LAMBDA_FILE, invalid_config_data):
            with mock_open(GLOBAL_FILE, invalid_config_data):
                client = StreamAlertAthenaClient()

    @raises(ConfigError)
    def test_invalid_missing_config(self):
        """Athena - Load Missing Config File"""
        invalid_config_data = 'test'
        with mock_open(LAMBDA_FILE, invalid_config_data):
            with mock_open(GLOBAL_FILE, invalid_config_data):
                with patch('os.path.exists') as mock_exists:
                    mock_exists.return_value = False
                    client = StreamAlertAthenaClient()

    def test_load_valid_config(self):
        """Athena - Load Config"""
        global_contents = json.dumps(self.config_data['global'], indent=4)
        lambda_contents = json.dumps(self.config_data['lambda'], indent=4)

        with mock_open(GLOBAL_FILE, global_contents):
            with mock_open(LAMBDA_FILE, lambda_contents):
                client = StreamAlertAthenaClient()

                assert_equal(type(client.config), dict)
                assert_equal(set(client.config.keys()), {'global', 'lambda'})

    @patch('stream_alert.athena_partition_refresh.main.LOGGER')
    @raises(NotImplementedError)
    def test_firehose_partition_refresh(self, mock_logging):
        """Athena - Test Firehose Parition Refresh"""
        self.client.firehose_partition_refresh(None)

        assert_true(mock_logging.error.called)

    @patch('stream_alert.athena_partition_refresh.main.LOGGER')
    def test_backoff_and_success_handlers(self, mock_logging):
        """Athena - Test Backoff Handlers"""
        self.client._backoff_handler({
            'wait': 1.0,
            'tries': 3,
            'target': 'backoff'
        })
        assert_true(mock_logging.debug.called)

        self.client._success_handler({'tries': 3, 'target': 'backoff'})
        assert_true(mock_logging.debug.called)

    def test_check_table_exists(self):
        """Athena - Check Table Exists"""
        query_result = [{'alerts': True}]
        self.client.athena_client = MockAthenaClient(results=query_result)

        result = self.client.check_table_exists('unit-test')
        assert_true(result)

        generated_results_key = 'unit-testing/{}'.format(
            datetime.now().strftime('%Y/%m/%d'))
        assert_equal(self.client.athena_results_key, generated_results_key)

    @patch('stream_alert.athena_partition_refresh.main.LOGGER')
    def test_check_table_exists_invalid(self, mock_logging):
        """Athena - Check Table Exists - Does Not Exist"""
        query_result = None
        self.client.athena_client = MockAthenaClient(results=query_result)

        result = self.client.check_table_exists('unit-test')
        assert_false(result)
        assert_true(mock_logging.info.called)

    def test_check_database_exists_invalid(self):
        """Athena - Check Database Exists - Does Not Exist"""
        query_result = None
        self.client.athena_client = MockAthenaClient(results=query_result)

        assert_false(self.client.check_database_exists())

    def test_check_database_exists(self):
        """Athena - Check Database Exists"""
        query_result = [{'streamalert': True}]
        self.client.athena_client = MockAthenaClient(results=query_result)

        assert_true(self.client.check_database_exists())

    @patch('stream_alert.athena_partition_refresh.main.LOGGER')
    def test_run_athena_query_empty(self, mock_logging):
        """Athena - Run Athena Query"""
        query_result = None
        self.client.athena_client = MockAthenaClient(results=query_result)

        query_success, query_results = self.client.run_athena_query(
            query='SHOW DATABASES;')

        assert_true(query_success)
        assert_equal(query_results['ResultSet']['Rows'], [])
        assert_true(mock_logging.debug.called)

    @patch('stream_alert.athena_partition_refresh.main.LOGGER')
    def test_run_athena_query_error(self, mock_logging):
        """Athena - Run Athena Query"""
        self.client.athena_client = MockAthenaClient(results=None,
                                                     result_state='FAILED')

        query_success, query_results = self.client.run_athena_query(
            query='SHOW DATABASES;')

        assert_true(mock_logging.error.called)
        assert_false(query_success)
        assert_equal(query_results, {})

    @patch('stream_alert.athena_partition_refresh.main.LOGGER')
    def test_repair_hive_table(self, mock_logging):
        """Athena - Repair Hive Table"""
        query_result = [{'Status': 'Success'}]
        self.client.athena_client = MockAthenaClient(results=query_result)

        self.client.repair_hive_table()
        assert_true(mock_logging.info.called)

    def test_run_athena_query(self):
        """Athena - Run Athena Query"""
        self.client.athena_client = MockAthenaClient()

        query_success, query_results = self.client.run_athena_query(
            query='SHOW DATABASES;')

        assert_true(query_success)
        assert_equal(query_results['ResultSet']['Rows'], [{
            'Data': [{
                'test': 'test'
            }]
        }])

    @patch('stream_alert.athena_partition_refresh.main.LOGGER.error')
    @patch(
        'stream_alert.athena_partition_refresh.main.StreamAlertAthenaClient.run_athena_query'
    )
    def test_repair_hive_table_fail(self, mock_run_athena, mock_logging):
        """Athena - Repair Hive Table, Failure"""
        mock_run_athena.return_value = (False, None)
        self.client.athena_client = MockAthenaClient()

        self.client.repair_hive_table()
        assert_true(mock_logging.called)
Ejemplo n.º 13
0
def rebuild_partitions(table, bucket, config):
    """Rebuild an Athena table's partitions

    Steps:
      - Get the list of current partitions
      - Destroy existing table
      - Re-create tables
      - Re-create partitions

    Args:
        table (str): The name of the table being rebuilt
        bucket (str): The s3 bucket to be used as the location for Athena data
        table_type (str): The type of table being refreshed
            Types of 'data' and 'alert' are accepted, but only 'data' is implemented
        config (CLIConfig): Loaded StreamAlert CLI
    """
    athena_client = StreamAlertAthenaClient(
        config, results_key_prefix='stream_alert_cli')

    sa_firehose = StreamAlertFirehose(
        config['global']['account']['region'],
        config['global']['infrastructure']['firehose'], config['logs'])

    sanitized_table_name = sa_firehose.firehose_log_name(table)

    # Get the current set of partitions
    partition_success, partitions = athena_client.run_athena_query(
        query='SHOW PARTITIONS {}'.format(sanitized_table_name),
        database=athena_client.sa_database)
    if not partition_success:
        LOGGER_CLI.error('An error occurred when loading partitions for %s',
                         sanitized_table_name)
        return

    unique_partitions = athena_helpers.unique_values_from_query(partitions)

    if not unique_partitions:
        LOGGER_CLI.info('No partitions to rebuild for %s, nothing to do',
                        sanitized_table_name)
        return

    # Drop the table
    LOGGER_CLI.info('Dropping table %s', sanitized_table_name)
    drop_success, _ = athena_client.run_athena_query(
        query='DROP TABLE {}'.format(sanitized_table_name),
        database=athena_client.sa_database)
    if not drop_success:
        LOGGER_CLI.error('An error occurred when dropping the %s table',
                         sanitized_table_name)
        return

    LOGGER_CLI.info('Dropped table %s', sanitized_table_name)

    LOGGER_CLI.info('Creating table %s', sanitized_table_name)

    # Re-create the table with previous partitions
    create_table(table, bucket, config)

    new_partitions_statement = athena_helpers.partition_statement(
        unique_partitions, bucket, sanitized_table_name)

    # Make sure our new alter table statement is within the query API limits
    if len(new_partitions_statement) > MAX_QUERY_LENGTH:
        LOGGER_CLI.error(
            'Partition statement too large, writing to local file')
        with open('partitions_{}.txt'.format(sanitized_table_name),
                  'w') as partition_file:
            partition_file.write(new_partitions_statement)
        return

    LOGGER_CLI.info('Creating %d new partitions for %s',
                    len(unique_partitions), sanitized_table_name)
    new_part_success, _ = athena_client.run_athena_query(
        query=new_partitions_statement, database=athena_client.sa_database)
    if not new_part_success:
        LOGGER_CLI.error('Error re-creating new partitions for %s',
                         sanitized_table_name)
        return

    LOGGER_CLI.info('Successfully rebuilt partitions for %s',
                    sanitized_table_name)
Ejemplo n.º 14
0
def create_table(table, bucket, config, schema_override=None):
    """Create a 'streamalert' Athena table

    Args:
        table (str): The name of the table being rebuilt
        bucket (str): The s3 bucket to be used as the location for Athena data
        table_type (str): The type of table being refreshed
        config (CLIConfig): Loaded StreamAlert CLI
        schema_override (set): An optional set of key=value pairs to be used for
            overriding the configured column_name=value_type.
    """
    athena_client = StreamAlertAthenaClient(
        config, results_key_prefix='stream_alert_cli')

    sa_firehose = StreamAlertFirehose(
        config['global']['account']['region'],
        config['global']['infrastructure']['firehose'], config['logs'])

    # Convert special characters in schema name to underscores
    sanitized_table_name = sa_firehose.firehose_log_name(table)

    # Check that the log type is enabled via Firehose
    if sanitized_table_name != 'alerts' and sanitized_table_name not in sa_firehose.enabled_logs:
        LOGGER_CLI.error(
            'Table name %s missing from configuration or '
            'is not enabled.', sanitized_table_name)
        return

    # Check if the table exists
    if athena_client.check_table_exists(sanitized_table_name, True):
        LOGGER_CLI.info('The \'%s\' table already exists.',
                        sanitized_table_name)
        return

    if table == 'alerts':
        # get a fake alert so we can get the keys needed and their types
        alert = Alert('temp_rule_name', {}, {})
        output = alert.output_dict()
        schema = record_to_schema(output)
        athena_schema = handler_helpers.to_athena_schema(schema)

        query = _construct_create_table_statement(schema=athena_schema,
                                                  table_name=table,
                                                  bucket=bucket)

    else:  # all other tables are log types

        log_info = config['logs'][table.replace('_', ':', 1)]

        schema = dict(log_info['schema'])
        sanitized_schema = StreamAlertFirehose.sanitize_keys(schema)

        athena_schema = handler_helpers.to_athena_schema(sanitized_schema)

        # Add envelope keys to Athena Schema
        configuration_options = log_info.get('configuration')
        if configuration_options:
            envelope_keys = configuration_options.get('envelope_keys')
            if envelope_keys:
                sanitized_envelope_key_schema = StreamAlertFirehose.sanitize_keys(
                    envelope_keys)
                # Note: this key is wrapped in backticks to be Hive compliant
                athena_schema[
                    '`streamalert:envelope_keys`'] = handler_helpers.to_athena_schema(
                        sanitized_envelope_key_schema)

        # Handle Schema overrides
        #   This is useful when an Athena schema needs to differ from the normal log schema
        if schema_override:
            for override in schema_override:
                column_name, column_type = override.split('=')
                if not all([column_name, column_type]):
                    LOGGER_CLI.error(
                        'Invalid schema override [%s], use column_name=type format',
                        override)

                # Columns are escaped to avoid Hive issues with special characters
                column_name = '`{}`'.format(column_name)
                if column_name in athena_schema:
                    athena_schema[column_name] = column_type
                    LOGGER_CLI.info('Applied schema override: %s:%s',
                                    column_name, column_type)
                else:
                    LOGGER_CLI.error(
                        'Schema override column %s not found in Athena Schema, skipping',
                        column_name)

        query = _construct_create_table_statement(
            schema=athena_schema,
            table_name=sanitized_table_name,
            bucket=bucket)

    create_table_success, _ = athena_client.run_athena_query(
        query=query, database=athena_client.sa_database)

    if not create_table_success:
        LOGGER_CLI.error('The %s table could not be created',
                         sanitized_table_name)
        return

    # Update the CLI config
    if (table != 'alerts' and bucket not in config['lambda']
        ['athena_partition_refresh_config']['buckets']):
        config['lambda']['athena_partition_refresh_config']['buckets'][
            bucket] = 'data'
        config.write()

    LOGGER_CLI.info('The %s table was successfully created!',
                    sanitized_table_name)