def generate_alerts_table_schema(): """Generate the schema for alerts table in terraform by using a fake alert Returns: athena_schema (dict): Equivalent Athena schema used for generating create table statement """ alert = Alert('temp_rule_name', {}, {}) output = alert.output_dict() schema = record_to_schema(output) athena_schema = logs_schema_to_athena_schema(schema, False) return format_schema_tf(athena_schema)
def create_table(table, bucket, config, schema_override=None): """Create a 'streamalert' Athena table Args: table (str): The name of the table being rebuilt bucket (str): The s3 bucket to be used as the location for Athena data table_type (str): The type of table being refreshed config (CLIConfig): Loaded StreamAlert config schema_override (set): An optional set of key=value pairs to be used for overriding the configured column_name=value_type. Returns: bool: False if errors occurred, True otherwise """ enabled_logs = FirehoseClient.load_enabled_log_sources( config['global']['infrastructure']['firehose'], config['logs']) # Convert special characters in schema name to underscores sanitized_table_name = FirehoseClient.firehose_log_name(table) # Check that the log type is enabled via Firehose if sanitized_table_name != 'alerts' and sanitized_table_name not in enabled_logs: LOGGER.error( 'Table name %s missing from configuration or ' 'is not enabled.', sanitized_table_name) return False athena_client = get_athena_client(config) config_data_bucket = firehose_data_bucket(config) if not config_data_bucket: LOGGER.error('The \'firehose\' module is not enabled in global.json') return False # Check if the table exists if athena_client.check_table_exists(sanitized_table_name): LOGGER.info('The \'%s\' table already exists.', sanitized_table_name) return False if table == 'alerts': # get a fake alert so we can get the keys needed and their types alert = Alert('temp_rule_name', {}, {}) output = alert.output_dict() schema = record_to_schema(output) athena_schema = helpers.logs_schema_to_athena_schema(schema) # Use the bucket if supplied, otherwise use the default alerts bucket bucket = bucket or firehose_alerts_bucket(config) query = _construct_create_table_statement(schema=athena_schema, table_name=table, bucket=bucket) else: # all other tables are log types # Use the bucket if supplied, otherwise use the default data bucket bucket = bucket or config_data_bucket log_info = config['logs'][table.replace('_', ':', 1)] schema = dict(log_info['schema']) sanitized_schema = FirehoseClient.sanitize_keys(schema) athena_schema = helpers.logs_schema_to_athena_schema(sanitized_schema) # Add envelope keys to Athena Schema configuration_options = log_info.get('configuration') if configuration_options: envelope_keys = configuration_options.get('envelope_keys') if envelope_keys: sanitized_envelope_key_schema = FirehoseClient.sanitize_keys( envelope_keys) # Note: this key is wrapped in backticks to be Hive compliant athena_schema[ '`streamalert:envelope_keys`'] = helpers.logs_schema_to_athena_schema( sanitized_envelope_key_schema) # Handle Schema overrides # This is useful when an Athena schema needs to differ from the normal log schema if schema_override: for override in schema_override: column_name, column_type = override.split('=') # Columns are escaped to avoid Hive issues with special characters column_name = '`{}`'.format(column_name) if column_name in athena_schema: athena_schema[column_name] = column_type LOGGER.info('Applied schema override: %s:%s', column_name, column_type) else: LOGGER.error( 'Schema override column %s not found in Athena Schema, skipping', column_name) query = _construct_create_table_statement( schema=athena_schema, table_name=sanitized_table_name, bucket=bucket) success = athena_client.run_query(query=query) if not success: LOGGER.error('The %s table could not be created', sanitized_table_name) return False # Update the CLI config if table != 'alerts' and bucket != config_data_bucket: # Only add buckets to the config if they are not one of the default/configured buckets # Ensure 'buckets' exists in the config (since it is not required) config['lambda']['athena_partition_refresh_config']['buckets'] = ( config['lambda']['athena_partition_refresh_config'].get( 'buckets', {})) if bucket not in config['lambda']['athena_partition_refresh_config'][ 'buckets']: config['lambda']['athena_partition_refresh_config']['buckets'][ bucket] = 'data' config.write() LOGGER.info('The %s table was successfully created!', sanitized_table_name) return True