def test_pre_parse_s3_debug(s3_mock, log_mock, _): """S3Payload - Pre Parse, Debug On""" with patch.object(payload, 'LOGGER_DEBUG_ENABLED', True): records = ['_first_line_test_' * 10, '_second_line_test_' * 10] s3_mock.side_effect = [((100, records[0]), (200, records[1]))] raw_record = make_s3_raw_record('unit_bucket_name', 'unit_key_name') s3_payload = load_stream_payload('s3', 'unit_key_name', raw_record) S3Payload.s3_object_size = 350 _ = [_ for _ in s3_payload.pre_parse()] calls = [ call( 'Processed %s S3 records out of an approximate total of %s ' '(average record size: %s bytes, total size: %s bytes)', 100, 350, 1, 350), call( 'Processed %s S3 records out of an approximate total of %s ' '(average record size: %s bytes, total size: %s bytes)', 200, 350, 1, 350) ] log_mock.assert_has_calls(calls)
def test_s3_object_too_large(): """S3Payload - S3ObjectSizeError, Object too Large""" raw_record = make_s3_raw_record('unit_bucket_name', 'unit_key_name') s3_payload = load_stream_payload('s3', 'unit_key_name', raw_record) S3Payload.s3_object_size = (128 * 1024 * 1024) + 10 s3_payload._download_object('region', 'bucket', 'key')
def test_mult_schema_match(self, log_mock): """StreamClassifier - Multiple Schema Matching with Log Patterns""" kinesis_data = json.dumps({ 'name': 'file removal test', 'identifier': 'host4.this.test.also', 'time': 'Jan 01 2017', 'type': 'random', 'message': 'bad_001.txt was removed' }) sa_classifier.SUPPORT_MULTIPLE_SCHEMA_MATCHING = True service, entity = 'kinesis', 'test_stream_2' raw_record = make_kinesis_raw_record(entity, kinesis_data) payload = load_stream_payload(service, entity, raw_record) self.classifier.load_sources(service, entity) payload = list(payload.pre_parse())[0] schema_matches = self.classifier._process_log_schemas(payload) assert_equal(len(schema_matches), 2) self.classifier._check_schema_match(schema_matches) calls = [ call('Log classification matched for multiple schemas: %s', 'test_multiple_schemas:01, test_multiple_schemas:02'), call('Proceeding with schema for: %s', 'test_multiple_schemas:01') ] log_mock.assert_has_calls(calls)
def test_mult_schema_match_success(self): """StreamClassifier - Multiple Schema Matching with Log Patterns, Success""" kinesis_data = json.dumps({ 'name': 'file added test', 'identifier': 'host4.this.test', 'time': 'Jan 01 2017', 'type': 'lol_file_added_event_test', 'message': 'bad_001.txt was added' }) # Make sure support for multiple schema matching is ON sa_classifier.SUPPORT_MULTIPLE_SCHEMA_MATCHING = True service, entity = 'kinesis', 'test_stream_2' raw_record = make_kinesis_raw_record(entity, kinesis_data) payload = load_stream_payload(service, entity, raw_record) self.classifier.load_sources(service, entity) payload = list(payload.pre_parse())[0] schema_matches = self.classifier._process_log_schemas(payload) assert_equal(len(schema_matches), 2) assert_equal(schema_matches[0].log_name, 'test_multiple_schemas:01') assert_equal(schema_matches[1].log_name, 'test_multiple_schemas:02') schema_match = self.classifier._check_schema_match(schema_matches) assert_equal(schema_match.log_name, 'test_multiple_schemas:01')
def test_pre_parse_s3_debug(s3_mock, log_mock, _): """S3Payload - Pre Parse, Debug On""" # Cache the logger level log_level = LOGGER.getEffectiveLevel() # Increase the logger level to debug LOGGER.setLevel(logging.DEBUG) records = ['_first_line_test_' * 10, '_second_line_test_' * 10] s3_mock.side_effect = [((100, records[0]), (200, records[1]))] raw_record = make_s3_raw_record('unit_bucket_name', 'unit_key_name') s3_payload = load_stream_payload('s3', 'unit_key_name', raw_record) S3Payload.s3_object_size = 350 _ = [_ for _ in s3_payload.pre_parse()] calls = [ call( 'Processed %s S3 records out of an approximate total of %s ' '(average record size: %s bytes, total size: %s bytes)', 100, 350, 1, 350), call( 'Processed %s S3 records out of an approximate total of %s ' '(average record size: %s bytes, total size: %s bytes)', 200, 350, 1, 350) ] log_mock.assert_has_calls(calls) # Reset the logger level and stop the patchers LOGGER.setLevel(log_level)
def test_mult_schema_match_failure(self, log_mock): """StreamClassifier - Multiple Schema Matching with Log Patterns, Fail""" kinesis_data = json.dumps({ 'name': 'file removal test', 'identifier': 'host4.this.test.also', 'time': 'Jan 01 2017', 'type': 'file_removed_event_test_file_added_event', 'message': 'bad_001.txt was removed' }) sa_classifier.SUPPORT_MULTIPLE_SCHEMA_MATCHING = True service, entity = 'kinesis', 'test_stream_2' raw_record = make_kinesis_raw_record(entity, kinesis_data) payload = load_stream_payload(service, entity, raw_record) self.classifier.load_sources(service, entity) payload = payload.pre_parse().next() schema_matches = self.classifier._process_log_schemas(payload) assert_equal(len(schema_matches), 2) self.classifier._check_schema_match(schema_matches) log_mock.assert_called_with('Proceeding with schema for: %s', 'test_multiple_schemas:01')
def test_s3_download_object_zero_size(*_): """S3Payload - Download Object of Zero Size""" raw_record = make_s3_raw_record('unit_bucket_name', 'unit_key_name', 0) s3_payload = load_stream_payload('s3', 'unit_key_name', raw_record) assert_is_none( s3_payload._download_object('us-east-1', 'unit_bucket_name', 'unit_key_name'))
def test_get_object_ioerror(download_object_mock): """S3Payload - IOError Test""" download_object_mock.side_effect = IOError('No space left on device') raw_record = make_s3_raw_record('unit_bucket_name', 'unit_key_name') s3_payload = load_stream_payload('s3', 'unit_key_name', raw_record) result = s3_payload._get_object() assert_equal(result, None)
def test_get_object(log_mock, _): """S3Payload - Get S3 Info from Raw Record""" raw_record = make_s3_raw_record('unit_bucket_name', 'unit_key_name') s3_payload = load_stream_payload('s3', 'unit_key_name', raw_record) s3_payload._get_object() log_mock.assert_called_with( 'Pre-parsing record from S3. Bucket: %s, Key: %s, Size: %d', 'unit_bucket_name', 'unit_key_name', 100)
def test_s3_download_object(log_mock, *_): """S3Payload - Download Object""" raw_record = make_s3_raw_record('unit_bucket_name', 'unit_key_name') s3_payload = load_stream_payload('s3', 'unit_key_name', raw_record) s3_payload._download_object('us-east-1', 'unit_bucket_name', 'unit_key_name') assert_equal(log_mock.call_args_list[1][0][0], 'Completed download in %s seconds')
def _prepare_and_classify_payload(self, service, entity, raw_record): """Helper method to return a preparsed and classified payload""" payload = load_stream_payload(service, entity, raw_record) payload = list(payload.pre_parse())[0] self.classifier.load_sources(service, entity) self.classifier.classify_record(payload) return payload
def mock_normalized_records(default_data=None): """Morck records which have been normalized""" if not default_data: default_data = [{ 'account': 12345, 'region': '123456123456', 'detail': { 'eventName': 'ConsoleLogin', 'userIdentity': { 'userName': '******', 'accountId': '12345' }, 'sourceIPAddress': '1.1.1.2', 'recipientAccountId': '12345' }, 'source': '1.1.1.2', 'streamalert:normalization': { 'sourceAddress': [['detail', 'sourceIPAddress'], ['source']], 'usernNme': [['detail', 'userIdentity', 'userName']] } }, { 'domain': 'evil.com', 'pc_name': 'test-pc', 'date': 'Dec 1st, 2016', 'data': 'ABCDEF', 'streamalert:normalization': { 'destinationDomain': [['domain']] } }, { 'domain': 'evil2.com', 'pc_name': 'test-pc', 'date': 'Dec 1st, 2016', 'data': 'ABCDEF', 'streamalert:normalization': { 'destinationDomain': [['domain']] } }, { 'process_md5': 'abcdef0123456789', 'server': 'test-server', 'date': 'Dec 2nd, 2016', 'data': 'Foo', 'streamalert:normalization': { 'fileHash': [['process_md5']] } }] kinesis_payload = [] for record in default_data: entity = 'unit_test_entity' raw_record = make_kinesis_raw_record(entity, 'None') payload = load_stream_payload('kinesis', entity, raw_record) payload = payload.pre_parse().next() payload.pre_parsed_record = record kinesis_payload.append(payload) return kinesis_payload
def test_s3_download_object(*_): """S3Payload - Download Object""" key = 'test/unit/s3-object.gz' raw_record = make_s3_raw_record('unit_bucket_name', key) s3_payload = load_stream_payload('s3', key, raw_record) S3Payload.s3_object_size = (1024 * 1024) downloaded_path = s3_payload._download_object('us-east-1', 'unit_bucket_name', key) assert_true(downloaded_path.endswith('test-unit-s3-object.gz'))
def test_pre_parse_s3(s3_mock, *_): """S3Payload - Pre Parse""" records = ['{"record01": "value01"}', '{"record02": "value02"}'] s3_mock.side_effect = [((0, records[0]), (1, records[1]))] raw_record = make_s3_raw_record('unit_bucket_name', 'unit_key_name') s3_payload = load_stream_payload('s3', 'unit_key_name', raw_record) for index, record in enumerate(s3_payload.pre_parse()): assert_equal(record.pre_parsed_record, records[index])
def load_and_classify_payload(config, service, entity, raw_record): """Return a loaded and classified payload.""" # prepare the payloads payload = load_stream_payload(service, entity, raw_record) payload = list(payload.pre_parse())[0] classifier = StreamClassifier(config=config) classifier.load_sources(service, entity) classifier.classify_record(payload) return payload
def test_s3_download_object_mb(log_mock, *_): """S3Payload - Download Object, Size in MB""" raw_record = make_s3_raw_record('unit_bucket_name', 'unit_key_name') s3_payload = load_stream_payload('s3', 'unit_key_name', raw_record) S3Payload.s3_object_size = (127.8 * 1024 * 1024) s3_payload._download_object('us-east-1', 'unit_bucket_name', 'unit_key_name') assert_equal(log_mock.call_args_list[0], call('[S3Payload] Starting download from S3: %s/%s [%s]', 'unit_bucket_name', 'unit_key_name', '127.8MB')) assert_equal(log_mock.call_args_list[1][0][0], 'Completed download in %s seconds')
def test_repr_string(): """StreamPayload - String Representation""" s3_payload = load_stream_payload('s3', 'entity', 'record') # Set some values that are different than the defaults s3_payload.type = 'unit_type' s3_payload.log_source = 'unit_source' s3_payload.records = ['rec1', 'rec2'] print_value = ('<S3Payload valid:False log_source:unit_source ' 'entity:entity type:unit_type ' 'record:[\'rec1\', \'rec2\']>') output_print = s3_payload.__repr__() assert_equal(output_print, print_value)
def test_pre_parse_sns(log_mock): """SNSPayload - Pre Parse""" sns_data = json.dumps({'test': 'value'}) raw_record = make_sns_raw_record('unit_topic', sns_data) sns_payload = load_stream_payload('sns', 'entity', raw_record) sns_payload = sns_payload.pre_parse().next() assert_equal(sns_payload.pre_parsed_record, '{"test": "value"}') log_mock.assert_called_with( 'Pre-parsing record from SNS. ' 'MessageId: %s, EventSubscriptionArn: %s', 'unit test message id', 'arn:aws:sns:us-east-1:123456789012:unit_topic')
def test_pre_parse_kinesis(log_mock): """KinesisPayload - Pre Parse""" kinesis_data = json.dumps({'test': 'value'}) entity = 'unit_test_entity' raw_record = make_kinesis_raw_record(entity, kinesis_data) kinesis_payload = load_stream_payload('kinesis', entity, raw_record) kinesis_payload = kinesis_payload.pre_parse().next() assert_equal(kinesis_payload.pre_parsed_record, '{"test": "value"}') log_mock.assert_called_with( 'Pre-parsing record from Kinesis. ' 'eventID: %s, eventSourceARN: %s', 'unit test event id', 'arn:aws:kinesis:us-east-1:123456789012:stream/{}'.format(entity))
def test_refresh_record(): """StreamPayload - Refresh Record""" s3_payload = load_stream_payload('s3', 'entity', 'record') # Set some values that are different than the defaults s3_payload.type = 'unit_type' s3_payload.log_source = 'unit_source' s3_payload.records = ['rec1'] s3_payload.valid = True s3_payload._refresh_record('new pre_parsed_record') assert_equal(s3_payload.pre_parsed_record, 'new pre_parsed_record') assert_is_none(s3_payload.type) assert_is_none(s3_payload.log_source) assert_is_none(s3_payload.records) assert_false(s3_payload.valid)
def _validate_test_records(self, rule_name, test_record, formatted_record, print_header_line): """Function to validate test records and log any errors Args: rule_name (str): The rule name being tested test_record (dict): A single record to test formatted_record (dict): A dictionary that includes the 'data' from the test record, formatted into a structure that is resemblant of how an incoming record from a service would format it. See test/integration/templates for example of how each service formats records. """ service, entity = self.processor.classifier.extract_service_and_entity( formatted_record) if not self.processor.classifier.load_sources(service, entity): self.all_tests_passed = False return # Create the StreamPayload to use for encapsulating parsed info payload = load_stream_payload(service, entity, formatted_record) if not payload: self.all_tests_passed = False return if print_header_line: print '\n{}'.format(rule_name) for record in payload.pre_parse(): self.processor.classifier.classify_record(record) if not record.valid: self.all_tests_passed = False self.analyze_record_delta(rule_name, test_record) report_output(record.valid, [ '[log=\'{}\']'.format(record.log_source or 'unknown'), 'validation', record.service(), test_record['description'] ])
def _validate_test_record(self, file_name, test_event, formatted_record, print_header_line): """Function to validate test records and log any errors Args: file_name (str): The base name of the test event file. test_event (dict): A single test event containing the record and other detail formatted_record (dict): A dictionary that includes the 'data' from the test record, formatted into a structure that is resemblant of how an incoming record from a service would format it. See test/integration/templates for example of how each service formats records. print_header_line (bool): Indicates if this is the first record from a test file, and therefore we should print some header information """ service, entity = self.processor.classifier.extract_service_and_entity( formatted_record) if not self.processor.classifier.load_sources(service, entity): return False # Create the StreamPayload to use for encapsulating parsed info payload = load_stream_payload(service, entity, formatted_record) if not payload: return False if print_header_line: print '\n{}'.format(file_name) for record in payload.pre_parse(): self.processor.classifier.classify_record(record) if not record.valid: self.all_tests_passed = False self.analyze_record_delta(file_name, test_event) report_output(record.valid, [ '[log=\'{}\']'.format(record.log_source or 'unknown'), 'validation', record.service(), test_event['description'] ])
def test_parse_convert_fail(self, log_mock): """StreamClassifier - Convert Failed""" service, entity = 'kinesis', 'unit_test_default_stream' result = self.classifier.load_sources(service, entity) assert_true(result) kinesis_data = json.dumps({ 'unit_key_01': 'not an integer', 'unit_key_02': 'valid string' }) raw_record = make_kinesis_raw_record(entity, kinesis_data) payload = load_stream_payload(service, entity, raw_record) payload = list(payload.pre_parse())[0] result = self.classifier._parse(payload) assert_false(result) log_mock.assert_called_with( 'Invalid schema. Value for key [%s] is not an int: %s', 'unit_key_01', 'not an integer')
def test_get_service_sns(): """StreamPayload - Get Service, SNS""" sns_payload = load_stream_payload('sns', 'entity', 'record') assert_equal(sns_payload.service(), 'sns')
def test_get_service_s3(): """StreamPayload - Get Service, S3""" s3_payload = load_stream_payload('s3', 'entity', 'record') assert_equal(s3_payload.service(), 's3')
def test_get_service_kinesis(): """StreamPayload - Get Service, Kinesis""" kinesis_payload = load_stream_payload('kinesis', 'entity', 'record') assert_equal(kinesis_payload.service(), 'kinesis')
def test_load_payload_invalid(log_mock): """StreamPayload - Loading Stream Payload, Invalid""" load_stream_payload('blah', 'entity', 'record') log_mock.assert_called_with('Service payload not supported: %s', 'blah')
def run(self, event): """StreamAlert Lambda function handler. Loads the configuration for the StreamAlert function which contains available data sources, log schemas, normalized types, and outputs. Classifies logs sent into a parsed type. Matches records against rules. Args: event (dict): An AWS event mapped to a specific source/entity containing data read by Lambda. Returns: bool: True if all logs being parsed match a schema """ records = event.get('Records', []) LOGGER.debug('Number of incoming records: %d', len(records)) if not records: return False firehose_config = self.config['global'].get('infrastructure', {}).get('firehose', {}) if firehose_config.get('enabled'): self._firehose_client = StreamAlertFirehose( self.env['lambda_region'], firehose_config, self.config['logs']) payload_with_normalized_records = [] for raw_record in records: # Get the service and entity from the payload. If the service/entity # is not in our config, log and error and go onto the next record service, entity = self.classifier.extract_service_and_entity( raw_record) if not service: LOGGER.error( 'No valid service found in payload\'s raw record. Skipping ' 'record: %s', raw_record) continue if not entity: LOGGER.error( 'Unable to extract entity from payload\'s raw record for service %s. ' 'Skipping record: %s', service, raw_record) continue # Cache the log sources for this service and entity on the classifier if not self.classifier.load_sources(service, entity): continue # Create the StreamPayload to use for encapsulating parsed info payload = load_stream_payload(service, entity, raw_record) if not payload: continue payload_with_normalized_records.extend( self._process_alerts(payload)) # Log normalized records metric MetricLogger.log_metric(FUNCTION_NAME, MetricLogger.NORMALIZED_RECORDS, len(payload_with_normalized_records)) # Apply Threat Intel to normalized records in the end of Rule Processor invocation record_alerts = self._rules_engine.threat_intel_match( payload_with_normalized_records) self._alerts.extend(record_alerts) if record_alerts: self.alert_forwarder.send_alerts(record_alerts) MetricLogger.log_metric(FUNCTION_NAME, MetricLogger.TOTAL_RECORDS, self._processed_record_count) MetricLogger.log_metric(FUNCTION_NAME, MetricLogger.TOTAL_PROCESSED_SIZE, self._processed_size) LOGGER.debug('Invalid record count: %d', self._failed_record_count) MetricLogger.log_metric(FUNCTION_NAME, MetricLogger.FAILED_PARSES, self._failed_record_count) LOGGER.debug('%s alerts triggered', len(self._alerts)) MetricLogger.log_metric(FUNCTION_NAME, MetricLogger.TRIGGERED_ALERTS, len(self._alerts)) # Check if debugging logging is on before json dumping alerts since # this can be time consuming if there are a lot of alerts if self._alerts and LOGGER.isEnabledFor(LOG_LEVEL_DEBUG): LOGGER.debug( 'Alerts:\n%s', json.dumps([alert.output_dict() for alert in self._alerts], indent=2, sort_keys=True)) if self._firehose_client: self._firehose_client.send() # Only log rule info here if this is not running tests # During testing, this gets logged at the end and printing here could be confusing # since stress testing calls this method multiple times if self.env['lambda_alias'] != 'development': stats.print_rule_stats(True) return self._failed_record_count == 0
def test_load_payload_valid(): """StreamPayload - Loading Stream Payload, Valid""" payload = load_stream_payload('s3', 'entity', 'record') assert_is_instance(payload, S3Payload)
def run(self, event): """StreamAlert Lambda function handler. Loads the configuration for the StreamAlert function which contains available data sources, log schemas, normalized types, and outputs. Classifies logs sent into a parsed type. Matches records against rules. Args: event (dict): An AWS event mapped to a specific source/entity containing data read by Lambda. Returns: bool: True if all logs being parsed match a schema """ records = event.get('Records', []) LOGGER.debug('Number of Records: %d', len(records)) if not records: return False MetricLogger.log_metric(FUNCTION_NAME, MetricLogger.TOTAL_RECORDS, len(records)) firehose_config = self.config['global'].get( 'infrastructure', {}).get('firehose', {}) if firehose_config.get('enabled'): self.firehose_client = boto3.client('firehose', region_name=self.env['lambda_region']) for raw_record in records: # Get the service and entity from the payload. If the service/entity # is not in our config, log and error and go onto the next record service, entity = self.classifier.extract_service_and_entity(raw_record) if not service: LOGGER.error('No valid service found in payload\'s raw record. Skipping ' 'record: %s', raw_record) continue if not entity: LOGGER.error( 'Unable to extract entity from payload\'s raw record for service %s. ' 'Skipping record: %s', service, raw_record) continue # Cache the log sources for this service and entity on the classifier if not self.classifier.load_sources(service, entity): continue # Create the StreamPayload to use for encapsulating parsed info payload = load_stream_payload(service, entity, raw_record) if not payload: continue self._process_alerts(payload) MetricLogger.log_metric(FUNCTION_NAME, MetricLogger.TOTAL_PROCESSED_SIZE, self._processed_size) LOGGER.debug('Invalid record count: %d', self._failed_record_count) MetricLogger.log_metric(FUNCTION_NAME, MetricLogger.FAILED_PARSES, self._failed_record_count) LOGGER.debug('%s alerts triggered', len(self._alerts)) MetricLogger.log_metric( FUNCTION_NAME, MetricLogger.TRIGGERED_ALERTS, len( self._alerts)) # Check if debugging logging is on before json dumping alerts since # this can be time consuming if there are a lot of alerts if self._alerts and LOGGER.isEnabledFor(LOG_LEVEL_DEBUG): LOGGER.debug('Alerts:\n%s', json.dumps(self._alerts, indent=2)) if self.firehose_client: self._send_to_firehose() return self._failed_record_count == 0