コード例 #1
0
ファイル: handler.py プロジェクト: pondurance/streamalert
    def __init__(self, context, enable_alert_processor=True):
        """
        Args:
            context: An AWS context object which provides metadata on the currently
                executing lambda function.
            enable_alert_processor (bool): If the user wants to send the alerts using their
                own methods, 'enable_alert_processor' can be set to False to suppress
                sending with the StreamAlert alert processor.
        """
        # Load the config. Validation occurs during load, which will
        # raise exceptions on any ConfigErrors
        config = load_config()

        # Load the environment from the context arn
        self.env = load_env(context)

        # Instantiate the sink here to handle sending the triggered alerts to the
        # alert processor
        self.sinker = StreamSink(self.env)

        # Instantiate a classifier that is used for this run
        self.classifier = StreamClassifier(config=config)

        self.enable_alert_processor = enable_alert_processor
        self._failed_record_count = 0
        self._alerts = []
コード例 #2
0
ファイル: handler.py プロジェクト: ykv-name/streamalert
    def __init__(self, context):
        """Initializer

        Args:
            context (dict): An AWS context object which provides metadata on the currently
                executing lambda function.
        """
        # Load the config. Validation occurs during load, which will
        # raise exceptions on any ConfigError
        StreamAlert.config = StreamAlert.config or config.load_config(validate=True)

        # Load the environment from the context arn
        self.env = config.parse_lambda_arn(context.invoked_function_arn)

        # Instantiate the send_alerts here to handle sending the triggered alerts to the
        # alert processor
        self.alert_forwarder = AlertForwarder()

        # Instantiate a classifier that is used for this run
        self.classifier = StreamClassifier(config=self.config)

        self._failed_record_count = 0
        self._processed_record_count = 0
        self._processed_size = 0
        self._alerts = []

        rule_import_paths = [item for location in {'rule_locations', 'matcher_locations'}
                             for item in self.config['global']['general'][location]]

        # Create an instance of the RulesEngine class that gets cached in the
        # StreamAlert class as an instance property
        self._rules_engine = RulesEngine(self.config, *rule_import_paths)

        # Firehose client attribute
        self._firehose_client = None
コード例 #3
0
    def run(self, event, context):
        """StreamAlert Lambda function handler.

        Loads the configuration for the StreamAlert function which contains:
        available data sources, log formats, parser modes, and sinks.  Classifies
        logs sent into the stream into a parsed type.  Matches records against
        rules.

        Args:
            event: An AWS event mapped to a specific source/entity (kinesis stream or
                an s3 bucket event) containing data emitted to the stream.
            context: An AWS context object which provides metadata on the currently
                executing lambda function.

        Returns:
            None
        """
        logger.debug('Number of Records: %d', len(event.get('Records', [])))

        config = load_config()
        env = load_env(context)

        for record in event.get('Records', []):
            payload = StreamPayload(raw_record=record)
            classifier = StreamClassifier(config=config)
            classifier.map_source(payload)

            # If the kinesis stream or s3 bucket is not in our config,
            # go onto the next record
            if not payload.valid_source:
                continue

            if payload.service == 's3':
                self.s3_process(payload, classifier)
            elif payload.service == 'kinesis':
                self.kinesis_process(payload, classifier)
            else:
                logger.info('Unsupported service: %s', payload.service)

        # returns the list of generated alerts
        if self.return_alerts:
            return self.alerts
        # send alerts to SNS
        self.send_alerts(env, payload)
コード例 #4
0
ファイル: test_classifier.py プロジェクト: mruba/streamalert
    def test_map_source_2(self):
        """Payload Source Mapping 2"""
        data_encoded = base64.b64encode('test_map_source_data_2')
        payload = self.payload_generator(kinesis_stream='test_stream_2',
                                         kinesis_data=data_encoded)

        classifier = StreamClassifier(config=self.config)
        classifier.map_source(payload)

        test_stream_2_logs = {
            'test_log_type_json_2', 'test_log_type_json_nested_osquery',
            'test_log_type_syslog'
        }
        metadata = classifier.log_metadata(payload)

        # service, entity, metadata test
        assert_equal(payload.service, 'kinesis')
        assert_equal(payload.entity, 'test_stream_2')
        assert_equal(set(metadata.keys()), test_stream_2_logs)
コード例 #5
0
    def run(self, event):
        """StreamAlert Lambda function handler.

        Loads the configuration for the StreamAlert function which contains:
        available data sources, log formats, parser modes, and sinks.  Classifies
        logs sent into the stream into a parsed type.  Matches records against
        rules.

        Args:
            event: An AWS event mapped to a specific source/entity (kinesis stream or
                an s3 bucket event) containing data emitted to the stream.

        Returns:
            None
        """
        LOGGER.debug('Number of Records: %d', len(event.get('Records', [])))

        config = load_config()

        for record in event.get('Records', []):
            payload = StreamPayload(raw_record=record)
            classifier = StreamClassifier(config=config)

            # If the kinesis stream, s3 bucket, or sns topic is not in our config,
            # go onto the next record
            if not classifier.map_source(payload):
                continue

            if payload.service == 's3':
                self._s3_process(payload, classifier)
            elif payload.service == 'kinesis':
                self._kinesis_process(payload, classifier)
            elif payload.service == 'sns':
                self._sns_process(payload, classifier)
            else:
                LOGGER.info('Unsupported service: %s', payload.service)

        LOGGER.debug('%s alerts triggered', len(self.alerts))
        LOGGER.debug('\n%s\n', json.dumps(self.alerts, indent=4))

        if self.return_alerts:
            return self.alerts
コード例 #6
0
ファイル: test.py プロジェクト: VVMichaelSawyer/streamalert
    def test_rule(self, rule_name, test_record, formatted_record):
        """Feed formatted records into StreamAlert and check for alerts
        Args:
            rule_name [str]: The rule name being tested
            test_record [dict]: A single record to test
            formatted_record [dict]: A dictionary that includes the 'data' from the
                test record, formatted into a structure that is resemblant of how
                an incoming record from a service would format it.
                See test/integration/templates for example of how each service
                formats records.

        Returns:
            [list] alerts that hit for this rule
            [integer] count of expected alerts for this rule
            [bool] boolean where False indicates errors occurred during processing
        """
        event = {'Records': [formatted_record]}

        expected_alert_count = test_record.get('trigger_count')
        if not expected_alert_count:
            expected_alert_count = 1 if test_record['trigger'] else 0

        # Run the rule processor. Passing mocked context object with fake
        # values and False for suppressing sending of alerts
        processor = StreamAlert(self.context, False)
        all_records_matched_schema = processor.run(event)

        if not all_records_matched_schema:
            payload = StreamPayload(raw_record=formatted_record)
            classifier = StreamClassifier(config=load_config())
            classifier.map_source(payload)
            logs = classifier._log_metadata()
            self.analyze_record_delta(logs, rule_name, test_record)

        alerts = processor.get_alerts()

        # we only want alerts for the specific rule being tested
        alerts = [alert for alert in alerts
                  if alert['rule_name'] == rule_name]

        return alerts, expected_alert_count, all_records_matched_schema
コード例 #7
0
    def test_map_source_1(self):
        """Payload Source Mapping 1"""
        data_encoded = base64.b64encode('test_map_source data')
        payload = self.payload_generator(kinesis_stream='test_kinesis_stream',
                                         kinesis_data=data_encoded)

        classifier = StreamClassifier(config=self.config)
        classifier.map_source(payload)

        test_kinesis_stream_logs = {
            'test_log_type_json', 'test_log_type_json_2',
            'test_log_type_json_nested', 'test_log_type_json_nested_with_data',
            'test_log_type_csv', 'test_log_type_csv_nested',
            'test_log_type_kv_auditd'
        }
        metadata = classifier._log_metadata()

        # service, entity, metadata test
        assert_equal(payload.service, 'kinesis')
        assert_equal(payload.entity, 'test_kinesis_stream')
        assert_equal(set(metadata.keys()), test_kinesis_stream_logs)
コード例 #8
0
    def test_classify_record_kinesis_json(self):
        """Payload Classify JSON - boolean, float, integer types"""
        kinesis_data = json.dumps({
            'key4': 'true',
            'key5': '10.001',
            'key6': '10',
            'key7': False
        })
        payload = self.payload_generator(kinesis_stream='test_kinesis_stream',
                                         kinesis_data=kinesis_data)
        classifier = StreamClassifier(config=self.config)
        classifier.map_source(payload)

        # pre parse and classify
        data = self.pre_parse_kinesis(payload)
        classifier.classify_record(payload, data)

        # valid record test
        assert_equal(payload.valid, True)
        assert_equal(type(payload.records[0]), dict)

        # log type test
        assert_equal(payload.log_source, 'test_log_type_json_2')

        # payload type test
        assert_equal(payload.type, 'json')
        assert_not_equal(payload.type, 'csv')

        # record type test
        assert_equal(payload.records[0]['key4'], True)
        assert_equal(payload.records[0]['key5'], 10.001)
        assert_equal(payload.records[0]['key6'], 10)
        assert_equal(payload.records[0]['key7'], False)
コード例 #9
0
    def test_classify_record_kinesis_nested_json_missing_subkey_fields(self):
        """Payload Classify Nested JSON Missing Subkeys"""
        kinesis_data = json.dumps({
            'name': 'testquery',
            'hostIdentifier': 'host1.test.prod',
            'calendarTime': 'Jan 01 2017',
            'unixTime': '12321412321',
            'columns': {
                'key1': 'test',
                'key2': 'one'
            },
            'action': 'added',
            'decorations': {
                'role': 'web-server',
                'env': 'production',
                # 'cluster': 'eu-east',
                'number': '100'
            }
        })
        payload = self.payload_generator(kinesis_stream='test_stream_2',
                                         kinesis_data=kinesis_data)

        classifier = StreamClassifier(config=self.config)
        classifier.map_source(payload)

        data = self.pre_parse_kinesis(payload)
        classifier.classify_record(payload, data)

        # invalid record test
        assert_equal(payload.valid, False)
        assert_equal(payload.records, None)
コード例 #10
0
    def __init__(self, context, enable_alert_processor=True):
        """Initializer

        Args:
            context (dict): An AWS context object which provides metadata on the currently
                executing lambda function.
            enable_alert_processor (bool): If the user wants to send the alerts using their
                own methods, 'enable_alert_processor' can be set to False to suppress
                sending with the StreamAlert alert processor.
        """
        # Load the config. Validation occurs during load, which will
        # raise exceptions on any ConfigErrors
        StreamAlert.config = StreamAlert.config or load_config()

        # Load the environment from the context arn
        self.env = load_env(context)

        # Instantiate the sink here to handle sending the triggered alerts to the
        # alert processor
        self.sinker = StreamSink(self.env)

        # Instantiate a classifier that is used for this run
        self.classifier = StreamClassifier(config=self.config)

        self.enable_alert_processor = enable_alert_processor
        self._failed_record_count = 0
        self._processed_size = 0
        self._alerts = []

        # Create a dictionary to hold parsed payloads by log type.
        # Firehose needs this information to send to its corresponding
        # delivery stream.
        self.categorized_payloads = defaultdict(list)

        # Firehose client initialization
        self.firehose_client = None

        # create an instance of the StreamRules class that gets cached in the
        # StreamAlert class as an instance property
        self._rule_engine = StreamRules(self.config)
コード例 #11
0
    def test_classify_record_kinesis_csv(self):
        """Payload Classify CSV"""
        csv_data = 'jan102017,0100,host1,thisis some data with keyword1 in it'
        payload = self.payload_generator(kinesis_stream='test_kinesis_stream',
                                         kinesis_data=csv_data)

        classifier = StreamClassifier(config=self.config)
        classifier.map_source(payload)

        data = self.pre_parse_kinesis(payload)
        classifier.classify_record(payload, data)

        # valid record test
        assert_equal(payload.valid, True)
        assert_equal(type(payload.records[0]), dict)

        # record value tests
        assert_equal(payload.records[0]['message'],
                     'thisis some data with keyword1 in it')
        assert_equal(payload.records[0]['host'], 'host1')

        # type test
        assert_equal(payload.type, 'csv')
        assert_not_equal(payload.type, 'json')

        # log source test
        assert_equal(payload.log_source, 'test_log_type_csv')
コード例 #12
0
    def test_classify_record_kinesis_csv_nested(self):
        """Payload Classify Nested CSV"""
        csv_nested_data = (
            '"Jan 10 2017","1485635414","host1.prod.test","Corp",'
            '"chef,web-server,1,10,success"')
        payload = self.payload_generator(kinesis_stream='test_kinesis_stream',
                                         kinesis_data=csv_nested_data)

        classifier = StreamClassifier(config=self.config)
        classifier.map_source(payload)

        data = self.pre_parse_kinesis(payload)
        classifier.classify_record(payload, data)

        # valid record test
        assert_equal(payload.valid, True)
        assert_equal(type(payload.records[0]), dict)

        # record value tests
        assert_equal(payload.records[0]['date'], 'Jan 10 2017')
        assert_equal(payload.records[0]['host'], 'host1.prod.test')
        assert_equal(payload.records[0]['time'], 1485635414)
        assert_equal(payload.records[0]['message']['role'], 'web-server')
        assert_equal(payload.records[0]['message']['cluster_size'], 10)

        # type test
        assert_equal(payload.type, 'csv')
        assert_not_equal(payload.type, 'json')

        # log source test
        assert_equal(payload.log_source, 'test_log_type_csv_nested')
コード例 #13
0
    def test_classify_record_kinesis_json(self):
        """Payload Classify JSON"""
        kinesis_data = json.dumps({
            'key1': 'sample data!!!!',
            'key2': 'more sample data',
            'key3': '1'
        })
        payload = self.payload_generator(kinesis_stream='test_kinesis_stream',
                                         kinesis_data=kinesis_data)
        classifier = StreamClassifier(config=self.config)
        classifier.map_source(payload)

        # pre parse and classify
        data = self.pre_parse_kinesis(payload)
        classifier.classify_record(payload, data)

        # valid record test
        assert_equal(payload.valid, True)
        assert_equal(type(payload.records[0]), dict)

        # log type test
        assert_equal(payload.log_source, 'test_log_type_json')

        # payload type test
        assert_equal(payload.type, 'json')
        assert_not_equal(payload.type, 'csv')

        # record type test
        assert_equal(type(payload.records[0]['key1']), str)
        assert_equal(type(payload.records[0]['key2']), str)
        assert_equal(type(payload.records[0]['key3']), int)
コード例 #14
0
    def test_multiple_schema_matching(self):
        """Test Matching Multiple Schemas with Log Patterns"""
        kinesis_data = json.dumps({
            'name': 'file added test',
            'identifier': 'host4.this.test',
            'time': 'Jan 01 2017',
            'type': 'lol_file_added_event_test',
            'message': 'bad_001.txt was added'
        })
        # Make sure support for multiple schema matching is ON
        sa_classifier.SUPPORT_MULTIPLE_SCHEMA_MATCHING = True

        payload = self.payload_generator(kinesis_stream='test_stream_2',
                                         kinesis_data=kinesis_data)
        classifier = StreamClassifier(config=self.config)
        classifier.map_source(payload)

        data = self.pre_parse_kinesis(payload)
        valid_parses = classifier._process_log_schemas(payload, data)

        assert_equal(len(valid_parses), 2)
        assert_equal(valid_parses[0].log_name, 'test_multiple_schemas:01')
        assert_equal(valid_parses[1].log_name, 'test_multiple_schemas:02')
        valid_parse = classifier._check_valid_parse(valid_parses)

        assert_equal(valid_parse.log_name, 'test_multiple_schemas:01')
コード例 #15
0
def load_and_classify_payload(config, service, entity, raw_record):
    """Return a loaded and classified payload."""
    # prepare the payloads
    payload = load_stream_payload(service, entity, raw_record)

    payload = list(payload.pre_parse())[0]
    classifier = StreamClassifier(config=config)
    classifier.load_sources(service, entity)
    classifier.classify_record(payload)

    return payload
コード例 #16
0
    def test_classify_record_kinesis_json_optional(self):
        """Payload Classify JSON - optional fields"""
        kinesis_data = json.dumps({
            'key1': [{
                'test': 1,
                'test2': 2
            }, {
                'test3': 3,
                'test4': 4
            }],
            'key2':
            'more sample data',
            'key3':
            '1',
            'key10': {
                'test-field': 1,
                'test-field2': 2
            }
        })
        payload = self.payload_generator(kinesis_stream='test_kinesis_stream',
                                         kinesis_data=kinesis_data)
        classifier = StreamClassifier(config=self.config)
        classifier.map_source(payload)

        # pre parse and classify
        data = self.pre_parse_kinesis(payload)
        classifier.classify_record(payload, data)

        # valid record test
        assert_equal(payload.valid, True)
        assert_equal(type(payload.records[0]), dict)

        # log type test
        assert_equal(payload.log_source, 'test_log_type_json')

        # payload type test
        assert_equal(payload.type, 'json')
        assert_not_equal(payload.type, 'csv')

        # record value tests
        assert_equal(len(payload.records[0]['key1']), 2)
        assert_equal(payload.records[0]['key3'], 1)
        assert_equal(payload.records[0]['key1'][1]['test4'], 4)

        # optional field tests
        assert_equal(payload.records[0]['key11'], 0.0)
        assert_equal(payload.records[0]['key9'], False)
        assert_equal(len(payload.records[0]['key10']), 2)

        # record type tests
        assert_equal(type(payload.records[0]['key1']), list)
        assert_equal(type(payload.records[0]['key2']), str)
        assert_equal(type(payload.records[0]['key3']), int)
コード例 #17
0
    def test_classify_record_kinesis_nested_json_osquery(self):
        """Payload Classify JSON osquery"""
        kinesis_data = json.dumps({
            'name': 'testquery',
            'hostIdentifier': 'host1.test.prod',
            'calendarTime': 'Jan 01 2017',
            'unixTime': '1485556524',
            'columns': {
                'key1': 'test',
                'key2': 'one'
            },
            'action': 'added',
            'decorations': {
                'role': 'web-server',
                'env': 'production',
                'cluster': 'eu-east',
                'number': '100'
            }
        })
        payload = self.payload_generator(kinesis_stream='test_stream_2',
                                         kinesis_data=kinesis_data)

        classifier = StreamClassifier(config=self.config)
        classifier.map_source(payload)

        data = self.pre_parse_kinesis(payload)
        classifier.classify_record(payload, data)

        # valid record test
        assert_equal(payload.valid, True)
        assert_equal(type(payload.records[0]), dict)

        # log type test
        assert_equal(payload.log_source, 'test_log_type_json_nested_osquery')

        # payload type test
        assert_equal(payload.type, 'json')
        assert_not_equal(payload.type, 'csv')

        # record type test
        assert_equal(type(payload.records[0]['hostIdentifier']), str)
        assert_equal(type(payload.records[0]['unixTime']), int)
        assert_equal(type(payload.records[0]['columns']), dict)
        assert_equal(type(payload.records[0]['decorations']), dict)

        # record value test
        assert_equal(payload.records[0]['unixTime'], 1485556524)
        assert_equal(payload.records[0]['columns']['key1'], 'test')
        assert_equal(payload.records[0]['decorations']['cluster'], 'eu-east')
        assert_equal(payload.records[0]['decorations']['number'], 100)
        assert_equal(payload.records[0]['log_type'], '')
コード例 #18
0
    def test_classify_record_syslog(self):
        """Payload Classify Syslog"""
        test_data_1 = ('Jan 26 19:35:33 vagrant-ubuntu-trusty-64 '
                       'sudo: pam_unix(sudo:session): '
                       'session opened for user root by (uid=0)')
        test_data_2 = (
            "Jan 26 12:28:06 macbook004154test authd[122]: "
            "Succeeded authorizing right 'com.apple.trust-settings.admin' "
            "by client '/usr/sbin/ocspd' [11835] for authorization created by"
            " '/usr/bin/security' [21322] (3,0)")

        fixtures = {'test_1': test_data_1, 'test_2': test_data_2}
        for name, syslog_message in fixtures.iteritems():
            payload = self.payload_generator(kinesis_stream='test_stream_2',
                                             kinesis_data=syslog_message)

            classifier = StreamClassifier(config=self.config)
            classifier.map_source(payload)

            data = self.pre_parse_kinesis(payload)
            classifier.classify_record(payload, data)

            # valid record test
            assert_equal(payload.valid, True)
            assert_equal(type(payload.records[0]), dict)

            # type test
            assert_equal(payload.type, 'syslog')
            assert_not_equal(payload.type, 'csv')
            assert_not_equal(payload.type, 'json')
            assert_not_equal(payload.type, 'kv')

            # record value tests
            if name == 'test_1':
                assert_equal(payload.records[0]['host'],
                             'vagrant-ubuntu-trusty-64')
                assert_equal(payload.records[0]['application'], 'sudo')
                assert_equal(
                    payload.records[0]['message'], 'pam_unix(sudo:session):'
                    ' session opened for user'
                    ' root by (uid=0)')
            elif name == 'test_2':
                assert_equal(payload.records[0]['host'], 'macbook004154test')
                assert_equal(payload.records[0]['application'], 'authd')
コード例 #19
0
    def test_classify_record_kinesis_nested_json_with_data(self):
        """Payload Classify Nested JSON Generic"""
        kinesis_data = json.dumps({
            'date': 'Jan 01 2017',
            'unixtime': '1485556524',
            'host': 'host1',
            'application': 'myapp',
            'environment': 'development',
            'data': {
                'category': 'test',
                'type': '1',
                'source': 'dev-app-1'
            }
        })
        payload = self.payload_generator(kinesis_stream='test_kinesis_stream',
                                         kinesis_data=kinesis_data)

        classifier = StreamClassifier(config=self.config)
        classifier.map_source(payload)

        data = self.pre_parse_kinesis(payload)
        classifier.classify_record(payload, data)

        # valid record test
        assert_equal(payload.valid, True)
        assert_equal(type(payload.records[0]), dict)

        # log type test
        assert_equal(payload.log_source, 'test_log_type_json_nested_with_data')

        # payload type test
        assert_equal(payload.type, 'json')
        assert_not_equal(payload.type, 'csv')

        # record type test
        assert_equal(type(payload.records[0]['date']), str)
        assert_equal(type(payload.records[0]['unixtime']), int)
        assert_equal(type(payload.records[0]['data']), dict)
        assert_equal(type(payload.records[0]['data']['type']), int)
        assert_equal(type(payload.records[0]['data']['category']), str)

        # record value test
        assert_equal(payload.records[0]['date'], 'Jan 01 2017')
        assert_equal(payload.records[0]['data']['source'], 'dev-app-1')
コード例 #20
0
    def make_kinesis_payload(self, kinesis_stream, kinesis_data):
        """Helper for creating the kinesis payload"""
        raw_record = {
            'eventSource':
            'aws:kinesis',
            'eventSourceARN':
            'arn:aws:kinesis:us-east-1:123456789012:stream/{}'.format(
                kinesis_stream),
            'kinesis': {
                'data': base64.b64encode(kinesis_data)
            }
        }
        payload = StreamPayload(raw_record=raw_record)
        classifier = StreamClassifier(config=self.config)

        classifier.map_source(payload)
        data = self.pre_parse_kinesis(payload)
        classifier.classify_record(payload, data)

        if payload.valid:
            return payload
コード例 #21
0
    def test_classify_record_kinesis_kv(self):
        """Payload Classify KV"""
        auditd_test_data = (
            'type=SYSCALL msg=audit(1364481363.243:24287): '
            'arch=c000003e syscall=2 success=no exit=-13 a0=7fffd19c5592 a1=0 '
            'a2=7fffd19c4b50 a3=a items=1 ppid=2686 pid=3538 auid=500 uid=500 '
            'gid=500 euid=500 suid=500 fsuid=500 egid=500 sgid=500 fsgid=500 tty=pts0 '
            'ses=1 comm="cat" exe="/bin/cat" '
            'subj=unconfined_u:unconfined_r:unconfined_t:s0-s0:c0.c1023 '
            'key="sshd_config" type=CWD msg=audit(1364481363.243:24287):  '
            'cwd="/home/shadowman" type=PATH '
            'msg=audit(1364481363.243:24287): item=0 name="/etc/ssh/sshd_config" '
            'inode=409248 dev=fd:00 mode=0100600 ouid=0 ogid=0 '
            'rdev=00:00 obj=system_u:object_r:etc_t:s0')

        payload = self.payload_generator(kinesis_stream='test_kinesis_stream',
                                         kinesis_data=auditd_test_data)

        classifier = StreamClassifier(config=self.config)
        classifier.map_source(payload)

        data = self.pre_parse_kinesis(payload)
        classifier.classify_record(payload, data)

        # valid record test
        assert_equal(payload.valid, True)
        assert_equal(type(payload.records[0]), dict)

        # record value tests
        assert_equal(payload.records[0]['type'], 'SYSCALL')
        assert_equal(payload.records[0]['suid'], 500)
        assert_equal(payload.records[0]['pid'], 3538)
        assert_equal(payload.records[0]['type_3'], 'PATH')

        # type test
        assert_equal(payload.type, 'kv')
        assert_not_equal(payload.type, 'csv')
        assert_not_equal(payload.type, 'json')
コード例 #22
0
    def test_classify_record_kinesis_nested_json(self):
        """Payload Classify Nested JSON"""
        kinesis_data = json.dumps({
            'date': 'Jan 01 2017',
            'unixtime': '1485556524',
            'host': 'my-host-name',
            'data': {
                'key1': 'test',
                'key2': 'one'
            }
        })
        payload = self.payload_generator(kinesis_stream='test_kinesis_stream',
                                         kinesis_data=kinesis_data)
        classifier = StreamClassifier(config=self.config)
        classifier.map_source(payload)

        data = self.pre_parse_kinesis(payload)
        classifier.classify_record(payload, data)

        # valid record test
        assert_equal(payload.valid, True)
        assert_equal(type(payload.records[0]), dict)

        # log type test
        assert_equal(payload.log_source, 'test_log_type_json_nested')

        # payload type test
        assert_equal(payload.type, 'json')
        assert_not_equal(payload.type, 'csv')

        # record type test
        assert_equal(type(payload.records[0]['date']), str)
        assert_equal(type(payload.records[0]['unixtime']), int)
        assert_equal(type(payload.records[0]['data']), dict)

        # record value test
        assert_equal(payload.records[0]['date'], 'Jan 01 2017')
        assert_equal(payload.records[0]['data']['key1'], 'test')
コード例 #23
0
ファイル: handler.py プロジェクト: pondurance/streamalert
class StreamAlert(object):
    """Wrapper class for handling all StreamAlert classificaiton and processing"""
    def __init__(self, context, enable_alert_processor=True):
        """
        Args:
            context: An AWS context object which provides metadata on the currently
                executing lambda function.
            enable_alert_processor (bool): If the user wants to send the alerts using their
                own methods, 'enable_alert_processor' can be set to False to suppress
                sending with the StreamAlert alert processor.
        """
        # Load the config. Validation occurs during load, which will
        # raise exceptions on any ConfigErrors
        config = load_config()

        # Load the environment from the context arn
        self.env = load_env(context)

        # Instantiate the sink here to handle sending the triggered alerts to the
        # alert processor
        self.sinker = StreamSink(self.env)

        # Instantiate a classifier that is used for this run
        self.classifier = StreamClassifier(config=config)

        self.enable_alert_processor = enable_alert_processor
        self._failed_record_count = 0
        self._alerts = []

    def run(self, event):
        """StreamAlert Lambda function handler.

        Loads the configuration for the StreamAlert function which contains:
        available data sources, log formats, parser modes, and sinks.  Classifies
        logs sent into the stream into a parsed type.  Matches records against
        rules.

        Args:
            event: An AWS event mapped to a specific source/entity (kinesis stream or
                an s3 bucket event) containing data emitted to the stream.

        Returns:
            bool: True if all logs being parsed match a schema
        """
        records = event.get('Records', [])
        LOGGER.debug('Number of Records: %d', len(records))
        if not records:
            return False

        MetricLogger.log_metric(FUNCTION_NAME, MetricLogger.TOTAL_RECORDS,
                                len(records))

        for raw_record in records:
            # Get the service and entity from the payload. If the service/entity
            # is not in our config, log and error and go onto the next record
            service, entity = self.classifier.extract_service_and_entity(
                raw_record)
            if not service:
                LOGGER.error(
                    'No valid service found in payload\'s raw record. Skipping '
                    'record: %s', raw_record)
                continue

            if not entity:
                LOGGER.error(
                    'Unable to extract entity from payload\'s raw record for service %s. '
                    'Skipping record: %s', service, raw_record)
                continue

            # Cache the log sources for this service and entity on the classifier
            if not self.classifier.load_sources(service, entity):
                continue

            # Create the StreamPayload to use for encapsulating parsed info
            payload = load_stream_payload(service, entity, raw_record)
            if not payload:
                continue

            self._process_alerts(payload)

        LOGGER.debug('Invalid record count: %d', self._failed_record_count)

        MetricLogger.log_metric(FUNCTION_NAME, MetricLogger.FAILED_PARSES,
                                self._failed_record_count)

        LOGGER.debug('%s alerts triggered', len(self._alerts))

        MetricLogger.log_metric(FUNCTION_NAME, MetricLogger.TRIGGERED_ALERTS,
                                len(self._alerts))

        # Check if debugging logging is on before json dumping alerts since
        # this can be time consuming if there are a lot of alerts
        if self._alerts and LOGGER.isEnabledFor(LOG_LEVEL_DEBUG):
            LOGGER.debug('Alerts:\n%s', json.dumps(self._alerts, indent=2))

        return self._failed_record_count == 0

    def get_alerts(self):
        """Public method to return alerts from class. Useful for testing.

        Returns:
            list: list of alerts as dictionaries
        """
        return self._alerts

    def _process_alerts(self, payload):
        """Process records for alerts and send them to the correct places

        Args:
            payload (StreamPayload): StreamAlert payload object being processed
        """
        for record in payload.pre_parse():
            self.classifier.classify_record(record)
            if not record.valid:
                if self.env['lambda_alias'] != 'development':
                    LOGGER.error(
                        'Record does not match any defined schemas: %s\n%s',
                        record, record.pre_parsed_record)

                self._failed_record_count += 1
                continue

            LOGGER.debug(
                'Classified and Parsed Payload: <Valid: %s, Log Source: %s, Entity: %s>',
                record.valid, record.log_source, record.entity)

            record_alerts = StreamRules.process(record)

            LOGGER.debug(
                'Processed %d valid record(s) that resulted in %d alert(s).',
                len(payload.records), len(record_alerts))

            if not record_alerts:
                continue

            # Extend the list of alerts with any new ones so they can be returned
            self._alerts.extend(record_alerts)

            if self.enable_alert_processor:
                self.sinker.sink(record_alerts)
コード例 #24
0
class StreamAlert(object):
    """Wrapper class for handling StreamAlert classificaiton and processing"""
    __config = {}

    def __init__(self, context, enable_alert_processor=True):
        """Initializer

        Args:
            context (dict): An AWS context object which provides metadata on the currently
                executing lambda function.
            enable_alert_processor (bool): If the user wants to send the alerts using their
                own methods, 'enable_alert_processor' can be set to False to suppress
                sending with the StreamAlert alert processor.
        """
        # Load the config. Validation occurs during load, which will
        # raise exceptions on any ConfigErrors
        StreamAlert.__config = StreamAlert.__config or load_config()

        # Load the environment from the context arn
        self.env = load_env(context)

        # Instantiate the sink here to handle sending the triggered alerts to the
        # alert processor
        self.sinker = StreamSink(self.env)

        # Instantiate a classifier that is used for this run
        self.classifier = StreamClassifier(config=self.__config)

        self.enable_alert_processor = enable_alert_processor
        self._failed_record_count = 0
        self._processed_size = 0
        self._alerts = []

        # Create a dictionary to hold parsed payloads by log type.
        # Firehose needs this information to send to its corresponding
        # delivery stream.
        self.categorized_payloads = defaultdict(list)

        # Firehose client initialization
        self.firehose_client = None

    def run(self, event):
        """StreamAlert Lambda function handler.

        Loads the configuration for the StreamAlert function which contains
        available data sources, log schemas, normalized types, and outputs.
        Classifies logs sent into a parsed type.
        Matches records against rules.

        Args:
            event (dict): An AWS event mapped to a specific source/entity
                containing data read by Lambda.

        Returns:
            bool: True if all logs being parsed match a schema
        """
        records = event.get('Records', [])
        LOGGER.debug('Number of Records: %d', len(records))
        if not records:
            return False

        MetricLogger.log_metric(FUNCTION_NAME, MetricLogger.TOTAL_RECORDS, len(records))

        firehose_config = self.__config['global'].get(
            'infrastructure', {}).get('firehose', {})
        if firehose_config.get('enabled'):
            self.firehose_client = boto3.client('firehose',
                                                region_name=self.env['lambda_region'])

        for raw_record in records:
            # Get the service and entity from the payload. If the service/entity
            # is not in our config, log and error and go onto the next record
            service, entity = self.classifier.extract_service_and_entity(raw_record)
            if not service:
                LOGGER.error('No valid service found in payload\'s raw record. Skipping '
                             'record: %s', raw_record)
                continue

            if not entity:
                LOGGER.error(
                    'Unable to extract entity from payload\'s raw record for service %s. '
                    'Skipping record: %s', service, raw_record)
                continue

            # Cache the log sources for this service and entity on the classifier
            if not self.classifier.load_sources(service, entity):
                continue

            # Create the StreamPayload to use for encapsulating parsed info
            payload = load_stream_payload(service, entity, raw_record)
            if not payload:
                continue

            self._process_alerts(payload)

        MetricLogger.log_metric(FUNCTION_NAME,
                                MetricLogger.TOTAL_PROCESSED_SIZE,
                                self._processed_size)

        LOGGER.debug('Invalid record count: %d', self._failed_record_count)

        MetricLogger.log_metric(FUNCTION_NAME,
                                MetricLogger.FAILED_PARSES,
                                self._failed_record_count)

        LOGGER.debug('%s alerts triggered', len(self._alerts))

        MetricLogger.log_metric(
            FUNCTION_NAME, MetricLogger.TRIGGERED_ALERTS, len(
                self._alerts))

        # Check if debugging logging is on before json dumping alerts since
        # this can be time consuming if there are a lot of alerts
        if self._alerts and LOGGER.isEnabledFor(LOG_LEVEL_DEBUG):
            LOGGER.debug('Alerts:\n%s', json.dumps(self._alerts, indent=2))

        if self.firehose_client:
            self._send_to_firehose()

        return self._failed_record_count == 0

    def get_alerts(self):
        """Public method to return alerts from class. Useful for testing.

        Returns:
            list: list of alerts as dictionaries
        """
        return self._alerts

    def _send_to_firehose(self):
        """Send all classified records to a respective Firehose Delivery Stream"""
        def _chunk(record_list, chunk_size):
            """Helper function to chunk payloads"""
            for item in range(0, len(record_list), chunk_size):
                yield record_list[item:item + chunk_size]

        def _check_record_batch(batch):
            """Helper function to verify record size"""
            for index, record in enumerate(batch):
                if len(str(record)) > MAX_RECORD_SIZE:
                    # Show the first 1k bytes in order to not overload
                    # CloudWatch logs
                    LOGGER.error('The following record is too large'
                                 'be sent to Firehose: %s', str(record)[:1000])
                    MetricLogger.log_metric(FUNCTION_NAME,
                                            MetricLogger.FIREHOSE_FAILED_RECORDS,
                                            1)
                    batch.pop(index)

        delivery_stream_name_pattern = 'streamalert_data_{}'

        # Iterate through each payload type
        for log_type, records in self.categorized_payloads.items():
            # This same method is used when naming the Delivery Streams
            formatted_log_type = log_type.replace(':', '_')

            for record_batch in _chunk(records, MAX_BATCH_SIZE):
                stream_name = delivery_stream_name_pattern.format(formatted_log_type)
                _check_record_batch(record_batch)

                resp = self.firehose_client.put_record_batch(
                    DeliveryStreamName=stream_name,
                    # The newline at the end is required by Firehose,
                    # otherwise all records will be on a single line and
                    # unsearchable in Athena.
                    Records=[{'Data': json.dumps(record, separators=(",", ":")) + '\n'}
                             for record
                             in record_batch])

                # Error handle if failures occured
                # TODO(jack) implement backoff here once the rule processor is split
                if resp.get('FailedPutCount') > 0:
                    failed_records = [failed
                                      for failed
                                      in resp['RequestResponses']
                                      if failed.get('ErrorCode')]
                    MetricLogger.log_metric(FUNCTION_NAME,
                                            MetricLogger.FIREHOSE_FAILED_RECORDS,
                                            resp['FailedPutCount'])
                    # Only print the first 100 failed records
                    LOGGER.error('The following records failed to Put to the'
                                 'Delivery stream %s: %s',
                                 stream_name,
                                 json.dumps(failed_records[:100], indent=2))
                else:
                    MetricLogger.log_metric(FUNCTION_NAME,
                                            MetricLogger.FIREHOSE_RECORDS_SENT,
                                            len(record_batch))
                    LOGGER.info('Successfully sent %d messages to Firehose:%s',
                                len(record_batch),
                                stream_name)

    def _process_alerts(self, payload):
        """Process records for alerts and send them to the correct places

        Args:
            payload (StreamPayload): StreamAlert payload object being processed
        """
        for record in payload.pre_parse():
            # Increment the processed size using the length of this record
            self._processed_size += len(record.pre_parsed_record)
            self.classifier.classify_record(record)
            if not record.valid:
                if self.env['lambda_alias'] != 'development':
                    LOGGER.error('Record does not match any defined schemas: %s\n%s',
                                 record, record.pre_parsed_record)

                self._failed_record_count += 1
                continue

            LOGGER.debug(
                'Classified and Parsed Payload: <Valid: %s, Log Source: %s, Entity: %s>',
                record.valid,
                record.log_source,
                record.entity)

            record_alerts = StreamRules.process(record)

            LOGGER.debug('Processed %d valid record(s) that resulted in %d alert(s).',
                         len(payload.records),
                         len(record_alerts))

            # Add all parsed records to the categorized payload dict
            # only if Firehose is enabled
            if self.firehose_client:
                self.categorized_payloads[payload.log_source].extend(payload.records)

            if not record_alerts:
                continue

            # Extend the list of alerts with any new ones so they can be returned
            self._alerts.extend(record_alerts)

            if self.enable_alert_processor:
                self.sinker.sink(record_alerts)
コード例 #25
0
ファイル: handler.py プロジェクト: suhasjrv/streamalert
class StreamAlert(object):
    """Wrapper class for handling StreamAlert classification and processing"""
    config = {}

    def __init__(self, context):
        """Initializer

        Args:
            context (dict): An AWS context object which provides metadata on the currently
                executing lambda function.
        """
        # Load the config. Validation occurs during load, which will
        # raise exceptions on any ConfigErrors
        StreamAlert.config = StreamAlert.config or load_config()

        # Load the environment from the context arn
        self.env = load_env(context)

        # Instantiate the send_alerts here to handle sending the triggered alerts to the
        # alert processor
        self.alert_forwarder = AlertForwarder()

        # Instantiate a classifier that is used for this run
        self.classifier = StreamClassifier(config=self.config)

        self._failed_record_count = 0
        self._processed_record_count = 0
        self._processed_size = 0
        self._alerts = []

        rule_import_paths = [
            item for location in {'rule_locations', 'matcher_locations'}
            for item in self.config['global']['general'][location]
        ]

        # Create an instance of the StreamRules class that gets cached in the
        # StreamAlert class as an instance property
        self._rules_engine = RulesEngine(self.config, *rule_import_paths)

        # Firehose client attribute
        self._firehose_client = None

    def run(self, event):
        """StreamAlert Lambda function handler.

        Loads the configuration for the StreamAlert function which contains
        available data sources, log schemas, normalized types, and outputs.
        Classifies logs sent into a parsed type.
        Matches records against rules.

        Args:
            event (dict): An AWS event mapped to a specific source/entity
                containing data read by Lambda.

        Returns:
            bool: True if all logs being parsed match a schema
        """
        records = event.get('Records', [])
        LOGGER.debug('Number of incoming records: %d', len(records))
        if not records:
            return False

        firehose_config = self.config['global'].get('infrastructure',
                                                    {}).get('firehose', {})
        if firehose_config.get('enabled'):
            self._firehose_client = StreamAlertFirehose(
                self.env['lambda_region'], firehose_config,
                self.config['logs'])

        payload_with_normalized_records = []
        for raw_record in records:
            # Get the service and entity from the payload. If the service/entity
            # is not in our config, log and error and go onto the next record
            service, entity = self.classifier.extract_service_and_entity(
                raw_record)
            if not service:
                LOGGER.error(
                    'No valid service found in payload\'s raw record. Skipping '
                    'record: %s', raw_record)
                continue

            if not entity:
                LOGGER.error(
                    'Unable to extract entity from payload\'s raw record for service %s. '
                    'Skipping record: %s', service, raw_record)
                continue

            # Cache the log sources for this service and entity on the classifier
            if not self.classifier.load_sources(service, entity):
                continue

            # Create the StreamPayload to use for encapsulating parsed info
            payload = load_stream_payload(service, entity, raw_record)
            if not payload:
                continue

            payload_with_normalized_records.extend(
                self._process_alerts(payload))

        # Log normalized records metric
        MetricLogger.log_metric(FUNCTION_NAME, MetricLogger.NORMALIZED_RECORDS,
                                len(payload_with_normalized_records))

        # Apply Threat Intel to normalized records in the end of Rule Processor invocation
        record_alerts = self._rules_engine.threat_intel_match(
            payload_with_normalized_records)
        self._alerts.extend(record_alerts)
        if record_alerts:
            self.alert_forwarder.send_alerts(record_alerts)

        MetricLogger.log_metric(FUNCTION_NAME, MetricLogger.TOTAL_RECORDS,
                                self._processed_record_count)

        MetricLogger.log_metric(FUNCTION_NAME,
                                MetricLogger.TOTAL_PROCESSED_SIZE,
                                self._processed_size)

        LOGGER.debug('Invalid record count: %d', self._failed_record_count)

        MetricLogger.log_metric(FUNCTION_NAME, MetricLogger.FAILED_PARSES,
                                self._failed_record_count)

        LOGGER.debug('%s alerts triggered', len(self._alerts))

        MetricLogger.log_metric(FUNCTION_NAME, MetricLogger.TRIGGERED_ALERTS,
                                len(self._alerts))

        # Check if debugging logging is on before json dumping alerts since
        # this can be time consuming if there are a lot of alerts
        if self._alerts and LOGGER.isEnabledFor(LOG_LEVEL_DEBUG):
            LOGGER.debug(
                'Alerts:\n%s',
                json.dumps([alert.output_dict() for alert in self._alerts],
                           indent=2,
                           sort_keys=True))

        if self._firehose_client:
            self._firehose_client.send()

        # Only log rule info here if this is not running tests
        # During testing, this gets logged at the end and printing here could be confusing
        # since stress testing calls this method multiple times
        if self.env['lambda_alias'] != 'development':
            stats.print_rule_stats(True)

        return self._failed_record_count == 0

    @property
    def alerts(self):
        """Returns list of Alert instances (useful for testing)."""
        return self._alerts

    def _process_alerts(self, payload):
        """Run the record through the rules, saving any alerts and forwarding them to Dynamo.

        Args:
            payload (StreamPayload): StreamAlert payload object being processed
        """
        payload_with_normalized_records = []
        for record in payload.pre_parse():
            # Increment the processed size using the length of this record
            self._processed_size += len(record.pre_parsed_record)
            self.classifier.classify_record(record)
            if not record.valid:
                if self.env['lambda_alias'] != 'development':
                    LOGGER.error(
                        'Record does not match any defined schemas: %s\n%s',
                        record, record.pre_parsed_record)

                self._failed_record_count += 1
                continue

            # Increment the total processed records to get an accurate assessment of throughput
            self._processed_record_count += len(record.records)

            LOGGER.debug(
                'Classified and Parsed Payload: <Valid: %s, Log Source: %s, Entity: %s>',
                record.valid, record.log_source, record.entity)

            record_alerts, normalized_records = self._rules_engine.run(record)

            payload_with_normalized_records.extend(normalized_records)

            LOGGER.debug(
                'Processed %d valid record(s) that resulted in %d alert(s).',
                len(payload.records), len(record_alerts))

            # Add all parsed records to the categorized payload dict only if Firehose is enabled
            if self._firehose_client:
                # Only send payloads with enabled log sources
                if self._firehose_client.enabled_log_source(
                        payload.log_source):
                    self._firehose_client.categorized_payloads[
                        payload.log_source].extend(payload.records)

            if not record_alerts:
                continue

            # Extend the list of alerts with any new ones so they can be returned
            self._alerts.extend(record_alerts)

            self.alert_forwarder.send_alerts(record_alerts)

        return payload_with_normalized_records
コード例 #26
0
class StreamAlert(object):
    """Wrapper class for handling StreamAlert classificaiton and processing"""
    config = {}
    # Used to detect special characters in payload keys.
    # This is necessary for sanitization of data prior to searching in Athena.
    special_char_regex = re.compile(r'\W')
    special_char_sub = '_'

    def __init__(self, context, enable_alert_processor=True):
        """Initializer

        Args:
            context (dict): An AWS context object which provides metadata on the currently
                executing lambda function.
            enable_alert_processor (bool): If the user wants to send the alerts using their
                own methods, 'enable_alert_processor' can be set to False to suppress
                sending with the StreamAlert alert processor.
        """
        # Load the config. Validation occurs during load, which will
        # raise exceptions on any ConfigErrors
        StreamAlert.config = StreamAlert.config or load_config()

        # Load the environment from the context arn
        self.env = load_env(context)

        # Instantiate the sink here to handle sending the triggered alerts to the
        # alert processor
        self.sinker = StreamSink(self.env)

        # Instantiate a classifier that is used for this run
        self.classifier = StreamClassifier(config=self.config)

        self.enable_alert_processor = enable_alert_processor
        self._failed_record_count = 0
        self._processed_size = 0
        self._alerts = []

        # Create a dictionary to hold parsed payloads by log type.
        # Firehose needs this information to send to its corresponding
        # delivery stream.
        self.categorized_payloads = defaultdict(list)

        # Firehose client initialization
        self.firehose_client = None
        StreamThreatIntel.load_intelligence(self.config)

    def run(self, event):
        """StreamAlert Lambda function handler.

        Loads the configuration for the StreamAlert function which contains
        available data sources, log schemas, normalized types, and outputs.
        Classifies logs sent into a parsed type.
        Matches records against rules.

        Args:
            event (dict): An AWS event mapped to a specific source/entity
                containing data read by Lambda.

        Returns:
            bool: True if all logs being parsed match a schema
        """
        records = event.get('Records', [])
        LOGGER.debug('Number of Records: %d', len(records))
        if not records:
            return False

        MetricLogger.log_metric(FUNCTION_NAME, MetricLogger.TOTAL_RECORDS, len(records))

        firehose_config = self.config['global'].get(
            'infrastructure', {}).get('firehose', {})
        if firehose_config.get('enabled'):
            self.firehose_client = boto3.client('firehose',
                                                region_name=self.env['lambda_region'])

        for raw_record in records:
            # Get the service and entity from the payload. If the service/entity
            # is not in our config, log and error and go onto the next record
            service, entity = self.classifier.extract_service_and_entity(raw_record)
            if not service:
                LOGGER.error('No valid service found in payload\'s raw record. Skipping '
                             'record: %s', raw_record)
                continue

            if not entity:
                LOGGER.error(
                    'Unable to extract entity from payload\'s raw record for service %s. '
                    'Skipping record: %s', service, raw_record)
                continue

            # Cache the log sources for this service and entity on the classifier
            if not self.classifier.load_sources(service, entity):
                continue

            # Create the StreamPayload to use for encapsulating parsed info
            payload = load_stream_payload(service, entity, raw_record)
            if not payload:
                continue

            self._process_alerts(payload)

        MetricLogger.log_metric(FUNCTION_NAME,
                                MetricLogger.TOTAL_PROCESSED_SIZE,
                                self._processed_size)

        LOGGER.debug('Invalid record count: %d', self._failed_record_count)

        MetricLogger.log_metric(FUNCTION_NAME,
                                MetricLogger.FAILED_PARSES,
                                self._failed_record_count)

        LOGGER.debug('%s alerts triggered', len(self._alerts))

        MetricLogger.log_metric(
            FUNCTION_NAME, MetricLogger.TRIGGERED_ALERTS, len(
                self._alerts))

        # Check if debugging logging is on before json dumping alerts since
        # this can be time consuming if there are a lot of alerts
        if self._alerts and LOGGER.isEnabledFor(LOG_LEVEL_DEBUG):
            LOGGER.debug('Alerts:\n%s', json.dumps(self._alerts, indent=2))

        if self.firehose_client:
            self._send_to_firehose()

        return self._failed_record_count == 0

    def get_alerts(self):
        """Public method to return alerts from class. Useful for testing.

        Returns:
            list: list of alerts as dictionaries
        """
        return self._alerts

    @staticmethod
    def _segment_records_by_count(record_list, max_count):
        """Segment records by length

        Args:
            record_list (list): The original records list to be segmented
            max_count (int): The max amount of records to yield per group
        """
        for index in range(0, len(record_list), max_count):
            yield record_list[index:index + max_count]

    def _segment_records_by_size(self, record_batch):
        """Segment record groups by size

        Args:
            record_batch (list): The original record batch to measure and segment

        Returns:
            generator: Used to iterate on each newly segmented group
        """
        split_factor = 1
        len_batch = len(record_batch)

        # Sample the first batch of records to determine the split factor.
        # Generally, it's very rare for a group of records to have
        # drastically different sizes in a single Lambda invocation.
        while len(json.dumps(record_batch[:len_batch / split_factor],
                             separators=(",", ":"))) > MAX_BATCH_SIZE:
            split_factor += 1

        return self._segment_records_by_count(record_batch, len_batch / split_factor)

    @staticmethod
    def _limit_record_size(batch):
        """Limit the record size to be sent to Firehose

        Args:
            batch (list): Record batch to iterate on
        """
        for index, record in enumerate(batch):
            if len(json.dumps(record, separators=(",", ":"))) > MAX_RECORD_SIZE:
                # Show the first 1k bytes in order to not overload
                # CloudWatch logs
                LOGGER.error('The following record is too large'
                             'be sent to Firehose: %s', str(record)[:1000])
                MetricLogger.log_metric(FUNCTION_NAME,
                                        MetricLogger.FIREHOSE_FAILED_RECORDS,
                                        1)
                batch.pop(index)

    @classmethod
    def sanitize_keys(cls, record):
        """Remove special characters from parsed record keys

        This is required when searching in Athena.  Keys can only have
        a period or underscore

        Args:
            record (dict): Original parsed record

        Returns:
            dict: A sanitized record
        """
        new_record = {}
        for key, value in record.iteritems():
            sanitized_key = re.sub(cls.special_char_regex,
                                   cls.special_char_sub,
                                   key)

            # Handle nested objects
            if isinstance(value, dict):
                new_record[sanitized_key] = cls.sanitize_keys(record[key])
            else:
                new_record[sanitized_key] = record[key]

        return new_record

    def _firehose_request_helper(self, stream_name, record_batch):
        """Send record batches to Firehose

        Args:
            stream_name (str): The name of the Delivery Stream to send to
            record_batch (list): The records to send
        """
        record_batch_size = len(record_batch)
        resp = {}

        try:
            LOGGER.debug('Sending %d records to Firehose:%s',
                         record_batch_size,
                         stream_name)
            resp = self.firehose_client.put_record_batch(
                DeliveryStreamName=stream_name,
                # The newline at the end is required by Firehose,
                # otherwise all records will be on a single line and
                # unsearchable in Athena.
                Records=[{'Data': json.dumps(self.sanitize_keys(record),
                                             separators=(",", ":")) + '\n'}
                         for record
                         in record_batch])
        except ClientError as firehose_err:
            LOGGER.error(firehose_err)
            MetricLogger.log_metric(FUNCTION_NAME,
                                    MetricLogger.FIREHOSE_FAILED_RECORDS,
                                    record_batch_size)
            return

        # Error handle if failures occured in PutRecordBatch
        # TODO(jack) implement backoff here for additional message reliability
        if resp.get('FailedPutCount') > 0:
            failed_records = [failed
                              for failed
                              in resp['RequestResponses']
                              if failed.get('ErrorCode')]
            MetricLogger.log_metric(FUNCTION_NAME,
                                    MetricLogger.FIREHOSE_FAILED_RECORDS,
                                    resp['FailedPutCount'])
            # Only print the first 100 failed records to Cloudwatch logs
            LOGGER.error('The following records failed to Put to the'
                         'Delivery stream %s: %s',
                         stream_name,
                         json.dumps(failed_records[:100], indent=2))
        else:
            MetricLogger.log_metric(FUNCTION_NAME,
                                    MetricLogger.FIREHOSE_RECORDS_SENT,
                                    record_batch_size)
            LOGGER.info('Successfully sent %d messages to Firehose:%s',
                        record_batch_size,
                        stream_name)

    def _send_to_firehose(self):
        """Send all classified records to a respective Firehose Delivery Stream"""
        delivery_stream_name_pattern = 'streamalert_data_{}'

        # Iterate through each payload type
        for log_type, records in self.categorized_payloads.items():
            # This same method is used when naming the Delivery Streams
            formatted_log_type = log_type.replace(':', '_')

            for record_batch in self._segment_records_by_count(records, MAX_BATCH_COUNT):
                stream_name = delivery_stream_name_pattern.format(formatted_log_type)
                self._limit_record_size(record_batch)
                for sized_batch in self._segment_records_by_size(record_batch):
                    self._firehose_request_helper(stream_name, sized_batch)

    def _process_alerts(self, payload):
        """Process records for alerts and send them to the correct places

        Args:
            payload (StreamPayload): StreamAlert payload object being processed
        """
        for record in payload.pre_parse():
            # Increment the processed size using the length of this record
            self._processed_size += len(record.pre_parsed_record)
            self.classifier.classify_record(record)
            if not record.valid:
                if self.env['lambda_alias'] != 'development':
                    LOGGER.error('Record does not match any defined schemas: %s\n%s',
                                 record, record.pre_parsed_record)

                self._failed_record_count += 1
                continue

            LOGGER.debug(
                'Classified and Parsed Payload: <Valid: %s, Log Source: %s, Entity: %s>',
                record.valid,
                record.log_source,
                record.entity)

            record_alerts = StreamRules.process(record)

            LOGGER.debug('Processed %d valid record(s) that resulted in %d alert(s).',
                         len(payload.records),
                         len(record_alerts))

            # Add all parsed records to the categorized payload dict
            # only if Firehose is enabled
            if self.firehose_client:
                # Only send payloads with enabled types
                if payload.log_source.split(':')[0] not in self.config['global'] \
                    ['infrastructure'].get('firehose', {}).get('disabled_logs', []):
                    self.categorized_payloads[payload.log_source].extend(payload.records)

            if not record_alerts:
                continue

            # Extend the list of alerts with any new ones so they can be returned
            self._alerts.extend(record_alerts)

            if self.enable_alert_processor:
                self.sinker.sink(record_alerts)