Ejemplo n.º 1
0
    def process(cls, input_payload):
        """Process rules on a record.

        Gather a list of rules based on the record's datasource type.
        For each rule, evaluate the record through all listed matchers
        and the rule itself to determine if a match occurs.

        Returns:
            list: alerts

            An alert is represented as a dictionary with the following keys:
                rule_name: the name of the triggered rule
                payload: the StreamPayload object
                outputs: list of outputs to send to
        """
        alerts = []
        payload = copy(input_payload)

        rules = [
            rule_attrs for rule_attrs in cls.__rules.values()
            if payload.log_source in rule_attrs.logs
        ]

        if not rules:
            LOGGER.debug('No rules to process for %s', payload)
            return alerts

        for record in payload.records:
            for rule in rules:
                # subkey check
                has_sub_keys = cls.process_subkeys(record, payload.type, rule)
                if not has_sub_keys:
                    continue

                # matcher check
                matcher_result = cls.match_event(record, rule)
                if not matcher_result:
                    continue

                # rule analysis
                rule_result = cls.process_rule(record, rule)
                if rule_result:
                    LOGGER.info(
                        'Rule [%s] triggered an alert on log type [%s] from entity \'%s\' '
                        'in service \'%s\'', rule.rule_name,
                        payload.log_source, payload.entity, payload.service())
                    alert = {
                        'record': record,
                        'rule_name': rule.rule_name,
                        'rule_description': rule.rule_function.__doc__
                        or DEFAULT_RULE_DESCRIPTION,
                        'log_source': str(payload.log_source),
                        'log_type': payload.type,
                        'outputs': rule.outputs,
                        'source_service': payload.service(),
                        'source_entity': payload.entity
                    }
                    alerts.append(alert)

        return alerts
Ejemplo n.º 2
0
    def parse(self, schema, data):
        """Parse a string into a list of JSON payloads.

        Args:
            schema (dict): Parsing schema.
            data (str|dict): Data to be parsed.

        Returns:
            list: A list of dictionaries representing parsed records OR
            False if the data is not JSON or the data does not follow the schema.
        """
        if isinstance(data, (unicode, str)):
            try:
                loaded_data = json.loads(data)
            except ValueError as err:
                LOGGER.debug('JSON parse failed: %s', str(err))
                LOGGER.debug('JSON parse could not load data: %s', str(data))
                return False
            else:
                json_records = self._parse_records(schema, loaded_data)
        else:
            json_records = self._parse_records(schema, data)

        if not json_records:
            return False

        self._add_optional_keys(json_records, schema,
                                self.options.get('optional_top_level_keys'))
        # Make sure all keys match the schema, including nests maps
        if not self._key_check(schema, json_records):
            return False

        return json_records
Ejemplo n.º 3
0
    def _process_alerts(self, payload):
        """Process records for alerts and send them to the correct places

        Args:
            payload (StreamPayload): StreamAlert payload object being processed
        """
        for record in payload.pre_parse():
            self.classifier.classify_record(record)
            if not record.valid:
                if self.env['lambda_alias'] != 'development':
                    LOGGER.error(
                        'Record does not match any defined schemas: %s\n%s',
                        record, record.pre_parsed_record)

                self._failed_record_count += 1
                continue

            LOGGER.debug(
                'Classified and Parsed Payload: <Valid: %s, Log Source: %s, Entity: %s>',
                record.valid, record.log_source, record.entity)

            record_alerts = StreamRules.process(record)

            LOGGER.debug(
                'Processed %d valid record(s) that resulted in %d alert(s).',
                len(payload.records), len(record_alerts))

            if not record_alerts:
                continue

            # Extend the list of alerts with any new ones so they can be returned
            self._alerts.extend(record_alerts)

            if self.enable_alert_processor:
                self.sinker.sink(record_alerts)
Ejemplo n.º 4
0
    def _read_downloaded_s3_object(s3_object):
        """Read lines from a downloaded file from S3

        Supports reading both gzipped files and plaintext files.

        Args:
            s3_object (str): A full path to the downloaded file.

        Yields:
            (str) Lines from the downloaded s3 object.
        """
        _, extension = os.path.splitext(s3_object)

        if extension == '.gz':
            with gzip.open(s3_object, 'r') as s3_file:
                for num, line in enumerate(s3_file, start=1):
                    yield num, line.rstrip()
        else:
            with open(s3_object, 'r') as s3_file:
                for num, line in enumerate(s3_file, start=1):
                    yield num, line.rstrip()

        # AWS Lambda apparently does not reallocate disk space when files are
        # removed using os.remove(), so we must truncate them before removal
        with open(s3_object, 'w'):
            pass

        os.remove(s3_object)
        if not os.path.exists(s3_object):
            LOGGER.debug('Removed temp S3 file: %s', s3_object)
        else:
            LOGGER.error('Failed to remove temp S3 file: %s', s3_object)
Ejemplo n.º 5
0
    def process_subkeys(cls, record, payload_type, rule):
        """Check payload record contains all subkeys needed for rules

        Because each log is processed by every rule for a given log type,
        it's possible that a rule references a subkey that doesn't exist in
        that specific log. This method verifies that the declared subkeys
        in a rule are contained in the JSON payload prior to rule processing.

        Args:
            record: Payload record to process
            payload_type (str): type of the record
            rule: Rule attributes

        Returns:
            bool: result of subkey check.
        """
        if not rule.req_subkeys or payload_type != 'json':
            return True

        for key, nested_keys in rule.req_subkeys.iteritems():
            # This is an extra layer of protection when
            # verifying a subkey exists in a record with a null value.
            # In the case of CloudTrail, a top level key has been
            # observed as either a map with subkeys, or null.
            if not record.get(key):
                LOGGER.debug(
                    'The required subkey %s is not found when trying to process %s: \n%s',
                    key, rule.rule_name, json.dumps(record, indent=2))
                return False
            if not all(x in record[key] for x in nested_keys):
                return False

        return True
Ejemplo n.º 6
0
    def _extract_json_path(self, json_payload):
        """Extract records from the original json payload using a provided JSON path

        Args:
            json_payload (dict): The parsed json data

        Returns:
            list: A list of JSON records extracted via JSON path or regex
        """
        records = []
        json_path_expression = self.options.get('json_path')
        if not json_path_expression:
            return records

        # Handle jsonpath extraction of records
        LOGGER.debug('Parsing records with JSONPath')
        records_jsonpath = jsonpath_rw.parse(json_path_expression)

        # If the csv parser is extracting csv from json, the payload is likely
        # a string and needs to be loaded to a dict
        if not isinstance(json_payload, dict):
            json_payload = json.loads(json_payload)

        matches = records_jsonpath.find(json_payload)
        if not matches:
            return False

        return [match.value for match in matches]
Ejemplo n.º 7
0
    def _get_object(self):
        """Given an S3 record, download and parse the data.

        Returns:
            str: Path to the downloaded s3 object.
        """
        # Use the urllib unquote method to decode any url encoded characters
        # (ie - %26 --> &) from the bucket and key names
        unquoted = lambda (data): unquote(data).decode('utf-8')
        region = self.raw_record['awsRegion']

        bucket = unquoted(self.raw_record['s3']['bucket']['name'])
        key = unquoted(self.raw_record['s3']['object']['key'])
        self.s3_object_size = int(self.raw_record['s3']['object']['size'])

        LOGGER.debug(
            'Pre-parsing record from S3. Bucket: %s, Key: %s, Size: %d',
            bucket, key, self.s3_object_size)

        try:
            return self._download_object(region, bucket, key)
        except IOError:
            LOGGER.exception(
                '[S3Payload] The following error occurred while downloading')
            return
Ejemplo n.º 8
0
    def _load_rule_table(cls, config):
        """Load and return a RuleTable class for communicating with the DynamoDB rule table

        Args:
            config (dict): Loaded configuration from 'conf/' directory

        Returns:
            rule_table.RuleTable: Loaded frontend for DynamoDB rules table
        """
        # Ensure the rules table is enabled
        rt_config = config['global']['infrastructure']['rules_table']
        if not rt_config.get('enabled', False):
            return

        now = datetime.utcnow()
        refresh_delta = timedelta(
            minutes=rt_config.get('cache_refresh_minutes', 10))

        # The rule table will need 'refreshed' if the refresh interval has been surpassed
        needs_refresh = cls._RULE_TABLE_LAST_REFRESH + refresh_delta < now

        if not needs_refresh:
            LOGGER.debug(
                'Rule table does not need refreshed (last refresh time: %s; '
                'current time: %s)', cls._RULE_TABLE_LAST_REFRESH, now)
            return

        LOGGER.info(
            'Refreshing rule table (last refresh time: %s; current time: %s)',
            cls._RULE_TABLE_LAST_REFRESH, now)

        table_name = '{}_streamalert_rules'.format(
            config['global']['account']['prefix'])
        cls._RULE_TABLE = RuleTable(table_name)
        cls._RULE_TABLE_LAST_REFRESH = now
Ejemplo n.º 9
0
    def _parse_records(self, schema, json_payload):
        """Identify and extract nested payloads from parsed JSON records.

        Nested payloads can be detected with log_patterns (`records` should be a
        JSONpath selector that yields the desired nested records). If desired,
        fields present on the root record can be merged into child events
        using the `envelope_keys` option.

        Args:
            json_payload (dict): The parsed json data

        Returns:
            list: A list of parsed JSON records
        """
        # Check options and return the payload if there is nothing special to do
        if not self.options:
            return [json_payload]

        envelope_schema = self.options.get('envelope_keys')
        optional_envelope_keys = self.options.get('optional_envelope_keys')

        # If the schema has a defined envelope schema, with optional keys in
        # the envelope.  This occurs in some cases when using json_regex_key.
        if envelope_schema and optional_envelope_keys:
            missing_keys_schema = {}
            for key in optional_envelope_keys:
                if key not in json_payload:
                    missing_keys_schema[key] = envelope_schema[key]
            if missing_keys_schema:
                self._add_optional_keys([json_payload], envelope_schema,
                                        missing_keys_schema)

        # If the envelope schema is defined and all envelope keys are required
        # to be present in the record.
        elif envelope_schema and not all(x in json_payload
                                         for x in envelope_schema):
            return [json_payload]

        envelope = {}
        if envelope_schema:
            LOGGER.debug('Parsing envelope keys')
            schema.update({ENVELOPE_KEY: envelope_schema})
            envelope_keys = envelope_schema.keys()
            envelope_jsonpath = jsonpath_rw.parse("$." +
                                                  ",".join(envelope_keys))
            envelope_matches = [
                match.value for match in envelope_jsonpath.find(json_payload)
            ]
            envelope = dict(zip(envelope_keys, envelope_matches))

        json_records = self._extract_records(json_payload, envelope)
        if json_records is False:
            return False

        # If the final parsed record is singular
        if not json_records:
            json_records.append(json_payload)

        return json_records
Ejemplo n.º 10
0
    def _download_object(self, region, bucket, key):
        """Download an object from S3.

        Verifies the S3 object is less than or equal to 128MB, and
        downloads it into a temp file.  Lambda can only execute for a
        maximum of 300 seconds, and the file to download
        greatly impacts that time.

        Args:
            region (str): AWS region to use for boto client instance.
            bucket (str): S3 bucket to download object from.
            key (str): Key of s3 object.

        Returns:
            str: The downloaded path of the S3 object.
        """
        size_kb = self.s3_object_size / 1024.0
        size_mb = size_kb / 1024.0
        display_size = '{}MB'.format(size_mb) if size_mb else '{}KB'.format(
            size_kb)

        # File size checks before downloading
        if size_kb == 0:
            return
        elif size_mb > 128:
            raise S3ObjectSizeError(
                '[S3Payload] The S3 object {}/{} is too large [{}] to download '
                'from S3'.format(bucket, key, display_size))

        # Bandit warns about using a shell process, ignore with #nosec
        LOGGER.debug(os.popen('df -h /tmp | tail -1').read().strip())  # nosec
        LOGGER.info('[S3Payload] Starting download from S3: %s/%s [%s]',
                    bucket, key, display_size)

        # Convert the S3 object name to store as a file in the Lambda container
        suffix = key.replace('/', '-')
        file_descriptor, downloaded_s3_object = tempfile.mkstemp(suffix=suffix)

        with open(downloaded_s3_object, 'wb') as data:
            client = boto3.client('s3', region_name=region)
            start_time = time.time()
            client.download_fileobj(bucket, key, data)

        # Explicitly call os.close on the underlying open file descriptor
        # Addresses https://github.com/airbnb/streamalert/issues/587
        os.close(file_descriptor)

        total_time = time.time() - start_time
        LOGGER.info('Completed download in %s seconds', round(total_time, 2))

        # Log a metric on how long this object took to download
        MetricLogger.log_metric(FUNCTION_NAME, MetricLogger.S3_DOWNLOAD_TIME,
                                total_time)

        return downloaded_s3_object
Ejemplo n.º 11
0
    def _process_alerts(self, payload):
        """Process records for alerts and send them to the correct places

        Args:
            payload (StreamPayload): StreamAlert payload object being processed
        """
        payload_with_normalized_records = []
        for record in payload.pre_parse():
            # Increment the processed size using the length of this record
            self._processed_size += len(record.pre_parsed_record)
            self.classifier.classify_record(record)
            if not record.valid:
                if self.env['lambda_alias'] != 'development':
                    LOGGER.error(
                        'Record does not match any defined schemas: %s\n%s',
                        record, record.pre_parsed_record)

                self._failed_record_count += 1
                continue

            # Increment the total processed records to get an accurate assessment of throughput
            self._processed_record_count += len(record.records)

            LOGGER.debug(
                'Classified and Parsed Payload: <Valid: %s, Log Source: %s, Entity: %s>',
                record.valid, record.log_source, record.entity)

            record_alerts, normalized_records = self._rule_engine.process(
                record)

            payload_with_normalized_records.extend(normalized_records)

            LOGGER.debug(
                'Processed %d valid record(s) that resulted in %d alert(s).',
                len(payload.records), len(record_alerts))

            # Add all parsed records to the categorized payload dict only if Firehose is enabled
            if self._firehose_client:
                # Only send payloads with enabled log sources
                if self._firehose_client.enabled_log_source(
                        payload.log_source):
                    self._firehose_client.categorized_payloads[
                        payload.log_source].extend(payload.records)

            if not record_alerts:
                continue

            # Extend the list of alerts with any new ones so they can be returned
            self._alerts.extend(record_alerts)

            if self.enable_alert_processor:
                self.sinker.sink(record_alerts)

        return payload_with_normalized_records
Ejemplo n.º 12
0
    def _shred_temp_directory():
        """Delete all objects in the container's temp directory"""
        LOGGER.debug('Shredding temp directory')

        for root, dirs, files in os.walk(tempfile.gettempdir(), topdown=False):
            for name in files:
                subprocess.check_call([ #nosec
                    'shred', '--force', '--iterations=1',
                    '--remove', os.path.join(root, name)])
            for name in dirs:
                os.rmdir(os.path.join(root, name)) #nosec
Ejemplo n.º 13
0
    def _firehose_request_helper(self, stream_name, record_batch):
        """Send record batches to Firehose

        Args:
            stream_name (str): The name of the Delivery Stream to send to
            record_batch (list): The records to send
        """
        record_batch_size = len(record_batch)
        resp = {}

        try:
            LOGGER.debug('Sending %d records to Firehose:%s',
                         record_batch_size,
                         stream_name)
            resp = self.firehose_client.put_record_batch(
                DeliveryStreamName=stream_name,
                # The newline at the end is required by Firehose,
                # otherwise all records will be on a single line and
                # unsearchable in Athena.
                Records=[{'Data': json.dumps(self.sanitize_keys(record),
                                             separators=(",", ":")) + '\n'}
                         for record
                         in record_batch])
        except ClientError as firehose_err:
            LOGGER.error(firehose_err)
            MetricLogger.log_metric(FUNCTION_NAME,
                                    MetricLogger.FIREHOSE_FAILED_RECORDS,
                                    record_batch_size)
            return

        # Error handle if failures occured in PutRecordBatch
        # TODO(jack) implement backoff here for additional message reliability
        if resp.get('FailedPutCount') > 0:
            failed_records = [failed
                              for failed
                              in resp['RequestResponses']
                              if failed.get('ErrorCode')]
            MetricLogger.log_metric(FUNCTION_NAME,
                                    MetricLogger.FIREHOSE_FAILED_RECORDS,
                                    resp['FailedPutCount'])
            # Only print the first 100 failed records to Cloudwatch logs
            LOGGER.error('The following records failed to Put to the'
                         'Delivery stream %s: %s',
                         stream_name,
                         json.dumps(failed_records[:100], indent=2))
        else:
            MetricLogger.log_metric(FUNCTION_NAME,
                                    MetricLogger.FIREHOSE_RECORDS_SENT,
                                    record_batch_size)
            LOGGER.info('Successfully sent %d messages to Firehose:%s',
                        record_batch_size,
                        stream_name)
Ejemplo n.º 14
0
    def _process_ioc(self, ioc_collections):
        """Check if any info is malicious by querying DynamoDB IOC table

        Args:
            ioc_collections (list): A list of StreamIoc instances.
        """
        LOGGER.debug('[Threat Inel] Rule Processor queries %d IOCs',
                     len(ioc_collections))
        # Segment data before calling DynamoDB table with batch_get_item.
        for subset in self._segment(ioc_collections):
            query_values = []
            for ioc in subset:
                if ioc.value not in query_values:
                    query_values.append(ioc.value)

            query_result = []

            query_error_msg = 'An error occurred while quering dynamodb table. Error is: %s'
            try:
                result, unprocesed_keys = self._query(query_values)
                query_result.extend(result)
            except ClientError as err:
                LOGGER.error(query_error_msg, err.response)
                return
            except ParamValidationError as err:
                LOGGER.error(query_error_msg, err)
                return

            # If there are unprocessed keys, we will re-query once with unprocessed
            # keys only
            if unprocesed_keys:
                deserializer = self._deserialize(
                    unprocesed_keys[self._table]['Keys'])
                query_values = [elem[PRIMARY_KEY] for elem in deserializer]
                query_error_msg = 'An error occurred while processing unprocesed_keys. Error is: %s'
                try:
                    result, _ = self._query(query_values)
                    query_result.extend(result)
                except ClientError as err:
                    LOGGER.error(query_error_msg, err.response)
                    return
                except ParamValidationError as err:
                    LOGGER.error(query_error_msg, err)
                    return

            for value in ioc_collections:
                for ioc in query_result:
                    if value.value == ioc[PRIMARY_KEY]:
                        value.sub_type = ioc[SUB_TYPE_KEY]
                        value.is_ioc = True
                        continue
Ejemplo n.º 15
0
    def read_compressed_files(cls, intel_dir, delimiter=','):
        """Read intelligence into memory

        Read all intelligence from csv.gz files located in threat_intel
        directory into a dictionary. CSV filename should follow the convention
        <ioc_type_as_basename>.csv.gz. The basename (without extension) of csv
        file will be the key in return dictionary.

        Returns:
            (dict): Threat intelligence in the following format:
                {
                    "domain": {
                        "evil1.com": ["apt_domain", "source1 reported evil1.com"],
                        "evil2.com": ["c2_domain", "source2 reported evil2.com"]
                    },
                    "ip": {
                        "1.1.1.2": ["scan_ip", "source reported ip1"],
                        "2.2.2.2": ["scan_ip", "source reported ip2"]
                    },
                    "url": {
                        "www.hacker.com/evil_page": ["mal_url", "source_foo"]
                    },
                    "md5": {
                        "0123456789abcdef0123456789abcdef": ["mal_md5", "source_bar"]
                    }
                }
            None: if the intelligence directory does not exist
        """
        if not os.path.exists(intel_dir):
            return

        gz_files = [
            os.path.join(intel_dir, gz_file)
            for gz_file in os.listdir(intel_dir) if gz_file.endswith('.gz')
        ]

        for gz_file in gz_files:
            with gzip.open(gz_file, 'r') as ioc_file:
                csv_reader = csv.reader(ioc_file, delimiter=delimiter)
                ioc_type = os.path.basename(gz_file).split('.')[0]
                if ioc_type not in cls.__intelligence:
                    cls.__intelligence[ioc_type] = dict()
                for row in csv_reader:
                    if len(row) < 2:
                        LOGGER.debug(
                            'Warning, each row in CSV file should '
                            'contain at least two fields. Bad row [%s]', row)
                        continue
                    cls.__intelligence[ioc_type][row[0]] = row[1:]

        return cls.__intelligence
Ejemplo n.º 16
0
    def _extract_records(self, json_payload):
        """Extract records from the original json payload using the JSON configuration

        Args:
            json_payload (dict): The parsed json data

        Returns:
            list: A list of JSON records extracted via JSON path or regex
        """
        json_records = []
        extracted_records = self._extract_json_path(json_payload)
        if extracted_records is False:
            return False

        if extracted_records:
            if not self.options.get('embedded_json'):
                return extracted_records

            for record in extracted_records:
                try:
                    record = json.loads(record)
                except ValueError:
                    LOGGER.warning('Embedded json is invalid')
                    continue
                json_records.append(record)
            return json_records

        json_regex_key = self.options.get('json_regex_key')
        # Handle nested json object regex matching
        if json_regex_key and json_payload.get(json_regex_key):
            LOGGER.debug('Parsing records with JSON Regex Key')
            match = self.__regex.search(str(json_payload[json_regex_key]))
            if not match:
                return False
            match_str = match.groups('json_blob')[0]
            try:
                new_record = json.loads(match_str)
            except ValueError:
                LOGGER.debug('Matched regex string is not valid JSON: %s',
                             match_str)
                return False
            else:
                # Make sure the new_record is a dictionary and not a list.
                # Valid JSON can be either
                if not isinstance(new_record, dict):
                    return False

                json_records.append(new_record)

        return json_records
Ejemplo n.º 17
0
    def _process_log_schemas(self, payload):
        """Get any log schemas that matched this log format

        Args:
            payload: A StreamAlert payload object

        Returns:
            list: Contains any schemas that matched this log format
                Each list entry contains the namedtuple of 'SchemaMatch' with
                values of log_name, root_schema, parser, and parsed_data
        """
        schema_match = namedtuple(
            'SchemaMatch', 'log_name, root_schema, parser, parsed_data')
        schema_matches = []
        log_info = self.get_log_info_for_source()

        # Loop over all logs declared in logs.json
        for log_name, attributes in log_info.iteritems():
            # Get the parser type to use for this log
            parser_name = payload.type or attributes['parser']

            schema = attributes['schema']
            options = attributes.get('configuration', {})

            # Setup the parser class
            parser_class = get_parser(parser_name)
            parser = parser_class(options)

            # Get a list of parsed records
            LOGGER.debug('Trying schema: %s', log_name)
            parsed_data = parser.parse(schema, payload.pre_parsed_record)

            if not parsed_data:
                continue

            LOGGER.debug('Parsed %d records with schema %s', len(parsed_data),
                         log_name)

            if SUPPORT_MULTIPLE_SCHEMA_MATCHING:
                schema_matches.append(
                    schema_match(log_name, schema, parser, parsed_data))
                continue

            log_patterns = parser.options.get('log_patterns')
            if all(
                    parser.matched_log_pattern(rec, log_patterns)
                    for rec in parsed_data):
                return [schema_match(log_name, schema, parser, parsed_data)]

        return schema_matches
Ejemplo n.º 18
0
    def pre_parse(self):
        """Pre-parsing method for SNS records. Extracts the SNS payload from the
        record itself and sets it as the `pre_parsed_record` property.

        Yields:
            This object with the pre_parsed_record now set
        """
        LOGGER.debug(
            'Pre-parsing record from SNS. MessageId: %s, EventSubscriptionArn: %s',
            self.raw_record['Sns']['MessageId'],
            self.raw_record['EventSubscriptionArn'])

        self.pre_parsed_record = self.raw_record['Sns']['Message']

        yield self
Ejemplo n.º 19
0
    def _check_schema_match(schema_matches):
        """Check to see if the log matches multiple schemas. If so, fall back
        on using log_patterns to look for the proper log. If no log_patterns
        exist, or they do not resolve the problem, fall back on using the
        first matched schema.

        Args:
            schema_matches (list): A list of tuples containing the info for schemas that have
                validly parsed this record. Each tuple is: (log_name, parser, parsed_data)

        Returns:
            tuple: The proper tuple to use for parsing from the list of tuples
        """
        # If there is only one parse or we do not have support for multiple schemas
        # enabled, then just return the first parse that was valid
        if len(schema_matches) == 1 or not SUPPORT_MULTIPLE_SCHEMA_MATCHING:
            return schema_matches[0]

        matches = []
        for i, schema_match in enumerate(schema_matches):
            log_patterns = schema_match.parser.options.get('log_patterns', {})
            LOGGER.debug('Log patterns: %s', log_patterns)
            if (all(
                    schema_match.parser.matched_log_pattern(
                        data, log_patterns)
                    for data in schema_match.parsed_data)):
                matches.append(schema_matches[i])
            else:
                if LOGGER_DEBUG_ENABLED:
                    LOGGER.debug(
                        'Log pattern matching failed for:\n%s',
                        json.dumps(schema_match.parsed_data, indent=2))

        if matches:
            if len(matches) > 1:
                LOGGER.error('Log patterns matched for multiple schemas: %s',
                             ', '.join(match.log_name for match in matches))
                LOGGER.error('Proceeding with schema for: %s',
                             matches[0].log_name)

            return matches[0]

        LOGGER.error('Log classification matched for multiple schemas: %s',
                     ', '.join(match.log_name for match in schema_matches))
        LOGGER.error('Proceeding with schema for: %s',
                     schema_matches[0].log_name)

        return schema_matches[0]
Ejemplo n.º 20
0
    def run(self, input_payload):
        """Process rules on a record.

        Gather a list of rules based on the record's datasource type.
        For each rule, evaluate the record through all listed matchers
        and the rule itself to determine if a match occurs.

        Returns:
            A tuple(list, list).
                First return is a list of Alert instances.
                Second return is a list of payload instance with normalized records.
        """
        alerts = []
        # store normalized records for future process in Threat Intel
        normalized_records = []
        payload = copy(input_payload)

        rules = Rule.rules_for_log_type(payload.log_source)

        if not rules:
            LOGGER.debug('No rules to process for %s', payload)
            return alerts, normalized_records
        # fetch all datatypes info from rules and run data normalization before
        # rule match to improve performance. So one record will be normalized only
        # once by all normalized datatypes from all rules.
        datatypes_set = {
            datatype
            for rule in rules if rule.datatypes for datatype in rule.datatypes
        }

        if datatypes_set:
            for record in payload.records:
                self._apply_normalization(record, normalized_records,
                                          datatypes_set, payload)

        for record in payload.records:
            for rule in rules:
                # subkey check
                if not self.process_subkeys(record, payload.type, rule):
                    continue

                # matcher check
                if not rule.check_matchers(record):
                    continue

                self.rule_analysis(record, rule, payload, alerts)

        return alerts, normalized_records
Ejemplo n.º 21
0
    def _process_alerts(self, payload):
        """Process records for alerts and send them to the correct places

        Args:
            payload (StreamPayload): StreamAlert payload object being processed
        """
        for record in payload.pre_parse():
            # Increment the processed size using the length of this record
            self._processed_size += len(record.pre_parsed_record)
            self.classifier.classify_record(record)
            if not record.valid:
                if self.env['lambda_alias'] != 'development':
                    LOGGER.error('Record does not match any defined schemas: %s\n%s',
                                 record, record.pre_parsed_record)

                self._failed_record_count += 1
                continue

            LOGGER.debug(
                'Classified and Parsed Payload: <Valid: %s, Log Source: %s, Entity: %s>',
                record.valid,
                record.log_source,
                record.entity)

            record_alerts = StreamRules.process(record)

            LOGGER.debug('Processed %d valid record(s) that resulted in %d alert(s).',
                         len(payload.records),
                         len(record_alerts))

            # Add all parsed records to the categorized payload dict
            # only if Firehose is enabled
            if self.firehose_client:
                # Only send payloads with enabled types
                if payload.log_source.split(':')[0] not in self.config['global'] \
                    ['infrastructure'].get('firehose', {}).get('disabled_logs', []):
                    self.categorized_payloads[payload.log_source].extend(payload.records)

            if not record_alerts:
                continue

            # Extend the list of alerts with any new ones so they can be returned
            self._alerts.extend(record_alerts)

            if self.enable_alert_processor:
                self.sinker.sink(record_alerts)
Ejemplo n.º 22
0
    def process(self, input_payload):
        """Process rules on a record.

        Gather a list of rules based on the record's datasource type.
        For each rule, evaluate the record through all listed matchers
        and the rule itself to determine if a match occurs.

        Returns:
            A tuple(list, list).
                First return is a list of alerts.
                Second return is a list of payload instance with normalized records.
        """
        alerts = []
        # store normalized records for future process in Threat Intel
        normalized_records = []
        payload = copy(input_payload)

        rules = [rule_attrs for rule_attrs in self.__rules.values()
                 if rule_attrs.logs is None or payload.log_source in rule_attrs.logs]

        if not rules:
            LOGGER.debug('No rules to process for %s', payload)
            return alerts, normalized_records

        for record in payload.records:
            for rule in rules:
                # subkey check
                has_sub_keys = self.process_subkeys(record, payload.type, rule)
                if not has_sub_keys:
                    continue

                # matcher check
                matcher_result = self.match_event(record, rule)
                if not matcher_result:
                    continue

                if rule.datatypes:
                    # When rule 'datatypes' option is defined, rules engine will
                    # apply data normalization to all the record.
                    record_copy = self._apply_normalization(record, normalized_records,
                                                            rule, payload)
                    self.rule_analysis(record_copy, rule, payload, alerts)
                else:
                    self.rule_analysis(record, rule, payload, alerts)

        return alerts, normalized_records
Ejemplo n.º 23
0
    def _download_object(self, region, bucket, key):
        """Download an object from S3.

        Verifies the S3 object is less than or equal to 128MB, and
        downloads it into a temp file.  Lambda can only execute for a
        maximum of 300 seconds, and the file to download
        greatly impacts that time.

        Args:
            region (str): AWS region to use for boto client instance.
            bucket (str): S3 bucket to download object from.
            key (str): Key of s3 object.

        Returns:
            str: The downloaded path of the S3 object.
        """
        size_kb = self.s3_object_size / 1024.0
        size_mb = size_kb / 1024.0
        if size_mb > 128:
            raise S3ObjectSizeError('S3 object to download is above 128MB')

        # Bandit warns about using a shell process, ignore with #nosec
        LOGGER.debug(os.popen('df -h /tmp | tail -1').read().strip())  # nosec

        display_size = '{}MB'.format(size_mb) if size_mb else '{}KB'.format(
            size_kb)

        LOGGER.info('Starting download from S3: %s/%s [%s]', bucket, key,
                    display_size)

        suffix = key.replace('/', '-')
        _, downloaded_s3_object = tempfile.mkstemp(suffix=suffix)
        with open(downloaded_s3_object, 'wb') as data:
            client = boto3.client('s3', region_name=region)
            start_time = time.time()
            client.download_fileobj(bucket, key, data)

        total_time = time.time() - start_time
        LOGGER.info('Completed download in %s seconds', round(total_time, 2))

        # Log a metric on how long this object took to download
        MetricLogger.log_metric(FUNCTION_NAME, MetricLogger.S3_DOWNLOAD_TIME,
                                total_time)

        return downloaded_s3_object
Ejemplo n.º 24
0
    def _extract_envelope(envelope_schema, json_payload):
        """Extract envelope key/values from the original payload

        Args:
            schema (dict): Parsing schema
            envelope_schema (dict): Envelope keys to be extracted
            json_payload (dict): The parsed json data

        Returns:
            dict: Key/values extracted from the log to be used as the envelope
        """
        if not isinstance(json_payload, dict):
            json_payload = json.loads(json_payload)
        LOGGER.debug('Parsing envelope keys')
        envelope_keys = envelope_schema.keys()
        envelope_jsonpath = jsonpath_rw.parse("$." + ",".join(envelope_keys))
        envelope_matches = [match.value for match in envelope_jsonpath.find(json_payload)]
        return dict(zip(envelope_keys, envelope_matches))
Ejemplo n.º 25
0
    def pre_parse(self):
        """Pre-parsing method for S3 objects that will download the s3 object,
        open it for reading and iterate over lines (records) in the file.
        This yields back references of this S3Payload instance to the caller
        with a propertly set `pre_parsed_record` for this record.

        Yields:
            Instances of `self` back to the caller with the
                proper `pre_parsed_record` set. Conforms to the interface of
                returning a generator, providing the ability to support
                multi-record like this (s3).
        """
        s3_file_path = self._get_object()
        if not s3_file_path:
            return

        line_num, processed_size = 0, 0
        for line_num, data in self._read_downloaded_s3_object(s3_file_path):

            self._refresh_record(data)
            yield self

            # Only do the extra calculations below if debug logging is enabled
            if not LOGGER.isEnabledFor(LOG_LEVEL_DEBUG):
                continue

            # Add the current data to the total processed size
            # +1 to account for line feed
            processed_size += (len(data) + 1)

            # Log a debug message on every 100 lines processed
            if line_num % 100 == 0:
                avg_record_size = ((processed_size - 1) / line_num)
                if avg_record_size:
                    approx_record_count = self.s3_object_size / avg_record_size
                    LOGGER.debug(
                        'Processed %s S3 records out of an approximate total of %s '
                        '(average record size: %s bytes, total size: %s bytes)',
                        line_num,
                        approx_record_count,
                        avg_record_size,
                        self.s3_object_size)

        MetricLogger.log_metric(FUNCTION_NAME, MetricLogger.TOTAL_S3_RECORDS, line_num)
Ejemplo n.º 26
0
    def _get_object(self):
        """Given an S3 record, download and parse the data.

        Returns:
            str: Path to the downloaded s3 object.
        """
        # Use the urllib unquote method to decode any url encoded characters
        # (ie - %26 --> &) from the bucket and key names
        def unquoted(data):
            return unquote(data).decode('utf-8')
        region = self.raw_record['awsRegion']
        bucket = unquoted(self.raw_record['s3']['bucket']['name'])
        key = unquoted(self.raw_record['s3']['object']['key'])
        self.s3_object_size = int(self.raw_record['s3']['object']['size'])

        LOGGER.debug('Pre-parsing record from S3. Bucket: %s, Key: %s, Size: %d',
                     bucket, key, self.s3_object_size)

        return self._download_object(region, bucket, key)
Ejemplo n.º 27
0
    def pre_parse(self):
        """Pre-parsing method for Kinesis records. Extracts the base64 encoded
        payload from the record itself, decodes it and sets it as the
        `pre_parsed_record` property.

        Yields:
            This object with the pre_parsed_record now set
        """
        LOGGER.debug('Pre-parsing record from Kinesis. eventID: %s, eventSourceARN: %s',
                     self.raw_record['eventID'], self.raw_record['eventSourceARN'])

        # Kinesis records have to potential to be gzipped, so try to decompress
        record = base64.b64decode(self.raw_record['kinesis']['data'])
        try:
            self.pre_parsed_record = zlib.decompress(record, 47)
        except zlib.error:
            self.pre_parsed_record = record

        yield self
Ejemplo n.º 28
0
    def matched_log_pattern(self, record, log_patterns):
        """Return True if all log patterns of this record match"""
        # Return True immediately if there are no log patterns
        # or if the data being tested is not a dict
        if not log_patterns:
            return True

        pattern_result = []
        for field, pattern_list in log_patterns.iteritems():
            # handle nested log_patterns
            if isinstance(pattern_list, dict):
                return self.matched_log_pattern(record[field], pattern_list)

            if not isinstance(pattern_list, list):
                LOGGER.debug('Configured `log_patterns` should be a \'list\'')
                continue

            # The pattern field value in the record
            try:
                value = record[field]
            except (KeyError, TypeError):
                LOGGER.debug(
                    'Declared log pattern field [%s] is not a valid type '
                    'for this record: %s', field, record)
                continue
            # Append the result of any of the log_patterns being True
            pattern_result.append(
                any(fnmatch(value, pattern) for pattern in pattern_list))

        all_patterns_result = all(pattern_result)
        LOGGER.debug('%s log pattern match result: %s', self.type(),
                     all_patterns_result)

        # if all pattern group results are True
        return all_patterns_result
Ejemplo n.º 29
0
    def _parse(self, payload):
        """Parse a record into a declared type.

        Args:
            payload: A StreamAlert payload object

        Sets:
            payload.log_source: The detected log name from the data_sources config.
            payload.type: The record's type.
            payload.records: The parsed records as a list.

        Returns:
            bool: the success of the parse.
        """
        schema_matches = self._process_log_schemas(payload)

        if not schema_matches:
            return False

        if LOGGER_DEBUG_ENABLED:
            LOGGER.debug(
                'Schema Matched Records:\n%s',
                json.dumps([
                    schema_match.parsed_data for schema_match in schema_matches
                ],
                           indent=2))

        schema_match = self._check_schema_match(schema_matches)

        if LOGGER_DEBUG_ENABLED:
            LOGGER.debug('Log name: %s', schema_match.log_name)
            LOGGER.debug('Parsed data:\n%s',
                         json.dumps(schema_match.parsed_data, indent=2))

        for parsed_data_value in schema_match.parsed_data:
            # Convert data types per the schema
            # Use the root schema for the parser due to updates caused by
            # configuration settings such as envelope_keys and optional_keys
            try:
                if not self._convert_type(parsed_data_value,
                                          schema_match.root_schema):
                    return False
            except KeyError:
                LOGGER.error('The payload is mis-classified. Payload [%s]',
                             parsed_data_value)
                return False

        normalized_types = StreamThreatIntel.normalized_type_mapping()

        payload.log_source = schema_match.log_name
        payload.type = schema_match.parser.type()
        payload.records = schema_match.parsed_data
        payload.normalized_types = normalized_types.get(
            payload.log_source.split(':')[0])

        return True
Ejemplo n.º 30
0
    def _extract_records(self, json_payload, envelope):
        """Extract records from the original json payload using the JSON configuration

        Args:
            json_payload (dict): The parsed json data

        Returns:
            list: A list of JSON records extracted via JSON path or regex
        """
        json_records = []
        json_path_expression = self.options.get('json_path')
        json_regex_key = self.options.get('json_regex_key')
        # Handle jsonpath extraction of records
        if json_path_expression:
            LOGGER.debug('Parsing records with JSONPath')
            records_jsonpath = jsonpath_rw.parse(json_path_expression)
            matches = records_jsonpath.find(json_payload)
            if not matches:
                return False
            for match in matches:
                record = match.value
                embedded_json = self.options.get('embedded_json')
                if embedded_json:
                    try:
                        record = json.loads(match.value)
                    except ValueError:
                        LOGGER.warning('Embedded json is invalid')
                        continue
                if envelope:
                    record.update({ENVELOPE_KEY: envelope})
                json_records.append(record)

        # Handle nested json object regex matching
        elif json_regex_key and json_payload.get(json_regex_key):
            LOGGER.debug('Parsing records with JSON Regex Key')
            match = self.__regex.search(str(json_payload[json_regex_key]))
            if not match:
                return False
            match_str = match.groups('json_blob')[0]
            try:
                new_record = json.loads(match_str)
            except ValueError:
                LOGGER.debug('Matched regex string is not valid JSON: %s',
                             match_str)
                return False
            else:
                # Make sure the new_record is a dictionary and not a list.
                # Valid JSON can be either
                if not isinstance(new_record, dict):
                    return False
                if envelope:
                    new_record.update({ENVELOPE_KEY: envelope})

                json_records.append(new_record)

        return json_records