def process(cls, input_payload): """Process rules on a record. Gather a list of rules based on the record's datasource type. For each rule, evaluate the record through all listed matchers and the rule itself to determine if a match occurs. Returns: list: alerts An alert is represented as a dictionary with the following keys: rule_name: the name of the triggered rule payload: the StreamPayload object outputs: list of outputs to send to """ alerts = [] payload = copy(input_payload) rules = [ rule_attrs for rule_attrs in cls.__rules.values() if payload.log_source in rule_attrs.logs ] if not rules: LOGGER.debug('No rules to process for %s', payload) return alerts for record in payload.records: for rule in rules: # subkey check has_sub_keys = cls.process_subkeys(record, payload.type, rule) if not has_sub_keys: continue # matcher check matcher_result = cls.match_event(record, rule) if not matcher_result: continue # rule analysis rule_result = cls.process_rule(record, rule) if rule_result: LOGGER.info( 'Rule [%s] triggered an alert on log type [%s] from entity \'%s\' ' 'in service \'%s\'', rule.rule_name, payload.log_source, payload.entity, payload.service()) alert = { 'record': record, 'rule_name': rule.rule_name, 'rule_description': rule.rule_function.__doc__ or DEFAULT_RULE_DESCRIPTION, 'log_source': str(payload.log_source), 'log_type': payload.type, 'outputs': rule.outputs, 'source_service': payload.service(), 'source_entity': payload.entity } alerts.append(alert) return alerts
def parse(self, schema, data): """Parse a string into a list of JSON payloads. Args: schema (dict): Parsing schema. data (str|dict): Data to be parsed. Returns: list: A list of dictionaries representing parsed records OR False if the data is not JSON or the data does not follow the schema. """ if isinstance(data, (unicode, str)): try: loaded_data = json.loads(data) except ValueError as err: LOGGER.debug('JSON parse failed: %s', str(err)) LOGGER.debug('JSON parse could not load data: %s', str(data)) return False else: json_records = self._parse_records(schema, loaded_data) else: json_records = self._parse_records(schema, data) if not json_records: return False self._add_optional_keys(json_records, schema, self.options.get('optional_top_level_keys')) # Make sure all keys match the schema, including nests maps if not self._key_check(schema, json_records): return False return json_records
def _process_alerts(self, payload): """Process records for alerts and send them to the correct places Args: payload (StreamPayload): StreamAlert payload object being processed """ for record in payload.pre_parse(): self.classifier.classify_record(record) if not record.valid: if self.env['lambda_alias'] != 'development': LOGGER.error( 'Record does not match any defined schemas: %s\n%s', record, record.pre_parsed_record) self._failed_record_count += 1 continue LOGGER.debug( 'Classified and Parsed Payload: <Valid: %s, Log Source: %s, Entity: %s>', record.valid, record.log_source, record.entity) record_alerts = StreamRules.process(record) LOGGER.debug( 'Processed %d valid record(s) that resulted in %d alert(s).', len(payload.records), len(record_alerts)) if not record_alerts: continue # Extend the list of alerts with any new ones so they can be returned self._alerts.extend(record_alerts) if self.enable_alert_processor: self.sinker.sink(record_alerts)
def _read_downloaded_s3_object(s3_object): """Read lines from a downloaded file from S3 Supports reading both gzipped files and plaintext files. Args: s3_object (str): A full path to the downloaded file. Yields: (str) Lines from the downloaded s3 object. """ _, extension = os.path.splitext(s3_object) if extension == '.gz': with gzip.open(s3_object, 'r') as s3_file: for num, line in enumerate(s3_file, start=1): yield num, line.rstrip() else: with open(s3_object, 'r') as s3_file: for num, line in enumerate(s3_file, start=1): yield num, line.rstrip() # AWS Lambda apparently does not reallocate disk space when files are # removed using os.remove(), so we must truncate them before removal with open(s3_object, 'w'): pass os.remove(s3_object) if not os.path.exists(s3_object): LOGGER.debug('Removed temp S3 file: %s', s3_object) else: LOGGER.error('Failed to remove temp S3 file: %s', s3_object)
def process_subkeys(cls, record, payload_type, rule): """Check payload record contains all subkeys needed for rules Because each log is processed by every rule for a given log type, it's possible that a rule references a subkey that doesn't exist in that specific log. This method verifies that the declared subkeys in a rule are contained in the JSON payload prior to rule processing. Args: record: Payload record to process payload_type (str): type of the record rule: Rule attributes Returns: bool: result of subkey check. """ if not rule.req_subkeys or payload_type != 'json': return True for key, nested_keys in rule.req_subkeys.iteritems(): # This is an extra layer of protection when # verifying a subkey exists in a record with a null value. # In the case of CloudTrail, a top level key has been # observed as either a map with subkeys, or null. if not record.get(key): LOGGER.debug( 'The required subkey %s is not found when trying to process %s: \n%s', key, rule.rule_name, json.dumps(record, indent=2)) return False if not all(x in record[key] for x in nested_keys): return False return True
def _extract_json_path(self, json_payload): """Extract records from the original json payload using a provided JSON path Args: json_payload (dict): The parsed json data Returns: list: A list of JSON records extracted via JSON path or regex """ records = [] json_path_expression = self.options.get('json_path') if not json_path_expression: return records # Handle jsonpath extraction of records LOGGER.debug('Parsing records with JSONPath') records_jsonpath = jsonpath_rw.parse(json_path_expression) # If the csv parser is extracting csv from json, the payload is likely # a string and needs to be loaded to a dict if not isinstance(json_payload, dict): json_payload = json.loads(json_payload) matches = records_jsonpath.find(json_payload) if not matches: return False return [match.value for match in matches]
def _get_object(self): """Given an S3 record, download and parse the data. Returns: str: Path to the downloaded s3 object. """ # Use the urllib unquote method to decode any url encoded characters # (ie - %26 --> &) from the bucket and key names unquoted = lambda (data): unquote(data).decode('utf-8') region = self.raw_record['awsRegion'] bucket = unquoted(self.raw_record['s3']['bucket']['name']) key = unquoted(self.raw_record['s3']['object']['key']) self.s3_object_size = int(self.raw_record['s3']['object']['size']) LOGGER.debug( 'Pre-parsing record from S3. Bucket: %s, Key: %s, Size: %d', bucket, key, self.s3_object_size) try: return self._download_object(region, bucket, key) except IOError: LOGGER.exception( '[S3Payload] The following error occurred while downloading') return
def _load_rule_table(cls, config): """Load and return a RuleTable class for communicating with the DynamoDB rule table Args: config (dict): Loaded configuration from 'conf/' directory Returns: rule_table.RuleTable: Loaded frontend for DynamoDB rules table """ # Ensure the rules table is enabled rt_config = config['global']['infrastructure']['rules_table'] if not rt_config.get('enabled', False): return now = datetime.utcnow() refresh_delta = timedelta( minutes=rt_config.get('cache_refresh_minutes', 10)) # The rule table will need 'refreshed' if the refresh interval has been surpassed needs_refresh = cls._RULE_TABLE_LAST_REFRESH + refresh_delta < now if not needs_refresh: LOGGER.debug( 'Rule table does not need refreshed (last refresh time: %s; ' 'current time: %s)', cls._RULE_TABLE_LAST_REFRESH, now) return LOGGER.info( 'Refreshing rule table (last refresh time: %s; current time: %s)', cls._RULE_TABLE_LAST_REFRESH, now) table_name = '{}_streamalert_rules'.format( config['global']['account']['prefix']) cls._RULE_TABLE = RuleTable(table_name) cls._RULE_TABLE_LAST_REFRESH = now
def _parse_records(self, schema, json_payload): """Identify and extract nested payloads from parsed JSON records. Nested payloads can be detected with log_patterns (`records` should be a JSONpath selector that yields the desired nested records). If desired, fields present on the root record can be merged into child events using the `envelope_keys` option. Args: json_payload (dict): The parsed json data Returns: list: A list of parsed JSON records """ # Check options and return the payload if there is nothing special to do if not self.options: return [json_payload] envelope_schema = self.options.get('envelope_keys') optional_envelope_keys = self.options.get('optional_envelope_keys') # If the schema has a defined envelope schema, with optional keys in # the envelope. This occurs in some cases when using json_regex_key. if envelope_schema and optional_envelope_keys: missing_keys_schema = {} for key in optional_envelope_keys: if key not in json_payload: missing_keys_schema[key] = envelope_schema[key] if missing_keys_schema: self._add_optional_keys([json_payload], envelope_schema, missing_keys_schema) # If the envelope schema is defined and all envelope keys are required # to be present in the record. elif envelope_schema and not all(x in json_payload for x in envelope_schema): return [json_payload] envelope = {} if envelope_schema: LOGGER.debug('Parsing envelope keys') schema.update({ENVELOPE_KEY: envelope_schema}) envelope_keys = envelope_schema.keys() envelope_jsonpath = jsonpath_rw.parse("$." + ",".join(envelope_keys)) envelope_matches = [ match.value for match in envelope_jsonpath.find(json_payload) ] envelope = dict(zip(envelope_keys, envelope_matches)) json_records = self._extract_records(json_payload, envelope) if json_records is False: return False # If the final parsed record is singular if not json_records: json_records.append(json_payload) return json_records
def _download_object(self, region, bucket, key): """Download an object from S3. Verifies the S3 object is less than or equal to 128MB, and downloads it into a temp file. Lambda can only execute for a maximum of 300 seconds, and the file to download greatly impacts that time. Args: region (str): AWS region to use for boto client instance. bucket (str): S3 bucket to download object from. key (str): Key of s3 object. Returns: str: The downloaded path of the S3 object. """ size_kb = self.s3_object_size / 1024.0 size_mb = size_kb / 1024.0 display_size = '{}MB'.format(size_mb) if size_mb else '{}KB'.format( size_kb) # File size checks before downloading if size_kb == 0: return elif size_mb > 128: raise S3ObjectSizeError( '[S3Payload] The S3 object {}/{} is too large [{}] to download ' 'from S3'.format(bucket, key, display_size)) # Bandit warns about using a shell process, ignore with #nosec LOGGER.debug(os.popen('df -h /tmp | tail -1').read().strip()) # nosec LOGGER.info('[S3Payload] Starting download from S3: %s/%s [%s]', bucket, key, display_size) # Convert the S3 object name to store as a file in the Lambda container suffix = key.replace('/', '-') file_descriptor, downloaded_s3_object = tempfile.mkstemp(suffix=suffix) with open(downloaded_s3_object, 'wb') as data: client = boto3.client('s3', region_name=region) start_time = time.time() client.download_fileobj(bucket, key, data) # Explicitly call os.close on the underlying open file descriptor # Addresses https://github.com/airbnb/streamalert/issues/587 os.close(file_descriptor) total_time = time.time() - start_time LOGGER.info('Completed download in %s seconds', round(total_time, 2)) # Log a metric on how long this object took to download MetricLogger.log_metric(FUNCTION_NAME, MetricLogger.S3_DOWNLOAD_TIME, total_time) return downloaded_s3_object
def _process_alerts(self, payload): """Process records for alerts and send them to the correct places Args: payload (StreamPayload): StreamAlert payload object being processed """ payload_with_normalized_records = [] for record in payload.pre_parse(): # Increment the processed size using the length of this record self._processed_size += len(record.pre_parsed_record) self.classifier.classify_record(record) if not record.valid: if self.env['lambda_alias'] != 'development': LOGGER.error( 'Record does not match any defined schemas: %s\n%s', record, record.pre_parsed_record) self._failed_record_count += 1 continue # Increment the total processed records to get an accurate assessment of throughput self._processed_record_count += len(record.records) LOGGER.debug( 'Classified and Parsed Payload: <Valid: %s, Log Source: %s, Entity: %s>', record.valid, record.log_source, record.entity) record_alerts, normalized_records = self._rule_engine.process( record) payload_with_normalized_records.extend(normalized_records) LOGGER.debug( 'Processed %d valid record(s) that resulted in %d alert(s).', len(payload.records), len(record_alerts)) # Add all parsed records to the categorized payload dict only if Firehose is enabled if self._firehose_client: # Only send payloads with enabled log sources if self._firehose_client.enabled_log_source( payload.log_source): self._firehose_client.categorized_payloads[ payload.log_source].extend(payload.records) if not record_alerts: continue # Extend the list of alerts with any new ones so they can be returned self._alerts.extend(record_alerts) if self.enable_alert_processor: self.sinker.sink(record_alerts) return payload_with_normalized_records
def _shred_temp_directory(): """Delete all objects in the container's temp directory""" LOGGER.debug('Shredding temp directory') for root, dirs, files in os.walk(tempfile.gettempdir(), topdown=False): for name in files: subprocess.check_call([ #nosec 'shred', '--force', '--iterations=1', '--remove', os.path.join(root, name)]) for name in dirs: os.rmdir(os.path.join(root, name)) #nosec
def _firehose_request_helper(self, stream_name, record_batch): """Send record batches to Firehose Args: stream_name (str): The name of the Delivery Stream to send to record_batch (list): The records to send """ record_batch_size = len(record_batch) resp = {} try: LOGGER.debug('Sending %d records to Firehose:%s', record_batch_size, stream_name) resp = self.firehose_client.put_record_batch( DeliveryStreamName=stream_name, # The newline at the end is required by Firehose, # otherwise all records will be on a single line and # unsearchable in Athena. Records=[{'Data': json.dumps(self.sanitize_keys(record), separators=(",", ":")) + '\n'} for record in record_batch]) except ClientError as firehose_err: LOGGER.error(firehose_err) MetricLogger.log_metric(FUNCTION_NAME, MetricLogger.FIREHOSE_FAILED_RECORDS, record_batch_size) return # Error handle if failures occured in PutRecordBatch # TODO(jack) implement backoff here for additional message reliability if resp.get('FailedPutCount') > 0: failed_records = [failed for failed in resp['RequestResponses'] if failed.get('ErrorCode')] MetricLogger.log_metric(FUNCTION_NAME, MetricLogger.FIREHOSE_FAILED_RECORDS, resp['FailedPutCount']) # Only print the first 100 failed records to Cloudwatch logs LOGGER.error('The following records failed to Put to the' 'Delivery stream %s: %s', stream_name, json.dumps(failed_records[:100], indent=2)) else: MetricLogger.log_metric(FUNCTION_NAME, MetricLogger.FIREHOSE_RECORDS_SENT, record_batch_size) LOGGER.info('Successfully sent %d messages to Firehose:%s', record_batch_size, stream_name)
def _process_ioc(self, ioc_collections): """Check if any info is malicious by querying DynamoDB IOC table Args: ioc_collections (list): A list of StreamIoc instances. """ LOGGER.debug('[Threat Inel] Rule Processor queries %d IOCs', len(ioc_collections)) # Segment data before calling DynamoDB table with batch_get_item. for subset in self._segment(ioc_collections): query_values = [] for ioc in subset: if ioc.value not in query_values: query_values.append(ioc.value) query_result = [] query_error_msg = 'An error occurred while quering dynamodb table. Error is: %s' try: result, unprocesed_keys = self._query(query_values) query_result.extend(result) except ClientError as err: LOGGER.error(query_error_msg, err.response) return except ParamValidationError as err: LOGGER.error(query_error_msg, err) return # If there are unprocessed keys, we will re-query once with unprocessed # keys only if unprocesed_keys: deserializer = self._deserialize( unprocesed_keys[self._table]['Keys']) query_values = [elem[PRIMARY_KEY] for elem in deserializer] query_error_msg = 'An error occurred while processing unprocesed_keys. Error is: %s' try: result, _ = self._query(query_values) query_result.extend(result) except ClientError as err: LOGGER.error(query_error_msg, err.response) return except ParamValidationError as err: LOGGER.error(query_error_msg, err) return for value in ioc_collections: for ioc in query_result: if value.value == ioc[PRIMARY_KEY]: value.sub_type = ioc[SUB_TYPE_KEY] value.is_ioc = True continue
def read_compressed_files(cls, intel_dir, delimiter=','): """Read intelligence into memory Read all intelligence from csv.gz files located in threat_intel directory into a dictionary. CSV filename should follow the convention <ioc_type_as_basename>.csv.gz. The basename (without extension) of csv file will be the key in return dictionary. Returns: (dict): Threat intelligence in the following format: { "domain": { "evil1.com": ["apt_domain", "source1 reported evil1.com"], "evil2.com": ["c2_domain", "source2 reported evil2.com"] }, "ip": { "1.1.1.2": ["scan_ip", "source reported ip1"], "2.2.2.2": ["scan_ip", "source reported ip2"] }, "url": { "www.hacker.com/evil_page": ["mal_url", "source_foo"] }, "md5": { "0123456789abcdef0123456789abcdef": ["mal_md5", "source_bar"] } } None: if the intelligence directory does not exist """ if not os.path.exists(intel_dir): return gz_files = [ os.path.join(intel_dir, gz_file) for gz_file in os.listdir(intel_dir) if gz_file.endswith('.gz') ] for gz_file in gz_files: with gzip.open(gz_file, 'r') as ioc_file: csv_reader = csv.reader(ioc_file, delimiter=delimiter) ioc_type = os.path.basename(gz_file).split('.')[0] if ioc_type not in cls.__intelligence: cls.__intelligence[ioc_type] = dict() for row in csv_reader: if len(row) < 2: LOGGER.debug( 'Warning, each row in CSV file should ' 'contain at least two fields. Bad row [%s]', row) continue cls.__intelligence[ioc_type][row[0]] = row[1:] return cls.__intelligence
def _extract_records(self, json_payload): """Extract records from the original json payload using the JSON configuration Args: json_payload (dict): The parsed json data Returns: list: A list of JSON records extracted via JSON path or regex """ json_records = [] extracted_records = self._extract_json_path(json_payload) if extracted_records is False: return False if extracted_records: if not self.options.get('embedded_json'): return extracted_records for record in extracted_records: try: record = json.loads(record) except ValueError: LOGGER.warning('Embedded json is invalid') continue json_records.append(record) return json_records json_regex_key = self.options.get('json_regex_key') # Handle nested json object regex matching if json_regex_key and json_payload.get(json_regex_key): LOGGER.debug('Parsing records with JSON Regex Key') match = self.__regex.search(str(json_payload[json_regex_key])) if not match: return False match_str = match.groups('json_blob')[0] try: new_record = json.loads(match_str) except ValueError: LOGGER.debug('Matched regex string is not valid JSON: %s', match_str) return False else: # Make sure the new_record is a dictionary and not a list. # Valid JSON can be either if not isinstance(new_record, dict): return False json_records.append(new_record) return json_records
def _process_log_schemas(self, payload): """Get any log schemas that matched this log format Args: payload: A StreamAlert payload object Returns: list: Contains any schemas that matched this log format Each list entry contains the namedtuple of 'SchemaMatch' with values of log_name, root_schema, parser, and parsed_data """ schema_match = namedtuple( 'SchemaMatch', 'log_name, root_schema, parser, parsed_data') schema_matches = [] log_info = self.get_log_info_for_source() # Loop over all logs declared in logs.json for log_name, attributes in log_info.iteritems(): # Get the parser type to use for this log parser_name = payload.type or attributes['parser'] schema = attributes['schema'] options = attributes.get('configuration', {}) # Setup the parser class parser_class = get_parser(parser_name) parser = parser_class(options) # Get a list of parsed records LOGGER.debug('Trying schema: %s', log_name) parsed_data = parser.parse(schema, payload.pre_parsed_record) if not parsed_data: continue LOGGER.debug('Parsed %d records with schema %s', len(parsed_data), log_name) if SUPPORT_MULTIPLE_SCHEMA_MATCHING: schema_matches.append( schema_match(log_name, schema, parser, parsed_data)) continue log_patterns = parser.options.get('log_patterns') if all( parser.matched_log_pattern(rec, log_patterns) for rec in parsed_data): return [schema_match(log_name, schema, parser, parsed_data)] return schema_matches
def pre_parse(self): """Pre-parsing method for SNS records. Extracts the SNS payload from the record itself and sets it as the `pre_parsed_record` property. Yields: This object with the pre_parsed_record now set """ LOGGER.debug( 'Pre-parsing record from SNS. MessageId: %s, EventSubscriptionArn: %s', self.raw_record['Sns']['MessageId'], self.raw_record['EventSubscriptionArn']) self.pre_parsed_record = self.raw_record['Sns']['Message'] yield self
def _check_schema_match(schema_matches): """Check to see if the log matches multiple schemas. If so, fall back on using log_patterns to look for the proper log. If no log_patterns exist, or they do not resolve the problem, fall back on using the first matched schema. Args: schema_matches (list): A list of tuples containing the info for schemas that have validly parsed this record. Each tuple is: (log_name, parser, parsed_data) Returns: tuple: The proper tuple to use for parsing from the list of tuples """ # If there is only one parse or we do not have support for multiple schemas # enabled, then just return the first parse that was valid if len(schema_matches) == 1 or not SUPPORT_MULTIPLE_SCHEMA_MATCHING: return schema_matches[0] matches = [] for i, schema_match in enumerate(schema_matches): log_patterns = schema_match.parser.options.get('log_patterns', {}) LOGGER.debug('Log patterns: %s', log_patterns) if (all( schema_match.parser.matched_log_pattern( data, log_patterns) for data in schema_match.parsed_data)): matches.append(schema_matches[i]) else: if LOGGER_DEBUG_ENABLED: LOGGER.debug( 'Log pattern matching failed for:\n%s', json.dumps(schema_match.parsed_data, indent=2)) if matches: if len(matches) > 1: LOGGER.error('Log patterns matched for multiple schemas: %s', ', '.join(match.log_name for match in matches)) LOGGER.error('Proceeding with schema for: %s', matches[0].log_name) return matches[0] LOGGER.error('Log classification matched for multiple schemas: %s', ', '.join(match.log_name for match in schema_matches)) LOGGER.error('Proceeding with schema for: %s', schema_matches[0].log_name) return schema_matches[0]
def run(self, input_payload): """Process rules on a record. Gather a list of rules based on the record's datasource type. For each rule, evaluate the record through all listed matchers and the rule itself to determine if a match occurs. Returns: A tuple(list, list). First return is a list of Alert instances. Second return is a list of payload instance with normalized records. """ alerts = [] # store normalized records for future process in Threat Intel normalized_records = [] payload = copy(input_payload) rules = Rule.rules_for_log_type(payload.log_source) if not rules: LOGGER.debug('No rules to process for %s', payload) return alerts, normalized_records # fetch all datatypes info from rules and run data normalization before # rule match to improve performance. So one record will be normalized only # once by all normalized datatypes from all rules. datatypes_set = { datatype for rule in rules if rule.datatypes for datatype in rule.datatypes } if datatypes_set: for record in payload.records: self._apply_normalization(record, normalized_records, datatypes_set, payload) for record in payload.records: for rule in rules: # subkey check if not self.process_subkeys(record, payload.type, rule): continue # matcher check if not rule.check_matchers(record): continue self.rule_analysis(record, rule, payload, alerts) return alerts, normalized_records
def _process_alerts(self, payload): """Process records for alerts and send them to the correct places Args: payload (StreamPayload): StreamAlert payload object being processed """ for record in payload.pre_parse(): # Increment the processed size using the length of this record self._processed_size += len(record.pre_parsed_record) self.classifier.classify_record(record) if not record.valid: if self.env['lambda_alias'] != 'development': LOGGER.error('Record does not match any defined schemas: %s\n%s', record, record.pre_parsed_record) self._failed_record_count += 1 continue LOGGER.debug( 'Classified and Parsed Payload: <Valid: %s, Log Source: %s, Entity: %s>', record.valid, record.log_source, record.entity) record_alerts = StreamRules.process(record) LOGGER.debug('Processed %d valid record(s) that resulted in %d alert(s).', len(payload.records), len(record_alerts)) # Add all parsed records to the categorized payload dict # only if Firehose is enabled if self.firehose_client: # Only send payloads with enabled types if payload.log_source.split(':')[0] not in self.config['global'] \ ['infrastructure'].get('firehose', {}).get('disabled_logs', []): self.categorized_payloads[payload.log_source].extend(payload.records) if not record_alerts: continue # Extend the list of alerts with any new ones so they can be returned self._alerts.extend(record_alerts) if self.enable_alert_processor: self.sinker.sink(record_alerts)
def process(self, input_payload): """Process rules on a record. Gather a list of rules based on the record's datasource type. For each rule, evaluate the record through all listed matchers and the rule itself to determine if a match occurs. Returns: A tuple(list, list). First return is a list of alerts. Second return is a list of payload instance with normalized records. """ alerts = [] # store normalized records for future process in Threat Intel normalized_records = [] payload = copy(input_payload) rules = [rule_attrs for rule_attrs in self.__rules.values() if rule_attrs.logs is None or payload.log_source in rule_attrs.logs] if not rules: LOGGER.debug('No rules to process for %s', payload) return alerts, normalized_records for record in payload.records: for rule in rules: # subkey check has_sub_keys = self.process_subkeys(record, payload.type, rule) if not has_sub_keys: continue # matcher check matcher_result = self.match_event(record, rule) if not matcher_result: continue if rule.datatypes: # When rule 'datatypes' option is defined, rules engine will # apply data normalization to all the record. record_copy = self._apply_normalization(record, normalized_records, rule, payload) self.rule_analysis(record_copy, rule, payload, alerts) else: self.rule_analysis(record, rule, payload, alerts) return alerts, normalized_records
def _download_object(self, region, bucket, key): """Download an object from S3. Verifies the S3 object is less than or equal to 128MB, and downloads it into a temp file. Lambda can only execute for a maximum of 300 seconds, and the file to download greatly impacts that time. Args: region (str): AWS region to use for boto client instance. bucket (str): S3 bucket to download object from. key (str): Key of s3 object. Returns: str: The downloaded path of the S3 object. """ size_kb = self.s3_object_size / 1024.0 size_mb = size_kb / 1024.0 if size_mb > 128: raise S3ObjectSizeError('S3 object to download is above 128MB') # Bandit warns about using a shell process, ignore with #nosec LOGGER.debug(os.popen('df -h /tmp | tail -1').read().strip()) # nosec display_size = '{}MB'.format(size_mb) if size_mb else '{}KB'.format( size_kb) LOGGER.info('Starting download from S3: %s/%s [%s]', bucket, key, display_size) suffix = key.replace('/', '-') _, downloaded_s3_object = tempfile.mkstemp(suffix=suffix) with open(downloaded_s3_object, 'wb') as data: client = boto3.client('s3', region_name=region) start_time = time.time() client.download_fileobj(bucket, key, data) total_time = time.time() - start_time LOGGER.info('Completed download in %s seconds', round(total_time, 2)) # Log a metric on how long this object took to download MetricLogger.log_metric(FUNCTION_NAME, MetricLogger.S3_DOWNLOAD_TIME, total_time) return downloaded_s3_object
def _extract_envelope(envelope_schema, json_payload): """Extract envelope key/values from the original payload Args: schema (dict): Parsing schema envelope_schema (dict): Envelope keys to be extracted json_payload (dict): The parsed json data Returns: dict: Key/values extracted from the log to be used as the envelope """ if not isinstance(json_payload, dict): json_payload = json.loads(json_payload) LOGGER.debug('Parsing envelope keys') envelope_keys = envelope_schema.keys() envelope_jsonpath = jsonpath_rw.parse("$." + ",".join(envelope_keys)) envelope_matches = [match.value for match in envelope_jsonpath.find(json_payload)] return dict(zip(envelope_keys, envelope_matches))
def pre_parse(self): """Pre-parsing method for S3 objects that will download the s3 object, open it for reading and iterate over lines (records) in the file. This yields back references of this S3Payload instance to the caller with a propertly set `pre_parsed_record` for this record. Yields: Instances of `self` back to the caller with the proper `pre_parsed_record` set. Conforms to the interface of returning a generator, providing the ability to support multi-record like this (s3). """ s3_file_path = self._get_object() if not s3_file_path: return line_num, processed_size = 0, 0 for line_num, data in self._read_downloaded_s3_object(s3_file_path): self._refresh_record(data) yield self # Only do the extra calculations below if debug logging is enabled if not LOGGER.isEnabledFor(LOG_LEVEL_DEBUG): continue # Add the current data to the total processed size # +1 to account for line feed processed_size += (len(data) + 1) # Log a debug message on every 100 lines processed if line_num % 100 == 0: avg_record_size = ((processed_size - 1) / line_num) if avg_record_size: approx_record_count = self.s3_object_size / avg_record_size LOGGER.debug( 'Processed %s S3 records out of an approximate total of %s ' '(average record size: %s bytes, total size: %s bytes)', line_num, approx_record_count, avg_record_size, self.s3_object_size) MetricLogger.log_metric(FUNCTION_NAME, MetricLogger.TOTAL_S3_RECORDS, line_num)
def _get_object(self): """Given an S3 record, download and parse the data. Returns: str: Path to the downloaded s3 object. """ # Use the urllib unquote method to decode any url encoded characters # (ie - %26 --> &) from the bucket and key names def unquoted(data): return unquote(data).decode('utf-8') region = self.raw_record['awsRegion'] bucket = unquoted(self.raw_record['s3']['bucket']['name']) key = unquoted(self.raw_record['s3']['object']['key']) self.s3_object_size = int(self.raw_record['s3']['object']['size']) LOGGER.debug('Pre-parsing record from S3. Bucket: %s, Key: %s, Size: %d', bucket, key, self.s3_object_size) return self._download_object(region, bucket, key)
def pre_parse(self): """Pre-parsing method for Kinesis records. Extracts the base64 encoded payload from the record itself, decodes it and sets it as the `pre_parsed_record` property. Yields: This object with the pre_parsed_record now set """ LOGGER.debug('Pre-parsing record from Kinesis. eventID: %s, eventSourceARN: %s', self.raw_record['eventID'], self.raw_record['eventSourceARN']) # Kinesis records have to potential to be gzipped, so try to decompress record = base64.b64decode(self.raw_record['kinesis']['data']) try: self.pre_parsed_record = zlib.decompress(record, 47) except zlib.error: self.pre_parsed_record = record yield self
def matched_log_pattern(self, record, log_patterns): """Return True if all log patterns of this record match""" # Return True immediately if there are no log patterns # or if the data being tested is not a dict if not log_patterns: return True pattern_result = [] for field, pattern_list in log_patterns.iteritems(): # handle nested log_patterns if isinstance(pattern_list, dict): return self.matched_log_pattern(record[field], pattern_list) if not isinstance(pattern_list, list): LOGGER.debug('Configured `log_patterns` should be a \'list\'') continue # The pattern field value in the record try: value = record[field] except (KeyError, TypeError): LOGGER.debug( 'Declared log pattern field [%s] is not a valid type ' 'for this record: %s', field, record) continue # Append the result of any of the log_patterns being True pattern_result.append( any(fnmatch(value, pattern) for pattern in pattern_list)) all_patterns_result = all(pattern_result) LOGGER.debug('%s log pattern match result: %s', self.type(), all_patterns_result) # if all pattern group results are True return all_patterns_result
def _parse(self, payload): """Parse a record into a declared type. Args: payload: A StreamAlert payload object Sets: payload.log_source: The detected log name from the data_sources config. payload.type: The record's type. payload.records: The parsed records as a list. Returns: bool: the success of the parse. """ schema_matches = self._process_log_schemas(payload) if not schema_matches: return False if LOGGER_DEBUG_ENABLED: LOGGER.debug( 'Schema Matched Records:\n%s', json.dumps([ schema_match.parsed_data for schema_match in schema_matches ], indent=2)) schema_match = self._check_schema_match(schema_matches) if LOGGER_DEBUG_ENABLED: LOGGER.debug('Log name: %s', schema_match.log_name) LOGGER.debug('Parsed data:\n%s', json.dumps(schema_match.parsed_data, indent=2)) for parsed_data_value in schema_match.parsed_data: # Convert data types per the schema # Use the root schema for the parser due to updates caused by # configuration settings such as envelope_keys and optional_keys try: if not self._convert_type(parsed_data_value, schema_match.root_schema): return False except KeyError: LOGGER.error('The payload is mis-classified. Payload [%s]', parsed_data_value) return False normalized_types = StreamThreatIntel.normalized_type_mapping() payload.log_source = schema_match.log_name payload.type = schema_match.parser.type() payload.records = schema_match.parsed_data payload.normalized_types = normalized_types.get( payload.log_source.split(':')[0]) return True
def _extract_records(self, json_payload, envelope): """Extract records from the original json payload using the JSON configuration Args: json_payload (dict): The parsed json data Returns: list: A list of JSON records extracted via JSON path or regex """ json_records = [] json_path_expression = self.options.get('json_path') json_regex_key = self.options.get('json_regex_key') # Handle jsonpath extraction of records if json_path_expression: LOGGER.debug('Parsing records with JSONPath') records_jsonpath = jsonpath_rw.parse(json_path_expression) matches = records_jsonpath.find(json_payload) if not matches: return False for match in matches: record = match.value embedded_json = self.options.get('embedded_json') if embedded_json: try: record = json.loads(match.value) except ValueError: LOGGER.warning('Embedded json is invalid') continue if envelope: record.update({ENVELOPE_KEY: envelope}) json_records.append(record) # Handle nested json object regex matching elif json_regex_key and json_payload.get(json_regex_key): LOGGER.debug('Parsing records with JSON Regex Key') match = self.__regex.search(str(json_payload[json_regex_key])) if not match: return False match_str = match.groups('json_blob')[0] try: new_record = json.loads(match_str) except ValueError: LOGGER.debug('Matched regex string is not valid JSON: %s', match_str) return False else: # Make sure the new_record is a dictionary and not a list. # Valid JSON can be either if not isinstance(new_record, dict): return False if envelope: new_record.update({ENVELOPE_KEY: envelope}) json_records.append(new_record) return json_records