Esempio n. 1
0
def _objects_to_analyze(
        event: Dict[str, Any]) -> Generator[Tuple[str, str], None, None]:
    """Parse the invocation event into a list of objects to analyze.

    Args:
        event: Invocation event (SQS message whose message body is an S3 event notification)

    Yields:
        (bucket_name, object_key) string tuples to analyze
    """
    if 'BucketName' in event and 'ObjectKeys' in event:
        # Direct (simple) invocation
        for key in event['ObjectKeys']:
            yield event['BucketName'], urllib.parse.unquote_plus(key)
        return

    # SQS message invocation
    for sqs_message in event['Records']:
        try:
            s3_records = json.loads(sqs_message['body'])['Records']
        except (KeyError, TypeError, json.JSONDecodeError):
            LOGGER.exception('Skipping invalid SQS message %s',
                             json.dumps(sqs_message))
            continue

        for s3_message in s3_records:
            yield (s3_message['s3']['bucket']['name'],
                   urllib.parse.unquote_plus(
                       s3_message['s3']['object']['key']))
Esempio n. 2
0
def _s3_objects(
    s3_records: List[Dict[str,
                          Any]]) -> Generator[Tuple[str, str], None, None]:
    """Build list of objects in the given S3 record.

    Args:
        s3_records: List of S3 event records: [
            {
                's3': {
                    'object': {
                        'key': (str)
                    },
                    'bucket': {
                        'name': (str)
                    }
                }
            },
            ...
        ]

    Yields:
        (bucket_name, object_key) string tuple
    """
    for record in s3_records:
        try:
            bucket_name = record['s3']['bucket']['name']
            object_key = urllib.parse.unquote_plus(
                record['s3']['object']['key'])
            yield bucket_name, object_key
        except (KeyError, TypeError):
            LOGGER.exception('Skipping invalid S3 record %s', record)
Esempio n. 3
0
    def analyze(self, target_file: str, original_target_path: str = '') -> List[YaraMatch]:
        """Run YARA analysis on a file.

        Args:
            target_file: Local path to target file to be analyzed.
            original_target_path: Path where the target file was originally discovered.

        Returns:
            List of YaraMatch tuples.
        """
        # Raw YARA matches (yara-python)
        # TODO: Once yextend is more robust, we may eventually not need yara-python anymore.
        raw_yara_matches = self._rules.match(
            target_file, externals=self._yara_variables(original_target_path)
        )
        yara_python_matches = [
            YaraMatch(m.rule, m.namespace, m.meta, set(t[1] for t in m.strings))
            for m in raw_yara_matches
        ]

        # Yextend matches
        os.environ['LD_LIBRARY_PATH'] = os.environ['LAMBDA_TASK_ROOT']
        yextend_output = None
        try:
            yextend_output = subprocess.check_output(
                ['./yextend', '-r', self._compiled_rules_file, '-t', target_file, '-j'])
            yextend_list = json.loads(yextend_output.decode('utf-8'))
            return yara_python_matches + _convert_yextend_to_yara_match(yextend_list[0])
        except Exception:  # pylint: disable=broad-except
            # If yextend fails for any reason, still return the yara-python match results.
            LOGGER.exception('Error running yextend or parsing its output')
            if yextend_output:
                LOGGER.error('yextend output: <%s>', yextend_output)
            return yara_python_matches
Esempio n. 4
0
    def _yextend_matches(self, target_file: str) -> List[YaraMatch]:
        """Use yextend to check for YARA matches against archive contents.

        Args:
            target_file: Local path to target file to be analyzed.

        Returns:
            List of YaraMatch tuples, or an empty list if yextend didn't work correctly.
        """
        try:
            output = subprocess.check_output([
                './yextend', '-r', self._compiled_rules_file, '-t',
                target_file, '-j'
            ],
                                             stderr=subprocess.STDOUT)
        except subprocess.CalledProcessError:
            LOGGER.exception('Yextend invocation failed')
            return []

        try:
            decoded_output = output.decode('utf-8')
        except UnicodeDecodeError:
            LOGGER.error('Yextend output could not be decoded to utf-8:\n%s',
                         output)
            return []

        try:
            yextend_list = json.loads(decoded_output)
        except json.JSONDecodeError:
            # There may be an error message on the first line and then the JSON result.
            try:
                yextend_list = json.loads('\n'.join(
                    decoded_output.split('\n')[1:]))
            except json.JSONDecodeError:
                # Still can't parse as JSON
                LOGGER.error('Cannot parse yextend output as JSON:\n%s',
                             decoded_output)
                return []

        # Yextend worked!
        try:
            return _convert_yextend_to_yara_match(yextend_list[0])
        except (KeyError, IndexError):
            LOGGER.exception('Unexpected yextend output format')
            LOGGER.error('Yextend output: %s', decoded_output)
            return []
Esempio n. 5
0
def _objects_to_analyze(
        event: Dict[str, Any]) -> Generator[Tuple[str, str], None, None]:
    """Parse the invocation event into a list of objects to analyze.

    Args:
        event: Invocation event, from either the dispatcher or an S3 bucket

    Yields:
        (bucket_name, object_key) string tuples to analyze
    """
    if set(event) == {'messages', 'queue_url'}:
        LOGGER.info('Invoked from dispatcher with %d messages',
                    len(event['messages']))
        for sqs_record in event['messages']:
            try:
                s3_records = json.loads(sqs_record['body'])['Records']
            except (json.JSONDecodeError, KeyError, TypeError):
                LOGGER.exception('Skipping invalid SQS message %s', sqs_record)
                continue
            yield from _s3_objects(s3_records)
    else:
        LOGGER.info('Invoked with dictionary (S3 Event)')
        yield from _s3_objects(event['Records'])
Esempio n. 6
0
    def analyze(self,
                target_file: str,
                original_target_path: str = '') -> List[YaraMatch]:
        """Run YARA analysis on a file.

        Args:
            target_file: Local path to target file to be analyzed.
            original_target_path: Path where the target file was originally discovered.

        Returns:
            List of YaraMatch tuples.
        """
        # Raw YARA matches (yara-python)
        # TODO: Once yextend is more robust, we may eventually not need yara-python anymore.
        raw_yara_matches = self._rules.match(
            target_file, externals=self._yara_variables(original_target_path))
        yara_python_matches = [
            YaraMatch(m.rule, m.namespace, m.meta, set(t[1]
                                                       for t in m.strings))
            for m in raw_yara_matches
        ]

        # Yextend matches
        os.environ['LD_LIBRARY_PATH'] = os.environ['LAMBDA_TASK_ROOT']
        try:
            yextend_output = subprocess.check_output([
                './yextend', '-r', self._compiled_rules_file, '-t',
                target_file, '-j'
            ])
            yextend_list = json.loads(yextend_output.decode('utf-8'))
        except (json.JSONDecodeError, subprocess.CalledProcessError):
            LOGGER.exception('Fatal error when running yextend')
            return yara_python_matches

        yextend_matches = _convert_yextend_to_yara_match(yextend_list[0])
        return yara_python_matches + yextend_matches
Esempio n. 7
0
def analyze_lambda_handler(event_data: Dict[str, Any],
                           lambda_context) -> Dict[str, Dict[str, Any]]:
    """Lambda function entry point.

    Args:
        event_data: [dict] of the form: {
            'S3Objects': [...],  # S3 object keys.
            'SQSReceipts': [...]  # SQS receipt handles (to be deleted after processing).
        }
            There can be any number of S3objects, but no more than 10 SQS receipts.
        lambda_context: LambdaContext object (with .function_version).

    Returns:
        A dict mapping S3 object identifier to a summary of file info and matched YARA rules.
        Example: {
            'S3:bucket:key': {
                'FileInfo': { ... },
                'MatchedRules': { ... },
                'NumMatchedRules': 1
            }
        }
    """
    result = {}
    binaries = []  # List of the BinaryInfo data.

    # The Lambda version must be an integer.
    try:
        lambda_version = int(lambda_context.function_version)
    except ValueError:
        lambda_version = -1

    LOGGER.info('Processing %d record(s)', len(event_data['S3Objects']))
    for s3_key in event_data['S3Objects']:
        # S3 keys in event notifications are url-encoded.
        s3_key = urllib.parse.unquote_plus(s3_key)
        LOGGER.info('Analyzing "%s"', s3_key)

        with binary_info.BinaryInfo(os.environ['S3_BUCKET_NAME'], s3_key,
                                    ANALYZER) as binary:
            result[binary.s3_identifier] = binary.summary()
            binaries.append(binary)

            if binary.yara_matches:
                LOGGER.warning('%s matched YARA rules: %s', binary,
                               binary.matched_rule_ids)
                binary.save_matches_and_alert(
                    lambda_version,
                    os.environ['YARA_MATCHES_DYNAMO_TABLE_NAME'],
                    os.environ['YARA_ALERTS_SNS_TOPIC_ARN'])
            else:
                LOGGER.info('%s did not match any YARA rules', binary)

    # Delete all of the SQS receipts (mark them as completed).
    analyzer_aws_lib.delete_sqs_messages(os.environ['SQS_QUEUE_URL'],
                                         event_data['SQSReceipts'])

    # Publish metrics.
    try:
        analyzer_aws_lib.put_metric_data(NUM_YARA_RULES, binaries)
    except BotoError:
        LOGGER.exception('Error saving metric data')

    return result
Esempio n. 8
0
def analyze_lambda_handler(event: Dict[str, Any],
                           lambda_context: Any) -> Dict[str, Dict[str, Any]]:
    """Analyzer Lambda function entry point.

    Args:
        event: SQS message batch sent by the dispatcher: {
            'messages': [
                {
                    'body': (str) JSON-encoded S3 put event: {
                        'Records': [
                            {
                                's3': {
                                    'object': {
                                        'key': (str)
                                    },
                                    'bucket': {
                                        'name': (str)
                                    }
                                }
                            },
                            ...
                        ]
                    },
                    'receipt': (str) SQS message receipt handle,
                    'receive_count': (int) Approx. # of times this has been received
                },
                ...
            ],
            'queue_url': (str) SQS queue url from which the message originated
        }
            Alternatively, the event can be an S3 Put Event dictionary (with no sqs information).
            This allows the analyzer to be linked directly to an S3 bucket notification if needed.
        lambda_context: LambdaContext object (with .function_version).

    Returns:
        A dict mapping S3 object identifier to a summary of file info and matched YARA rules.
        Example: {
            'S3:bucket:key': {
                'FileInfo': { ... },
                'MatchedRules': { ... },
                'NumMatchedRules': 1
            }
        }
    """
    # Executables in the root of the deployment package (upx, pdftotext, etc) are added to PATH.
    os.environ['PATH'] = '{}:{}'.format(os.environ['PATH'],
                                        os.environ['LAMBDA_TASK_ROOT'])
    os.environ['LD_LIBRARY_PATH'] = os.environ['LAMBDA_TASK_ROOT']

    result = {}
    binaries = []  # List of the BinaryInfo data.

    # The Lambda version must be an integer.
    try:
        lambda_version = int(lambda_context.function_version)
    except ValueError:
        LOGGER.warning('Invoked $LATEST instead of a versioned function')
        lambda_version = -1

    for bucket_name, object_key in _objects_to_analyze(event):
        LOGGER.info('Analyzing "%s:%s"', bucket_name, object_key)

        try:
            with binary_info.BinaryInfo(bucket_name, object_key,
                                        ANALYZER) as binary:
                result[binary.s3_identifier] = binary.summary()
                binaries.append(binary)
        except analyzer_aws_lib.FileDownloadError:
            LOGGER.exception('Unable to download %s from %s', object_key,
                             bucket_name)
            continue

        if binary.yara_matches:
            LOGGER.warning('%s matched YARA rules: %s', binary,
                           binary.matched_rule_ids)
            binary.save_matches_and_alert(
                lambda_version, os.environ['YARA_MATCHES_DYNAMO_TABLE_NAME'],
                os.environ['YARA_ALERTS_SNS_TOPIC_ARN'])

    # Delete all of the SQS receipts (mark them as completed).
    receipts_to_delete = [msg['receipt'] for msg in event.get('messages', [])]
    if receipts_to_delete:
        analyzer_aws_lib.delete_sqs_messages(event['queue_url'],
                                             receipts_to_delete)

    # Publish metrics.
    if binaries:
        try:
            analyzer_aws_lib.put_metric_data(NUM_YARA_RULES, binaries)
        except ClientError:
            LOGGER.exception('Error saving metric data')

    return result
Esempio n. 9
0
def analyze_lambda_handler(event_data: Dict[str, Any],
                           lambda_context) -> Dict[str, Dict[str, Any]]:
    """Lambda function entry point.

    Args:
        event_data: [dict] of the form: {
            'Records': [
                {
                    "s3": {
                        "object": {
                            "key": "FileName.txt"
                        },
                        "bucket": {
                            "name": "mybucket"
                        }
                    }
                }
            ],
            'SQSReceipts': [...]  # SQS receipt handles (to be deleted after processing).
        }
            There can be any number of S3objects, but no more than 10 SQS receipts.
            The Records are the same format as the S3 Put event, which means the analyzer could be
            directly linked to an S3 bucket notification if needed.
        lambda_context: LambdaContext object (with .function_version).

    Returns:
        A dict mapping S3 object identifier to a summary of file info and matched YARA rules.
        Example: {
            'S3:bucket:key': {
                'FileInfo': { ... },
                'MatchedRules': { ... },
                'NumMatchedRules': 1
            }
        }
    """
    result = {}
    binaries = []  # List of the BinaryInfo data.

    # The Lambda version must be an integer.
    try:
        lambda_version = int(lambda_context.function_version)
    except ValueError:
        lambda_version = -1

    LOGGER.info('Processing %d record(s)', len(event_data['Records']))
    for record in event_data['Records']:
        bucket_name = record['s3']['bucket']['name']
        s3_key = urllib.parse.unquote_plus(record['s3']['object']['key'])
        LOGGER.info('Analyzing "%s:%s"', bucket_name, s3_key)

        with binary_info.BinaryInfo(bucket_name, s3_key, ANALYZER) as binary:
            result[binary.s3_identifier] = binary.summary()
            binaries.append(binary)

            if binary.yara_matches:
                LOGGER.warning('%s matched YARA rules: %s', binary,
                               binary.matched_rule_ids)
                binary.save_matches_and_alert(
                    lambda_version,
                    os.environ['YARA_MATCHES_DYNAMO_TABLE_NAME'],
                    os.environ['YARA_ALERTS_SNS_TOPIC_ARN'])
            else:
                LOGGER.info('%s did not match any YARA rules', binary)

    # Delete all of the SQS receipts (mark them as completed).
    analyzer_aws_lib.delete_sqs_messages(os.environ['SQS_QUEUE_URL'],
                                         event_data.get('SQSReceipts', []))

    # Publish metrics.
    try:
        analyzer_aws_lib.put_metric_data(NUM_YARA_RULES, binaries)
    except BotoError:
        LOGGER.exception('Error saving metric data')

    return result
Esempio n. 10
0
def analyze_lambda_handler(event: Dict[str, Any],
                           lambda_context: Any) -> Dict[str, Any]:
    """Analyzer Lambda function entry point.

    Args:
        event: SQS message batch - each message body is a JSON-encoded S3 notification - {
            'Records': [
                {
                    'body': json.dumps({
                        'Records': [
                            's3': {
                                'bucket': {
                                    'name': '...'
                                },
                                'object': {
                                    'key': '...'
                                }
                            }
                        ]
                    }),
                    'messageId': '...'
                }
            ]
        }
        lambda_context: LambdaContext object (with .function_version).

    Returns:
        A dict mapping S3 object identifier to a summary of file info and matched YARA rules.
        Example: {
            'S3:bucket:key': {
                'FileInfo': { ... },
                'MatchedRules': { ... },
                'NumMatchedRules': 1
            }
        }
    """
    # Executables in the root of the deployment package (upx, pdftotext, etc) are added to PATH.
    os.environ['PATH'] = '{}:{}'.format(os.environ['PATH'],
                                        os.environ['LAMBDA_TASK_ROOT'])
    os.environ['LD_LIBRARY_PATH'] = os.environ['LAMBDA_TASK_ROOT']

    result = {}
    binaries = []  # List of the BinaryInfo data.

    # The Lambda version must be an integer.
    try:
        lambda_version = int(lambda_context.function_version)
    except ValueError:
        LOGGER.warning('Invoked $LATEST instead of a versioned function')
        lambda_version = -1

    for bucket_name, object_key in _objects_to_analyze(event):
        LOGGER.info('Analyzing "%s:%s"', bucket_name, object_key)

        try:
            with binary_info.BinaryInfo(bucket_name, object_key,
                                        ANALYZER) as binary:
                result[binary.s3_identifier] = binary.summary()
                binaries.append(binary)
        except analyzer_aws_lib.FileDownloadError:
            LOGGER.exception('Unable to download %s from %s', object_key,
                             bucket_name)
            continue

        if binary.yara_matches:
            LOGGER.warning('%s matched YARA rules: %s', binary,
                           binary.matched_rule_ids)
            binary.save_matches_and_alert(
                lambda_version, os.environ['YARA_MATCHES_DYNAMO_TABLE_NAME'],
                os.environ['YARA_ALERTS_SNS_TOPIC_ARN'])
        else:
            LOGGER.info('%s did not match any YARA rules', binary)
            if os.environ['SAFE_SNS_TOPIC_ARN']:
                binary.safe_alert_only(os.environ['SAFE_SNS_TOPIC_ARN'])

    # Publish metrics.
    if binaries:
        try:
            analyzer_aws_lib.put_metric_data(NUM_YARA_RULES, binaries)
        except ClientError:
            LOGGER.exception('Error saving metric data')

    return result