Example #1
0
def test_depth_limit(core):
    # Make a nested set of files that goes deeper than the max depth by one
    sha, size = ready_body(core)
    for _ in range(core.config.submission.max_extraction_depth + 1):
        sha, size = ready_extract(core, sha)

    core.ingest_queue.push(
        SubmissionInput(
            dict(
                metadata={},
                params=dict(
                    description="file abc123",
                    services=dict(selected=''),
                    submitter='user',
                    groups=['user'],
                    # Make sure we can extract enough files that we will definitely hit the depth limit first
                    max_extracted=core.config.submission.max_extraction_depth +
                    10),
                notification=dict(queue='test-depth-limit', threshold=0),
                files=[dict(sha256=sha, size=size,
                            name='abc123')])).as_primitives())

    notification_queue = NamedQueue('nq-test-depth-limit', core.redis)
    start = time.time()
    task = notification_queue.pop(timeout=10)
    print("notification time waited", time.time() - start)
    assert task is not None
    task = IngestTask(task)
    sub: Submission = core.ds.submission.get(task.submission.sid)
    assert len(sub.files) == 1
    # We should only get results for each file up to the max depth
    assert len(sub.results) == 4 * core.config.submission.max_extraction_depth
    assert len(sub.errors) == 1
Example #2
0
def test_submit_simple(submit_harness):
    datastore, submitter = submit_harness

    # Push a normal ingest task
    submitter.unique_queue.push(
        0,
        IngestTask({
            'submission': {
                'params':
                SubmissionParams({
                    'classification': 'U',
                    'description': 'file abc',
                    'services': {
                        'selected': [],
                        'excluded': [],
                        'resubmit': [],
                    },
                    'submitter': 'user',
                }),
                'files': [{
                    'sha256': '0' * 64,
                    'size': 100,
                    'name': 'abc',
                }],
                'metadata': {}
            },
            'ingest_id': '123abc'
        }).as_primitives())
    submitter.handle_submit()

    # The task has been passed to the submit tool and there are no other submissions
    submitter.submit_client.submit.assert_called()
    assert submitter.unique_queue.pop() is None
Example #3
0
def test_max_extracted_in_one(core):
    # Make a set of files that is bigger than max_extracted (3 in this case)
    children = [ready_body(core)[0] for _ in range(5)]
    sha, size = ready_extract(core, children)

    core.ingest_queue.push(
        SubmissionInput(
            dict(metadata={},
                 params=dict(description="file abc123",
                             services=dict(selected=''),
                             submitter='user',
                             groups=['user'],
                             max_extracted=3),
                 notification=dict(queue='test-extracted-in-one', threshold=0),
                 files=[dict(sha256=sha, size=size,
                             name='abc123')])).as_primitives())

    notification_queue = NamedQueue('nq-test-extracted-in-one', core.redis)
    start = time.time()
    task = notification_queue.pop(timeout=10)
    print("notification time waited", time.time() - start)
    assert task is not None
    task = IngestTask(task)
    sub: Submission = core.ds.submission.get(task.submission.sid)
    assert len(sub.files) == 1
    # We should only get results for each file up to the max depth
    assert len(sub.results) == 4 * (1 + 3)
    assert len(sub.errors) == 2  # The number of children that errored out
def test_ingest_retry(core: CoreSession, metrics):
    # -------------------------------------------------------------------------------
    #
    sha, size = ready_body(core)
    original_retry_delay = assemblyline_core.ingester.ingester._retry_delay
    assemblyline_core.ingester.ingester._retry_delay = 1

    attempts = []
    failures = []
    original_submit = core.ingest.submit

    def fail_once(task):
        attempts.append(task)
        if len(attempts) > 1:
            original_submit(task)
        else:
            failures.append(task)
            raise ValueError()

    core.ingest.submit = fail_once

    try:
        core.ingest_queue.push(
            SubmissionInput(
                dict(metadata={},
                     params=dict(
                         description="file abc123",
                         services=dict(selected=''),
                         submitter='user',
                         groups=['user'],
                     ),
                     notification=dict(queue='output-queue-one', threshold=0),
                     files=[dict(sha256=sha, size=size,
                                 name='abc123')])).as_primitives())

        notification_queue = NamedQueue('nq-output-queue-one', core.redis)
        first_task = notification_queue.pop(timeout=RESPONSE_TIMEOUT)

        # One of the submission will get processed fully
        assert first_task is not None
        first_task = IngestTask(first_task)
        first_submission: Submission = core.ds.submission.get(
            first_task.submission.sid)
        assert len(attempts) == 2
        assert len(failures) == 1
        assert first_submission.state == 'completed'
        assert len(first_submission.files) == 1
        assert len(first_submission.errors) == 0
        assert len(first_submission.results) == 4

        metrics.expect('ingester', 'submissions_ingested', 1)
        metrics.expect('ingester', 'submissions_completed', 1)
        metrics.expect('ingester', 'files_completed', 1)
        metrics.expect('ingester', 'duplicates', 0)
        metrics.expect('dispatcher', 'submissions_completed', 1)
        metrics.expect('dispatcher', 'files_completed', 1)

    finally:
        core.ingest.submit = original_submit
        assemblyline_core.ingester.ingester._retry_delay = original_retry_delay
Example #5
0
def test_max_extracted_in_several(core):
    # Make a set of in a non trivial tree, that add up to more than 3 (max_extracted) files
    children = [
        ready_extract(
            core,
            [ready_body(core)[0], ready_body(core)[0]])[0],
        ready_extract(
            core,
            [ready_body(core)[0], ready_body(core)[0]])[0]
    ]
    sha, size = ready_extract(core, children)

    core.ingest_queue.push(
        SubmissionInput(
            dict(metadata={},
                 params=dict(description="file abc123",
                             services=dict(selected=''),
                             submitter='user',
                             groups=['user'],
                             max_extracted=3),
                 notification=dict(queue='test-extracted-in-several',
                                   threshold=0),
                 files=[dict(sha256=sha, size=size,
                             name='abc123')])).as_primitives())

    notification_queue = NamedQueue('nq-test-extracted-in-several', core.redis)
    task = IngestTask(notification_queue.pop(timeout=10))
    sub: Submission = core.ds.submission.get(task.submission.sid)
    assert len(sub.files) == 1
    # We should only get results for each file up to the max depth
    assert len(sub.results) == 4 * (
        1 + 3)  # 4 services, 1 original file, 3 extracted files
    assert len(sub.errors) == 3  # The number of children that errored out
def test_ingest_stale_score_exists(ingest_harness):
    datastore, ingester, in_queue = ingest_harness
    get_if_exists = datastore.filescore.get_if_exists
    try:
        # Add a stale file score to the database for every file always
        from assemblyline.odm.models.filescore import FileScore
        datastore.filescore.get_if_exists = mock.MagicMock(
            return_value=FileScore(
                dict(psid='000',
                     expiry_ts=0,
                     errors=0,
                     score=10,
                     sid='000',
                     time=0)))

        # Process a message that hits the stale score
        in_queue.push(make_message())
        ingester.handle_ingest()

        # The stale filescore was retrieved
        datastore.filescore.get_if_exists.assert_called_once()

        # but message was ingested as a cache miss
        task = ingester.unique_queue.pop()
        assert task
        task = IngestTask(task)
        assert task.submission.files[0].sha256 == '0' * 64

        assert ingester.unique_queue.length() == 0
        assert ingester.ingest_queue.length() == 0
    finally:
        datastore.filescore.get_if_exists = get_if_exists
def test_extracted_file(core, metrics):
    sha, size = ready_extract(core, ready_body(core)[0])

    core.ingest_queue.push(
        SubmissionInput(
            dict(metadata={},
                 params=dict(description="file abc123",
                             services=dict(selected=''),
                             submitter='user',
                             groups=['user'],
                             max_extracted=10000),
                 notification=dict(queue='text-extracted-file', threshold=0),
                 files=[dict(sha256=sha, size=size,
                             name='abc123')])).as_primitives())

    notification_queue = NamedQueue('nq-text-extracted-file', core.redis)
    task = notification_queue.pop(timeout=RESPONSE_TIMEOUT)
    assert task
    task = IngestTask(task)
    sub = core.ds.submission.get(task.submission.sid)
    assert len(sub.files) == 1
    assert len(sub.results) == 8
    assert len(sub.errors) == 0

    metrics.expect('ingester', 'submissions_ingested', 1)
    metrics.expect('ingester', 'submissions_completed', 1)
    metrics.expect('dispatcher', 'submissions_completed', 1)
    metrics.expect('dispatcher', 'files_completed', 2)
Example #8
0
    def run_once():
        counter.reset_mock()

        core.ingest_queue.push(
            SubmissionInput(
                dict(metadata={},
                     params=dict(
                         description="file abc123",
                         services=dict(selected=''),
                         submitter='user',
                         groups=['user'],
                     ),
                     notification=dict(queue='1', threshold=0),
                     files=[dict(sha256=sha, size=size,
                                 name='abc123')])).as_primitives())

        notification_queue = NamedQueue('nq-1', core.redis)
        first_task = notification_queue.pop(timeout=5)

        # One of the submission will get processed fully
        assert first_task is not None
        first_task = IngestTask(first_task)
        first_submission: Submission = core.ds.submission.get(
            first_task.submission.sid)
        assert first_submission.state == 'completed'
        assert len(first_submission.files) == 1
        assert len(first_submission.errors) == 0
        assert len(first_submission.results) == 4
        return first_submission.sid
def test_dropping_early(core, metrics):
    # -------------------------------------------------------------------------------
    # This time have a file get marked for dropping by a service
    sha, size = ready_body(core, {'pre': {'result': {'drop_file': True}}})

    core.ingest_queue.push(
        SubmissionInput(
            dict(metadata={},
                 params=dict(description="file abc123",
                             services=dict(selected=''),
                             submitter='user',
                             groups=['user'],
                             max_extracted=10000),
                 notification=dict(queue='drop', threshold=0),
                 files=[dict(sha256=sha, size=size,
                             name='abc123')])).as_primitives())

    notification_queue = NamedQueue('nq-drop', core.redis)
    dropped_task = notification_queue.pop(timeout=RESPONSE_TIMEOUT)
    dropped_task = IngestTask(dropped_task)
    sub = core.ds.submission.get(dropped_task.submission.sid)
    assert len(sub.files) == 1
    assert len(sub.results) == 1

    metrics.expect('ingester', 'submissions_ingested', 1)
    metrics.expect('ingester', 'submissions_completed', 1)
    metrics.expect('dispatcher', 'submissions_completed', 1)
    metrics.expect('dispatcher', 'files_completed', 1)
def test_service_retry_limit(core, metrics):
    # This time have the service 'crash'
    sha, size = ready_body(core, {'pre': {'drop': 3}})

    core.ingest_queue.push(
        SubmissionInput(
            dict(metadata={},
                 params=dict(description="file abc123",
                             services=dict(selected=''),
                             submitter='user',
                             groups=['user'],
                             max_extracted=10000),
                 notification=dict(queue='watcher-recover', threshold=0),
                 files=[dict(sha256=sha, size=size,
                             name='abc123')])).as_primitives())

    notification_queue = NamedQueue('nq-watcher-recover', core.redis)
    dropped_task = notification_queue.pop(timeout=RESPONSE_TIMEOUT)
    assert dropped_task
    dropped_task = IngestTask(dropped_task)
    sub = core.ds.submission.get(dropped_task.submission.sid)
    assert len(sub.errors) == 1
    assert len(sub.results) == 3
    assert core.pre_service.drops[sha] == 3
    assert core.pre_service.hits[sha] == 3

    # Wait until we get feedback from the metrics channel
    metrics.expect('ingester', 'submissions_ingested', 1)
    metrics.expect('ingester', 'submissions_completed', 1)
    metrics.expect('dispatcher', 'service_timeouts', 3)
    metrics.expect('service', 'fail_recoverable', 3)
    metrics.expect('service', 'fail_nonrecoverable', 1)
    metrics.expect('dispatcher', 'submissions_completed', 1)
    metrics.expect('dispatcher', 'files_completed', 1)
Example #11
0
def test_service_retry_limit(core):
    watch = WatcherServer(redis=core.redis, redis_persist=core.redis)
    watch.start()
    try:
        # This time have the service 'crash'
        sha, size = ready_body(core, {'pre': {'drop': 3}})

        core.ingest_queue.push(
            SubmissionInput(
                dict(metadata={},
                     params=dict(description="file abc123",
                                 services=dict(selected=''),
                                 submitter='user',
                                 groups=['user'],
                                 max_extracted=10000),
                     notification=dict(queue='watcher-recover', threshold=0),
                     files=[dict(sha256=sha, size=size,
                                 name='abc123')])).as_primitives())

        notification_queue = NamedQueue('nq-watcher-recover', core.redis)
        dropped_task = notification_queue.pop(timeout=16)
        assert dropped_task
        dropped_task = IngestTask(dropped_task)
        sub = core.ds.submission.get(dropped_task.submission.sid)
        assert len(sub.errors) == 1
        assert len(sub.results) == 3
        assert core.pre_service.drops[sha] == 3
        assert core.pre_service.hits[sha] == 3
    finally:
        watch.stop()
        watch.join()
def test_plumber_clearing(core, metrics):
    global _global_semaphore
    _global_semaphore = threading.Semaphore(value=0)
    start = time.time()

    try:
        # Have the plumber cancel tasks
        sha, size = ready_body(core, {'pre': {'hold': 60}})

        core.ingest_queue.push(
            SubmissionInput(
                dict(metadata={},
                     params=dict(description="file abc123",
                                 services=dict(selected=''),
                                 submitter='user',
                                 groups=['user'],
                                 max_extracted=10000),
                     notification=dict(queue='test_plumber_clearing',
                                       threshold=0),
                     files=[dict(sha256=sha, size=size,
                                 name='abc123')])).as_primitives())

        metrics.expect('ingester', 'submissions_ingested', 1)
        service_queue = get_service_queue('pre', core.redis)

        start = time.time()
        while service_queue.length() < 1:
            if time.time() - start > RESPONSE_TIMEOUT:
                pytest.fail(f'Found { service_queue.length()}')
            time.sleep(0.1)

        service_delta = core.ds.service_delta.get('pre')
        service_delta['enabled'] = False
        core.ds.service_delta.save('pre', service_delta)

        notification_queue = NamedQueue('nq-test_plumber_clearing', core.redis)
        dropped_task = notification_queue.pop(timeout=RESPONSE_TIMEOUT)
        dropped_task = IngestTask(dropped_task)
        sub = core.ds.submission.get(dropped_task.submission.sid)
        assert len(sub.files) == 1
        assert len(sub.results) == 3
        assert len(sub.errors) == 1
        error = core.ds.error.get(sub.errors[0])
        assert "disabled" in error.response.message

        metrics.expect('ingester', 'submissions_completed', 1)
        metrics.expect('dispatcher', 'submissions_completed', 1)
        metrics.expect('dispatcher', 'files_completed', 1)
        metrics.expect('service', 'fail_recoverable', 1)

    finally:
        _global_semaphore.release()
        service_delta = core.ds.service_delta.get('pre')
        service_delta['enabled'] = True
        core.ds.service_delta.save('pre', service_delta)
Example #13
0
def test_submit_duplicate(submit_harness):
    datastore, submitter = submit_harness

    # a normal ingest task
    task = IngestTask({
        'submission': {
            'params':
            SubmissionParams({
                'classification': 'U',
                'description': 'file abc',
                'services': {
                    'selected': [],
                    'excluded': [],
                    'resubmit': [],
                },
                'submitter': 'user',
            }),
            'files': [{
                'sha256': '0' * 64,
                'size': 100,
                'name': 'abc',
            }],
            'metadata': {}
        },
        'ingest_id': 'abc123'
    })
    # Make sure the scan key is correct, this is normally done on ingest
    task.submission.scan_key = task.params.create_filescore_key(
        task.submission.files[0].sha256, [])

    # Add this file to the scanning table, so it looks like it has already been submitted + ingest again
    submitter.scanning.add(task.submission.scan_key, task.as_primitives())
    submitter.unique_queue.push(0, task.as_primitives())

    submitter.handle_submit()

    # No tasks should be left in the queue
    assert submitter.unique_queue.pop() is None
    # The task should have been pushed to the duplicates queue
    assert submitter.duplicate_queue.length(_dup_prefix +
                                            task.submission.scan_key) == 1
Example #14
0
def test_ingest_simple(ingest_harness):
    datastore, ingester, in_queue = ingest_harness

    user = random_minimal_obj(User)
    user.name = 'user'
    custom_user_groups = ['users', 'the_user']
    user.groups = list(custom_user_groups)
    datastore.user.save('user', user)

    # Let the ingest loop run an extra time because we send two messages
    ingester.running.counter += 1

    # Send a message with a garbled sha, this should be dropped
    in_queue.push(make_message(files={'sha256': '1' * 10}))

    with pytest.raises(ValueError):
        # Process garbled message
        ingester.try_run(volatile=True)

    # Send a message that is fine, but has an illegal metadata field
    in_queue.push(
        make_message(dict(
            metadata={
                'tobig':
                'a' *
                (ingester.ingester.config.submission.max_metadata_length + 2),
                'small':
                '100'
            }),
                     params={
                         'submitter': 'user',
                         'groups': []
                     }))

    # Process those ok message
    ingester.try_run(volatile=True)

    mm = ingester.ingester
    # The only task that makes it through though fit these parameters
    task = mm.unique_queue.pop()
    assert task
    task = IngestTask(task)
    assert task.submission.files[
        0].sha256 == '0' * 64  # Only the valid sha passed through
    assert 'tobig' not in task.submission.metadata  # The bad metadata was stripped
    assert task.submission.metadata[
        'small'] == '100'  # The valid metadata is unchanged
    assert task.submission.params.submitter == 'user'
    assert task.submission.params.groups == custom_user_groups

    # None of the other tasks should reach the end
    assert mm.unique_queue.length() == 0
    assert mm.ingest_queue.length() == 0
Example #15
0
def test_existing_score(submit_harness):
    datastore, submitter = submit_harness
    get_if_exists = datastore.filescore.get_if_exists
    try:
        # Set everything to have an existing filestore
        datastore.filescore.get_if_exists = mock.MagicMock(
            return_value=FileScore(
                dict(psid='000',
                     expiry_ts=0,
                     errors=0,
                     score=10,
                     sid='000',
                     time=time.time())))

        # add task to internal queue
        submitter.unique_queue.push(
            0,
            IngestTask({
                'submission': {
                    'params':
                    SubmissionParams({
                        'classification': 'U',
                        'description': 'file abc',
                        'services': {
                            'selected': [],
                            'excluded': [],
                            'resubmit': [],
                        },
                        'submitter': 'user',
                    }),
                    'files': [{
                        'sha256': '0' * 64,
                        'size': 100,
                        'name': 'abc',
                    }],
                    'metadata': {},
                    'notification': {
                        'queue': 'our_queue'
                    }
                },
                'ingest_id': 'abc123'
            }).as_primitives())

        submitter.handle_submit()

        # No tasks should be left in the queue
        assert submitter.unique_queue.pop() is None
        # We should have received a notification about our task, since it was already 'done'
        assert submitter.notification_queues['nq-our_queue'].length() == 1
    finally:
        datastore.filescore.get_if_exists = get_if_exists
Example #16
0
def test_plumber_clearing(core):
    global _global_semaphore
    _global_semaphore = threading.Semaphore(value=0)

    start = time.time()
    watch = WatcherServer(redis=core.redis, redis_persist=core.redis)
    watch.start()

    try:
        # Have the plumber cancel tasks
        sha, size = ready_body(core, {'pre': {'semaphore': 60}})

        core.ingest_queue.push(
            SubmissionInput(
                dict(metadata={},
                     params=dict(description="file abc123",
                                 services=dict(selected=''),
                                 submitter='user',
                                 groups=['user'],
                                 max_extracted=10000),
                     notification=dict(queue='test_plumber_clearing',
                                       threshold=0),
                     files=[dict(sha256=sha, size=size,
                                 name='abc123')])).as_primitives())

        service_queue = get_service_queue('pre', core.redis)
        time.sleep(0.5)
        while service_queue.length() == 0 and time.time() - start < 20:
            time.sleep(0.1)

        service_delta = core.ds.service_delta.get('pre')
        service_delta['enabled'] = False
        core.ds.service_delta.save('pre', service_delta)

        notification_queue = NamedQueue('nq-test_plumber_clearing', core.redis)
        dropped_task = notification_queue.pop(timeout=5)
        dropped_task = IngestTask(dropped_task)
        sub = core.ds.submission.get(dropped_task.submission.sid)
        assert len(sub.files) == 1
        assert len(sub.results) == 3
        assert len(sub.errors) == 1

        error = core.ds.error.get(sub.errors[0])
        assert "disabled" in error.response.message
    finally:
        _global_semaphore.release()
        service_delta = core.ds.service_delta.get('pre')
        service_delta['enabled'] = True
        core.ds.service_delta.save('pre', service_delta)
        watch.stop()
        watch.join()
def test_service_error(core, metrics):
    # -------------------------------------------------------------------------------
    # Have a service produce an error
    # -------------------------------------------------------------------------------
    # This time have a file get marked for dropping by a service
    sha, size = ready_body(
        core, {
            'core-a': {
                'error': {
                    'archive_ts': time.time() + 250,
                    'sha256': 'a' * 64,
                    'response': {
                        'message': 'words',
                        'status': 'FAIL_NONRECOVERABLE',
                        'service_name': 'core-a',
                        'service_tool_version': 0,
                        'service_version': '0'
                    },
                    'expiry_ts': time.time() + 500
                },
                'failure': True,
            }
        })

    core.ingest_queue.push(
        SubmissionInput(
            dict(metadata={},
                 params=dict(description="file abc123",
                             services=dict(selected=''),
                             submitter='user',
                             groups=['user'],
                             max_extracted=10000),
                 notification=dict(queue='error', threshold=0),
                 files=[dict(sha256=sha, size=size,
                             name='abc123')])).as_primitives())

    notification_queue = NamedQueue('nq-error', core.redis)
    task = IngestTask(notification_queue.pop(timeout=RESPONSE_TIMEOUT))
    sub = core.ds.submission.get(task.submission.sid)
    assert len(sub.files) == 1
    assert len(sub.results) == 3
    assert len(sub.errors) == 1

    metrics.expect('ingester', 'submissions_ingested', 1)
    metrics.expect('ingester', 'submissions_completed', 1)
    metrics.expect('dispatcher', 'submissions_completed', 1)
    metrics.expect('dispatcher', 'files_completed', 1)
def test_ingest_groups_custom(ingest_harness):
    datastore, ingester, in_queue = ingest_harness

    user = random_minimal_obj(User)
    user.name = 'user'
    custom_user_groups = ['users', 'the_user']
    user.groups = list(custom_user_groups)
    datastore.user.save('user', user)

    in_queue.push(
        make_message(params={
            'submitter': 'user',
            'groups': ['group_b']
        }))
    ingester.handle_ingest()

    task = ingester.unique_queue.pop()
    assert task
    task = IngestTask(task)
    assert task.submission.params.submitter == 'user'
    assert task.submission.params.groups == ['group_b']
    def try_run(self, volatile=False):
        ingester = self.ingester
        logger = self.log

        time_mark, cpu_mark = time.time(), time.process_time()

        while self.running:
            # noinspection PyBroadException
            try:
                self.heartbeat()
                ingester.counter.increment_execution_time(
                    'cpu_seconds',
                    time.process_time() - cpu_mark)
                ingester.counter.increment_execution_time(
                    'busy_seconds',
                    time.time() - time_mark)

                # Check if there is room for more submissions
                length = ingester.scanning.length()
                if length >= ingester.config.core.ingester.max_inflight:
                    time.sleep(0.1)
                    time_mark, cpu_mark = time.time(), time.process_time()
                    continue

                raw = ingester.unique_queue.pop()
                if not raw:
                    time.sleep(0.1)
                    time_mark, cpu_mark = time.time(), time.process_time()
                    continue

                # Start timing 'busy' time, we reset this above after the sleeps so that the sleeps
                # don't get counted as busy
                time_mark, cpu_mark = time.time(), time.process_time()

                # Start of ingest message
                if self.apm_client:
                    self.apm_client.begin_transaction('ingest_msg')

                task = IngestTask(raw)

                # noinspection PyBroadException
                if any(
                        len(file.sha256) != 64
                        for file in task.submission.files):
                    logger.error("Malformed entry on submission queue: %s",
                                 task.ingest_id)
                    # End of ingest message (invalid_hash)
                    if self.apm_client:
                        self.apm_client.end_transaction(
                            'ingest_submit', 'invalid_hash')
                    continue

                # If between the initial ingestion and now the drop/whitelist status
                # of this submission has changed, then drop it now
                if ingester.drop(task):
                    # End of ingest message (dropped)
                    if self.apm_client:
                        self.apm_client.end_transaction(
                            'ingest_submit', 'dropped')
                    continue

                if ingester.is_whitelisted(task):
                    # End of ingest message (whitelisted)
                    if self.apm_client:
                        self.apm_client.end_transaction(
                            'ingest_submit', 'whitelisted')
                    continue

                # Check if this file has been previously processed.
                pprevious, previous, score, scan_key = None, False, None, None
                if not task.submission.params.ignore_cache:
                    pprevious, previous, score, scan_key = ingester.check(task)
                else:
                    scan_key = ingester.stamp_filescore_key(task)

                # If it HAS been previously processed, we are dealing with a resubmission
                # finalize will decide what to do, and put the task back in the queue
                # rewritten properly if we are going to run it again
                if previous:
                    if not task.submission.params.services.resubmit and not pprevious:
                        logger.warning(
                            f"No psid for what looks like a resubmission of "
                            f"{task.submission.files[0].sha256}: {scan_key}")
                    ingester.finalize(pprevious, previous, score, task)
                    # End of ingest message (finalized)
                    if self.apm_client:
                        self.apm_client.end_transaction(
                            'ingest_submit', 'finalized')

                    continue

                # We have decided this file is worth processing

                # Add the task to the scanning table, this is atomic across all submit
                # workers, so if it fails, someone beat us to the punch, record the file
                # as a duplicate then.
                if not ingester.scanning.add(scan_key, task.as_primitives()):
                    logger.debug('Duplicate %s',
                                 task.submission.files[0].sha256)
                    ingester.counter.increment('duplicates')
                    ingester.duplicate_queue.push(_dup_prefix + scan_key,
                                                  task.as_primitives())
                    # End of ingest message (duplicate)
                    if self.apm_client:
                        self.apm_client.end_transaction(
                            'ingest_submit', 'duplicate')

                    continue

                # We have managed to add the task to the scan table, so now we go
                # ahead with the submission process
                try:
                    ingester.submit(task)
                    # End of ingest message (submitted)
                    if self.apm_client:
                        self.apm_client.end_transaction(
                            'ingest_submit', 'submitted')

                    continue
                except Exception as _ex:
                    # For some reason (contained in `ex`) we have failed the submission
                    # The rest of this function is error handling/recovery
                    ex = _ex
                    traceback = _ex.__traceback__

                ingester.counter.increment('error')

                should_retry = True
                if isinstance(ex, CorruptedFileStoreException):
                    logger.error(
                        "Submission for file '%s' failed due to corrupted filestore: %s"
                        % (task.sha256, str(ex)))
                    should_retry = False
                elif isinstance(ex, DataStoreException):
                    trace = exceptions.get_stacktrace_info(ex)
                    logger.error(
                        "Submission for file '%s' failed due to data store error:\n%s"
                        % (task.sha256, trace))
                elif not isinstance(ex, FileStoreException):
                    trace = exceptions.get_stacktrace_info(ex)
                    logger.error("Submission for file '%s' failed: %s" %
                                 (task.sha256, trace))

                task = IngestTask(ingester.scanning.pop(scan_key))
                if not task:
                    logger.error('No scanning entry for for %s', task.sha256)
                    # End of ingest message (no_scan_entry)
                    if self.apm_client:
                        self.apm_client.end_transaction(
                            'ingest_submit', 'no_scan_entry')

                    continue

                if not should_retry:
                    # End of ingest message (cannot_retry)
                    if self.apm_client:
                        self.apm_client.end_transaction(
                            'ingest_submit', 'cannot_retry')

                    continue

                ingester.retry(task, scan_key, ex)
                # End of ingest message (retry)
                if self.apm_client:
                    self.apm_client.end_transaction('ingest_submit', 'retried')

                if volatile:
                    raise ex.with_traceback(traceback)

            except Exception:
                logger.exception("Unexpected error")
                # End of ingest message (exception)
                if self.apm_client:
                    self.apm_client.end_transaction('ingest_submit',
                                                    'exception')

                if volatile:
                    raise
    def try_run(self, volatile=False):
        ingester = self.ingester
        cpu_mark = time.process_time()
        time_mark = time.time()

        # Move from ingest to unique and waiting queues.
        # While there are entries in the ingest queue we consume chunk_size
        # entries at a time and move unique entries to uniqueq / queued and
        # duplicates to their own queues / waiting.
        while self.running:
            self.heartbeat()
            while True:
                result = ingester.complete_queue.pop(blocking=False)
                if not result:
                    break
                # Start of ingest message
                if self.apm_client:
                    self.apm_client.begin_transaction('ingest_msg')

                sub = Submission(result)
                ingester.completed(sub)

                # End of ingest message (success)
                if self.apm_client:
                    elasticapm.tag(sid=sub.sid)
                    self.apm_client.end_transaction('ingest_complete', 'success')

            ingester.counter.increment_execution_time('cpu_seconds', time.process_time() - cpu_mark)
            ingester.counter.increment_execution_time('busy_seconds', time.time() - time_mark)

            message = ingester.ingest_queue.pop(timeout=1)

            cpu_mark = time.process_time()
            time_mark = time.time()

            if not message:
                continue

            # Start of ingest message
            if self.apm_client:
                self.apm_client.begin_transaction('ingest_msg')

            try:
                sub = SubmissionInput(message)
                # Write all input to the traffic queue
                ingester.traffic_queue.publish(SubmissionMessage({
                    'msg': sub,
                    'msg_type': 'SubmissionIngested',
                    'sender': 'ingester',
                }).as_primitives())

                task = IngestTask(dict(
                    submission=sub,
                    ingest_id=sub.sid,
                ))
                task.submission.sid = None  # Reset to new random uuid

            except (ValueError, TypeError) as error:
                self.log.exception(f"Dropped ingest submission {message} because {str(error)}")

                # End of ingest message (value_error)
                if self.apm_client:
                    self.apm_client.end_transaction('ingest_input', 'value_error')

                if volatile:
                    raise
                continue

            if any(len(file.sha256) != 64 for file in task.submission.files):
                self.log.error(f"Invalid sha256: {[file.sha256 for file in task.submission.files]}")

                # End of ingest message (invalid_hash)
                if self.apm_client:
                    self.apm_client.end_transaction('ingest_input', 'invalid_hash')

                continue

            for file in task.submission.files:
                file.sha256 = file.sha256.lower()

            ingester.ingest(task)

            # End of ingest message (success)
            if self.apm_client:
                self.apm_client.end_transaction('ingest_input', 'success')
Example #21
0
def test_deduplication(core):
    # -------------------------------------------------------------------------------
    # Submit two identical jobs, check that they get deduped by ingester
    sha, size = ready_body(core)

    for _ in range(2):
        core.ingest_queue.push(
            SubmissionInput(
                dict(metadata={},
                     params=dict(
                         description="file abc123",
                         services=dict(selected=''),
                         submitter='user',
                         groups=['user'],
                     ),
                     notification=dict(queue='output-queue-one', threshold=0),
                     files=[dict(sha256=sha, size=size,
                                 name='abc123')])).as_primitives())

    notification_queue = NamedQueue('nq-output-queue-one', core.redis)
    first_task = notification_queue.pop(timeout=5)
    second_task = notification_queue.pop(timeout=5)

    # One of the submission will get processed fully
    assert first_task is not None
    first_task = IngestTask(first_task)
    first_submission: Submission = core.ds.submission.get(
        first_task.submission.sid)
    assert first_submission.state == 'completed'
    assert len(first_submission.files) == 1
    assert len(first_submission.errors) == 0
    assert len(first_submission.results) == 4

    # The other will get processed as a duplicate
    # (Which one is the 'real' one and which is the duplicate isn't important for our purposes)
    second_task = IngestTask(second_task)
    assert second_task.submission.sid == first_task.submission.sid

    # -------------------------------------------------------------------------------
    # Submit the same body, but change a parameter so the cache key misses,
    core.ingest_queue.push(
        SubmissionInput(
            dict(metadata={},
                 params=dict(description="file abc123",
                             services=dict(selected=''),
                             submitter='user',
                             groups=['user'],
                             max_extracted=10000),
                 notification=dict(queue='2', threshold=0),
                 files=[dict(sha256=sha, size=size,
                             name='abc123')])).as_primitives())

    notification_queue = NamedQueue('nq-2', core.redis)
    third_task = notification_queue.pop(timeout=5)
    assert third_task

    # The third task should not be deduplicated by ingester, so will have a different submission
    third_task = IngestTask(third_task)
    third_submission: Submission = core.ds.submission.get(
        third_task.submission.sid)
    assert third_submission.state == 'completed'
    assert first_submission.sid != third_submission.sid
    assert len(third_submission.files) == 1
    assert len(third_submission.results) == 4