Example #1
0
    def run_once():
        counter.reset_mock()

        core.ingest_queue.push(
            SubmissionInput(
                dict(metadata={},
                     params=dict(
                         description="file abc123",
                         services=dict(selected=''),
                         submitter='user',
                         groups=['user'],
                     ),
                     notification=dict(queue='1', threshold=0),
                     files=[dict(sha256=sha, size=size,
                                 name='abc123')])).as_primitives())

        notification_queue = NamedQueue('nq-1', core.redis)
        first_task = notification_queue.pop(timeout=5)

        # One of the submission will get processed fully
        assert first_task is not None
        first_task = IngestTask(first_task)
        first_submission: Submission = core.ds.submission.get(
            first_task.submission.sid)
        assert first_submission.state == 'completed'
        assert len(first_submission.files) == 1
        assert len(first_submission.errors) == 0
        assert len(first_submission.results) == 4
        return first_submission.sid
Example #2
0
def test_max_extracted_in_one(core):
    # Make a set of files that is bigger than max_extracted (3 in this case)
    children = [ready_body(core)[0] for _ in range(5)]
    sha, size = ready_extract(core, children)

    core.ingest_queue.push(
        SubmissionInput(
            dict(metadata={},
                 params=dict(description="file abc123",
                             services=dict(selected=''),
                             submitter='user',
                             groups=['user'],
                             max_extracted=3),
                 notification=dict(queue='test-extracted-in-one', threshold=0),
                 files=[dict(sha256=sha, size=size,
                             name='abc123')])).as_primitives())

    notification_queue = NamedQueue('nq-test-extracted-in-one', core.redis)
    start = time.time()
    task = notification_queue.pop(timeout=10)
    print("notification time waited", time.time() - start)
    assert task is not None
    task = IngestTask(task)
    sub: Submission = core.ds.submission.get(task.submission.sid)
    assert len(sub.files) == 1
    # We should only get results for each file up to the max depth
    assert len(sub.results) == 4 * (1 + 3)
    assert len(sub.errors) == 2  # The number of children that errored out
Example #3
0
def test_max_extracted_in_several(core):
    # Make a set of in a non trivial tree, that add up to more than 3 (max_extracted) files
    children = [
        ready_extract(
            core,
            [ready_body(core)[0], ready_body(core)[0]])[0],
        ready_extract(
            core,
            [ready_body(core)[0], ready_body(core)[0]])[0]
    ]
    sha, size = ready_extract(core, children)

    core.ingest_queue.push(
        SubmissionInput(
            dict(metadata={},
                 params=dict(description="file abc123",
                             services=dict(selected=''),
                             submitter='user',
                             groups=['user'],
                             max_extracted=3),
                 notification=dict(queue='test-extracted-in-several',
                                   threshold=0),
                 files=[dict(sha256=sha, size=size,
                             name='abc123')])).as_primitives())

    notification_queue = NamedQueue('nq-test-extracted-in-several', core.redis)
    task = IngestTask(notification_queue.pop(timeout=10))
    sub: Submission = core.ds.submission.get(task.submission.sid)
    assert len(sub.files) == 1
    # We should only get results for each file up to the max depth
    assert len(sub.results) == 4 * (
        1 + 3)  # 4 services, 1 original file, 3 extracted files
    assert len(sub.errors) == 3  # The number of children that errored out
Example #4
0
def test_service_retry_limit(core):
    watch = WatcherServer(redis=core.redis, redis_persist=core.redis)
    watch.start()
    try:
        # This time have the service 'crash'
        sha, size = ready_body(core, {'pre': {'drop': 3}})

        core.ingest_queue.push(
            SubmissionInput(
                dict(metadata={},
                     params=dict(description="file abc123",
                                 services=dict(selected=''),
                                 submitter='user',
                                 groups=['user'],
                                 max_extracted=10000),
                     notification=dict(queue='watcher-recover', threshold=0),
                     files=[dict(sha256=sha, size=size,
                                 name='abc123')])).as_primitives())

        notification_queue = NamedQueue('nq-watcher-recover', core.redis)
        dropped_task = notification_queue.pop(timeout=16)
        assert dropped_task
        dropped_task = IngestTask(dropped_task)
        sub = core.ds.submission.get(dropped_task.submission.sid)
        assert len(sub.errors) == 1
        assert len(sub.results) == 3
        assert core.pre_service.drops[sha] == 3
        assert core.pre_service.hits[sha] == 3
    finally:
        watch.stop()
        watch.join()
Example #5
0
def test_depth_limit(core):
    # Make a nested set of files that goes deeper than the max depth by one
    sha, size = ready_body(core)
    for _ in range(core.config.submission.max_extraction_depth + 1):
        sha, size = ready_extract(core, sha)

    core.ingest_queue.push(
        SubmissionInput(
            dict(
                metadata={},
                params=dict(
                    description="file abc123",
                    services=dict(selected=''),
                    submitter='user',
                    groups=['user'],
                    # Make sure we can extract enough files that we will definitely hit the depth limit first
                    max_extracted=core.config.submission.max_extraction_depth +
                    10),
                notification=dict(queue='test-depth-limit', threshold=0),
                files=[dict(sha256=sha, size=size,
                            name='abc123')])).as_primitives())

    notification_queue = NamedQueue('nq-test-depth-limit', core.redis)
    start = time.time()
    task = notification_queue.pop(timeout=10)
    print("notification time waited", time.time() - start)
    assert task is not None
    task = IngestTask(task)
    sub: Submission = core.ds.submission.get(task.submission.sid)
    assert len(sub.files) == 1
    # We should only get results for each file up to the max depth
    assert len(sub.results) == 4 * core.config.submission.max_extraction_depth
    assert len(sub.errors) == 1
def test_extracted_file(core, metrics):
    sha, size = ready_extract(core, ready_body(core)[0])

    core.ingest_queue.push(
        SubmissionInput(
            dict(metadata={},
                 params=dict(description="file abc123",
                             services=dict(selected=''),
                             submitter='user',
                             groups=['user'],
                             max_extracted=10000),
                 notification=dict(queue='text-extracted-file', threshold=0),
                 files=[dict(sha256=sha, size=size,
                             name='abc123')])).as_primitives())

    notification_queue = NamedQueue('nq-text-extracted-file', core.redis)
    task = notification_queue.pop(timeout=RESPONSE_TIMEOUT)
    assert task
    task = IngestTask(task)
    sub = core.ds.submission.get(task.submission.sid)
    assert len(sub.files) == 1
    assert len(sub.results) == 8
    assert len(sub.errors) == 0

    metrics.expect('ingester', 'submissions_ingested', 1)
    metrics.expect('ingester', 'submissions_completed', 1)
    metrics.expect('dispatcher', 'submissions_completed', 1)
    metrics.expect('dispatcher', 'files_completed', 2)
def test_dropping_early(core, metrics):
    # -------------------------------------------------------------------------------
    # This time have a file get marked for dropping by a service
    sha, size = ready_body(core, {'pre': {'result': {'drop_file': True}}})

    core.ingest_queue.push(
        SubmissionInput(
            dict(metadata={},
                 params=dict(description="file abc123",
                             services=dict(selected=''),
                             submitter='user',
                             groups=['user'],
                             max_extracted=10000),
                 notification=dict(queue='drop', threshold=0),
                 files=[dict(sha256=sha, size=size,
                             name='abc123')])).as_primitives())

    notification_queue = NamedQueue('nq-drop', core.redis)
    dropped_task = notification_queue.pop(timeout=RESPONSE_TIMEOUT)
    dropped_task = IngestTask(dropped_task)
    sub = core.ds.submission.get(dropped_task.submission.sid)
    assert len(sub.files) == 1
    assert len(sub.results) == 1

    metrics.expect('ingester', 'submissions_ingested', 1)
    metrics.expect('ingester', 'submissions_completed', 1)
    metrics.expect('dispatcher', 'submissions_completed', 1)
    metrics.expect('dispatcher', 'files_completed', 1)
def test_service_retry_limit(core, metrics):
    # This time have the service 'crash'
    sha, size = ready_body(core, {'pre': {'drop': 3}})

    core.ingest_queue.push(
        SubmissionInput(
            dict(metadata={},
                 params=dict(description="file abc123",
                             services=dict(selected=''),
                             submitter='user',
                             groups=['user'],
                             max_extracted=10000),
                 notification=dict(queue='watcher-recover', threshold=0),
                 files=[dict(sha256=sha, size=size,
                             name='abc123')])).as_primitives())

    notification_queue = NamedQueue('nq-watcher-recover', core.redis)
    dropped_task = notification_queue.pop(timeout=RESPONSE_TIMEOUT)
    assert dropped_task
    dropped_task = IngestTask(dropped_task)
    sub = core.ds.submission.get(dropped_task.submission.sid)
    assert len(sub.errors) == 1
    assert len(sub.results) == 3
    assert core.pre_service.drops[sha] == 3
    assert core.pre_service.hits[sha] == 3

    # Wait until we get feedback from the metrics channel
    metrics.expect('ingester', 'submissions_ingested', 1)
    metrics.expect('ingester', 'submissions_completed', 1)
    metrics.expect('dispatcher', 'service_timeouts', 3)
    metrics.expect('service', 'fail_recoverable', 3)
    metrics.expect('service', 'fail_nonrecoverable', 1)
    metrics.expect('dispatcher', 'submissions_completed', 1)
    metrics.expect('dispatcher', 'files_completed', 1)
def test_ingest_retry(core: CoreSession, metrics):
    # -------------------------------------------------------------------------------
    #
    sha, size = ready_body(core)
    original_retry_delay = assemblyline_core.ingester.ingester._retry_delay
    assemblyline_core.ingester.ingester._retry_delay = 1

    attempts = []
    failures = []
    original_submit = core.ingest.submit

    def fail_once(task):
        attempts.append(task)
        if len(attempts) > 1:
            original_submit(task)
        else:
            failures.append(task)
            raise ValueError()

    core.ingest.submit = fail_once

    try:
        core.ingest_queue.push(
            SubmissionInput(
                dict(metadata={},
                     params=dict(
                         description="file abc123",
                         services=dict(selected=''),
                         submitter='user',
                         groups=['user'],
                     ),
                     notification=dict(queue='output-queue-one', threshold=0),
                     files=[dict(sha256=sha, size=size,
                                 name='abc123')])).as_primitives())

        notification_queue = NamedQueue('nq-output-queue-one', core.redis)
        first_task = notification_queue.pop(timeout=RESPONSE_TIMEOUT)

        # One of the submission will get processed fully
        assert first_task is not None
        first_task = IngestTask(first_task)
        first_submission: Submission = core.ds.submission.get(
            first_task.submission.sid)
        assert len(attempts) == 2
        assert len(failures) == 1
        assert first_submission.state == 'completed'
        assert len(first_submission.files) == 1
        assert len(first_submission.errors) == 0
        assert len(first_submission.results) == 4

        metrics.expect('ingester', 'submissions_ingested', 1)
        metrics.expect('ingester', 'submissions_completed', 1)
        metrics.expect('ingester', 'files_completed', 1)
        metrics.expect('ingester', 'duplicates', 0)
        metrics.expect('dispatcher', 'submissions_completed', 1)
        metrics.expect('dispatcher', 'files_completed', 1)

    finally:
        core.ingest.submit = original_submit
        assemblyline_core.ingester.ingester._retry_delay = original_retry_delay
def test_plumber_clearing(core, metrics):
    global _global_semaphore
    _global_semaphore = threading.Semaphore(value=0)
    start = time.time()

    try:
        # Have the plumber cancel tasks
        sha, size = ready_body(core, {'pre': {'hold': 60}})

        core.ingest_queue.push(
            SubmissionInput(
                dict(metadata={},
                     params=dict(description="file abc123",
                                 services=dict(selected=''),
                                 submitter='user',
                                 groups=['user'],
                                 max_extracted=10000),
                     notification=dict(queue='test_plumber_clearing',
                                       threshold=0),
                     files=[dict(sha256=sha, size=size,
                                 name='abc123')])).as_primitives())

        metrics.expect('ingester', 'submissions_ingested', 1)
        service_queue = get_service_queue('pre', core.redis)

        start = time.time()
        while service_queue.length() < 1:
            if time.time() - start > RESPONSE_TIMEOUT:
                pytest.fail(f'Found { service_queue.length()}')
            time.sleep(0.1)

        service_delta = core.ds.service_delta.get('pre')
        service_delta['enabled'] = False
        core.ds.service_delta.save('pre', service_delta)

        notification_queue = NamedQueue('nq-test_plumber_clearing', core.redis)
        dropped_task = notification_queue.pop(timeout=RESPONSE_TIMEOUT)
        dropped_task = IngestTask(dropped_task)
        sub = core.ds.submission.get(dropped_task.submission.sid)
        assert len(sub.files) == 1
        assert len(sub.results) == 3
        assert len(sub.errors) == 1
        error = core.ds.error.get(sub.errors[0])
        assert "disabled" in error.response.message

        metrics.expect('ingester', 'submissions_completed', 1)
        metrics.expect('dispatcher', 'submissions_completed', 1)
        metrics.expect('dispatcher', 'files_completed', 1)
        metrics.expect('service', 'fail_recoverable', 1)

    finally:
        _global_semaphore.release()
        service_delta = core.ds.service_delta.get('pre')
        service_delta['enabled'] = True
        core.ds.service_delta.save('pre', service_delta)
Example #11
0
def test_plumber_clearing(core):
    global _global_semaphore
    _global_semaphore = threading.Semaphore(value=0)

    start = time.time()
    watch = WatcherServer(redis=core.redis, redis_persist=core.redis)
    watch.start()

    try:
        # Have the plumber cancel tasks
        sha, size = ready_body(core, {'pre': {'semaphore': 60}})

        core.ingest_queue.push(
            SubmissionInput(
                dict(metadata={},
                     params=dict(description="file abc123",
                                 services=dict(selected=''),
                                 submitter='user',
                                 groups=['user'],
                                 max_extracted=10000),
                     notification=dict(queue='test_plumber_clearing',
                                       threshold=0),
                     files=[dict(sha256=sha, size=size,
                                 name='abc123')])).as_primitives())

        service_queue = get_service_queue('pre', core.redis)
        time.sleep(0.5)
        while service_queue.length() == 0 and time.time() - start < 20:
            time.sleep(0.1)

        service_delta = core.ds.service_delta.get('pre')
        service_delta['enabled'] = False
        core.ds.service_delta.save('pre', service_delta)

        notification_queue = NamedQueue('nq-test_plumber_clearing', core.redis)
        dropped_task = notification_queue.pop(timeout=5)
        dropped_task = IngestTask(dropped_task)
        sub = core.ds.submission.get(dropped_task.submission.sid)
        assert len(sub.files) == 1
        assert len(sub.results) == 3
        assert len(sub.errors) == 1

        error = core.ds.error.get(sub.errors[0])
        assert "disabled" in error.response.message
    finally:
        _global_semaphore.release()
        service_delta = core.ds.service_delta.get('pre')
        service_delta['enabled'] = True
        core.ds.service_delta.save('pre', service_delta)
        watch.stop()
        watch.join()
def test_service_error(core, metrics):
    # -------------------------------------------------------------------------------
    # Have a service produce an error
    # -------------------------------------------------------------------------------
    # This time have a file get marked for dropping by a service
    sha, size = ready_body(
        core, {
            'core-a': {
                'error': {
                    'archive_ts': time.time() + 250,
                    'sha256': 'a' * 64,
                    'response': {
                        'message': 'words',
                        'status': 'FAIL_NONRECOVERABLE',
                        'service_name': 'core-a',
                        'service_tool_version': 0,
                        'service_version': '0'
                    },
                    'expiry_ts': time.time() + 500
                },
                'failure': True,
            }
        })

    core.ingest_queue.push(
        SubmissionInput(
            dict(metadata={},
                 params=dict(description="file abc123",
                             services=dict(selected=''),
                             submitter='user',
                             groups=['user'],
                             max_extracted=10000),
                 notification=dict(queue='error', threshold=0),
                 files=[dict(sha256=sha, size=size,
                             name='abc123')])).as_primitives())

    notification_queue = NamedQueue('nq-error', core.redis)
    task = IngestTask(notification_queue.pop(timeout=RESPONSE_TIMEOUT))
    sub = core.ds.submission.get(task.submission.sid)
    assert len(sub.files) == 1
    assert len(sub.results) == 3
    assert len(sub.errors) == 1

    metrics.expect('ingester', 'submissions_ingested', 1)
    metrics.expect('ingester', 'submissions_completed', 1)
    metrics.expect('dispatcher', 'submissions_completed', 1)
    metrics.expect('dispatcher', 'files_completed', 1)
def test_ingest_timeout(core: CoreSession, metrics: MetricsCounter):
    # -------------------------------------------------------------------------------
    #
    sha, size = ready_body(core)
    original_max_time = assemblyline_core.ingester.ingester._max_time
    assemblyline_core.ingester.ingester._max_time = 1

    attempts = []
    original_submit = core.ingest.submit_client.submit

    def _fail(**args):
        attempts.append(args)

    core.ingest.submit_client.submit = _fail

    try:
        si = SubmissionInput(
            dict(metadata={},
                 params=dict(
                     description="file abc123",
                     services=dict(selected=''),
                     submitter='user',
                     groups=['user'],
                 ),
                 notification=dict(queue='ingest-timeout', threshold=0),
                 files=[dict(sha256=sha, size=size, name='abc123')]))
        core.ingest_queue.push(si.as_primitives())

        sha256 = si.files[0].sha256
        scan_key = si.params.create_filescore_key(sha256)

        # Make sure the scanning table has been cleared
        time.sleep(0.5)
        for _ in range(60):
            if not core.ingest.scanning.exists(scan_key):
                break
            time.sleep(0.1)
        assert not core.ingest.scanning.exists(scan_key)
        assert len(attempts) == 1

        # Wait until we get feedback from the metrics channel
        metrics.expect('ingester', 'submissions_ingested', 1)
        metrics.expect('ingester', 'timed_out', 1)

    finally:
        core.ingest.submit_client.submit = original_submit
        assemblyline_core.ingester.ingester._max_time = original_max_time
Example #14
0
def test_deduplication(core):
    # -------------------------------------------------------------------------------
    # Submit two identical jobs, check that they get deduped by ingester
    sha, size = ready_body(core)

    for _ in range(2):
        core.ingest_queue.push(
            SubmissionInput(
                dict(metadata={},
                     params=dict(
                         description="file abc123",
                         services=dict(selected=''),
                         submitter='user',
                         groups=['user'],
                     ),
                     notification=dict(queue='output-queue-one', threshold=0),
                     files=[dict(sha256=sha, size=size,
                                 name='abc123')])).as_primitives())

    notification_queue = NamedQueue('nq-output-queue-one', core.redis)
    first_task = notification_queue.pop(timeout=5)
    second_task = notification_queue.pop(timeout=5)

    # One of the submission will get processed fully
    assert first_task is not None
    first_task = IngestTask(first_task)
    first_submission: Submission = core.ds.submission.get(
        first_task.submission.sid)
    assert first_submission.state == 'completed'
    assert len(first_submission.files) == 1
    assert len(first_submission.errors) == 0
    assert len(first_submission.results) == 4

    # The other will get processed as a duplicate
    # (Which one is the 'real' one and which is the duplicate isn't important for our purposes)
    second_task = IngestTask(second_task)
    assert second_task.submission.sid == first_task.submission.sid

    # -------------------------------------------------------------------------------
    # Submit the same body, but change a parameter so the cache key misses,
    core.ingest_queue.push(
        SubmissionInput(
            dict(metadata={},
                 params=dict(description="file abc123",
                             services=dict(selected=''),
                             submitter='user',
                             groups=['user'],
                             max_extracted=10000),
                 notification=dict(queue='2', threshold=0),
                 files=[dict(sha256=sha, size=size,
                             name='abc123')])).as_primitives())

    notification_queue = NamedQueue('nq-2', core.redis)
    third_task = notification_queue.pop(timeout=5)
    assert third_task

    # The third task should not be deduplicated by ingester, so will have a different submission
    third_task = IngestTask(third_task)
    third_submission: Submission = core.ds.submission.get(
        third_task.submission.sid)
    assert third_submission.state == 'completed'
    assert first_submission.sid != third_submission.sid
    assert len(third_submission.files) == 1
    assert len(third_submission.results) == 4
    def try_run(self, volatile=False):
        ingester = self.ingester
        cpu_mark = time.process_time()
        time_mark = time.time()

        # Move from ingest to unique and waiting queues.
        # While there are entries in the ingest queue we consume chunk_size
        # entries at a time and move unique entries to uniqueq / queued and
        # duplicates to their own queues / waiting.
        while self.running:
            self.heartbeat()
            while True:
                result = ingester.complete_queue.pop(blocking=False)
                if not result:
                    break
                # Start of ingest message
                if self.apm_client:
                    self.apm_client.begin_transaction('ingest_msg')

                sub = Submission(result)
                ingester.completed(sub)

                # End of ingest message (success)
                if self.apm_client:
                    elasticapm.tag(sid=sub.sid)
                    self.apm_client.end_transaction('ingest_complete', 'success')

            ingester.counter.increment_execution_time('cpu_seconds', time.process_time() - cpu_mark)
            ingester.counter.increment_execution_time('busy_seconds', time.time() - time_mark)

            message = ingester.ingest_queue.pop(timeout=1)

            cpu_mark = time.process_time()
            time_mark = time.time()

            if not message:
                continue

            # Start of ingest message
            if self.apm_client:
                self.apm_client.begin_transaction('ingest_msg')

            try:
                sub = SubmissionInput(message)
                # Write all input to the traffic queue
                ingester.traffic_queue.publish(SubmissionMessage({
                    'msg': sub,
                    'msg_type': 'SubmissionIngested',
                    'sender': 'ingester',
                }).as_primitives())

                task = IngestTask(dict(
                    submission=sub,
                    ingest_id=sub.sid,
                ))
                task.submission.sid = None  # Reset to new random uuid

            except (ValueError, TypeError) as error:
                self.log.exception(f"Dropped ingest submission {message} because {str(error)}")

                # End of ingest message (value_error)
                if self.apm_client:
                    self.apm_client.end_transaction('ingest_input', 'value_error')

                if volatile:
                    raise
                continue

            if any(len(file.sha256) != 64 for file in task.submission.files):
                self.log.error(f"Invalid sha256: {[file.sha256 for file in task.submission.files]}")

                # End of ingest message (invalid_hash)
                if self.apm_client:
                    self.apm_client.end_transaction('ingest_input', 'invalid_hash')

                continue

            for file in task.submission.files:
                file.sha256 = file.sha256.lower()

            ingester.ingest(task)

            # End of ingest message (success)
            if self.apm_client:
                self.apm_client.end_transaction('ingest_input', 'success')