def __init__(self, datastore=None, redis=None, redis_persist=None, logger=None): super().__init__('assemblyline.dispatcher.file', logger) config: Config = forge.get_config() datastore: AssemblylineDatastore = datastore or forge.get_datastore( config) self.dispatcher = Dispatcher(redis=redis, redis_persist=redis_persist, datastore=datastore, logger=self.log) if config.core.metrics.apm_server.server_url is not None: self.log.info( f"Exporting application metrics to: {config.core.metrics.apm_server.server_url}" ) elasticapm.instrument() self.apm_client = elasticapm.Client( server_url=config.core.metrics.apm_server.server_url, service_name="dispatcher") else: self.apm_client = None
def test_dispatch_extracted(clean_redis, clean_datastore): redis = clean_redis ds = clean_datastore # def service_queue(name): return get_service_queue(name, redis) # Setup the fake datastore file_hash = get_random_hash(64) second_file_hash = get_random_hash(64) for fh in [file_hash, second_file_hash]: obj = random_model_obj(models.file.File) obj.sha256 = fh ds.file.save(fh, obj) # Inject the fake submission submission = random_model_obj(models.submission.Submission) submission.files = [dict(name='./file', sha256=file_hash)] sid = submission.sid = 'first-submission' disp = Dispatcher(ds, redis, redis) disp.running = ToggleTrue() client = DispatchClient(ds, redis, redis) client.dispatcher_data_age = time.time() client.dispatcher_data.append(disp.instance_id) # Launch the submission client.dispatch_submission(submission) disp.pull_submissions() disp.service_worker(disp.process_queue_index(sid)) # Finish one service extracting a file job = client.request_work('0', 'extract', '0') assert job.fileinfo.sha256 == file_hash assert job.filename == './file' new_result: Result = random_minimal_obj(Result) new_result.sha256 = file_hash new_result.response.service_name = 'extract' new_result.response.extracted = [ dict(sha256=second_file_hash, name='second-*', description='abc', classification='U') ] client.service_finished(sid, 'extracted-done', new_result) # process the result disp.pull_service_results() disp.service_worker(disp.process_queue_index(sid)) disp.service_worker(disp.process_queue_index(sid)) # job = client.request_work('0', 'extract', '0') assert job.fileinfo.sha256 == second_file_hash assert job.filename == 'second-*'
def test_dispatch_submission(clean_redis): ds = MockDatastore( collections=['submission', 'result', 'service', 'error', 'file']) file_hash = get_random_hash(64) ds.file.save(file_hash, random_model_obj(models.file.File)) ds.file.get(file_hash).sha256 = file_hash # ds.file.get(file_hash).sha256 = '' submission = random_model_obj(models.submission.Submission) submission.files.clear() submission.files.append(dict(name='./file', sha256=file_hash)) submission.sid = 'first-submission' disp = Dispatcher(ds, logger=logging, redis=clean_redis, redis_persist=clean_redis) # Submit a problem, and check that it gets added to the dispatch hash # and the right service queues task = SubmissionTask(dict(submission=submission)) disp.dispatch_submission(task) file_task = FileTask(disp.file_queue.pop()) assert file_task.sid == submission.sid assert file_task.file_info.sha256 == file_hash assert file_task.depth == 0 assert file_task.file_info.type == ds.file.get(file_hash).type dh = DispatchHash(submission.sid, clean_redis) for service_name in disp.scheduler.services.keys(): dh.fail_nonrecoverable(file_hash, service_name, 'error-code') disp.dispatch_submission(task) assert ds.submission.get(submission.sid).state == 'completed' assert ds.submission.get( submission.sid).errors == ['error-code'] * len(disp.scheduler.services)
def test_simple(clean_redis, clean_datastore): ds = clean_datastore redis = clean_redis def service_queue(name): return get_service_queue(name, redis) file = random_model_obj(File) file_hash = file.sha256 file.type = 'unknown' ds.file.save(file_hash, file) sub: Submission = random_model_obj(models.submission.Submission) sub.sid = sid = 'first-submission' sub.params.ignore_cache = False sub.params.max_extracted = 5 sub.params.classification = get_classification().UNRESTRICTED sub.params.initial_data = json.dumps({'cats': 'big'}) sub.files = [dict(sha256=file_hash, name='file')] disp = Dispatcher(ds, redis, redis) disp.running = ToggleTrue() client = DispatchClient(ds, redis, redis) client.dispatcher_data_age = time.time() client.dispatcher_data.append(disp.instance_id) # Submit a problem, and check that it gets added to the dispatch hash # and the right service queues logger.info('==== first dispatch') # task = SubmissionTask(sub.as_primitives(), 'some-completion-queue') client.dispatch_submission(sub) disp.pull_submissions() disp.service_worker(disp.process_queue_index(sid)) task = disp.tasks.get(sid) assert task.queue_keys[(file_hash, 'extract')] is not None assert task.queue_keys[(file_hash, 'wrench')] is not None assert service_queue('extract').length() == 1 assert service_queue('wrench').length() == 1 # Making the same call again will queue it up again logger.info('==== second dispatch') disp.dispatch_file(task, file_hash) assert task.queue_keys[(file_hash, 'extract')] is not None assert task.queue_keys[(file_hash, 'wrench')] is not None assert service_queue('extract').length() == 1 # the queue doesn't pile up assert service_queue('wrench').length() == 1 logger.info('==== third dispatch') job = client.request_work('0', 'extract', '0') assert job.temporary_submission_data == [{'name': 'cats', 'value': 'big'}] client.service_failed(sid, 'abc123', make_error(file_hash, 'extract')) # Deliberately do in the wrong order to make sure that works disp.pull_service_results() disp.service_worker(disp.process_queue_index(sid)) assert task.queue_keys[(file_hash, 'extract')] is not None assert task.queue_keys[(file_hash, 'wrench')] is not None assert service_queue('extract').length() == 1 # Mark extract as finished, wrench as failed logger.info('==== fourth dispatch') client.request_work('0', 'extract', '0') client.request_work('0', 'wrench', '0') client.service_finished(sid, 'extract-result', make_result(file_hash, 'extract')) client.service_failed(sid, 'wrench-error', make_error(file_hash, 'wrench', False)) for _ in range(2): disp.pull_service_results() disp.service_worker(disp.process_queue_index(sid)) assert wait_error(task, file_hash, 'wrench') assert wait_result(task, file_hash, 'extract') assert service_queue('av-a').length() == 1 assert service_queue('av-b').length() == 1 assert service_queue('frankenstrings').length() == 1 # Have the AVs fail, frankenstrings finishes logger.info('==== fifth dispatch') client.request_work('0', 'av-a', '0') client.request_work('0', 'av-b', '0') client.request_work('0', 'frankenstrings', '0') client.service_failed(sid, 'av-a-error', make_error(file_hash, 'av-a', False)) client.service_failed(sid, 'av-b-error', make_error(file_hash, 'av-b', False)) client.service_finished(sid, 'f-result', make_result(file_hash, 'frankenstrings')) for _ in range(3): disp.pull_service_results() disp.service_worker(disp.process_queue_index(sid)) assert wait_result(task, file_hash, 'frankenstrings') assert wait_error(task, file_hash, 'av-a') assert wait_error(task, file_hash, 'av-b') assert service_queue('xerox').length() == 1 # Finish the xerox service and check if the submission completion got checked logger.info('==== sixth dispatch') client.request_work('0', 'xerox', '0') client.service_finished(sid, 'xerox-result-key', make_result(file_hash, 'xerox')) disp.pull_service_results() disp.service_worker(disp.process_queue_index(sid)) disp.save_submission() assert wait_result(task, file_hash, 'xerox') assert disp.tasks.get(sid) is None
from assemblyline_core.dispatching.dispatcher import Dispatcher with Dispatcher() as server: server.serve_forever()
def test_dispatch_file(clean_redis): service_queue = lambda name: get_service_queue(name, clean_redis) ds = MockDatastore(collections=[ 'submission', 'result', 'service', 'error', 'file', 'filescore' ]) file_hash = get_random_hash(64) sub = random_model_obj(models.submission.Submission) sub.sid = sid = 'first-submission' sub.params.ignore_cache = False disp = Dispatcher(ds, clean_redis, clean_redis, logging) disp.active_submissions.add( sid, SubmissionTask(dict(submission=sub)).as_primitives()) dh = DispatchHash(sid=sid, client=clean_redis) print('==== first dispatch') # Submit a problem, and check that it gets added to the dispatch hash # and the right service queues file_task = FileTask({ 'sid': 'first-submission', 'min_classification': get_classification().UNRESTRICTED, 'file_info': dict(sha256=file_hash, type='unknown', magic='a', md5=get_random_hash(32), mime='a', sha1=get_random_hash(40), size=10), 'depth': 0, 'max_files': 5 }) disp.dispatch_file(file_task) assert dh.dispatch_time(file_hash, 'extract') > 0 assert dh.dispatch_time(file_hash, 'wrench') > 0 assert service_queue('extract').length() == 1 assert service_queue('wrench').length() == 1 # Making the same call again will queue it up again print('==== second dispatch') disp.dispatch_file(file_task) assert dh.dispatch_time(file_hash, 'extract') > 0 assert dh.dispatch_time(file_hash, 'wrench') > 0 assert service_queue('extract').length() == 2 assert service_queue('wrench').length() == 2 # assert len(mq) == 4 # Push back the timestamp in the dispatch hash to simulate a timeout, # make sure it gets pushed into that service queue again print('==== third dispatch') [service_queue(name).delete() for name in disp.scheduler.services] dh.fail_recoverable(file_hash, 'extract') disp.dispatch_file(file_task) assert dh.dispatch_time(file_hash, 'extract') > 0 assert dh.dispatch_time(file_hash, 'wrench') > 0 assert service_queue('extract').length() == 1 # assert len(mq) == 1 # Mark extract as finished, wrench as failed print('==== fourth dispatch') [service_queue(name).delete() for name in disp.scheduler.services] dh.finish(file_hash, 'extract', 'result-key', 0, 'U') dh.fail_nonrecoverable(file_hash, 'wrench', 'error-key') disp.dispatch_file(file_task) assert dh.finished(file_hash, 'extract') assert dh.finished(file_hash, 'wrench') assert service_queue('av-a').length() == 1 assert service_queue('av-b').length() == 1 assert service_queue('frankenstrings').length() == 1 # Have the AVs fail, frankenstrings finishes print('==== fifth dispatch') [service_queue(name).delete() for name in disp.scheduler.services] dh.fail_nonrecoverable(file_hash, 'av-a', 'error-a') dh.fail_nonrecoverable(file_hash, 'av-b', 'error-b') dh.finish(file_hash, 'frankenstrings', 'result-key', 0, 'U') disp.dispatch_file(file_task) assert dh.finished(file_hash, 'av-a') assert dh.finished(file_hash, 'av-b') assert dh.finished(file_hash, 'frankenstrings') assert service_queue('xerox').length() == 1 # Finish the xerox service and check if the submission completion got checked print('==== sixth dispatch') [service_queue(name).delete() for name in disp.scheduler.services] dh.finish(file_hash, 'xerox', 'result-key', 0, 'U') disp.dispatch_file(file_task) assert dh.finished(file_hash, 'xerox') assert len(disp.submission_queue) == 1
def test_dispatch_extracted(clean_redis): # Setup the fake datastore ds = MockDatastore( collections=['submission', 'result', 'service', 'error', 'file']) file_hash = get_random_hash(64) second_file_hash = get_random_hash(64) for fh in [file_hash, second_file_hash]: ds.file.save(fh, random_model_obj(models.file.File)) ds.file.get(fh).sha256 = fh # Inject the fake submission submission = random_model_obj(models.submission.Submission) submission.files.clear() submission.files.append(dict(name='./file', sha256=file_hash)) submission.sid = 'first-submission' # Launch the dispatcher disp = Dispatcher(ds, logger=logging, redis=clean_redis, redis_persist=clean_redis) # Launch the submission task = SubmissionTask(dict(submission=submission)) disp.dispatch_submission(task) # Check that the right values were sent to the file_task = FileTask(disp.file_queue.pop(timeout=1)) assert file_task.sid == submission.sid assert file_task.file_info.sha256 == file_hash assert file_task.depth == 0 assert file_task.file_info.type == ds.file.get(file_hash).type # Finish the services dh = DispatchHash(submission.sid, clean_redis) for service_name in disp.scheduler.services.keys(): dh.finish(file_hash, service_name, 'error-code', 0, 'U') # But one of the services extracted a file dh.add_file(second_file_hash, 10, file_hash) # But meanwhile, dispatch_submission has been recalled on the submission disp.dispatch_submission(task) # It should see the missing file, and we should get a new file dispatch message for it # to make sure it is getting processed properly, this should be at depth 1, the first layer of # extracted files file_task = disp.file_queue.pop(timeout=1) assert file_task is not None file_task = FileTask(file_task) assert file_task.sid == submission.sid assert file_task.file_info.sha256 == second_file_hash assert file_task.depth == 1 assert file_task.file_info.type == ds.file.get(second_file_hash).type # Finish the second file for service_name in disp.scheduler.services.keys(): dh.finish(second_file_hash, service_name, 'error-code', 0, 'U') # And now we should get the finished submission disp.dispatch_submission(task) submission = ds.submission.get(submission.sid) assert submission.state == 'completed' assert submission.errors == [] assert len(submission.results) == 2 * len(disp.scheduler.services)
def core(request, redis, filestore, config, clean_datastore: AssemblylineDatastore): # Block logs from being initialized, it breaks under pytest if you create new stream handlers from assemblyline.common import log as al_log al_log.init_logging = lambda *args: None dispatcher.TIMEOUT_EXTRA_TIME = 1 dispatcher.TIMEOUT_TEST_INTERVAL = 3 # al_log.init_logging("simulation") ds = clean_datastore # Register services stages = get_service_stage_hash(redis) services = [] for svc, stage in [('pre', 'EXTRACT'), ('core-a', 'CORE'), ('core-b', 'CORE'), ('finish', 'POST')]: ds.service.save(f'{svc}_0', dummy_service(svc, stage, docid=f'{svc}_0')) ds.service_delta.save( svc, ServiceDelta({ 'name': svc, 'version': '0', 'enabled': True })) stages.set(svc, ServiceStage.Running) services.append(MockService(svc, ds, redis, filestore)) ds.service.commit() ds.service_delta.commit() listed_services = ds.list_all_services(full=True) assert len(listed_services) == 4 ingester = Ingester(datastore=ds, redis=redis, persistent_redis=redis, config=config) fields = CoreSession(config, ingester) fields.redis = redis fields.ds = ds fields.config = config forge.config_cache[None] = fields.config threads = [] fields.filestore = filestore fields.pre_service = services[0] threads: list[ServerBase] = [ # Start the ingester components ingester, # Start the dispatcher Dispatcher(datastore=ds, redis=redis, redis_persist=redis, config=config), # Start plumber Plumber(datastore=ds, redis=redis, redis_persist=redis, delay=0.5, config=config), ] threads = threads + services for t in threads: t.daemon = True t.start() def stop_core(): [tr.stop() for tr in threads] [tr.raising_join() for tr in threads] request.addfinalizer(stop_core) return fields