def cancel_service_task(self, task_key, worker): # We believe a service task has timed out, try and read it from running tasks # If we can't find the task in running tasks, it finished JUST before timing out, let it go task = self.running_tasks.pop(task_key) if not task: return # We can confirm that the task is ours now, even if the worker finished, the result will be ignored task = Task(task) self.log.info( f"[{task.sid}] Service {task.service_name} timed out on {task.fileinfo.sha256}." ) # Mark the previous attempt as invalid and redispatch it dispatch_table = DispatchHash(task.sid, self.redis) dispatch_table.fail_recoverable(task.fileinfo.sha256, task.service_name) dispatch_table.dispatch(task.fileinfo.sha256, task.service_name) get_service_queue(task.service_name, self.redis).push(task.priority, task.as_primitives()) # We push the task of killing the container off on the scaler, which already has root access # the scaler can also double check that the service name and container id match, to be sure # we aren't accidentally killing the wrong container self.scaler_timeout_queue.push({ 'service': task.service_name, 'container': worker }) # Report to the metrics system that a recoverable error has occurred for that service export_metrics_once(task.service_name, ServiceMetrics, dict(fail_recoverable=1), host=worker, counter_type='service')
def watch_service(self, service_name): service_queue = get_service_queue(service_name, self.redis) while self.running and not self.stop_signals[service_name].is_set(): while service_queue.length() > self.service_limit[service_name]: task = self.dispatch_client.request_work( 'plumber', service_name=service_name, service_version='0', blocking=False, low_priority=True) if task is None: break error = Error( dict( archive_ts=now_as_iso( self.config.datastore.ilm.days_until_archive * 24 * 60 * 60), created='NOW', expiry_ts=now_as_iso(task.ttl * 24 * 60 * 60) if task.ttl else None, response=dict( message="Task canceled due to execesive queuing.", service_name=task.service_name, service_version='0', status='FAIL_NONRECOVERABLE', ), sha256=task.fileinfo.sha256, type="TASK PRE-EMPTED", )) error_key = error.build_key(task=task) self.dispatch_client.service_failed(task.sid, error_key, error) self.sleep(2)
def perform_check(): # If the service is privileged, test connectivity to core if environ.get('PRIVILEGED', 'false').lower() == 'true': forge.get_datastore() forge.get_filestore(connection_attempts=1) forge.get_service_queue(service=environ['AL_SERVICE_NAME']) else: # Otherwise, perform a test for service-server availability if not requests.get(f"{environ['SERVICE_API_HOST']}/healthz/live").ok: raise Exception('Unable to reach service-server') # If running with an updater, check for availability. Make sure test doesn't run on the actual updater. if environ.get('updates_host') and not environ['HOSTNAME'].startswith( environ['updates_host']): if not requests.get( f"http://{environ['updates_host']}:{environ['updates_port']}/healthz/live" ).ok: raise Exception('Unable to reach local update server') exit()
def __init__(self, name, datastore, redis, filestore): super().__init__('assemblyline.service.' + name) self.service_name = name self.datastore = datastore self.filestore = filestore self.queue = get_service_queue(name, redis) self.dispatch_client = DispatchClient(self.datastore, redis) self.hits = dict() self.drops = dict()
def test_plumber_clearing(core, metrics): global _global_semaphore _global_semaphore = threading.Semaphore(value=0) start = time.time() try: # Have the plumber cancel tasks sha, size = ready_body(core, {'pre': {'hold': 60}}) core.ingest_queue.push( SubmissionInput( dict(metadata={}, params=dict(description="file abc123", services=dict(selected=''), submitter='user', groups=['user'], max_extracted=10000), notification=dict(queue='test_plumber_clearing', threshold=0), files=[dict(sha256=sha, size=size, name='abc123')])).as_primitives()) metrics.expect('ingester', 'submissions_ingested', 1) service_queue = get_service_queue('pre', core.redis) start = time.time() while service_queue.length() < 1: if time.time() - start > RESPONSE_TIMEOUT: pytest.fail(f'Found { service_queue.length()}') time.sleep(0.1) service_delta = core.ds.service_delta.get('pre') service_delta['enabled'] = False core.ds.service_delta.save('pre', service_delta) notification_queue = NamedQueue('nq-test_plumber_clearing', core.redis) dropped_task = notification_queue.pop(timeout=RESPONSE_TIMEOUT) dropped_task = IngestTask(dropped_task) sub = core.ds.submission.get(dropped_task.submission.sid) assert len(sub.files) == 1 assert len(sub.results) == 3 assert len(sub.errors) == 1 error = core.ds.error.get(sub.errors[0]) assert "disabled" in error.response.message metrics.expect('ingester', 'submissions_completed', 1) metrics.expect('dispatcher', 'submissions_completed', 1) metrics.expect('dispatcher', 'files_completed', 1) metrics.expect('service', 'fail_recoverable', 1) finally: _global_semaphore.release() service_delta = core.ds.service_delta.get('pre') service_delta['enabled'] = True core.ds.service_delta.save('pre', service_delta)
def test_plumber_clearing(core): global _global_semaphore _global_semaphore = threading.Semaphore(value=0) start = time.time() watch = WatcherServer(redis=core.redis, redis_persist=core.redis) watch.start() try: # Have the plumber cancel tasks sha, size = ready_body(core, {'pre': {'semaphore': 60}}) core.ingest_queue.push( SubmissionInput( dict(metadata={}, params=dict(description="file abc123", services=dict(selected=''), submitter='user', groups=['user'], max_extracted=10000), notification=dict(queue='test_plumber_clearing', threshold=0), files=[dict(sha256=sha, size=size, name='abc123')])).as_primitives()) service_queue = get_service_queue('pre', core.redis) time.sleep(0.5) while service_queue.length() == 0 and time.time() - start < 20: time.sleep(0.1) service_delta = core.ds.service_delta.get('pre') service_delta['enabled'] = False core.ds.service_delta.save('pre', service_delta) notification_queue = NamedQueue('nq-test_plumber_clearing', core.redis) dropped_task = notification_queue.pop(timeout=5) dropped_task = IngestTask(dropped_task) sub = core.ds.submission.get(dropped_task.submission.sid) assert len(sub.files) == 1 assert len(sub.results) == 3 assert len(sub.errors) == 1 error = core.ds.error.get(sub.errors[0]) assert "disabled" in error.response.message finally: _global_semaphore.release() service_delta = core.ds.service_delta.get('pre') service_delta['enabled'] = True core.ds.service_delta.save('pre', service_delta) watch.stop() watch.join()
def __init__(self, datastore=None, filestore=None): super().__init__('assemblyline.randomservice') self.config = forge.get_config() self.datastore = datastore or forge.get_datastore() self.filestore = filestore or forge.get_filestore() self.client_id = get_random_id() self.service_state_hash = ExpiringHash(SERVICE_STATE_HASH, ttl=30 * 60) self.counters = { n: MetricsFactory('service', Metrics, name=n, config=self.config) for n in self.datastore.service_delta.keys() } self.queues = [ forge.get_service_queue(name) for name in self.datastore.service_delta.keys() ] self.dispatch_client = DispatchClient(self.datastore) self.service_info = CachedObject(self.datastore.list_all_services, kwargs={'as_obj': False})
def _request_work(self, worker_id, service_name, service_version, timeout, blocking, low_priority=False) -> Optional[ServiceTask]: # For when we recursively retry on bad task dequeue-ing if int(timeout) <= 0: self.log.info(f"{service_name}:{worker_id} no task returned [timeout]") return None # Get work from the queue work_queue = get_service_queue(service_name, self.redis) if blocking: result = work_queue.blocking_pop(timeout=int(timeout), low_priority=low_priority) else: if low_priority: result = work_queue.unpush(1) else: result = work_queue.pop(1) if result: result = result[0] if not result: self.log.info(f"{service_name}:{worker_id} no task returned: [empty message]") return None task = ServiceTask(result) task.metadata['worker__'] = worker_id dispatcher = task.metadata['dispatcher__'] if not self.is_dispatcher(dispatcher): self.log.info(f"{service_name}:{worker_id} no task returned: [task from dead dispatcher]") return None if self.running_tasks.add(task.key(), task.as_primitives()): self.log.info(f"[{task.sid}/{task.fileinfo.sha256}] {service_name}:{worker_id} task found") start_queue = self._get_queue_from_cache(DISPATCH_START_EVENTS + dispatcher) start_queue.push((task.sid, task.fileinfo.sha256, service_name, worker_id)) return task return None
def send_heartbeat(self, m_type, m_name, m_data, instances): if m_type == "dispatcher": try: instances = sorted(Dispatcher.all_instances( self.redis_persist)) inflight = { _i: Dispatcher.instance_assignment_size( self.redis_persist, _i) for _i in instances } queues = { _i: Dispatcher.all_queue_lengths(self.redis, _i) for _i in instances } msg = { "sender": self.sender, "msg": { "inflight": { "max": self.config.core.dispatcher.max_inflight, "outstanding": self.dispatch_active_hash.length(), "per_instance": [inflight[_i] for _i in instances] }, "instances": len(instances), "metrics": m_data, "queues": { "ingest": self.dispatcher_submission_queue.length(), "start": [queues[_i]['start'] for _i in instances], "result": [queues[_i]['result'] for _i in instances], "command": [queues[_i]['command'] for _i in instances] }, "component": m_name, } } self.status_queue.publish( DispatcherMessage(msg).as_primitives()) self.log.info(f"Sent dispatcher heartbeat: {msg['msg']}") except Exception: self.log.exception( "An exception occurred while generating DispatcherMessage") elif m_type == "ingester": try: c_q_len = self.ingest_unique_queue.count(*self.c_rng) h_q_len = self.ingest_unique_queue.count(*self.h_rng) m_q_len = self.ingest_unique_queue.count(*self.m_rng) l_q_len = self.ingest_unique_queue.count(*self.l_rng) msg = { "sender": self.sender, "msg": { "instances": instances, "metrics": m_data, "processing": { "inflight": self.ingest_scanning.length() }, "processing_chance": { "critical": 1 - drop_chance(c_q_len, self.c_s_at), "high": 1 - drop_chance(h_q_len, self.h_s_at), "low": 1 - drop_chance(l_q_len, self.l_s_at), "medium": 1 - drop_chance(m_q_len, self.m_s_at) }, "queues": { "critical": c_q_len, "high": h_q_len, "ingest": self.ingest_queue.length(), "complete": self.ingest_complete_queue.length(), "low": l_q_len, "medium": m_q_len } } } self.status_queue.publish(IngestMessage(msg).as_primitives()) self.log.info(f"Sent ingester heartbeat: {msg['msg']}") except Exception: self.log.exception( "An exception occurred while generating IngestMessage") elif m_type == "alerter": try: msg = { "sender": self.sender, "msg": { "instances": instances, "metrics": m_data, "queues": { "alert": self.alert_queue.length() } } } self.status_queue.publish(AlerterMessage(msg).as_primitives()) self.log.info(f"Sent alerter heartbeat: {msg['msg']}") except Exception: self.log.exception( "An exception occurred while generating AlerterMessage") elif m_type == "expiry": try: msg = { "sender": self.sender, "msg": { "instances": instances, "metrics": m_data, "queues": self.to_expire } } self.status_queue.publish(ExpiryMessage(msg).as_primitives()) self.log.info(f"Sent expiry heartbeat: {msg['msg']}") except Exception: self.log.exception( "An exception occurred while generating ExpiryMessage") elif m_type == "archive": try: msg = { "sender": self.sender, "msg": { "instances": instances, "metrics": m_data } } self.status_queue.publish(ArchiveMessage(msg).as_primitives()) self.log.info(f"Sent archive heartbeat: {msg['msg']}") except Exception: self.log.exception( "An exception occurred while generating ArchiveMessage") elif m_type == "scaler": try: msg = { "sender": self.sender, "msg": { "instances": instances, "metrics": m_data, } } self.status_queue.publish(ScalerMessage(msg).as_primitives()) self.log.info(f"Sent scaler heartbeat: {msg['msg']}") except Exception: self.log.exception( "An exception occurred while generating WatcherMessage") elif m_type == "scaler_status": try: msg = { "sender": self.sender, "msg": { "service_name": m_name, "metrics": m_data, } } self.status_queue.publish( ScalerStatusMessage(msg).as_primitives()) self.log.info(f"Sent scaler status heartbeat: {msg['msg']}") except Exception: self.log.exception( "An exception occurred while generating WatcherMessage") elif m_type == "service": try: busy, idle = get_working_and_idle(self.redis, m_name) msg = { "sender": self.sender, "msg": { "instances": len(busy) + len(idle), "metrics": m_data, "activity": { 'busy': len(busy), 'idle': len(idle) }, "queue": get_service_queue(m_name, self.redis).length(), "service_name": m_name } } self.status_queue.publish(ServiceMessage(msg).as_primitives()) self.log.info(f"Sent service heartbeat: {msg['msg']}") except Exception: self.log.exception( "An exception occurred while generating ServiceMessage") else: self.log.warning( f"Skipping unknown counter: {m_name} [{m_type}] ==> {m_data}")
def sync_services(self): self.scheduler.enter(SERVICE_SYNC_INTERVAL, 0, self.sync_services) default_settings = self.config.core.scaler.service_defaults image_variables = defaultdict(str) image_variables.update(self.config.services.image_variables) current_services = set(self.profiles.keys()) discovered_services = [] # Get all the service data for service in self.datastore.list_all_services(full=True): service: Service = service name = service.name stage = self.get_service_stage(service.name) discovered_services.append(name) # noinspection PyBroadException try: if service.enabled and stage == ServiceStage.Off: # Enable this service's dependencies self.controller.prepare_network( service.name, service.docker_config.allow_internet_access) for _n, dependency in service.dependencies.items(): self.controller.start_stateful_container( service_name=service.name, container_name=_n, spec=dependency, labels={'dependency_for': service.name}) # Move to the next service stage if service.update_config and service.update_config.wait_for_update: self._service_stage_hash.set(name, ServiceStage.Update) else: self._service_stage_hash.set(name, ServiceStage.Running) if not service.enabled: self.stop_service(service.name, stage) continue # Check that all enabled services are enabled if service.enabled and stage == ServiceStage.Running: # Compute a hash of service properties not include in the docker config, that # should still result in a service being restarted when changed config_hash = hash(str(sorted(service.config.items()))) config_hash = hash( (config_hash, str(service.submission_params))) # Build the docker config for the service, we are going to either create it or # update it so we need to know what the current configuration is either way docker_config = service.docker_config docker_config.image = Template( docker_config.image).safe_substitute(image_variables) set_keys = set(var.name for var in docker_config.environment) for var in default_settings.environment: if var.name not in set_keys: docker_config.environment.append(var) # Add the service to the list of services being scaled if name not in self.profiles: self.log.info(f'Adding {service.name} to scaling') self.add_service( ServiceProfile( name=name, min_instances=default_settings.min_instances, growth=default_settings.growth, shrink=default_settings.shrink, config_hash=config_hash, backlog=default_settings.backlog, max_instances=service.licence_count, container_config=docker_config, queue=get_service_queue(name, self.redis), shutdown_seconds=service.timeout + 30, # Give service an extra 30 seconds to upload results )) # Update RAM, CPU, licence requirements for running services else: profile = self.profiles[name] if profile.container_config != docker_config or profile.config_hash != config_hash: self.log.info( f"Updating deployment information for {name}") profile.container_config = docker_config profile.config_hash = config_hash self.controller.restart(profile) self.log.info( f"Deployment information for {name} replaced") if service.licence_count == 0: profile._max_instances = float('inf') else: profile._max_instances = service.licence_count except Exception: self.log.exception( f"Error applying service settings from: {service.name}") self.handle_service_error(service.name) # Find any services we have running, that are no longer in the database and remove them for stray_service in current_services - set(discovered_services): stage = self.get_service_stage(stray_service) self.stop_service(stray_service, stage)
def service_queue(name): return get_service_queue(name, redis)
def _request_work(self, worker_id, service_name, service_version, timeout, blocking) -> Optional[ServiceTask]: # For when we recursively retry on bad task dequeue-ing if int(timeout) <= 0: self.log.info( f"{service_name}:{worker_id} no task returned [timeout]") return None # Get work from the queue work_queue = get_service_queue(service_name, self.redis) if blocking: result = work_queue.blocking_pop(timeout=int(timeout)) else: result = work_queue.pop(1) if result: result = result[0] if not result: self.log.info( f"{service_name}:{worker_id} no task returned: [empty message]" ) return None task = ServiceTask(result) # If someone is supposed to be working on this task right now, we won't be able to add it if self.running_tasks.add(task.key(), task.as_primitives()): self.log.info( f"[{task.sid}/{task.fileinfo.sha256}] {service_name}:{worker_id} task found" ) process_table = DispatchHash(task.sid, self.redis) abandoned = process_table.dispatch_time( file_hash=task.fileinfo.sha256, service=task.service_name) == 0 finished = process_table.finished( file_hash=task.fileinfo.sha256, service=task.service_name) is not None # A service might be re-dispatched as it finishes, when that is the case it can be marked as # both finished and dispatched, if that is the case, drop the dispatch from the table if finished and not abandoned: process_table.drop_dispatch(file_hash=task.fileinfo.sha256, service=task.service_name) if abandoned or finished: self.log.info( f"[{task.sid}/{task.fileinfo.sha256}] {service_name}:{worker_id} task already complete" ) self.running_tasks.pop(task.key()) raise RetryRequestWork() # Check if this task has reached the retry limit attempt_record = ExpiringHash(f'dispatch-hash-attempts-{task.sid}', host=self.redis) total_attempts = attempt_record.increment(task.key()) self.log.info( f"[{task.sid}/{task.fileinfo.sha256}] {service_name}:{worker_id} " f"task attempt {total_attempts}/3") if total_attempts > 3: self.log.warning( f"[{task.sid}/{task.fileinfo.sha256}] " f"{service_name}:{worker_id} marking task failed: TASK PREEMPTED " ) error = Error( dict( archive_ts=now_as_iso( self.config.datastore.ilm.days_until_archive * 24 * 60 * 60), created='NOW', expiry_ts=now_as_iso(task.ttl * 24 * 60 * 60) if task.ttl else None, response=dict( message= f'The number of retries has passed the limit.', service_name=task.service_name, service_version=service_version, status='FAIL_NONRECOVERABLE', ), sha256=task.fileinfo.sha256, type="TASK PRE-EMPTED", )) error_key = error.build_key(task=task) self.service_failed(task.sid, error_key, error) export_metrics_once(service_name, Metrics, dict(fail_nonrecoverable=1), host=worker_id, counter_type='service') raise RetryRequestWork() # Get the service information service_data = self.service_data[task.service_name] self.timeout_watcher.touch_task(timeout=int(service_data.timeout), key=f'{task.sid}-{task.key()}', worker=worker_id, task_key=task.key()) return task raise RetryRequestWork()
def dispatch_file(self, task: FileTask): """ Handle a message describing a file to be processed. This file may be: - A new submission or extracted file. - A file that has just completed a stage of processing. - A file that has not completed a a stage of processing, but this call has been triggered by a timeout or similar. If the file is totally new, we will setup a dispatch table, and fill it in. Once we make/load a dispatch table, we will dispatch whichever group the table shows us hasn't been completed yet. When we dispatch to a service, we check if the task is already in the dispatch queue. If it isn't proceed normally. If it is, check that the service is still online. """ # Read the message content file_hash = task.file_info.sha256 active_task = self.active_submissions.get(task.sid) if active_task is None: self.log.warning(f"[{task.sid}] Untracked submission is being processed") return submission_task = SubmissionTask(active_task) submission = submission_task.submission # Refresh the watch on the submission, we are still working on it self.timeout_watcher.touch(key=task.sid, timeout=int(self.config.core.dispatcher.timeout), queue=SUBMISSION_QUEUE, message={'sid': task.sid}) # Open up the file/service table for this submission dispatch_table = DispatchHash(task.sid, self.redis, fetch_results=True) # Load things that we will need to fill out the file_tags = ExpiringSet(task.get_tag_set_name(), host=self.redis) file_tags_data = file_tags.members() temporary_submission_data = ExpiringHash(task.get_temporary_submission_data_name(), host=self.redis) temporary_data = [dict(name=row[0], value=row[1]) for row in temporary_submission_data.items().items()] # Calculate the schedule for the file schedule = self.build_schedule(dispatch_table, submission, file_hash, task.file_info.type) started_stages = [] # Go through each round of the schedule removing complete/failed services # Break when we find a stage that still needs processing outstanding = {} score = 0 errors = 0 while schedule and not outstanding: stage = schedule.pop(0) started_stages.append(stage) for service_name in stage: service = self.scheduler.services.get(service_name) if not service: continue # Load the results, if there are no results, then the service must be dispatched later # Don't look at if it has been dispatched, as multiple dispatches are fine, # but missing a dispatch isn't. finished = dispatch_table.finished(file_hash, service_name) if not finished: outstanding[service_name] = service continue # If the service terminated in an error, count the error and continue if finished.is_error: errors += 1 continue # if the service finished, count the score, and check if the file has been dropped score += finished.score if not submission.params.ignore_filtering and finished.drop: schedule.clear() if schedule: # If there are still stages in the schedule, over write them for next time dispatch_table.schedules.set(file_hash, started_stages) # Try to retry/dispatch any outstanding services if outstanding: self.log.info(f"[{task.sid}] File {file_hash} sent to services : {', '.join(list(outstanding.keys()))}") for service_name, service in outstanding.items(): # Find the actual file name from the list of files in submission filename = None for file in submission.files: if task.file_info.sha256 == file.sha256: filename = file.name break # Build the actual service dispatch message config = self.build_service_config(service, submission) service_task = ServiceTask(dict( sid=task.sid, metadata=submission.metadata, min_classification=task.min_classification, service_name=service_name, service_config=config, fileinfo=task.file_info, filename=filename or task.file_info.sha256, depth=task.depth, max_files=task.max_files, ttl=submission.params.ttl, ignore_cache=submission.params.ignore_cache, ignore_dynamic_recursion_prevention=submission.params.ignore_dynamic_recursion_prevention, tags=file_tags_data, temporary_submission_data=temporary_data, deep_scan=submission.params.deep_scan, priority=submission.params.priority, )) dispatch_table.dispatch(file_hash, service_name) queue = get_service_queue(service_name, self.redis) queue.push(service_task.priority, service_task.as_primitives()) else: # There are no outstanding services, this file is done # clean up the tags file_tags.delete() # If there are no outstanding ANYTHING for this submission, # send a message to the submission dispatcher to finalize self.counter.increment('files_completed') if dispatch_table.all_finished(): self.log.info(f"[{task.sid}] Finished processing file '{file_hash}' starting submission finalization.") self.submission_queue.push({'sid': submission.sid}) else: self.log.info(f"[{task.sid}] Finished processing file '{file_hash}'. Other files are not finished.")
def _sync_service(self, service: Service): name = service.name stage = self.get_service_stage(service.name) default_settings = self.config.core.scaler.service_defaults image_variables: defaultdict[str, str] = defaultdict(str) image_variables.update(self.config.services.image_variables) def prepare_container(docker_config: DockerConfig) -> DockerConfig: docker_config.image = Template( docker_config.image).safe_substitute(image_variables) set_keys = set(var.name for var in docker_config.environment) for var in default_settings.environment: if var.name not in set_keys: docker_config.environment.append(var) return docker_config # noinspection PyBroadException try: def disable_incompatible_service(): service.enabled = False if self.datastore.service_delta.update(service.name, [ (self.datastore.service_delta.UPDATE_SET, 'enabled', False) ]): # Raise awareness to other components by sending an event for the service self.service_event_sender.send(service.name, { 'operation': Operation.Incompatible, 'name': service.name }) # Check if service considered compatible to run on Assemblyline? system_spec = f'{FRAMEWORK_VERSION}.{SYSTEM_VERSION}' if not service.version.startswith(system_spec): # If FW and SYS version don't prefix in the service version, we can't guarantee the service is compatible # Disable and treat it as incompatible due to service version. self.log.warning( "Disabling service with incompatible version. " f"[{service.version} != '{system_spec}.X.{service.update_channel}Y']." ) disable_incompatible_service() elif service.update_config and service.update_config.wait_for_update and not service.update_config.sources: # All signatures sources from a signature-dependent service was removed # Disable and treat it as incompatible due to service configuration relative to source management self.log.warning( "Disabling service with incompatible service configuration. " "Signature-dependent service has no signature sources.") disable_incompatible_service() if not service.enabled: self.stop_service(service.name, stage) return # Build the docker config for the dependencies. For now the dependency blob values # aren't set for the change key going to kubernetes because everything about # the dependency config should be captured in change key that the function generates # internally. A change key is set for the service deployment as that includes # things like the submission params dependency_config: dict[str, Any] = {} dependency_blobs: dict[str, str] = {} for _n, dependency in service.dependencies.items(): dependency.container = prepare_container(dependency.container) dependency_config[_n] = dependency dep_hash = get_id_from_data(dependency, length=16) dependency_blobs[ _n] = f"dh={dep_hash}v={service.version}p={service.privileged}" # Check if the service dependencies have been deployed. dependency_keys = [] updater_ready = stage == ServiceStage.Running if service.update_config: for _n, dependency in dependency_config.items(): key = self.controller.stateful_container_key( service.name, _n, dependency, '') if key: dependency_keys.append(_n + key) else: updater_ready = False # If stage is not set to running or a dependency container is missing start the setup process if not updater_ready: self.log.info(f'Preparing environment for {service.name}') # Move to the next service stage (do this first because the container we are starting may care) if service.update_config and service.update_config.wait_for_update: self._service_stage_hash.set(name, ServiceStage.Update) stage = ServiceStage.Update else: self._service_stage_hash.set(name, ServiceStage.Running) stage = ServiceStage.Running # Enable this service's dependencies before trying to launch the service containers dependency_internet = [ (name, dependency.container.allow_internet_access) for name, dependency in dependency_config.items() ] self.controller.prepare_network( service.name, service.docker_config.allow_internet_access, dependency_internet) for _n, dependency in dependency_config.items(): self.log.info(f'Launching {service.name} dependency {_n}') self.controller.start_stateful_container( service_name=service.name, container_name=_n, spec=dependency, labels={'dependency_for': service.name}, change_key=dependency_blobs.get(_n, '')) # If the conditions for running are met deploy or update service containers if stage == ServiceStage.Running: # Build the docker config for the service, we are going to either create it or # update it so we need to know what the current configuration is either way docker_config = prepare_container(service.docker_config) # Compute a blob of service properties not include in the docker config, that # should still result in a service being restarted when changed cfg_items = get_recursive_sorted_tuples(service.config) dep_keys = ''.join(sorted(dependency_keys)) config_blob = ( f"c={cfg_items}sp={service.submission_params}" f"dk={dep_keys}p={service.privileged}d={docker_config}") # Add the service to the list of services being scaled with self.profiles_lock: if name not in self.profiles: self.log.info( f"Adding " f"{f'privileged {service.name}' if service.privileged else service.name}" " to scaling") self.add_service( ServiceProfile( name=name, min_instances=default_settings.min_instances, growth=default_settings.growth, shrink=default_settings.shrink, config_blob=config_blob, dependency_blobs=dependency_blobs, backlog=default_settings.backlog, max_instances=service.licence_count, container_config=docker_config, queue=get_service_queue(name, self.redis), # Give service an extra 30 seconds to upload results shutdown_seconds=service.timeout + 30, privileged=service.privileged)) # Update RAM, CPU, licence requirements for running services else: profile = self.profiles[name] profile.max_instances = service.licence_count profile.privileged = service.privileged for dependency_name, dependency_blob in dependency_blobs.items( ): if profile.dependency_blobs[ dependency_name] != dependency_blob: self.log.info( f"Updating deployment information for {name}/{dependency_name}" ) profile.dependency_blobs[ dependency_name] = dependency_blob self.controller.start_stateful_container( service_name=service.name, container_name=dependency_name, spec=dependency_config[dependency_name], labels={'dependency_for': service.name}, change_key=dependency_blob) if profile.config_blob != config_blob: self.log.info( f"Updating deployment information for {name}") profile.container_config = docker_config profile.config_blob = config_blob self.controller.restart(profile) self.log.info( f"Deployment information for {name} replaced") except Exception: self.log.exception( f"Error applying service settings from: {service.name}") self.handle_service_error(service.name)
def test_dispatch_file(clean_redis): service_queue = lambda name: get_service_queue(name, clean_redis) ds = MockDatastore(collections=[ 'submission', 'result', 'service', 'error', 'file', 'filescore' ]) file_hash = get_random_hash(64) sub = random_model_obj(models.submission.Submission) sub.sid = sid = 'first-submission' sub.params.ignore_cache = False disp = Dispatcher(ds, clean_redis, clean_redis, logging) disp.active_submissions.add( sid, SubmissionTask(dict(submission=sub)).as_primitives()) dh = DispatchHash(sid=sid, client=clean_redis) print('==== first dispatch') # Submit a problem, and check that it gets added to the dispatch hash # and the right service queues file_task = FileTask({ 'sid': 'first-submission', 'min_classification': get_classification().UNRESTRICTED, 'file_info': dict(sha256=file_hash, type='unknown', magic='a', md5=get_random_hash(32), mime='a', sha1=get_random_hash(40), size=10), 'depth': 0, 'max_files': 5 }) disp.dispatch_file(file_task) assert dh.dispatch_time(file_hash, 'extract') > 0 assert dh.dispatch_time(file_hash, 'wrench') > 0 assert service_queue('extract').length() == 1 assert service_queue('wrench').length() == 1 # Making the same call again will queue it up again print('==== second dispatch') disp.dispatch_file(file_task) assert dh.dispatch_time(file_hash, 'extract') > 0 assert dh.dispatch_time(file_hash, 'wrench') > 0 assert service_queue('extract').length() == 2 assert service_queue('wrench').length() == 2 # assert len(mq) == 4 # Push back the timestamp in the dispatch hash to simulate a timeout, # make sure it gets pushed into that service queue again print('==== third dispatch') [service_queue(name).delete() for name in disp.scheduler.services] dh.fail_recoverable(file_hash, 'extract') disp.dispatch_file(file_task) assert dh.dispatch_time(file_hash, 'extract') > 0 assert dh.dispatch_time(file_hash, 'wrench') > 0 assert service_queue('extract').length() == 1 # assert len(mq) == 1 # Mark extract as finished, wrench as failed print('==== fourth dispatch') [service_queue(name).delete() for name in disp.scheduler.services] dh.finish(file_hash, 'extract', 'result-key', 0, 'U') dh.fail_nonrecoverable(file_hash, 'wrench', 'error-key') disp.dispatch_file(file_task) assert dh.finished(file_hash, 'extract') assert dh.finished(file_hash, 'wrench') assert service_queue('av-a').length() == 1 assert service_queue('av-b').length() == 1 assert service_queue('frankenstrings').length() == 1 # Have the AVs fail, frankenstrings finishes print('==== fifth dispatch') [service_queue(name).delete() for name in disp.scheduler.services] dh.fail_nonrecoverable(file_hash, 'av-a', 'error-a') dh.fail_nonrecoverable(file_hash, 'av-b', 'error-b') dh.finish(file_hash, 'frankenstrings', 'result-key', 0, 'U') disp.dispatch_file(file_task) assert dh.finished(file_hash, 'av-a') assert dh.finished(file_hash, 'av-b') assert dh.finished(file_hash, 'frankenstrings') assert service_queue('xerox').length() == 1 # Finish the xerox service and check if the submission completion got checked print('==== sixth dispatch') [service_queue(name).delete() for name in disp.scheduler.services] dh.finish(file_hash, 'xerox', 'result-key', 0, 'U') disp.dispatch_file(file_task) assert dh.finished(file_hash, 'xerox') assert len(disp.submission_queue) == 1
def try_run(self): while self.running: task = self.dispatch_client.request_work('worker', self.service_name, '0', timeout=3) if not task: continue self.log.info(f"{self.service_name} has received a job {task.sid}") file = self.filestore.get(task.fileinfo.sha256) instructions = json.loads(file) instructions = instructions.get(self.service_name, {}) self.log.info( f"{self.service_name} following instruction: {instructions}") hits = self.hits[task.fileinfo.sha256] = self.hits.get( task.fileinfo.sha256, 0) + 1 if instructions.get('hold', False): queue = get_service_queue(self.service_name, self.dispatch_client.redis) queue.push(0, task.as_primitives()) self.log.info( f"{self.service_name} Requeued task to {queue.name} holding for {instructions['hold']}" ) _global_semaphore.acquire(blocking=True, timeout=instructions['hold']) continue if instructions.get('lock', False): _global_semaphore.acquire(blocking=True, timeout=instructions['lock']) if 'drop' in instructions: if instructions['drop'] >= hits: self.drops[task.fileinfo.sha256] = self.drops.get( task.fileinfo.sha256, 0) + 1 continue if instructions.get('failure', False): error = Error(instructions['error']) error.sha256 = task.fileinfo.sha256 self.dispatch_client.service_failed(task.sid, error=error, error_key=get_random_id()) continue result_data = { 'archive_ts': time.time() + 300, 'classification': 'U', 'response': { 'service_version': '0', 'service_tool_version': '0', 'service_name': self.service_name, }, 'result': {}, 'sha256': task.fileinfo.sha256, 'expiry_ts': time.time() + 600 } result_data.update(instructions.get('result', {})) result_data['response'].update(instructions.get('response', {})) result = Result(result_data) result_key = instructions.get('result_key', get_random_id()) self.dispatch_client.service_finished(task.sid, result_key, result)