def cancel_service_task(self, task_key, worker):
        # We believe a service task has timed out, try and read it from running tasks
        # If we can't find the task in running tasks, it finished JUST before timing out, let it go
        task = self.running_tasks.pop(task_key)
        if not task:
            return

        # We can confirm that the task is ours now, even if the worker finished, the result will be ignored
        task = Task(task)
        self.log.info(
            f"[{task.sid}] Service {task.service_name} timed out on {task.fileinfo.sha256}."
        )

        # Mark the previous attempt as invalid and redispatch it
        dispatch_table = DispatchHash(task.sid, self.redis)
        dispatch_table.fail_recoverable(task.fileinfo.sha256,
                                        task.service_name)
        dispatch_table.dispatch(task.fileinfo.sha256, task.service_name)
        get_service_queue(task.service_name,
                          self.redis).push(task.priority, task.as_primitives())

        # We push the task of killing the container off on the scaler, which already has root access
        # the scaler can also double check that the service name and container id match, to be sure
        # we aren't accidentally killing the wrong container
        self.scaler_timeout_queue.push({
            'service': task.service_name,
            'container': worker
        })

        # Report to the metrics system that a recoverable error has occurred for that service
        export_metrics_once(task.service_name,
                            ServiceMetrics,
                            dict(fail_recoverable=1),
                            host=worker,
                            counter_type='service')
    def watch_service(self, service_name):
        service_queue = get_service_queue(service_name, self.redis)
        while self.running and not self.stop_signals[service_name].is_set():
            while service_queue.length() > self.service_limit[service_name]:
                task = self.dispatch_client.request_work(
                    'plumber',
                    service_name=service_name,
                    service_version='0',
                    blocking=False,
                    low_priority=True)
                if task is None:
                    break

                error = Error(
                    dict(
                        archive_ts=now_as_iso(
                            self.config.datastore.ilm.days_until_archive * 24 *
                            60 * 60),
                        created='NOW',
                        expiry_ts=now_as_iso(task.ttl * 24 * 60 *
                                             60) if task.ttl else None,
                        response=dict(
                            message="Task canceled due to execesive queuing.",
                            service_name=task.service_name,
                            service_version='0',
                            status='FAIL_NONRECOVERABLE',
                        ),
                        sha256=task.fileinfo.sha256,
                        type="TASK PRE-EMPTED",
                    ))

                error_key = error.build_key(task=task)
                self.dispatch_client.service_failed(task.sid, error_key, error)
            self.sleep(2)
Beispiel #3
0
def perform_check():
    # If the service is privileged, test connectivity to core
    if environ.get('PRIVILEGED', 'false').lower() == 'true':
        forge.get_datastore()
        forge.get_filestore(connection_attempts=1)
        forge.get_service_queue(service=environ['AL_SERVICE_NAME'])
    else:
        # Otherwise, perform a test for service-server availability
        if not requests.get(f"{environ['SERVICE_API_HOST']}/healthz/live").ok:
            raise Exception('Unable to reach service-server')
    # If running with an updater, check for availability. Make sure test doesn't run on the actual updater.
    if environ.get('updates_host') and not environ['HOSTNAME'].startswith(
            environ['updates_host']):
        if not requests.get(
                f"http://{environ['updates_host']}:{environ['updates_port']}/healthz/live"
        ).ok:
            raise Exception('Unable to reach local update server')
    exit()
Beispiel #4
0
 def __init__(self, name, datastore, redis, filestore):
     super().__init__('assemblyline.service.' + name)
     self.service_name = name
     self.datastore = datastore
     self.filestore = filestore
     self.queue = get_service_queue(name, redis)
     self.dispatch_client = DispatchClient(self.datastore, redis)
     self.hits = dict()
     self.drops = dict()
def test_plumber_clearing(core, metrics):
    global _global_semaphore
    _global_semaphore = threading.Semaphore(value=0)
    start = time.time()

    try:
        # Have the plumber cancel tasks
        sha, size = ready_body(core, {'pre': {'hold': 60}})

        core.ingest_queue.push(
            SubmissionInput(
                dict(metadata={},
                     params=dict(description="file abc123",
                                 services=dict(selected=''),
                                 submitter='user',
                                 groups=['user'],
                                 max_extracted=10000),
                     notification=dict(queue='test_plumber_clearing',
                                       threshold=0),
                     files=[dict(sha256=sha, size=size,
                                 name='abc123')])).as_primitives())

        metrics.expect('ingester', 'submissions_ingested', 1)
        service_queue = get_service_queue('pre', core.redis)

        start = time.time()
        while service_queue.length() < 1:
            if time.time() - start > RESPONSE_TIMEOUT:
                pytest.fail(f'Found { service_queue.length()}')
            time.sleep(0.1)

        service_delta = core.ds.service_delta.get('pre')
        service_delta['enabled'] = False
        core.ds.service_delta.save('pre', service_delta)

        notification_queue = NamedQueue('nq-test_plumber_clearing', core.redis)
        dropped_task = notification_queue.pop(timeout=RESPONSE_TIMEOUT)
        dropped_task = IngestTask(dropped_task)
        sub = core.ds.submission.get(dropped_task.submission.sid)
        assert len(sub.files) == 1
        assert len(sub.results) == 3
        assert len(sub.errors) == 1
        error = core.ds.error.get(sub.errors[0])
        assert "disabled" in error.response.message

        metrics.expect('ingester', 'submissions_completed', 1)
        metrics.expect('dispatcher', 'submissions_completed', 1)
        metrics.expect('dispatcher', 'files_completed', 1)
        metrics.expect('service', 'fail_recoverable', 1)

    finally:
        _global_semaphore.release()
        service_delta = core.ds.service_delta.get('pre')
        service_delta['enabled'] = True
        core.ds.service_delta.save('pre', service_delta)
Beispiel #6
0
def test_plumber_clearing(core):
    global _global_semaphore
    _global_semaphore = threading.Semaphore(value=0)

    start = time.time()
    watch = WatcherServer(redis=core.redis, redis_persist=core.redis)
    watch.start()

    try:
        # Have the plumber cancel tasks
        sha, size = ready_body(core, {'pre': {'semaphore': 60}})

        core.ingest_queue.push(
            SubmissionInput(
                dict(metadata={},
                     params=dict(description="file abc123",
                                 services=dict(selected=''),
                                 submitter='user',
                                 groups=['user'],
                                 max_extracted=10000),
                     notification=dict(queue='test_plumber_clearing',
                                       threshold=0),
                     files=[dict(sha256=sha, size=size,
                                 name='abc123')])).as_primitives())

        service_queue = get_service_queue('pre', core.redis)
        time.sleep(0.5)
        while service_queue.length() == 0 and time.time() - start < 20:
            time.sleep(0.1)

        service_delta = core.ds.service_delta.get('pre')
        service_delta['enabled'] = False
        core.ds.service_delta.save('pre', service_delta)

        notification_queue = NamedQueue('nq-test_plumber_clearing', core.redis)
        dropped_task = notification_queue.pop(timeout=5)
        dropped_task = IngestTask(dropped_task)
        sub = core.ds.submission.get(dropped_task.submission.sid)
        assert len(sub.files) == 1
        assert len(sub.results) == 3
        assert len(sub.errors) == 1

        error = core.ds.error.get(sub.errors[0])
        assert "disabled" in error.response.message
    finally:
        _global_semaphore.release()
        service_delta = core.ds.service_delta.get('pre')
        service_delta['enabled'] = True
        core.ds.service_delta.save('pre', service_delta)
        watch.stop()
        watch.join()
Beispiel #7
0
    def __init__(self, datastore=None, filestore=None):
        super().__init__('assemblyline.randomservice')
        self.config = forge.get_config()
        self.datastore = datastore or forge.get_datastore()
        self.filestore = filestore or forge.get_filestore()
        self.client_id = get_random_id()
        self.service_state_hash = ExpiringHash(SERVICE_STATE_HASH, ttl=30 * 60)

        self.counters = {
            n: MetricsFactory('service', Metrics, name=n, config=self.config)
            for n in self.datastore.service_delta.keys()
        }
        self.queues = [
            forge.get_service_queue(name)
            for name in self.datastore.service_delta.keys()
        ]
        self.dispatch_client = DispatchClient(self.datastore)
        self.service_info = CachedObject(self.datastore.list_all_services,
                                         kwargs={'as_obj': False})
    def _request_work(self, worker_id, service_name, service_version,
                      timeout, blocking, low_priority=False) -> Optional[ServiceTask]:
        # For when we recursively retry on bad task dequeue-ing
        if int(timeout) <= 0:
            self.log.info(f"{service_name}:{worker_id} no task returned [timeout]")
            return None

        # Get work from the queue
        work_queue = get_service_queue(service_name, self.redis)
        if blocking:
            result = work_queue.blocking_pop(timeout=int(timeout), low_priority=low_priority)
        else:
            if low_priority:
                result = work_queue.unpush(1)
            else:
                result = work_queue.pop(1)
            if result:
                result = result[0]

        if not result:
            self.log.info(f"{service_name}:{worker_id} no task returned: [empty message]")
            return None
        task = ServiceTask(result)
        task.metadata['worker__'] = worker_id
        dispatcher = task.metadata['dispatcher__']

        if not self.is_dispatcher(dispatcher):
            self.log.info(f"{service_name}:{worker_id} no task returned: [task from dead dispatcher]")
            return None

        if self.running_tasks.add(task.key(), task.as_primitives()):
            self.log.info(f"[{task.sid}/{task.fileinfo.sha256}] {service_name}:{worker_id} task found")
            start_queue = self._get_queue_from_cache(DISPATCH_START_EVENTS + dispatcher)
            start_queue.push((task.sid, task.fileinfo.sha256, service_name, worker_id))
            return task
        return None
    def send_heartbeat(self, m_type, m_name, m_data, instances):
        if m_type == "dispatcher":
            try:
                instances = sorted(Dispatcher.all_instances(
                    self.redis_persist))
                inflight = {
                    _i: Dispatcher.instance_assignment_size(
                        self.redis_persist, _i)
                    for _i in instances
                }
                queues = {
                    _i: Dispatcher.all_queue_lengths(self.redis, _i)
                    for _i in instances
                }

                msg = {
                    "sender": self.sender,
                    "msg": {
                        "inflight": {
                            "max": self.config.core.dispatcher.max_inflight,
                            "outstanding": self.dispatch_active_hash.length(),
                            "per_instance": [inflight[_i] for _i in instances]
                        },
                        "instances": len(instances),
                        "metrics": m_data,
                        "queues": {
                            "ingest":
                            self.dispatcher_submission_queue.length(),
                            "start": [queues[_i]['start'] for _i in instances],
                            "result":
                            [queues[_i]['result'] for _i in instances],
                            "command":
                            [queues[_i]['command'] for _i in instances]
                        },
                        "component": m_name,
                    }
                }
                self.status_queue.publish(
                    DispatcherMessage(msg).as_primitives())
                self.log.info(f"Sent dispatcher heartbeat: {msg['msg']}")
            except Exception:
                self.log.exception(
                    "An exception occurred while generating DispatcherMessage")

        elif m_type == "ingester":
            try:
                c_q_len = self.ingest_unique_queue.count(*self.c_rng)
                h_q_len = self.ingest_unique_queue.count(*self.h_rng)
                m_q_len = self.ingest_unique_queue.count(*self.m_rng)
                l_q_len = self.ingest_unique_queue.count(*self.l_rng)

                msg = {
                    "sender": self.sender,
                    "msg": {
                        "instances": instances,
                        "metrics": m_data,
                        "processing": {
                            "inflight": self.ingest_scanning.length()
                        },
                        "processing_chance": {
                            "critical": 1 - drop_chance(c_q_len, self.c_s_at),
                            "high": 1 - drop_chance(h_q_len, self.h_s_at),
                            "low": 1 - drop_chance(l_q_len, self.l_s_at),
                            "medium": 1 - drop_chance(m_q_len, self.m_s_at)
                        },
                        "queues": {
                            "critical": c_q_len,
                            "high": h_q_len,
                            "ingest": self.ingest_queue.length(),
                            "complete": self.ingest_complete_queue.length(),
                            "low": l_q_len,
                            "medium": m_q_len
                        }
                    }
                }
                self.status_queue.publish(IngestMessage(msg).as_primitives())
                self.log.info(f"Sent ingester heartbeat: {msg['msg']}")
            except Exception:
                self.log.exception(
                    "An exception occurred while generating IngestMessage")

        elif m_type == "alerter":
            try:
                msg = {
                    "sender": self.sender,
                    "msg": {
                        "instances": instances,
                        "metrics": m_data,
                        "queues": {
                            "alert": self.alert_queue.length()
                        }
                    }
                }
                self.status_queue.publish(AlerterMessage(msg).as_primitives())
                self.log.info(f"Sent alerter heartbeat: {msg['msg']}")
            except Exception:
                self.log.exception(
                    "An exception occurred while generating AlerterMessage")

        elif m_type == "expiry":
            try:
                msg = {
                    "sender": self.sender,
                    "msg": {
                        "instances": instances,
                        "metrics": m_data,
                        "queues": self.to_expire
                    }
                }
                self.status_queue.publish(ExpiryMessage(msg).as_primitives())
                self.log.info(f"Sent expiry heartbeat: {msg['msg']}")
            except Exception:
                self.log.exception(
                    "An exception occurred while generating ExpiryMessage")

        elif m_type == "archive":
            try:
                msg = {
                    "sender": self.sender,
                    "msg": {
                        "instances": instances,
                        "metrics": m_data
                    }
                }
                self.status_queue.publish(ArchiveMessage(msg).as_primitives())
                self.log.info(f"Sent archive heartbeat: {msg['msg']}")
            except Exception:
                self.log.exception(
                    "An exception occurred while generating ArchiveMessage")

        elif m_type == "scaler":
            try:
                msg = {
                    "sender": self.sender,
                    "msg": {
                        "instances": instances,
                        "metrics": m_data,
                    }
                }
                self.status_queue.publish(ScalerMessage(msg).as_primitives())
                self.log.info(f"Sent scaler heartbeat: {msg['msg']}")
            except Exception:
                self.log.exception(
                    "An exception occurred while generating WatcherMessage")

        elif m_type == "scaler_status":
            try:
                msg = {
                    "sender": self.sender,
                    "msg": {
                        "service_name": m_name,
                        "metrics": m_data,
                    }
                }
                self.status_queue.publish(
                    ScalerStatusMessage(msg).as_primitives())
                self.log.info(f"Sent scaler status heartbeat: {msg['msg']}")
            except Exception:
                self.log.exception(
                    "An exception occurred while generating WatcherMessage")

        elif m_type == "service":
            try:
                busy, idle = get_working_and_idle(self.redis, m_name)
                msg = {
                    "sender": self.sender,
                    "msg": {
                        "instances": len(busy) + len(idle),
                        "metrics": m_data,
                        "activity": {
                            'busy': len(busy),
                            'idle': len(idle)
                        },
                        "queue": get_service_queue(m_name,
                                                   self.redis).length(),
                        "service_name": m_name
                    }
                }
                self.status_queue.publish(ServiceMessage(msg).as_primitives())
                self.log.info(f"Sent service heartbeat: {msg['msg']}")
            except Exception:
                self.log.exception(
                    "An exception occurred while generating ServiceMessage")

        else:
            self.log.warning(
                f"Skipping unknown counter: {m_name} [{m_type}] ==> {m_data}")
    def sync_services(self):
        self.scheduler.enter(SERVICE_SYNC_INTERVAL, 0, self.sync_services)
        default_settings = self.config.core.scaler.service_defaults
        image_variables = defaultdict(str)
        image_variables.update(self.config.services.image_variables)
        current_services = set(self.profiles.keys())
        discovered_services = []

        # Get all the service data
        for service in self.datastore.list_all_services(full=True):
            service: Service = service
            name = service.name
            stage = self.get_service_stage(service.name)
            discovered_services.append(name)

            # noinspection PyBroadException
            try:
                if service.enabled and stage == ServiceStage.Off:
                    # Enable this service's dependencies
                    self.controller.prepare_network(
                        service.name,
                        service.docker_config.allow_internet_access)
                    for _n, dependency in service.dependencies.items():
                        self.controller.start_stateful_container(
                            service_name=service.name,
                            container_name=_n,
                            spec=dependency,
                            labels={'dependency_for': service.name})

                    # Move to the next service stage
                    if service.update_config and service.update_config.wait_for_update:
                        self._service_stage_hash.set(name, ServiceStage.Update)
                    else:
                        self._service_stage_hash.set(name,
                                                     ServiceStage.Running)

                if not service.enabled:
                    self.stop_service(service.name, stage)
                    continue

                # Check that all enabled services are enabled
                if service.enabled and stage == ServiceStage.Running:
                    # Compute a hash of service properties not include in the docker config, that
                    # should still result in a service being restarted when changed
                    config_hash = hash(str(sorted(service.config.items())))
                    config_hash = hash(
                        (config_hash, str(service.submission_params)))

                    # Build the docker config for the service, we are going to either create it or
                    # update it so we need to know what the current configuration is either way
                    docker_config = service.docker_config
                    docker_config.image = Template(
                        docker_config.image).safe_substitute(image_variables)
                    set_keys = set(var.name
                                   for var in docker_config.environment)
                    for var in default_settings.environment:
                        if var.name not in set_keys:
                            docker_config.environment.append(var)

                    # Add the service to the list of services being scaled
                    if name not in self.profiles:
                        self.log.info(f'Adding {service.name} to scaling')
                        self.add_service(
                            ServiceProfile(
                                name=name,
                                min_instances=default_settings.min_instances,
                                growth=default_settings.growth,
                                shrink=default_settings.shrink,
                                config_hash=config_hash,
                                backlog=default_settings.backlog,
                                max_instances=service.licence_count,
                                container_config=docker_config,
                                queue=get_service_queue(name, self.redis),
                                shutdown_seconds=service.timeout +
                                30,  # Give service an extra 30 seconds to upload results
                            ))

                    # Update RAM, CPU, licence requirements for running services
                    else:
                        profile = self.profiles[name]

                        if profile.container_config != docker_config or profile.config_hash != config_hash:
                            self.log.info(
                                f"Updating deployment information for {name}")
                            profile.container_config = docker_config
                            profile.config_hash = config_hash
                            self.controller.restart(profile)
                            self.log.info(
                                f"Deployment information for {name} replaced")

                        if service.licence_count == 0:
                            profile._max_instances = float('inf')
                        else:
                            profile._max_instances = service.licence_count
            except Exception:
                self.log.exception(
                    f"Error applying service settings from: {service.name}")
                self.handle_service_error(service.name)

        # Find any services we have running, that are no longer in the database and remove them
        for stray_service in current_services - set(discovered_services):
            stage = self.get_service_stage(stray_service)
            self.stop_service(stray_service, stage)
 def service_queue(name):
     return get_service_queue(name, redis)
    def _request_work(self, worker_id, service_name, service_version, timeout,
                      blocking) -> Optional[ServiceTask]:
        # For when we recursively retry on bad task dequeue-ing
        if int(timeout) <= 0:
            self.log.info(
                f"{service_name}:{worker_id} no task returned [timeout]")
            return None

        # Get work from the queue
        work_queue = get_service_queue(service_name, self.redis)
        if blocking:
            result = work_queue.blocking_pop(timeout=int(timeout))
        else:
            result = work_queue.pop(1)
            if result:
                result = result[0]

        if not result:
            self.log.info(
                f"{service_name}:{worker_id} no task returned: [empty message]"
            )
            return None
        task = ServiceTask(result)

        # If someone is supposed to be working on this task right now, we won't be able to add it
        if self.running_tasks.add(task.key(), task.as_primitives()):
            self.log.info(
                f"[{task.sid}/{task.fileinfo.sha256}] {service_name}:{worker_id} task found"
            )

            process_table = DispatchHash(task.sid, self.redis)

            abandoned = process_table.dispatch_time(
                file_hash=task.fileinfo.sha256, service=task.service_name) == 0
            finished = process_table.finished(
                file_hash=task.fileinfo.sha256,
                service=task.service_name) is not None

            # A service might be re-dispatched as it finishes, when that is the case it can be marked as
            # both finished and dispatched, if that is the case, drop the dispatch from the table
            if finished and not abandoned:
                process_table.drop_dispatch(file_hash=task.fileinfo.sha256,
                                            service=task.service_name)

            if abandoned or finished:
                self.log.info(
                    f"[{task.sid}/{task.fileinfo.sha256}] {service_name}:{worker_id} task already complete"
                )
                self.running_tasks.pop(task.key())
                raise RetryRequestWork()

            # Check if this task has reached the retry limit
            attempt_record = ExpiringHash(f'dispatch-hash-attempts-{task.sid}',
                                          host=self.redis)
            total_attempts = attempt_record.increment(task.key())
            self.log.info(
                f"[{task.sid}/{task.fileinfo.sha256}] {service_name}:{worker_id} "
                f"task attempt {total_attempts}/3")
            if total_attempts > 3:
                self.log.warning(
                    f"[{task.sid}/{task.fileinfo.sha256}] "
                    f"{service_name}:{worker_id} marking task failed: TASK PREEMPTED "
                )
                error = Error(
                    dict(
                        archive_ts=now_as_iso(
                            self.config.datastore.ilm.days_until_archive * 24 *
                            60 * 60),
                        created='NOW',
                        expiry_ts=now_as_iso(task.ttl * 24 * 60 *
                                             60) if task.ttl else None,
                        response=dict(
                            message=
                            f'The number of retries has passed the limit.',
                            service_name=task.service_name,
                            service_version=service_version,
                            status='FAIL_NONRECOVERABLE',
                        ),
                        sha256=task.fileinfo.sha256,
                        type="TASK PRE-EMPTED",
                    ))
                error_key = error.build_key(task=task)
                self.service_failed(task.sid, error_key, error)
                export_metrics_once(service_name,
                                    Metrics,
                                    dict(fail_nonrecoverable=1),
                                    host=worker_id,
                                    counter_type='service')
                raise RetryRequestWork()

            # Get the service information
            service_data = self.service_data[task.service_name]
            self.timeout_watcher.touch_task(timeout=int(service_data.timeout),
                                            key=f'{task.sid}-{task.key()}',
                                            worker=worker_id,
                                            task_key=task.key())
            return task
        raise RetryRequestWork()
    def dispatch_file(self, task: FileTask):
        """ Handle a message describing a file to be processed.

        This file may be:
            - A new submission or extracted file.
            - A file that has just completed a stage of processing.
            - A file that has not completed a a stage of processing, but this
              call has been triggered by a timeout or similar.

        If the file is totally new, we will setup a dispatch table, and fill it in.

        Once we make/load a dispatch table, we will dispatch whichever group the table
        shows us hasn't been completed yet.

        When we dispatch to a service, we check if the task is already in the dispatch
        queue. If it isn't proceed normally. If it is, check that the service is still online.
        """
        # Read the message content
        file_hash = task.file_info.sha256
        active_task = self.active_submissions.get(task.sid)

        if active_task is None:
            self.log.warning(f"[{task.sid}] Untracked submission is being processed")
            return

        submission_task = SubmissionTask(active_task)
        submission = submission_task.submission

        # Refresh the watch on the submission, we are still working on it
        self.timeout_watcher.touch(key=task.sid, timeout=int(self.config.core.dispatcher.timeout),
                                   queue=SUBMISSION_QUEUE, message={'sid': task.sid})

        # Open up the file/service table for this submission
        dispatch_table = DispatchHash(task.sid, self.redis, fetch_results=True)

        # Load things that we will need to fill out the
        file_tags = ExpiringSet(task.get_tag_set_name(), host=self.redis)
        file_tags_data = file_tags.members()
        temporary_submission_data = ExpiringHash(task.get_temporary_submission_data_name(), host=self.redis)
        temporary_data = [dict(name=row[0], value=row[1]) for row in temporary_submission_data.items().items()]

        # Calculate the schedule for the file
        schedule = self.build_schedule(dispatch_table, submission, file_hash, task.file_info.type)
        started_stages = []

        # Go through each round of the schedule removing complete/failed services
        # Break when we find a stage that still needs processing
        outstanding = {}
        score = 0
        errors = 0
        while schedule and not outstanding:
            stage = schedule.pop(0)
            started_stages.append(stage)

            for service_name in stage:
                service = self.scheduler.services.get(service_name)
                if not service:
                    continue

                # Load the results, if there are no results, then the service must be dispatched later
                # Don't look at if it has been dispatched, as multiple dispatches are fine,
                # but missing a dispatch isn't.
                finished = dispatch_table.finished(file_hash, service_name)
                if not finished:
                    outstanding[service_name] = service
                    continue

                # If the service terminated in an error, count the error and continue
                if finished.is_error:
                    errors += 1
                    continue

                # if the service finished, count the score, and check if the file has been dropped
                score += finished.score
                if not submission.params.ignore_filtering and finished.drop:
                    schedule.clear()
                    if schedule:  # If there are still stages in the schedule, over write them for next time
                        dispatch_table.schedules.set(file_hash, started_stages)

        # Try to retry/dispatch any outstanding services
        if outstanding:
            self.log.info(f"[{task.sid}] File {file_hash} sent to services : {', '.join(list(outstanding.keys()))}")

            for service_name, service in outstanding.items():

                # Find the actual file name from the list of files in submission
                filename = None
                for file in submission.files:
                    if task.file_info.sha256 == file.sha256:
                        filename = file.name
                        break

                # Build the actual service dispatch message
                config = self.build_service_config(service, submission)
                service_task = ServiceTask(dict(
                    sid=task.sid,
                    metadata=submission.metadata,
                    min_classification=task.min_classification,
                    service_name=service_name,
                    service_config=config,
                    fileinfo=task.file_info,
                    filename=filename or task.file_info.sha256,
                    depth=task.depth,
                    max_files=task.max_files,
                    ttl=submission.params.ttl,
                    ignore_cache=submission.params.ignore_cache,
                    ignore_dynamic_recursion_prevention=submission.params.ignore_dynamic_recursion_prevention,
                    tags=file_tags_data,
                    temporary_submission_data=temporary_data,
                    deep_scan=submission.params.deep_scan,
                    priority=submission.params.priority,
                ))
                dispatch_table.dispatch(file_hash, service_name)
                queue = get_service_queue(service_name, self.redis)
                queue.push(service_task.priority, service_task.as_primitives())

        else:
            # There are no outstanding services, this file is done
            # clean up the tags
            file_tags.delete()

            # If there are no outstanding ANYTHING for this submission,
            # send a message to the submission dispatcher to finalize
            self.counter.increment('files_completed')
            if dispatch_table.all_finished():
                self.log.info(f"[{task.sid}] Finished processing file '{file_hash}' starting submission finalization.")
                self.submission_queue.push({'sid': submission.sid})
            else:
                self.log.info(f"[{task.sid}] Finished processing file '{file_hash}'. Other files are not finished.")
Beispiel #14
0
    def _sync_service(self, service: Service):
        name = service.name
        stage = self.get_service_stage(service.name)
        default_settings = self.config.core.scaler.service_defaults
        image_variables: defaultdict[str, str] = defaultdict(str)
        image_variables.update(self.config.services.image_variables)

        def prepare_container(docker_config: DockerConfig) -> DockerConfig:
            docker_config.image = Template(
                docker_config.image).safe_substitute(image_variables)
            set_keys = set(var.name for var in docker_config.environment)
            for var in default_settings.environment:
                if var.name not in set_keys:
                    docker_config.environment.append(var)
            return docker_config

        # noinspection PyBroadException
        try:

            def disable_incompatible_service():
                service.enabled = False
                if self.datastore.service_delta.update(service.name, [
                    (self.datastore.service_delta.UPDATE_SET, 'enabled', False)
                ]):
                    # Raise awareness to other components by sending an event for the service
                    self.service_event_sender.send(service.name, {
                        'operation': Operation.Incompatible,
                        'name': service.name
                    })

            # Check if service considered compatible to run on Assemblyline?
            system_spec = f'{FRAMEWORK_VERSION}.{SYSTEM_VERSION}'
            if not service.version.startswith(system_spec):
                # If FW and SYS version don't prefix in the service version, we can't guarantee the service is compatible
                # Disable and treat it as incompatible due to service version.
                self.log.warning(
                    "Disabling service with incompatible version. "
                    f"[{service.version} != '{system_spec}.X.{service.update_channel}Y']."
                )
                disable_incompatible_service()
            elif service.update_config and service.update_config.wait_for_update and not service.update_config.sources:
                # All signatures sources from a signature-dependent service was removed
                # Disable and treat it as incompatible due to service configuration relative to source management
                self.log.warning(
                    "Disabling service with incompatible service configuration. "
                    "Signature-dependent service has no signature sources.")
                disable_incompatible_service()

            if not service.enabled:
                self.stop_service(service.name, stage)
                return

            # Build the docker config for the dependencies. For now the dependency blob values
            # aren't set for the change key going to kubernetes because everything about
            # the dependency config should be captured in change key that the function generates
            # internally. A change key is set for the service deployment as that includes
            # things like the submission params
            dependency_config: dict[str, Any] = {}
            dependency_blobs: dict[str, str] = {}
            for _n, dependency in service.dependencies.items():
                dependency.container = prepare_container(dependency.container)
                dependency_config[_n] = dependency
                dep_hash = get_id_from_data(dependency, length=16)
                dependency_blobs[
                    _n] = f"dh={dep_hash}v={service.version}p={service.privileged}"

            # Check if the service dependencies have been deployed.
            dependency_keys = []
            updater_ready = stage == ServiceStage.Running
            if service.update_config:
                for _n, dependency in dependency_config.items():
                    key = self.controller.stateful_container_key(
                        service.name, _n, dependency, '')
                    if key:
                        dependency_keys.append(_n + key)
                    else:
                        updater_ready = False

            # If stage is not set to running or a dependency container is missing start the setup process
            if not updater_ready:
                self.log.info(f'Preparing environment for {service.name}')
                # Move to the next service stage (do this first because the container we are starting may care)
                if service.update_config and service.update_config.wait_for_update:
                    self._service_stage_hash.set(name, ServiceStage.Update)
                    stage = ServiceStage.Update
                else:
                    self._service_stage_hash.set(name, ServiceStage.Running)
                    stage = ServiceStage.Running

                # Enable this service's dependencies before trying to launch the service containers
                dependency_internet = [
                    (name, dependency.container.allow_internet_access)
                    for name, dependency in dependency_config.items()
                ]

                self.controller.prepare_network(
                    service.name, service.docker_config.allow_internet_access,
                    dependency_internet)
                for _n, dependency in dependency_config.items():
                    self.log.info(f'Launching {service.name} dependency {_n}')
                    self.controller.start_stateful_container(
                        service_name=service.name,
                        container_name=_n,
                        spec=dependency,
                        labels={'dependency_for': service.name},
                        change_key=dependency_blobs.get(_n, ''))

            # If the conditions for running are met deploy or update service containers
            if stage == ServiceStage.Running:
                # Build the docker config for the service, we are going to either create it or
                # update it so we need to know what the current configuration is either way
                docker_config = prepare_container(service.docker_config)

                # Compute a blob of service properties not include in the docker config, that
                # should still result in a service being restarted when changed
                cfg_items = get_recursive_sorted_tuples(service.config)
                dep_keys = ''.join(sorted(dependency_keys))
                config_blob = (
                    f"c={cfg_items}sp={service.submission_params}"
                    f"dk={dep_keys}p={service.privileged}d={docker_config}")

                # Add the service to the list of services being scaled
                with self.profiles_lock:
                    if name not in self.profiles:
                        self.log.info(
                            f"Adding "
                            f"{f'privileged {service.name}' if service.privileged else service.name}"
                            " to scaling")
                        self.add_service(
                            ServiceProfile(
                                name=name,
                                min_instances=default_settings.min_instances,
                                growth=default_settings.growth,
                                shrink=default_settings.shrink,
                                config_blob=config_blob,
                                dependency_blobs=dependency_blobs,
                                backlog=default_settings.backlog,
                                max_instances=service.licence_count,
                                container_config=docker_config,
                                queue=get_service_queue(name, self.redis),
                                # Give service an extra 30 seconds to upload results
                                shutdown_seconds=service.timeout + 30,
                                privileged=service.privileged))

                    # Update RAM, CPU, licence requirements for running services
                    else:
                        profile = self.profiles[name]
                        profile.max_instances = service.licence_count
                        profile.privileged = service.privileged

                        for dependency_name, dependency_blob in dependency_blobs.items(
                        ):
                            if profile.dependency_blobs[
                                    dependency_name] != dependency_blob:
                                self.log.info(
                                    f"Updating deployment information for {name}/{dependency_name}"
                                )
                                profile.dependency_blobs[
                                    dependency_name] = dependency_blob
                                self.controller.start_stateful_container(
                                    service_name=service.name,
                                    container_name=dependency_name,
                                    spec=dependency_config[dependency_name],
                                    labels={'dependency_for': service.name},
                                    change_key=dependency_blob)

                        if profile.config_blob != config_blob:
                            self.log.info(
                                f"Updating deployment information for {name}")
                            profile.container_config = docker_config
                            profile.config_blob = config_blob
                            self.controller.restart(profile)
                            self.log.info(
                                f"Deployment information for {name} replaced")

        except Exception:
            self.log.exception(
                f"Error applying service settings from: {service.name}")
            self.handle_service_error(service.name)
def test_dispatch_file(clean_redis):
    service_queue = lambda name: get_service_queue(name, clean_redis)

    ds = MockDatastore(collections=[
        'submission', 'result', 'service', 'error', 'file', 'filescore'
    ])
    file_hash = get_random_hash(64)
    sub = random_model_obj(models.submission.Submission)
    sub.sid = sid = 'first-submission'
    sub.params.ignore_cache = False

    disp = Dispatcher(ds, clean_redis, clean_redis, logging)
    disp.active_submissions.add(
        sid,
        SubmissionTask(dict(submission=sub)).as_primitives())
    dh = DispatchHash(sid=sid, client=clean_redis)
    print('==== first dispatch')
    # Submit a problem, and check that it gets added to the dispatch hash
    # and the right service queues
    file_task = FileTask({
        'sid':
        'first-submission',
        'min_classification':
        get_classification().UNRESTRICTED,
        'file_info':
        dict(sha256=file_hash,
             type='unknown',
             magic='a',
             md5=get_random_hash(32),
             mime='a',
             sha1=get_random_hash(40),
             size=10),
        'depth':
        0,
        'max_files':
        5
    })
    disp.dispatch_file(file_task)

    assert dh.dispatch_time(file_hash, 'extract') > 0
    assert dh.dispatch_time(file_hash, 'wrench') > 0
    assert service_queue('extract').length() == 1
    assert service_queue('wrench').length() == 1

    # Making the same call again will queue it up again
    print('==== second dispatch')
    disp.dispatch_file(file_task)

    assert dh.dispatch_time(file_hash, 'extract') > 0
    assert dh.dispatch_time(file_hash, 'wrench') > 0
    assert service_queue('extract').length() == 2
    assert service_queue('wrench').length() == 2
    # assert len(mq) == 4

    # Push back the timestamp in the dispatch hash to simulate a timeout,
    # make sure it gets pushed into that service queue again
    print('==== third dispatch')
    [service_queue(name).delete() for name in disp.scheduler.services]
    dh.fail_recoverable(file_hash, 'extract')

    disp.dispatch_file(file_task)

    assert dh.dispatch_time(file_hash, 'extract') > 0
    assert dh.dispatch_time(file_hash, 'wrench') > 0
    assert service_queue('extract').length() == 1
    # assert len(mq) == 1

    # Mark extract as finished, wrench as failed
    print('==== fourth dispatch')
    [service_queue(name).delete() for name in disp.scheduler.services]
    dh.finish(file_hash, 'extract', 'result-key', 0, 'U')
    dh.fail_nonrecoverable(file_hash, 'wrench', 'error-key')

    disp.dispatch_file(file_task)

    assert dh.finished(file_hash, 'extract')
    assert dh.finished(file_hash, 'wrench')
    assert service_queue('av-a').length() == 1
    assert service_queue('av-b').length() == 1
    assert service_queue('frankenstrings').length() == 1

    # Have the AVs fail, frankenstrings finishes
    print('==== fifth dispatch')
    [service_queue(name).delete() for name in disp.scheduler.services]
    dh.fail_nonrecoverable(file_hash, 'av-a', 'error-a')
    dh.fail_nonrecoverable(file_hash, 'av-b', 'error-b')
    dh.finish(file_hash, 'frankenstrings', 'result-key', 0, 'U')

    disp.dispatch_file(file_task)

    assert dh.finished(file_hash, 'av-a')
    assert dh.finished(file_hash, 'av-b')
    assert dh.finished(file_hash, 'frankenstrings')
    assert service_queue('xerox').length() == 1

    # Finish the xerox service and check if the submission completion got checked
    print('==== sixth dispatch')
    [service_queue(name).delete() for name in disp.scheduler.services]
    dh.finish(file_hash, 'xerox', 'result-key', 0, 'U')

    disp.dispatch_file(file_task)

    assert dh.finished(file_hash, 'xerox')
    assert len(disp.submission_queue) == 1
    def try_run(self):
        while self.running:
            task = self.dispatch_client.request_work('worker',
                                                     self.service_name,
                                                     '0',
                                                     timeout=3)
            if not task:
                continue
            self.log.info(f"{self.service_name} has received a job {task.sid}")

            file = self.filestore.get(task.fileinfo.sha256)

            instructions = json.loads(file)
            instructions = instructions.get(self.service_name, {})
            self.log.info(
                f"{self.service_name} following instruction: {instructions}")
            hits = self.hits[task.fileinfo.sha256] = self.hits.get(
                task.fileinfo.sha256, 0) + 1

            if instructions.get('hold', False):
                queue = get_service_queue(self.service_name,
                                          self.dispatch_client.redis)
                queue.push(0, task.as_primitives())
                self.log.info(
                    f"{self.service_name} Requeued task to {queue.name} holding for {instructions['hold']}"
                )
                _global_semaphore.acquire(blocking=True,
                                          timeout=instructions['hold'])
                continue

            if instructions.get('lock', False):
                _global_semaphore.acquire(blocking=True,
                                          timeout=instructions['lock'])

            if 'drop' in instructions:
                if instructions['drop'] >= hits:
                    self.drops[task.fileinfo.sha256] = self.drops.get(
                        task.fileinfo.sha256, 0) + 1
                    continue

            if instructions.get('failure', False):
                error = Error(instructions['error'])
                error.sha256 = task.fileinfo.sha256
                self.dispatch_client.service_failed(task.sid,
                                                    error=error,
                                                    error_key=get_random_id())
                continue

            result_data = {
                'archive_ts': time.time() + 300,
                'classification': 'U',
                'response': {
                    'service_version': '0',
                    'service_tool_version': '0',
                    'service_name': self.service_name,
                },
                'result': {},
                'sha256': task.fileinfo.sha256,
                'expiry_ts': time.time() + 600
            }

            result_data.update(instructions.get('result', {}))
            result_data['response'].update(instructions.get('response', {}))

            result = Result(result_data)
            result_key = instructions.get('result_key', get_random_id())
            self.dispatch_client.service_finished(task.sid, result_key, result)