def _cleanup_submission(self, task: SubmissionTask, file_list: List[str]):
        """Clean up code that is the same for canceled and finished submissions"""
        submission = task.submission
        sid = submission.sid

        # Erase the temporary data which may have accumulated during processing
        for file_hash in file_list:
            hash_name = get_temporary_submission_data_name(sid, file_hash=file_hash)
            ExpiringHash(hash_name, host=self.redis).delete()

        if submission.params.quota_item and submission.params.submitter:
            self.log.info(f"[{sid}] Submission no longer counts toward {submission.params.submitter.upper()} quota")
            Hash('submissions-' + submission.params.submitter, self.redis_persist).pop(sid)

        if task.completed_queue:
            self.volatile_named_queue(task.completed_queue).push(submission.as_primitives())

        # Send complete message to any watchers.
        watcher_list = ExpiringSet(make_watcher_list_name(sid), host=self.redis)
        for w in watcher_list.members():
            NamedQueue(w).push(WatchQueueMessage({'status': 'STOP'}).as_primitives())

        # Clear the timeout watcher
        watcher_list.delete()
        self.timeout_watcher.clear(sid)
        self.active_submissions.pop(sid)

        # Count the submission as 'complete' either way
        self.counter.increment('submissions_completed')
    def __init__(self,
                 sid: str,
                 client: Union[Redis, StrictRedis],
                 fetch_results=False):
        """

        :param sid:
        :param client:
        :param fetch_results: Preload all the results on the redis server.
        """
        self.client = client
        self.sid = sid
        self._dispatch_key = f'{sid}{dispatch_tail}'
        self._finish_key = f'{sid}{finished_tail}'
        self._finish = self.client.register_script(finish_script)

        # cache the schedules calculated for the dispatcher, used to prevent rebuilding the
        # schedule repeatedly, and for telling the UI what services are pending
        self.schedules = ExpiringHash(f'dispatch-hash-schedules-{sid}',
                                      host=self.client)

        # How many services are outstanding for each file in the submission
        self._outstanding_service_count = ExpiringHash(
            f'dispatch-hash-files-{sid}', host=self.client)
        # Track which files have been extracted by what, in order to rebuild the file tree later
        self._file_tree = ExpiringSet(f'dispatch-hash-parents-{sid}',
                                      host=self.client)
        self._attempts = ExpiringHash(f'dispatch-hash-attempts-{sid}',
                                      host=self.client)

        # Local caches for _files and finished table
        self._cached_files = set(self._outstanding_service_count.keys())
        self._cached_results = dict()
        if fetch_results:
            self._cached_results = self.all_results()

        # Errors that are related to a submission, but not the terminal errors of a service
        self._other_errors = ExpiringSet(f'dispatch-hash-errors-{sid}',
                                         host=self.client)

        # TODO set these expire times from the global time limit for submissions
        retry_call(self.client.expire, self._dispatch_key, 60 * 60)
        retry_call(self.client.expire, self._finish_key, 60 * 60)
Example #3
0
def test_expiring_sets(redis_connection):
    if redis_connection:
        from assemblyline.remote.datatypes.set import ExpiringSet
        with ExpiringSet('test-expiring-set', ttl=1) as es:
            es.delete()

            values = ['a', 'b', 1, 2]
            assert es.add(*values) == 4
            assert es.length() == 4
            assert es.exist(values[2])
            for x in es.members():
                assert x in values
            time.sleep(1.1)
            assert es.length() == 0
            assert not es.exist(values[2])
class DispatchHash:
    def __init__(self,
                 sid: str,
                 client: Union[Redis, StrictRedis],
                 fetch_results=False):
        """

        :param sid:
        :param client:
        :param fetch_results: Preload all the results on the redis server.
        """
        self.client = client
        self.sid = sid
        self._dispatch_key = f'{sid}{dispatch_tail}'
        self._finish_key = f'{sid}{finished_tail}'
        self._finish = self.client.register_script(finish_script)

        # cache the schedules calculated for the dispatcher, used to prevent rebuilding the
        # schedule repeatedly, and for telling the UI what services are pending
        self.schedules = ExpiringHash(f'dispatch-hash-schedules-{sid}',
                                      host=self.client)

        # How many services are outstanding for each file in the submission
        self._outstanding_service_count = ExpiringHash(
            f'dispatch-hash-files-{sid}', host=self.client)
        # Track which files have been extracted by what, in order to rebuild the file tree later
        self._file_tree = ExpiringSet(f'dispatch-hash-parents-{sid}',
                                      host=self.client)
        self._attempts = ExpiringHash(f'dispatch-hash-attempts-{sid}',
                                      host=self.client)

        # Local caches for _files and finished table
        self._cached_files = set(self._outstanding_service_count.keys())
        self._cached_results = dict()
        if fetch_results:
            self._cached_results = self.all_results()

        # Errors that are related to a submission, but not the terminal errors of a service
        self._other_errors = ExpiringSet(f'dispatch-hash-errors-{sid}',
                                         host=self.client)

        # TODO set these expire times from the global time limit for submissions
        retry_call(self.client.expire, self._dispatch_key, 60 * 60)
        retry_call(self.client.expire, self._finish_key, 60 * 60)

    def add_file(self, file_hash: str, file_limit, parent_hash) -> bool:
        """Add a file to a submission.

        Returns: Whether the file could be added to the submission or has been rejected.
        """
        if parent_hash:
            self._file_tree.add(f'{file_hash}-{parent_hash}')
        else:
            self._file_tree.add(file_hash)

        # If it was already in the set, we don't need to check remotely
        if file_hash in self._cached_files:
            return True

        # If the set is already full, and its not in the set, then we don't need to check remotely
        if len(self._cached_files) >= file_limit:
            return False

        # Our local checks are unclear, check remotely,
        # 0 => already exists, still want to return true
        # 1 => didn't exist before
        # None => over size limit, return false
        if self._outstanding_service_count.limited_add(file_hash, 0,
                                                       file_limit) is not None:
            # If it was added, add it to the local cache so we don't need to check again
            self._cached_files.add(file_hash)
            return True
        return False

    def add_error(self, error_key: str) -> bool:
        """Add an error to a submission.

        NOTE: This method is for errors occuring outside of any errors handled via 'fail_*recoverable'

        Returns true if the error is new, false if the error is a duplicate.
        """
        return self._other_errors.add(error_key) > 0

    def dispatch(self, file_hash: str, service: str):
        """Mark that a service has been dispatched for the given sha."""
        if retry_call(self.client.hset, self._dispatch_key,
                      f"{file_hash}-{service}", time.time()):
            self._outstanding_service_count.increment(file_hash, 1)

    def drop_dispatch(self, file_hash: str, service: str):
        """If a dispatch has been found to be un-needed remove the counters."""
        if retry_call(self.client.hdel, self._dispatch_key,
                      f"{file_hash}-{service}"):
            self._outstanding_service_count.increment(file_hash, -1)

    def dispatch_count(self):
        """How many tasks have been dispatched for this submission."""
        return retry_call(self.client.hlen, self._dispatch_key)

    def dispatch_time(self, file_hash: str, service: str) -> float:
        """When was dispatch called for this sha/service pair."""
        result = retry_call(self.client.hget, self._dispatch_key,
                            f"{file_hash}-{service}")
        if result is None:
            return 0
        return float(result)

    def all_dispatches(self) -> Dict[str, Dict[str, float]]:
        """Load the entire table of things that should currently be running."""
        rows = retry_call(self.client.hgetall, self._dispatch_key)
        output = {}
        for key, timestamp in rows.items():
            file_hash, service = key.split(b'-', maxsplit=1)
            file_hash = file_hash.decode()
            service = service.decode()
            if file_hash not in output:
                output[file_hash] = {}
            output[file_hash][service] = float(timestamp)
        return output

    def fail_recoverable(self,
                         file_hash: str,
                         service: str,
                         error_key: str = None):
        """A service task has failed, but should be retried, clear that it has been dispatched.

        After this call, the service is in a non-dispatched state, and the status can't be update
        until it is dispatched again.
        """
        if error_key:
            self._other_errors.add(error_key)
        retry_call(self.client.hdel, self._dispatch_key,
                   f"{file_hash}-{service}")
        self._outstanding_service_count.increment(file_hash, -1)

    def fail_nonrecoverable(self, file_hash: str, service,
                            error_key) -> Tuple[int, bool]:
        """A service task has failed and should not be retried, entry the error as the result.

        Has exactly the same semantics as `finish` but for errors.
        """
        return retry_call(self._finish,
                          args=[
                              self.sid, file_hash, service,
                              json.dumps(['error', error_key, 0, False, ''])
                          ])

    def finish(self,
               file_hash,
               service,
               result_key,
               score,
               classification,
               drop=False) -> Tuple[int, bool]:
        """
        As a single transaction:
         - Remove the service from the dispatched list
         - Add the file to the finished list, with the given result key
         - return the number of items in the dispatched list and if this was a duplicate call to finish
        """
        return retry_call(self._finish,
                          args=[
                              self.sid, file_hash, service,
                              json.dumps([
                                  'result', result_key, score, drop,
                                  str(classification)
                              ])
                          ])

    def finished_count(self) -> int:
        """How many tasks have been finished for this submission."""
        return retry_call(self.client.hlen, self._finish_key)

    def finished(self, file_hash, service) -> Union[DispatchRow, None]:
        """If a service has been finished, return the key of the result document."""
        # Try the local cache
        result = self._cached_results.get(file_hash, {}).get(service, None)
        if result:
            return result
        # Try the server
        result = retry_call(self.client.hget, self._finish_key,
                            f"{file_hash}-{service}")
        if result:
            return DispatchRow(*json.loads(result))
        return None

    def all_finished(self) -> bool:
        """Are there no outstanding tasks, and at least one finished task."""
        return self.finished_count() > 0 and self.dispatch_count() == 0

    def all_results(self) -> Dict[str, Dict[str, DispatchRow]]:
        """Get all the records stored in the dispatch table.

        :return: output[file_hash][service_name] -> DispatchRow
        """
        rows = retry_call(self.client.hgetall, self._finish_key)
        output = {}
        for key, status in rows.items():
            file_hash, service = key.split(b'-', maxsplit=1)
            file_hash = file_hash.decode()
            service = service.decode()
            if file_hash not in output:
                output[file_hash] = {}
            output[file_hash][service] = DispatchRow(*json.loads(status))
        return output

    def all_extra_errors(self):
        """Return the set of errors not part of the dispatch table itself."""
        return self._other_errors.members()

    def all_files(self):
        return self._outstanding_service_count.keys()

    def file_tree(self):
        """Returns a mapping from file, to a list of files that are that file's parents.

        A none value being in the list indicates that the file is one of the root files of the submission.
        """
        edges = self._file_tree.members()
        output = {}
        for string in edges:
            if '-' in string:
                child, parent = string.split('-')
            else:
                child, parent = string, None

            if child not in output:
                output[child] = []
            output[child].append(parent)
        return output

    def delete(self):
        """Clear the tables from the redis server."""
        retry_call(self.client.delete, self._dispatch_key)
        retry_call(self.client.delete, self._finish_key)
        self.schedules.delete()
        self._outstanding_service_count.delete()
        self._file_tree.delete()
        self._other_errors.delete()
        self._attempts.delete()
Example #5
0
 def _get_watcher_list(self, sid):
     return ExpiringSet(make_watcher_list_name(sid), host=self.redis)
Example #6
0
    def service_finished(self,
                         sid: str,
                         result_key: str,
                         result: Result,
                         temporary_data: Optional[Dict[str, Any]] = None):
        """Notifies the dispatcher of service completion, and possible new files to dispatch."""
        # Make sure the dispatcher knows we were working on this task
        task_key = ServiceTask.make_key(
            sid=sid,
            service_name=result.response.service_name,
            sha=result.sha256)
        task = self.running_tasks.pop(task_key)
        if not task:
            self.log.warning(
                f"[{sid}/{result.sha256}] {result.response.service_name} could not find the specified "
                f"task in its set of running tasks while processing successful results."
            )
            return
        task = ServiceTask(task)

        # Check if the service is a candidate for dynamic recursion prevention
        if not task.ignore_dynamic_recursion_prevention:
            service_info = self.service_data.get(result.response.service_name,
                                                 None)
            if service_info and service_info.category == "Dynamic Analysis":
                # TODO: This should be done in lua because it can introduce race condition in the future
                #       but in the meantime it will remain this way while we can confirm it work as expected
                submission = self.active_submissions.get(sid)
                submission['submission']['params']['services'][
                    'runtime_excluded'].append(result.response.service_name)
                self.active_submissions.set(sid, submission)

        # Save or freshen the result, the CONTENT of the result shouldn't change, but we need to keep the
        # most distant expiry time to prevent pulling it out from under another submission too early
        if result.is_empty():
            # Empty Result will not be archived therefore result.archive_ts drives their deletion
            self.ds.emptyresult.save(result_key,
                                     {"expiry_ts": result.archive_ts})
        else:
            with Lock(f"lock-{result_key}", 5, self.redis):
                old = self.ds.result.get(result_key)
                if old:
                    if old.expiry_ts and result.expiry_ts:
                        result.expiry_ts = max(result.expiry_ts, old.expiry_ts)
                    else:
                        result.expiry_ts = None
                self.ds.result.save(result_key, result)

        # Let the logs know we have received a result for this task
        if result.drop_file:
            self.log.debug(
                f"[{sid}/{result.sha256}] {task.service_name} succeeded. "
                f"Result will be stored in {result_key} but processing will stop after this service."
            )
        else:
            self.log.debug(
                f"[{sid}/{result.sha256}] {task.service_name} succeeded. "
                f"Result will be stored in {result_key}")

        # Store the result object and mark the service finished in the global table
        process_table = DispatchHash(task.sid, self.redis)
        remaining, duplicate = process_table.finish(
            task.fileinfo.sha256, task.service_name, result_key,
            result.result.score, result.classification, result.drop_file)
        self.timeout_watcher.clear(f'{task.sid}-{task.key()}')
        if duplicate:
            self.log.warning(
                f"[{sid}/{result.sha256}] {result.response.service_name}'s current task was already "
                f"completed in the global processing table.")
            return

        # Push the result tags into redis
        new_tags = []
        for section in result.result.sections:
            new_tags.extend(tag_dict_to_list(section.tags.as_primitives()))
        if new_tags:
            tag_set = ExpiringSet(get_tag_set_name(
                sid=task.sid, file_hash=task.fileinfo.sha256),
                                  host=self.redis)
            tag_set.add(*new_tags)

        # Update the temporary data table for this file
        temp_data_hash = ExpiringHash(get_temporary_submission_data_name(
            sid=task.sid, file_hash=task.fileinfo.sha256),
                                      host=self.redis)
        for key, value in (temporary_data or {}).items():
            temp_data_hash.set(key, value)

        # Send the extracted files to the dispatcher
        depth_limit = self.config.submission.max_extraction_depth
        new_depth = task.depth + 1
        if new_depth < depth_limit:
            # Prepare the temporary data from the parent to build the temporary data table for
            # these newly extract files
            parent_data = dict(temp_data_hash.items())

            for extracted_data in result.response.extracted:
                if not process_table.add_file(
                        extracted_data.sha256,
                        task.max_files,
                        parent_hash=task.fileinfo.sha256):
                    if parent_data:
                        child_hash_name = get_temporary_submission_data_name(
                            task.sid, extracted_data.sha256)
                        ExpiringHash(child_hash_name,
                                     host=self.redis).multi_set(parent_data)

                    self._dispatching_error(
                        task, process_table,
                        Error({
                            'archive_ts': result.archive_ts,
                            'expiry_ts': result.expiry_ts,
                            'response': {
                                'message':
                                f"Too many files extracted for submission {task.sid} "
                                f"{extracted_data.sha256} extracted by "
                                f"{task.service_name} will be dropped",
                                'service_name':
                                task.service_name,
                                'service_tool_version':
                                result.response.service_tool_version,
                                'service_version':
                                result.response.service_version,
                                'status':
                                'FAIL_NONRECOVERABLE'
                            },
                            'sha256': extracted_data.sha256,
                            'type': 'MAX FILES REACHED'
                        }))
                    continue
                file_data = self.files.get(extracted_data.sha256)
                self.file_queue.push(
                    FileTask(
                        dict(sid=task.sid,
                             min_classification=task.min_classification.max(
                                 extracted_data.classification).value,
                             file_info=dict(
                                 magic=file_data.magic,
                                 md5=file_data.md5,
                                 mime=file_data.mime,
                                 sha1=file_data.sha1,
                                 sha256=file_data.sha256,
                                 size=file_data.size,
                                 type=file_data.type,
                             ),
                             depth=new_depth,
                             parent_hash=task.fileinfo.sha256,
                             max_files=task.max_files)).as_primitives())
        else:
            for extracted_data in result.response.extracted:
                self._dispatching_error(
                    task, process_table,
                    Error({
                        'archive_ts': result.archive_ts,
                        'expiry_ts': result.expiry_ts,
                        'response': {
                            'message':
                            f"{task.service_name} has extracted a file "
                            f"{extracted_data.sha256} beyond the depth limits",
                            'service_name':
                            result.response.service_name,
                            'service_tool_version':
                            result.response.service_tool_version,
                            'service_version':
                            result.response.service_version,
                            'status':
                            'FAIL_NONRECOVERABLE'
                        },
                        'sha256': extracted_data.sha256,
                        'type': 'MAX DEPTH REACHED'
                    }))

        # If the global table said that this was the last outstanding service,
        # send a message to the dispatchers.
        if remaining <= 0:
            self.file_queue.push(
                FileTask(
                    dict(sid=task.sid,
                         min_classification=task.min_classification.value,
                         file_info=task.fileinfo,
                         depth=task.depth,
                         max_files=task.max_files)).as_primitives())

        # Send the result key to any watching systems
        msg = {'status': 'OK', 'cache_key': result_key}
        for w in self._get_watcher_list(task.sid).members():
            NamedQueue(w).push(msg)
    def dispatch_file(self, task: FileTask):
        """ Handle a message describing a file to be processed.

        This file may be:
            - A new submission or extracted file.
            - A file that has just completed a stage of processing.
            - A file that has not completed a a stage of processing, but this
              call has been triggered by a timeout or similar.

        If the file is totally new, we will setup a dispatch table, and fill it in.

        Once we make/load a dispatch table, we will dispatch whichever group the table
        shows us hasn't been completed yet.

        When we dispatch to a service, we check if the task is already in the dispatch
        queue. If it isn't proceed normally. If it is, check that the service is still online.
        """
        # Read the message content
        file_hash = task.file_info.sha256
        active_task = self.active_submissions.get(task.sid)

        if active_task is None:
            self.log.warning(f"[{task.sid}] Untracked submission is being processed")
            return

        submission_task = SubmissionTask(active_task)
        submission = submission_task.submission

        # Refresh the watch on the submission, we are still working on it
        self.timeout_watcher.touch(key=task.sid, timeout=int(self.config.core.dispatcher.timeout),
                                   queue=SUBMISSION_QUEUE, message={'sid': task.sid})

        # Open up the file/service table for this submission
        dispatch_table = DispatchHash(task.sid, self.redis, fetch_results=True)

        # Load things that we will need to fill out the
        file_tags = ExpiringSet(task.get_tag_set_name(), host=self.redis)
        file_tags_data = file_tags.members()
        temporary_submission_data = ExpiringHash(task.get_temporary_submission_data_name(), host=self.redis)
        temporary_data = [dict(name=row[0], value=row[1]) for row in temporary_submission_data.items().items()]

        # Calculate the schedule for the file
        schedule = self.build_schedule(dispatch_table, submission, file_hash, task.file_info.type)
        started_stages = []

        # Go through each round of the schedule removing complete/failed services
        # Break when we find a stage that still needs processing
        outstanding = {}
        score = 0
        errors = 0
        while schedule and not outstanding:
            stage = schedule.pop(0)
            started_stages.append(stage)

            for service_name in stage:
                service = self.scheduler.services.get(service_name)
                if not service:
                    continue

                # Load the results, if there are no results, then the service must be dispatched later
                # Don't look at if it has been dispatched, as multiple dispatches are fine,
                # but missing a dispatch isn't.
                finished = dispatch_table.finished(file_hash, service_name)
                if not finished:
                    outstanding[service_name] = service
                    continue

                # If the service terminated in an error, count the error and continue
                if finished.is_error:
                    errors += 1
                    continue

                # if the service finished, count the score, and check if the file has been dropped
                score += finished.score
                if not submission.params.ignore_filtering and finished.drop:
                    schedule.clear()
                    if schedule:  # If there are still stages in the schedule, over write them for next time
                        dispatch_table.schedules.set(file_hash, started_stages)

        # Try to retry/dispatch any outstanding services
        if outstanding:
            self.log.info(f"[{task.sid}] File {file_hash} sent to services : {', '.join(list(outstanding.keys()))}")

            for service_name, service in outstanding.items():

                # Find the actual file name from the list of files in submission
                filename = None
                for file in submission.files:
                    if task.file_info.sha256 == file.sha256:
                        filename = file.name
                        break

                # Build the actual service dispatch message
                config = self.build_service_config(service, submission)
                service_task = ServiceTask(dict(
                    sid=task.sid,
                    metadata=submission.metadata,
                    min_classification=task.min_classification,
                    service_name=service_name,
                    service_config=config,
                    fileinfo=task.file_info,
                    filename=filename or task.file_info.sha256,
                    depth=task.depth,
                    max_files=task.max_files,
                    ttl=submission.params.ttl,
                    ignore_cache=submission.params.ignore_cache,
                    ignore_dynamic_recursion_prevention=submission.params.ignore_dynamic_recursion_prevention,
                    tags=file_tags_data,
                    temporary_submission_data=temporary_data,
                    deep_scan=submission.params.deep_scan,
                    priority=submission.params.priority,
                ))
                dispatch_table.dispatch(file_hash, service_name)
                queue = get_service_queue(service_name, self.redis)
                queue.push(service_task.priority, service_task.as_primitives())

        else:
            # There are no outstanding services, this file is done
            # clean up the tags
            file_tags.delete()

            # If there are no outstanding ANYTHING for this submission,
            # send a message to the submission dispatcher to finalize
            self.counter.increment('files_completed')
            if dispatch_table.all_finished():
                self.log.info(f"[{task.sid}] Finished processing file '{file_hash}' starting submission finalization.")
                self.submission_queue.push({'sid': submission.sid})
            else:
                self.log.info(f"[{task.sid}] Finished processing file '{file_hash}'. Other files are not finished.")
Example #8
0
def get_signup_queue(key):
    return ExpiringSet(f"signup_id_{key}",
                       host=config.core.redis.nonpersistent.host,
                       port=config.core.redis.nonpersistent.port,
                       ttl=60 * 15)
Example #9
0
def get_token_store(key):
    return ExpiringSet(f"oauth_token_{key}",
                       host=config.core.redis.nonpersistent.host,
                       port=config.core.redis.nonpersistent.port,
                       ttl=60 * 2)