def test_dispatch_extracted(clean_redis, clean_datastore):
    redis = clean_redis
    ds = clean_datastore

    # def service_queue(name): return get_service_queue(name, redis)

    # Setup the fake datastore
    file_hash = get_random_hash(64)
    second_file_hash = get_random_hash(64)

    for fh in [file_hash, second_file_hash]:
        obj = random_model_obj(models.file.File)
        obj.sha256 = fh
        ds.file.save(fh, obj)

    # Inject the fake submission
    submission = random_model_obj(models.submission.Submission)
    submission.files = [dict(name='./file', sha256=file_hash)]
    sid = submission.sid = 'first-submission'

    disp = Dispatcher(ds, redis, redis)
    disp.running = ToggleTrue()
    client = DispatchClient(ds, redis, redis)
    client.dispatcher_data_age = time.time()
    client.dispatcher_data.append(disp.instance_id)

    # Launch the submission
    client.dispatch_submission(submission)
    disp.pull_submissions()
    disp.service_worker(disp.process_queue_index(sid))

    # Finish one service extracting a file
    job = client.request_work('0', 'extract', '0')
    assert job.fileinfo.sha256 == file_hash
    assert job.filename == './file'
    new_result: Result = random_minimal_obj(Result)
    new_result.sha256 = file_hash
    new_result.response.service_name = 'extract'
    new_result.response.extracted = [
        dict(sha256=second_file_hash,
             name='second-*',
             description='abc',
             classification='U')
    ]
    client.service_finished(sid, 'extracted-done', new_result)

    # process the result
    disp.pull_service_results()
    disp.service_worker(disp.process_queue_index(sid))
    disp.service_worker(disp.process_queue_index(sid))

    #
    job = client.request_work('0', 'extract', '0')
    assert job.fileinfo.sha256 == second_file_hash
    assert job.filename == 'second-*'
class SubmissionClient:
    """A helper class to simplify submitting files from internal or external sources.

    This tool helps take care of interactions between the filestore,
    datastore, dispatcher, and any sources of files to be processed.
    """
    def __init__(self,
                 datastore: AssemblylineDatastore = None,
                 filestore: FileStore = None,
                 config=None,
                 redis=None,
                 identify=None):
        self.log = logging.getLogger('assemblyline.submission_client')
        self.config = config or forge.CachedObject(forge.get_config)
        self.datastore = datastore or forge.get_datastore(self.config)
        self.filestore = filestore or forge.get_filestore(self.config)
        self.redis = redis
        if identify:
            self.cleanup = False
        else:
            self.cleanup = True
        self.identify = identify or forge.get_identify(
            config=self.config, datastore=self.datastore, use_cache=True)

        # A client for interacting with the dispatcher
        self.dispatcher = DispatchClient(datastore, redis)

    def __enter__(self):
        return self

    def __exit__(self, *_):
        self.stop()

    def stop(self):
        if self.cleanup:
            self.identify.stop()

    @elasticapm.capture_span(span_type='submission_client')
    def rescan(self, submission: Submission, results: Dict[str, Result],
               file_infos: Dict[str, FileInfo], file_tree, errors: List[str],
               rescan_services: List[str]):
        """
        Rescan a submission started on another system.
        """
        # Reset submission processing data
        submission['times'].pop('completed')
        submission['state'] = 'submitted'

        # Set the list of service to rescan
        submission['params']['services']['rescan'] = rescan_services

        # Create the submission object
        submission_obj = Submission(submission)

        if len(submission_obj.files) == 0:
            raise SubmissionException("No files found to submit.")

        for f in submission_obj.files:
            if not self.datastore.file.exists(f.sha256):
                raise SubmissionException(
                    f"File {f.sha256} does not exist, cannot continue submission."
                )

        # Set the new expiry
        if submission_obj.params.ttl:
            submission_obj.expiry_ts = epoch_to_iso(now() +
                                                    submission_obj.params.ttl *
                                                    24 * 60 * 60)

        # Clearing runtime_excluded on initial submit or resubmit
        submission_obj.params.services.runtime_excluded = []

        # Save the submission
        self.datastore.submission.save(submission_obj.sid, submission_obj)

        # Dispatch the submission
        self.log.debug("Submission complete. Dispatching: %s",
                       submission_obj.sid)
        self.dispatcher.dispatch_bundle(submission_obj, results, file_infos,
                                        file_tree, errors)

        return submission

    @elasticapm.capture_span(span_type='submission_client')
    def submit(self,
               submission_obj: SubmissionObject,
               local_files: List = None,
               completed_queue=None):
        """Submit several files in a single submission.

        After this method runs, there should be no local copies of the file left.
        """
        if local_files is None:
            local_files = []

        if len(submission_obj.files) == 0 and len(local_files) == 0:
            raise SubmissionException("No files found to submit...")

        if submission_obj.params.ttl:
            expiry = epoch_to_iso(submission_obj.time.timestamp() +
                                  submission_obj.params.ttl * 24 * 60 * 60)
        else:
            expiry = None
        max_size = self.config.submission.max_file_size

        for local_file in local_files:
            # Upload/download, extract, analyze files
            original_classification = str(submission_obj.params.classification)
            file_hash, size, new_metadata = self._ready_file(
                local_file, expiry, original_classification)
            new_name = new_metadata.pop('name',
                                        safe_str(os.path.basename(local_file)))
            meta_classification = new_metadata.pop('classification',
                                                   original_classification)
            if meta_classification != original_classification:
                try:
                    submission_obj.params.classification = Classification.max_classification(
                        meta_classification, original_classification)
                except InvalidClassification as ic:
                    raise SubmissionException(
                        "The classification found inside the cart file cannot be merged with "
                        f"the classification the file was submitted as: {str(ic)}"
                    )

            submission_obj.metadata.update(**flatten(new_metadata))

            # Check that after we have resolved exactly what to pass on, that it
            # remains a valid target for scanning
            if size > max_size and not submission_obj.params.ignore_size:
                msg = "File too large (%d > %d). Submission failed" % (
                    size, max_size)
                raise SubmissionException(msg)
            elif size == 0:
                msg = "File empty. Submission failed"
                raise SubmissionException(msg)

            submission_obj.files.append(
                File({
                    'name': new_name,
                    'size': size,
                    'sha256': file_hash,
                }))

        # Clearing runtime_excluded on initial submit or resubmit
        submission_obj.params.services.runtime_excluded = []

        # We should now have all the information we need to construct a submission object
        sub = Submission(
            dict(
                archive_ts=now_as_iso(
                    self.config.datastore.ilm.days_until_archive * 24 * 60 *
                    60),
                classification=submission_obj.params.classification,
                error_count=0,
                errors=[],
                expiry_ts=expiry,
                file_count=len(submission_obj.files),
                files=submission_obj.files,
                max_score=0,
                metadata=submission_obj.metadata,
                params=submission_obj.params,
                results=[],
                sid=submission_obj.sid,
                state='submitted',
                scan_key=submission_obj.scan_key,
            ))

        if self.config.ui.allow_malicious_hinting and submission_obj.params.malicious:
            sub.verdict = {"malicious": [submission_obj.params.submitter]}

        self.datastore.submission.save(sub.sid, sub)

        self.log.debug("Submission complete. Dispatching: %s", sub.sid)
        self.dispatcher.dispatch_submission(sub,
                                            completed_queue=completed_queue)

        return sub

    def _ready_file(self, local_path: str, expiry,
                    classification) -> Tuple[str, int, dict]:
        """Take a file from local storage and prepare it for submission.

        After this method finished the file will ONLY exist on the filestore, not locally.
        """
        extracted_path = None
        try:
            # Analyze the file and make sure the file table is up to date
            fileinfo = self.identify.fileinfo(local_path)

            if fileinfo['size'] == 0:
                raise SubmissionException("File empty. Submission failed")

            # Check if there is an integrated decode process for this file
            # eg. files that are packaged, and the contained file (not the package
            # that local_path points to) should be passed into the system.
            extracted_path, fileinfo, al_meta = decode_file(
                local_path, fileinfo, self.identify)
            al_meta['classification'] = al_meta.get('classification',
                                                    classification)
            if not Classification.is_valid(al_meta['classification']):
                raise SubmissionException(
                    f"{al_meta['classification']} is not a valid classification for this system"
                    ", submission is cancelled...")

            if extracted_path:
                local_path = extracted_path

            self.datastore.save_or_freshen_file(fileinfo['sha256'],
                                                fileinfo,
                                                expiry,
                                                al_meta['classification'],
                                                redis=self.redis)
            self.filestore.upload(local_path, fileinfo['sha256'])
            return fileinfo['sha256'], fileinfo['size'], al_meta

        finally:
            # If we extracted anything delete it
            if extracted_path:
                if os.path.exists(extracted_path):
                    os.unlink(extracted_path)
def test_simple(clean_redis, clean_datastore):
    ds = clean_datastore
    redis = clean_redis

    def service_queue(name):
        return get_service_queue(name, redis)

    file = random_model_obj(File)
    file_hash = file.sha256
    file.type = 'unknown'
    ds.file.save(file_hash, file)

    sub: Submission = random_model_obj(models.submission.Submission)
    sub.sid = sid = 'first-submission'
    sub.params.ignore_cache = False
    sub.params.max_extracted = 5
    sub.params.classification = get_classification().UNRESTRICTED
    sub.params.initial_data = json.dumps({'cats': 'big'})
    sub.files = [dict(sha256=file_hash, name='file')]

    disp = Dispatcher(ds, redis, redis)
    disp.running = ToggleTrue()
    client = DispatchClient(ds, redis, redis)
    client.dispatcher_data_age = time.time()
    client.dispatcher_data.append(disp.instance_id)

    # Submit a problem, and check that it gets added to the dispatch hash
    # and the right service queues
    logger.info('==== first dispatch')
    # task = SubmissionTask(sub.as_primitives(), 'some-completion-queue')
    client.dispatch_submission(sub)
    disp.pull_submissions()
    disp.service_worker(disp.process_queue_index(sid))
    task = disp.tasks.get(sid)

    assert task.queue_keys[(file_hash, 'extract')] is not None
    assert task.queue_keys[(file_hash, 'wrench')] is not None
    assert service_queue('extract').length() == 1
    assert service_queue('wrench').length() == 1

    # Making the same call again will queue it up again
    logger.info('==== second dispatch')
    disp.dispatch_file(task, file_hash)

    assert task.queue_keys[(file_hash, 'extract')] is not None
    assert task.queue_keys[(file_hash, 'wrench')] is not None
    assert service_queue('extract').length() == 1  # the queue doesn't pile up
    assert service_queue('wrench').length() == 1

    logger.info('==== third dispatch')
    job = client.request_work('0', 'extract', '0')
    assert job.temporary_submission_data == [{'name': 'cats', 'value': 'big'}]
    client.service_failed(sid, 'abc123', make_error(file_hash, 'extract'))
    # Deliberately do in the wrong order to make sure that works
    disp.pull_service_results()
    disp.service_worker(disp.process_queue_index(sid))

    assert task.queue_keys[(file_hash, 'extract')] is not None
    assert task.queue_keys[(file_hash, 'wrench')] is not None
    assert service_queue('extract').length() == 1

    # Mark extract as finished, wrench as failed
    logger.info('==== fourth dispatch')
    client.request_work('0', 'extract', '0')
    client.request_work('0', 'wrench', '0')
    client.service_finished(sid, 'extract-result',
                            make_result(file_hash, 'extract'))
    client.service_failed(sid, 'wrench-error',
                          make_error(file_hash, 'wrench', False))
    for _ in range(2):
        disp.pull_service_results()
        disp.service_worker(disp.process_queue_index(sid))

    assert wait_error(task, file_hash, 'wrench')
    assert wait_result(task, file_hash, 'extract')
    assert service_queue('av-a').length() == 1
    assert service_queue('av-b').length() == 1
    assert service_queue('frankenstrings').length() == 1

    # Have the AVs fail, frankenstrings finishes
    logger.info('==== fifth dispatch')
    client.request_work('0', 'av-a', '0')
    client.request_work('0', 'av-b', '0')
    client.request_work('0', 'frankenstrings', '0')
    client.service_failed(sid, 'av-a-error',
                          make_error(file_hash, 'av-a', False))
    client.service_failed(sid, 'av-b-error',
                          make_error(file_hash, 'av-b', False))
    client.service_finished(sid, 'f-result',
                            make_result(file_hash, 'frankenstrings'))
    for _ in range(3):
        disp.pull_service_results()
        disp.service_worker(disp.process_queue_index(sid))

    assert wait_result(task, file_hash, 'frankenstrings')
    assert wait_error(task, file_hash, 'av-a')
    assert wait_error(task, file_hash, 'av-b')
    assert service_queue('xerox').length() == 1

    # Finish the xerox service and check if the submission completion got checked
    logger.info('==== sixth dispatch')
    client.request_work('0', 'xerox', '0')
    client.service_finished(sid, 'xerox-result-key',
                            make_result(file_hash, 'xerox'))
    disp.pull_service_results()
    disp.service_worker(disp.process_queue_index(sid))
    disp.save_submission()

    assert wait_result(task, file_hash, 'xerox')
    assert disp.tasks.get(sid) is None
Exemple #4
0
class SubmissionClient:
    """A helper class to simplify submitting files from internal or external sources.

    This tool helps take care of interactions between the filestore,
    datastore, dispatcher, and any sources of files to be processed.
    """

    def __init__(self, datastore: AssemblylineDatastore = None, filestore: FileStore = None,
                 config=None, redis=None):
        self.log = logging.getLogger('assemblyline.submission_client')
        self.config = config or forge.CachedObject(forge.get_config)
        self.datastore = datastore or forge.get_datastore(self.config)
        self.filestore = filestore or forge.get_filestore(self.config)
        self.redis = redis

        # A client for interacting with the dispatcher
        self.dispatcher = DispatchClient(datastore, redis)

    def submit(self, submission_obj: SubmissionObject, local_files: List = None, cleanup=True, completed_queue=None):
        """Submit several files in a single submission.

        After this method runs, there should be no local copies of the file left.
        """
        if local_files is None:
            local_files = []

        try:
            expiry = now_as_iso(submission_obj.params.ttl * 24 * 60 * 60) if submission_obj.params.ttl else None
            max_size = self.config.submission.max_file_size

            if len(submission_obj.files) == 0:
                if len(local_files) == 0:
                    raise SubmissionException("No files found to submit...")

                for local_file in local_files:
                    # Upload/download, extract, analyze files
                    file_hash, size, new_metadata = self._ready_file(local_file, expiry,
                                                                     str(submission_obj.params.classification),
                                                                     cleanup, upload=True)
                    new_name = new_metadata.pop('name', safe_str(os.path.basename(local_file)))
                    submission_obj.params.classification = new_metadata.pop('classification',
                                                                            submission_obj.params.classification)
                    submission_obj.metadata.update(**flatten(new_metadata))

                    # Check that after we have resolved exactly what to pass on, that it
                    # remains a valid target for scanning
                    if size > max_size and not submission_obj.params.ignore_size:
                        msg = "File too large (%d > %d). Submission failed" % (size, max_size)
                        raise SubmissionException(msg)
                    elif size == 0:
                        msg = "File empty. Submission failed"
                        raise SubmissionException(msg)

                    submission_obj.files.append(File({
                        'name': new_name,
                        'size': size,
                        'sha256': file_hash,
                    }))
            else:
                for f in submission_obj.files:
                    temporary_path = None
                    try:
                        fd, temporary_path = tempfile.mkstemp(prefix="submission.submit")
                        os.close(fd)  # We don't need the file descriptor open
                        self.filestore.download(f.sha256, temporary_path)
                        file_hash, size, new_metadata = self._ready_file(temporary_path, expiry,
                                                                         str(submission_obj.params.classification),
                                                                         cleanup, sha256=f.sha256)

                        new_name = new_metadata.pop('name', f.name)
                        submission_obj.params.classification = new_metadata.pop('classification',
                                                                                submission_obj.params.classification)
                        submission_obj.metadata.update(**flatten(new_metadata))

                        # Check that after we have resolved exactly what to pass on, that it
                        # remains a valid target for scanning
                        if size > max_size and not submission_obj.params.ignore_size:
                            msg = "File too large (%d > %d). Submission failed" % (size, max_size)
                            raise SubmissionException(msg)
                        elif size == 0:
                            msg = "File empty. Submission failed"
                            raise SubmissionException(msg)

                        if f.size is None:
                            f.size = size

                        f.name = new_name
                        f.sha256 = file_hash

                    finally:
                        if temporary_path:
                            if os.path.exists(temporary_path):
                                os.unlink(temporary_path)

            # Initialize the temporary data from the submission parameter
            if submission_obj.params.initial_data:
                try:
                    temp_hash_name = get_temporary_submission_data_name(submission_obj.sid,
                                                                        submission_obj.files[0].sha256)
                    temporary_submission_data = ExpiringHash(temp_hash_name, host=self.redis)
                    temporary_submission_data.multi_set(json.loads(submission_obj.params.initial_data))
                except ValueError as err:
                    self.log.warning(f"[{submission_obj.sid}] could not process initialization data: {err}")

            # Clearing runtime_excluded on initial submit or resubmit
            submission_obj.params.services.runtime_excluded = []

            # We should now have all the information we need to construct a submission object
            sub = Submission(dict(
                archive_ts=now_as_iso(self.config.datastore.ilm.days_until_archive * 24 * 60 * 60),
                classification=submission_obj.params.classification,
                error_count=0,
                errors=[],
                expiry_ts=expiry,
                file_count=len(submission_obj.files),
                files=submission_obj.files,
                max_score=0,
                metadata=submission_obj.metadata,
                params=submission_obj.params,
                results=[],
                sid=submission_obj.sid,
                state='submitted'
            ))
            self.datastore.submission.save(sub.sid, sub)

            self.log.debug("Submission complete. Dispatching: %s", sub.sid)
            self.dispatcher.dispatch_submission(sub, completed_queue=completed_queue)

            return sub
        finally:
            # Just in case this method fails clean up local files
            if cleanup:
                for path in local_files:
                    if path and os.path.exists(path):
                        # noinspection PyBroadException
                        try:
                            os.unlink(path)
                        except Exception:
                            self.log.error("Couldn't delete dangling file %s", path)

    def _ready_file(self, local_path: str, expiry, classification, cleanup,
                    sha256=None, upload=False) -> Tuple[str, int, dict]:
        """Take a file from local storage and prepare it for submission.

        After this method finished the file will ONLY exist on the filestore, not locally.
        """
        extracted_path = None
        try:
            # Analyze the file and make sure the file table is up to date
            fileinfo = identify.fileinfo(local_path)

            if fileinfo['size'] == 0:
                raise SubmissionException("File empty. Submission failed")

            if sha256 is not None and fileinfo['sha256'] != sha256:
                raise CorruptedFileStoreException(f"SHA256 mismatch between received and calculated "
                                                  f"sha256. {sha256} != {fileinfo['sha256']}")

            # Check if there is an integrated decode process for this file
            # eg. files that are packaged, and the contained file (not the package
            # that local_path points to) should be passed into the system.
            extracted_path, fileinfo, al_meta = decode_file(local_path, fileinfo)
            al_meta['classification'] = al_meta.get('classification', classification)

            if extracted_path:
                local_path = extracted_path
                self.filestore.upload(local_path, fileinfo['sha256'])
            elif upload:
                self.filestore.upload(local_path, fileinfo['sha256'])

            self.datastore.save_or_freshen_file(fileinfo['sha256'], fileinfo, expiry,
                                                al_meta['classification'], redis=self.redis)
            return fileinfo['sha256'], fileinfo['size'], al_meta

        finally:
            # If we extracted anything delete it
            if extracted_path:
                if os.path.exists(extracted_path):
                    os.unlink(extracted_path)

            # If we DIDN'T download anything, still delete it
            if local_path and cleanup:
                if os.path.exists(local_path):
                    os.unlink(local_path)