Example #1
0
def test_submit_duplicate(submit_harness):
    datastore, submitter = submit_harness

    # a normal ingest task
    task = IngestTask({
        'submission': {
            'params':
            SubmissionParams({
                'classification': 'U',
                'description': 'file abc',
                'services': {
                    'selected': [],
                    'excluded': [],
                    'resubmit': [],
                },
                'submitter': 'user',
            }),
            'files': [{
                'sha256': '0' * 64,
                'size': 100,
                'name': 'abc',
            }],
            'metadata': {}
        },
        'ingest_id': 'abc123'
    })
    # Make sure the scan key is correct, this is normally done on ingest
    task.submission.scan_key = task.params.create_filescore_key(
        task.submission.files[0].sha256, [])

    # Add this file to the scanning table, so it looks like it has already been submitted + ingest again
    submitter.scanning.add(task.submission.scan_key, task.as_primitives())
    submitter.unique_queue.push(0, task.as_primitives())

    submitter.handle_submit()

    # No tasks should be left in the queue
    assert submitter.unique_queue.pop() is None
    # The task should have been pushed to the duplicates queue
    assert submitter.duplicate_queue.length(_dup_prefix +
                                            task.submission.scan_key) == 1
    def try_run(self, volatile=False):
        ingester = self.ingester
        logger = self.log

        time_mark, cpu_mark = time.time(), time.process_time()

        while self.running:
            # noinspection PyBroadException
            try:
                self.heartbeat()
                ingester.counter.increment_execution_time(
                    'cpu_seconds',
                    time.process_time() - cpu_mark)
                ingester.counter.increment_execution_time(
                    'busy_seconds',
                    time.time() - time_mark)

                # Check if there is room for more submissions
                length = ingester.scanning.length()
                if length >= ingester.config.core.ingester.max_inflight:
                    time.sleep(0.1)
                    time_mark, cpu_mark = time.time(), time.process_time()
                    continue

                raw = ingester.unique_queue.pop()
                if not raw:
                    time.sleep(0.1)
                    time_mark, cpu_mark = time.time(), time.process_time()
                    continue

                # Start timing 'busy' time, we reset this above after the sleeps so that the sleeps
                # don't get counted as busy
                time_mark, cpu_mark = time.time(), time.process_time()

                # Start of ingest message
                if self.apm_client:
                    self.apm_client.begin_transaction('ingest_msg')

                task = IngestTask(raw)

                # noinspection PyBroadException
                if any(
                        len(file.sha256) != 64
                        for file in task.submission.files):
                    logger.error("Malformed entry on submission queue: %s",
                                 task.ingest_id)
                    # End of ingest message (invalid_hash)
                    if self.apm_client:
                        self.apm_client.end_transaction(
                            'ingest_submit', 'invalid_hash')
                    continue

                # If between the initial ingestion and now the drop/whitelist status
                # of this submission has changed, then drop it now
                if ingester.drop(task):
                    # End of ingest message (dropped)
                    if self.apm_client:
                        self.apm_client.end_transaction(
                            'ingest_submit', 'dropped')
                    continue

                if ingester.is_whitelisted(task):
                    # End of ingest message (whitelisted)
                    if self.apm_client:
                        self.apm_client.end_transaction(
                            'ingest_submit', 'whitelisted')
                    continue

                # Check if this file has been previously processed.
                pprevious, previous, score, scan_key = None, False, None, None
                if not task.submission.params.ignore_cache:
                    pprevious, previous, score, scan_key = ingester.check(task)
                else:
                    scan_key = ingester.stamp_filescore_key(task)

                # If it HAS been previously processed, we are dealing with a resubmission
                # finalize will decide what to do, and put the task back in the queue
                # rewritten properly if we are going to run it again
                if previous:
                    if not task.submission.params.services.resubmit and not pprevious:
                        logger.warning(
                            f"No psid for what looks like a resubmission of "
                            f"{task.submission.files[0].sha256}: {scan_key}")
                    ingester.finalize(pprevious, previous, score, task)
                    # End of ingest message (finalized)
                    if self.apm_client:
                        self.apm_client.end_transaction(
                            'ingest_submit', 'finalized')

                    continue

                # We have decided this file is worth processing

                # Add the task to the scanning table, this is atomic across all submit
                # workers, so if it fails, someone beat us to the punch, record the file
                # as a duplicate then.
                if not ingester.scanning.add(scan_key, task.as_primitives()):
                    logger.debug('Duplicate %s',
                                 task.submission.files[0].sha256)
                    ingester.counter.increment('duplicates')
                    ingester.duplicate_queue.push(_dup_prefix + scan_key,
                                                  task.as_primitives())
                    # End of ingest message (duplicate)
                    if self.apm_client:
                        self.apm_client.end_transaction(
                            'ingest_submit', 'duplicate')

                    continue

                # We have managed to add the task to the scan table, so now we go
                # ahead with the submission process
                try:
                    ingester.submit(task)
                    # End of ingest message (submitted)
                    if self.apm_client:
                        self.apm_client.end_transaction(
                            'ingest_submit', 'submitted')

                    continue
                except Exception as _ex:
                    # For some reason (contained in `ex`) we have failed the submission
                    # The rest of this function is error handling/recovery
                    ex = _ex
                    traceback = _ex.__traceback__

                ingester.counter.increment('error')

                should_retry = True
                if isinstance(ex, CorruptedFileStoreException):
                    logger.error(
                        "Submission for file '%s' failed due to corrupted filestore: %s"
                        % (task.sha256, str(ex)))
                    should_retry = False
                elif isinstance(ex, DataStoreException):
                    trace = exceptions.get_stacktrace_info(ex)
                    logger.error(
                        "Submission for file '%s' failed due to data store error:\n%s"
                        % (task.sha256, trace))
                elif not isinstance(ex, FileStoreException):
                    trace = exceptions.get_stacktrace_info(ex)
                    logger.error("Submission for file '%s' failed: %s" %
                                 (task.sha256, trace))

                task = IngestTask(ingester.scanning.pop(scan_key))
                if not task:
                    logger.error('No scanning entry for for %s', task.sha256)
                    # End of ingest message (no_scan_entry)
                    if self.apm_client:
                        self.apm_client.end_transaction(
                            'ingest_submit', 'no_scan_entry')

                    continue

                if not should_retry:
                    # End of ingest message (cannot_retry)
                    if self.apm_client:
                        self.apm_client.end_transaction(
                            'ingest_submit', 'cannot_retry')

                    continue

                ingester.retry(task, scan_key, ex)
                # End of ingest message (retry)
                if self.apm_client:
                    self.apm_client.end_transaction('ingest_submit', 'retried')

                if volatile:
                    raise ex.with_traceback(traceback)

            except Exception:
                logger.exception("Unexpected error")
                # End of ingest message (exception)
                if self.apm_client:
                    self.apm_client.end_transaction('ingest_submit',
                                                    'exception')

                if volatile:
                    raise