Python RulesetEngine Beispiele, peekaboo.ruleset.engine.RulesetEngine Python Beispiele

Beispiel #1

0

Datei anzeigen

Datei: test.py Projekt: heikipikker/PeekabooAV

    def test_unknown_rule_enabled(self):
        """ Test that correct error is shown if an unknown rule is enabled. """
        config = CreatingConfigParser('''[rules]
rule.1: foo''')
        with self.assertRaisesRegex(PeekabooRulesetConfigError,
                                    r'Unknown rule\(s\) enabled: foo'):
            RulesetEngine(ruleset_config=config, db_con=None)

Beispiel #2

0

Datei anzeigen

Datei: test.py Projekt: heikipikker/PeekabooAV

 def test_no_rules_configured(self):
     """ Test that correct error is shown if no rules are configured. """
     config = CreatingConfigParser()
     with self.assertRaisesRegex(
             PeekabooRulesetConfigError,
             r'No enabled rules found, check ruleset config.'):
         RulesetEngine(ruleset_config=config, db_con=None)

Beispiel #3

0

Datei anzeigen

Datei: test.py Projekt: heikipikker/PeekabooAV

    def test_disabled_config(self):
        """ Test that no error is shown if disabled rule has config. """

        config = CreatingConfigParser('''[rules]
rule.1: known
#rule.2: cuckoo_score

[cuckoo_score]
higher_than: 4.0''')
        RulesetEngine(ruleset_config=config, db_con=None)

Beispiel #4

0

Datei anzeigen

Datei: test.py Projekt: heikipikker/PeekabooAV

    def test_invalid_type(self):
        """ Test that correct error is shown if rule config option has wrong
        type. """

        config = CreatingConfigParser('''[rules]
rule.1: cuckoo_score

[cuckoo_score]
higher_than: foo''')
        with self.assertRaisesRegex(
                ValueError, r"could not convert string to float: '?foo'?"):
            RulesetEngine(ruleset_config=config, db_con=None)

Beispiel #5

0

Datei anzeigen

Datei: queuing.py Projekt: kamarit/PeekabooAV

    def run(self):
        self.running_flag.set()
        while not self.shutdown_requested.is_set():
            try:
                # wait blocking for next job (thread safe) with timeout
                sample = self.job_queue.dequeue(self.dequeue_timeout)
            except Empty:
                continue
            logger.info('Worker %d: Processing sample %s' %
                        (self.worker_id, sample))

            sample.init()

            try:
                engine = RulesetEngine(sample, self.ruleset_config)
                engine.run()
                engine.report()
                self.job_queue.done(sample.sha256sum)
            except CuckooReportPendingException:
                logger.debug("Report for sample %s still pending" % sample)
                pass
            except Exception as e:
                logger.exception(e)
                # it's no longer in-flight even though processing seems to have
                # failed
                self.job_queue.done(sample.sha256sum)

            logger.debug('Worker is ready')

        logger.info('Worker %d: Stopped' % self.worker_id)
        self.running_flag.clear()

Beispiel #6

0

Datei anzeigen

Datei: queuing.py Projekt: janschmid/PeekabooAV

    def run(self):
        while self.active:
            logger.debug('Worker is ready')
            sample = JobQueue.jobs.get(True)  # wait blocking for next job (thread safe)
            logger.info('Worker %d: Processing sample %s' % (self.worker_id, sample))

            sample.init()

            try:
                engine = RulesetEngine(sample)
                engine.run()
                engine.report()
            except CuckooReportPendingException:
                pass
            except Exception as e:
                logger.exception(e)

Beispiel #7

0

Datei anzeigen

Datei: queuing.py Projekt: scVENUS/PeekabooAV

    def __init__(self,
                 ruleset_config,
                 db_con,
                 analyzer_config,
                 worker_count=4,
                 cluster_duplicate_check_interval=5,
                 threadpool=None):
        """ Initialise job queue by creating n Peekaboo workers to process
        samples.

        @param ruleset_config: the ruleset configuration
        @type ruleset_config: PeekabooConfigParser
        @param db_con: Database connection object for cluster instance
                       coordination, i.e. saving sample info.
        @type db_con: PeekabooDatabase
        @param worker_count: The number of workers to create. Defaults to 4.
        @type worker_count: int
        @param cluster_duplicate_check_interval: How long to wait inbetween
                                                 checks for stale cluster
                                                 duplicate locks.
        @type cluster_duplicate_check_interval: int
        @raises PeekabooConfigException: if an error occured in configuration.
        """
        self.db_con = db_con
        self.jobs = asyncio.Queue()
        self.workers = []
        self.worker_count = worker_count
        self.threadpool = threadpool

        # keep a backlog of samples with identities identical to samples
        # currently in analysis to avoid analysing multiple identical samples
        # simultaneously. Once one analysis has finished, we can submit the
        # others and the ruleset will notice that we already know the result.
        self.duplicates = {}
        self.duplock = asyncio.Lock()

        # keep a similar backlog of samples currently being processed by
        # other instances so we can regularly try to resubmit them and re-use
        # the other instances' cached results from the database
        self.cluster_duplicates = {}

        self.ruleset_engine = RulesetEngine(ruleset_config, self, db_con,
                                            analyzer_config, threadpool)

        # we start these here because they do no lengthy init and starting can
        # not fail. We need this here to avoid races in startup vs. shutdown by
        # signal to avoid continuing running in a half-inited/half-shutdown
        # state.
        for wno in range(0, self.worker_count):
            logger.debug("Create Worker %d", wno)
            worker = Worker(wno, self, self.ruleset_engine, db_con)
            self.workers.append(worker)

        logger.info('Created %d Workers.', self.worker_count)

        self.cluster_duplicate_handler = None
        if cluster_duplicate_check_interval:
            logger.debug(
                "Creating cluster duplicate handler with check "
                "interval %d.", cluster_duplicate_check_interval)
            self.cluster_duplicate_handler = ClusterDuplicateHandler(
                self, cluster_duplicate_check_interval)
        else:
            logger.debug("Disabling cluster duplicate handler.")

Beispiel #8

0

Datei anzeigen

Datei: queuing.py Projekt: scVENUS/PeekabooAV

class JobQueue:
    """ Peekaboo's queuing system. """
    def __init__(self,
                 ruleset_config,
                 db_con,
                 analyzer_config,
                 worker_count=4,
                 cluster_duplicate_check_interval=5,
                 threadpool=None):
        """ Initialise job queue by creating n Peekaboo workers to process
        samples.

        @param ruleset_config: the ruleset configuration
        @type ruleset_config: PeekabooConfigParser
        @param db_con: Database connection object for cluster instance
                       coordination, i.e. saving sample info.
        @type db_con: PeekabooDatabase
        @param worker_count: The number of workers to create. Defaults to 4.
        @type worker_count: int
        @param cluster_duplicate_check_interval: How long to wait inbetween
                                                 checks for stale cluster
                                                 duplicate locks.
        @type cluster_duplicate_check_interval: int
        @raises PeekabooConfigException: if an error occured in configuration.
        """
        self.db_con = db_con
        self.jobs = asyncio.Queue()
        self.workers = []
        self.worker_count = worker_count
        self.threadpool = threadpool

        # keep a backlog of samples with identities identical to samples
        # currently in analysis to avoid analysing multiple identical samples
        # simultaneously. Once one analysis has finished, we can submit the
        # others and the ruleset will notice that we already know the result.
        self.duplicates = {}
        self.duplock = asyncio.Lock()

        # keep a similar backlog of samples currently being processed by
        # other instances so we can regularly try to resubmit them and re-use
        # the other instances' cached results from the database
        self.cluster_duplicates = {}

        self.ruleset_engine = RulesetEngine(ruleset_config, self, db_con,
                                            analyzer_config, threadpool)

        # we start these here because they do no lengthy init and starting can
        # not fail. We need this here to avoid races in startup vs. shutdown by
        # signal to avoid continuing running in a half-inited/half-shutdown
        # state.
        for wno in range(0, self.worker_count):
            logger.debug("Create Worker %d", wno)
            worker = Worker(wno, self, self.ruleset_engine, db_con)
            self.workers.append(worker)

        logger.info('Created %d Workers.', self.worker_count)

        self.cluster_duplicate_handler = None
        if cluster_duplicate_check_interval:
            logger.debug(
                "Creating cluster duplicate handler with check "
                "interval %d.", cluster_duplicate_check_interval)
            self.cluster_duplicate_handler = ClusterDuplicateHandler(
                self, cluster_duplicate_check_interval)
        else:
            logger.debug("Disabling cluster duplicate handler.")

    async def start(self):
        """ Start up the job queue including resource initialisation. """
        awaitables = []
        for worker in self.workers:
            awaitables.append(await worker.start())

        if self.cluster_duplicate_handler:
            awaitables.append(await self.cluster_duplicate_handler.start())

        # create a single ruleset engine for all workers, instantiates all the
        # rules based on the ruleset configuration, may start up long-lived
        # analyzer instances which are shared as well, is otherwise stateless
        # to allow concurrent use by multiple worker
        try:
            awaitables.extend(await self.ruleset_engine.start())
        except (KeyError, ValueError, PeekabooConfigException) as error:
            self.shut_down()
            await self.close_down()
            raise PeekabooConfigException('Ruleset configuration error: %s' %
                                          error)
        except PeekabooRulesetConfigError as error:
            self.shut_down()
            await self.close_down()
            raise PeekabooConfigException(error)

        return awaitables

    async def submit(self, sample):
        """
        Adds a Sample object to the job queue.
        If the queue is full, we block for 300 seconds and then throw an
        exception.

        @param sample: The Sample object to add to the queue.
        @raises Full: if the queue is full.
        """
        identity = await sample.identity
        duplicate = None
        cluster_duplicate = None
        resubmit = None

        # we have to lock this down because async routines called from here may
        # allow us to be called again concurrently from the event loop
        async with self.duplock:
            # check if a sample with same identity is currently in flight
            duplicates = self.duplicates.get(identity)
            if duplicates is not None:
                # we are regularly resubmitting samples, e.g. after we've
                # noticed that cuckoo is finished analysing them. This
                # obviously isn't a duplicate but continued processing of the
                # same sample.
                if duplicates['master'] == sample:
                    resubmit = sample.id
                    await self.jobs.put(sample)
                else:
                    # record the to-be-submitted sample as duplicate and do
                    # nothing
                    duplicate = sample.id
                    duplicates['duplicates'].append(sample)
            else:
                # are we the first of potentially multiple instances working on
                # this sample?
                try:
                    locked = await self.db_con.mark_sample_in_flight(sample)
                except PeekabooDatabaseError as dberr:
                    logger.error(dberr)
                    return False

                if locked:
                    # initialise a per-duplicate backlog for this sample which
                    # also serves as in-flight marker and submit to queue
                    self.duplicates[identity] = {
                        'master': sample,
                        'duplicates': [],
                    }
                    await self.jobs.put(sample)
                else:
                    # another instance is working on this
                    if self.cluster_duplicates.get(identity) is None:
                        self.cluster_duplicates[identity] = []

                    cluster_duplicate = sample.id
                    self.cluster_duplicates[identity].append(sample)

        if duplicate is not None:
            logger.debug(
                "%d: Sample is duplicate and waiting for running analysis "
                "to finish", duplicate)
        elif cluster_duplicate is not None:
            logger.debug(
                "%d: Sample is concurrently processed by another instance "
                "and held", cluster_duplicate)
        elif resubmit is not None:
            logger.debug("%d: Resubmitted sample to job queue", resubmit)
        else:
            logger.debug("%d: New sample submitted to job queue", sample.id)

        return True

    async def submit_cluster_duplicates(self):
        """ Submit samples held while being processed by another cluster
        instance back into the job queue if they have finished processing. """
        if not self.cluster_duplicates.keys():
            return True

        submitted_cluster_duplicates = []

        async with self.duplock:
            # try to submit *all* samples which have been marked as being
            # processed by another instance concurrently
            # get the items view on a copy of the cluster duplicate backlog
            # because we will change it by removing entries which would raise a
            # RuntimeException
            cluster_duplicates = self.cluster_duplicates.copy().items()
            for identity, sample_duplicates in cluster_duplicates:
                # try to mark as in-flight
                try:
                    locked = await self.db_con.mark_sample_in_flight(
                        sample_duplicates[0])
                except PeekabooDatabaseError as dberr:
                    logger.error(dberr)
                    return False

                if locked:
                    if self.duplicates.get(identity) is not None:
                        logger.error(
                            "Possible backlog corruption for sample %d! "
                            "Please file a bug report. Trying to continue...",
                            sample.id)
                        continue

                    # submit one of the held-back samples as a new master
                    # analysis in case the analysis on the other instance
                    # failed and we have no result in the database yet. If all
                    # is well, this master should finish analysis very quickly
                    # using the stored result, causing all the duplicates to be
                    # submitted and finish quickly as well.
                    sample = sample_duplicates.pop()
                    self.duplicates[identity] = {
                        'master': sample,
                        'duplicates': sample_duplicates,
                    }
                    submitted_cluster_duplicates.append(sample.id)
                    await self.jobs.put(sample)
                    del self.cluster_duplicates[identity]

        if len(submitted_cluster_duplicates) > 0:
            logger.debug(
                "Submitted cluster duplicates (and potentially their "
                "duplicates) from backlog: %s", submitted_cluster_duplicates)

        return True

    async def clear_stale_in_flight_samples(self):
        """ Clear any stale in-flight sample logs from the database. """
        try:
            cleared = await self.db_con.clear_stale_in_flight_samples()
        except PeekabooDatabaseError as dberr:
            logger.error(dberr)
            cleared = False

        return cleared

    async def submit_duplicates(self, identity):
        """ Check if any samples have been held from processing as duplicates
        and submit them now. Clear the original sample whose duplicates have
        been submitted from the in-flight list.

        @param identity: identity of sample to check for duplicates
        """
        submitted_duplicates = []

        async with self.duplock:
            # duplicates which have been submitted from the backlog still
            # report done but do not get registered as potentially having
            # duplicates because we expect the ruleset to identify them as
            # already known and process them quickly now that the first
            # instance has gone through full analysis. Therefore we can ignore
            # them here.
            if identity not in self.duplicates:
                return

            # submit all samples which have accumulated in the backlog
            for sample in self.duplicates[identity]['duplicates']:
                submitted_duplicates.append(sample.id)
                await self.jobs.put(sample)

            sample = self.duplicates[identity]['master']
            try:
                await self.db_con.clear_sample_in_flight(sample)
            except PeekabooDatabaseError as dberr:
                logger.error(dberr)

            del self.duplicates[identity]

        logger.debug("%d: Cleared sample from in-flight list", sample.id)
        if len(submitted_duplicates) > 0:
            logger.debug("Submitted duplicates from backlog: %s",
                         submitted_duplicates)

    async def done(self, sample):
        """ Perform cleanup actions after sample processing is done:
        1. Submit held duplicates and
        2. notify request handler that sample processing is done.

        @param sample: The Sample object to post-process. """
        await self.submit_duplicates(await sample.identity)

    async def dequeue(self):
        """ Remove a sample from the queue. Used by the workers to get their
        work. Blocks indefinitely until some work is available. """
        return await self.jobs.get()

    def shut_down(self):
        """ Trigger a shutdown of the queue including the workers. """
        logger.info("Queue shutdown requested. Signalling workers.")

        if self.ruleset_engine is not None:
            self.ruleset_engine.shut_down()

        if self.cluster_duplicate_handler is not None:
            self.cluster_duplicate_handler.shut_down()

        # tell all workers to shut down
        for worker in self.workers:
            worker.shut_down()

    async def close_down(self):
        """ Wait for workers to stop and free up resources. """
        for worker in self.workers:
            await worker.close_down()

        if self.cluster_duplicate_handler is not None:
            await self.cluster_duplicate_handler.close_down()

        if self.ruleset_engine is not None:
            await self.ruleset_engine.close_down()

        logger.info("Queue shut down.")

Beispiel #9

0

Datei anzeigen

def run():
    """ Runs the Peekaboo daemon. """
    arg_parser = ArgumentParser(
        description=
        'Peekaboo Extended Email Attachment Behavior Observation Owl')
    arg_parser.add_argument('-c',
                            '--config',
                            action='store',
                            help='The configuration file for Peekaboo.')
    arg_parser.add_argument(
        '-d',
        '--debug',
        action='store_true',
        help=
        "Run Peekaboo in debug mode regardless of what's specified in the configuration."
    )
    arg_parser.add_argument(
        '-D',
        '--daemon',
        action='store_true',
        help=
        'Run Peekaboo in daemon mode (suppresses the logo to be written to STDOUT).'
    )
    args = arg_parser.parse_args()

    print('Starting Peekaboo %s.' % __version__)
    if not args.daemon:
        print(PEEKABOO_OWL)

    # Check if CLI arguments override the configuration
    log_level = None
    if args.debug:
        log_level = logging.DEBUG

    try:
        config = PeekabooConfig(config_file=args.config, log_level=log_level)
        logger.debug(config)
    except PeekabooConfigException as error:
        logging.critical(error)
        sys.exit(1)

    # find localisation in our package directory
    locale_domain = 'peekaboo'
    locale_dir = os.path.join(os.path.dirname(__file__), 'locale')
    languages = None
    if config.report_locale:
        logger.debug('Looking for translations for preconfigured locale "%s"',
                     config.report_locale)
        languages = [config.report_locale]
        if not gettext.find(locale_domain, locale_dir, languages):
            logger.warning('Translation file not found - falling back to '
                           'system configuration.')
            languages = None

    logger.debug('Installing report message translations')
    translation = gettext.translation(locale_domain,
                                      locale_dir,
                                      languages,
                                      fallback=True)
    # python2's gettext needs to be told explicitly to return unicode strings
    loc_kwargs = {}
    if sys.version_info[0] < 3:
        loc_kwargs = {'unicode': True}
    translation.install(loc_kwargs)

    # establish a connection to the database
    try:
        db_con = PeekabooDatabase(
            db_url=config.db_url,
            instance_id=config.cluster_instance_id,
            stale_in_flight_threshold=config.cluster_stale_in_flight_threshold,
            log_level=config.db_log_level)
    except PeekabooDatabaseError as error:
        logging.critical(error)
        sys.exit(1)
    except SQLAlchemyError as dberr:
        logger.critical(
            'Failed to establish a connection to the database '
            'at %s: %s', config.db_url, dberr)
        sys.exit(1)

    # Import debug module if we are in debug mode
    debugger = None
    if config.use_debug_module:
        from peekaboo.debug import PeekabooDebugger
        debugger = PeekabooDebugger()
        debugger.start()

    # initialize the daemon infrastructure such as PID file and dropping
    # privileges, automatically cleans up after itself when going out of scope
    daemon_infrastructure = PeekabooDaemonInfrastructure(
        config.pid_file, config.sock_file, config.user, config.group)
    daemon_infrastructure.init()

    systemd = SystemdNotifier()

    # clear all our in flight samples and all instances' stale in flight
    # samples
    db_con.clear_in_flight_samples()
    db_con.clear_stale_in_flight_samples()

    # a cluster duplicate interval of 0 disables the handler thread which is
    # what we want if we don't have an instance_id and therefore are alone
    cldup_check_interval = 0
    if config.cluster_instance_id > 0:
        cldup_check_interval = config.cluster_duplicate_check_interval
        if cldup_check_interval < 5:
            cldup_check_interval = 5
            logger.warning(
                "Raising excessively low cluster duplicate check "
                "interval to %d seconds.", cldup_check_interval)

    # workers of the job queue need the ruleset configuration to create the
    # ruleset engine with it
    try:
        ruleset_config = PeekabooConfigParser(config.ruleset_config)
    except PeekabooConfigException as error:
        logging.critical(error)
        sys.exit(1)

    # verify the ruleset configuration by spawning a ruleset engine and having
    # it verify it
    try:
        engine = RulesetEngine(ruleset_config, db_con)
    except (KeyError, ValueError, PeekabooConfigException) as error:
        logging.critical('Ruleset configuration error: %s', error)
        sys.exit(1)
    except PeekabooRulesetConfigError as error:
        logging.critical(error)
        sys.exit(1)

    job_queue = JobQueue(worker_count=config.worker_count,
                         ruleset_config=ruleset_config,
                         db_con=db_con,
                         cluster_duplicate_check_interval=cldup_check_interval)

    if config.cuckoo_mode == "embed":
        cuckoo = CuckooEmbed(job_queue, config.cuckoo_exec,
                             config.cuckoo_submit, config.cuckoo_storage,
                             config.interpreter)
    # otherwise it's the new API method and default
    else:
        cuckoo = CuckooApi(job_queue, config.cuckoo_url,
                           config.cuckoo_api_token,
                           config.cuckoo_poll_interval)

    sig_handler = SignalHandler()
    sig_handler.register_listener(cuckoo)

    # Factory producing almost identical samples providing them with global
    # config values and references to other objects they need, such as cuckoo,
    # database connection and connection map.
    sample_factory = SampleFactory(cuckoo, config.sample_base_dir,
                                   config.job_hash_regex,
                                   config.keep_mail_data,
                                   config.processing_info_dir)

    # We only want to accept 2 * worker_count connections.
    try:
        server = PeekabooServer(sock_file=config.sock_file,
                                job_queue=job_queue,
                                sample_factory=sample_factory,
                                request_queue_size=config.worker_count * 2)
    except Exception as error:
        logger.critical('Failed to start Peekaboo Server: %s', error)
        job_queue.shut_down()
        if debugger is not None:
            debugger.shut_down()
        sys.exit(1)

    exit_code = 1
    try:
        systemd.notify("READY=1")
        # If this dies Peekaboo dies, since this is the main thread. (legacy)
        exit_code = cuckoo.do()
    except Exception as error:
        logger.critical('Main thread aborted: %s', error)
    finally:
        server.shutdown()
        job_queue.shut_down()
        try:
            db_con.clear_in_flight_samples()
            db_con.clear_stale_in_flight_samples()
        except PeekabooDatabaseError as dberr:
            logger.error(dberr)

        if debugger is not None:
            debugger.shut_down()

    sys.exit(exit_code)

Beispiel #10

0

Datei anzeigen

class JobQueue:
    """ Peekaboo's queuing system. """
    def __init__(self,
                 ruleset_config,
                 db_con,
                 analyzer_config,
                 worker_count=4,
                 queue_timeout=300,
                 shutdown_timeout=60,
                 cluster_duplicate_check_interval=5):
        """ Initialise job queue by creating n Peekaboo worker threads to
        process samples.

        @param ruleset_config: the ruleset configuration
        @type ruleset_config: PeekabooConfigParser
        @param db_con: Database connection object for cluster instance
                       coordination, i.e. saving sample info.
        @type db_con: PeekabooDatabase
        @param worker_count: The amount of worker threads to create. Defaults
                             to 4.
        @type worker_count: int
        @param queue_timeout: How long to block before considering queueing
                              failed.
        @type queue_timeout: int
        @param shutdown_timeout: How long to block before considering shutdown
                                 failed.
        @type shutdown_timeout: int
        @param cluster_duplicate_check_interval: How long to wait inbetween
                                                 checks for stale cluster
                                                 duplicate locks.
        @type cluster_duplicate_check_interval: int
        @raises PeekabooConfigException: if an error occured in configuration.
        """
        self.db_con = db_con
        self.jobs = queue.Queue()
        self.workers = []
        self.worker_count = worker_count
        self.queue_timeout = queue_timeout
        self.shutdown_timeout = shutdown_timeout

        # keep a backlog of samples with hashes identical to samples currently
        # in analysis to avoid analysing multiple identical samples
        # simultaneously. Once one analysis has finished, we can submit the
        # others and the ruleset will notice that we already know the result.
        self.duplicates = {}
        self.duplock = threading.Lock()

        # keep a similar backlog of samples currently being processed by
        # other instances so we can regularly try to resubmit them and re-use
        # the other instances' cached results from the database
        self.cluster_duplicates = {}

        self.ruleset_engine = RulesetEngine(ruleset_config, self, db_con,
                                            analyzer_config)

        # we start these here because they do no lengthy init and starting can
        # not fail. We need this here to avoid races in startup vs. shutdown by
        # signal to avoid continuing running in a half-inited/half-shutdown
        # state.
        for wno in range(0, self.worker_count):
            logger.debug("Create Worker %d", wno)
            worker = Worker(wno, self, self.ruleset_engine, db_con)
            self.workers.append(worker)

        logger.info('Created %d Workers.', self.worker_count)

        self.cluster_duplicate_handler = None
        if cluster_duplicate_check_interval:
            logger.debug(
                "Starting cluster duplicate handler thread with check "
                "interval %d.", cluster_duplicate_check_interval)
            self.cluster_duplicate_handler = ClusterDuplicateHandler(
                self, cluster_duplicate_check_interval)
        else:
            logger.debug("Disabling cluster duplicate handler thread.")

    def start(self):
        """ Start up the job queue including resource initialisation. """
        for worker in self.workers:
            worker.start()

        if self.cluster_duplicate_handler:
            self.cluster_duplicate_handler.start()

        # create a single ruleset engine for all workers, instantiates all the
        # rules based on the ruleset configuration, may start up long-lived
        # analyzer instances which are shared as well, is otherwise stateless
        # to allow concurrent use by multiple worker threads
        try:
            self.ruleset_engine.start()
        except (KeyError, ValueError, PeekabooConfigException) as error:
            self.shut_down()
            self.close_down()
            raise PeekabooConfigException('Ruleset configuration error: %s' %
                                          error)
        except PeekabooRulesetConfigError as error:
            self.shut_down()
            self.close_down()
            raise PeekabooConfigException(error)

    def submit(self, sample, submitter):
        """
        Adds a Sample object to the job queue.
        If the queue is full, we block for 300 seconds and then throw an
        exception.

        @param sample: The Sample object to add to the queue.
        @param submitter: The name of the class / module that wants to submit
                          the sample.
        @raises Full: if the queue is full.
        """
        sample_hash = sample.sha256sum
        sample_str = "%s" % sample
        duplicate = None
        cluster_duplicate = None
        resubmit = None
        # we have to lock this down because apart from callbacks from our
        # Workers we're also called from the ThreadingUnixStreamServer
        with self.duplock:
            # check if a sample with same hash is currently in flight
            duplicates = self.duplicates.get(sample_hash)
            if duplicates is not None:
                # we are regularly resubmitting samples, e.g. after we've
                # noticed that cuckoo is finished analysing them. This
                # obviously isn't a duplicate but continued processing of the
                # same sample.
                if duplicates['master'] == sample:
                    resubmit = sample_str
                    self.jobs.put(sample, True, self.queue_timeout)
                else:
                    # record the to-be-submitted sample as duplicate and do
                    # nothing
                    duplicate = sample_str
                    duplicates['duplicates'].append(sample)
            else:
                # are we the first of potentially multiple instances working on
                # this sample?
                try:
                    locked = self.db_con.mark_sample_in_flight(sample)
                except PeekabooDatabaseError as dberr:
                    logger.error(dberr)
                    return False

                if locked:
                    # initialise a per-duplicate backlog for this sample which
                    # also serves as in-flight marker and submit to queue
                    self.duplicates[sample_hash] = {
                        'master': sample,
                        'duplicates': [],
                    }
                    self.jobs.put(sample, True, self.queue_timeout)
                else:
                    # another instance is working on this
                    if self.cluster_duplicates.get(sample_hash) is None:
                        self.cluster_duplicates[sample_hash] = []

                    cluster_duplicate = sample_str
                    self.cluster_duplicates[sample_hash].append(sample)

        if duplicate:
            logger.debug(
                "Sample from %s is duplicate and waiting for running "
                "analysis to finish: %s", submitter, duplicate)
        elif cluster_duplicate:
            logger.debug(
                "Sample from %s is concurrently processed by another "
                "instance and held: %s", submitter, cluster_duplicate)
        elif resubmit:
            logger.debug("Resubmitted sample to job queue for %s: %s",
                         submitter, resubmit)
        else:
            logger.debug("New sample submitted to job queue by %s. %s",
                         submitter, sample_str)

        return True

    def submit_cluster_duplicates(self):
        """ Submit samples held while being processed by another cluster
        instance back into the job queue if they have finished processing. """
        if not self.cluster_duplicates.keys():
            return True

        submitted_cluster_duplicates = []

        with self.duplock:
            # try to submit *all* samples which have been marked as being
            # processed by another instance concurrently
            # get the items view on a copy of the cluster duplicate backlog
            # because we will change it by removing entries which would raise a
            # RuntimeException
            cluster_duplicates = self.cluster_duplicates.copy().items()
            for sample_hash, sample_duplicates in cluster_duplicates:
                # try to mark as in-flight
                try:
                    locked = self.db_con.mark_sample_in_flight(
                        sample_duplicates[0])
                except PeekabooDatabaseError as dberr:
                    logger.error(dberr)
                    return False

                if locked:
                    sample_str = "%s" % sample_duplicates[0]
                    if self.duplicates.get(sample_hash) is not None:
                        logger.error(
                            "Possible backlog corruption for sample %s! "
                            "Please file a bug report. Trying to continue...",
                            sample_str)
                        continue

                    # submit one of the held-back samples as a new master
                    # analysis in case the analysis on the other instance
                    # failed and we have no result in the database yet. If all
                    # is well, this master should finish analysis very quickly
                    # using the stored result, causing all the duplicates to be
                    # submitted and finish quickly as well.
                    sample = sample_duplicates.pop()
                    self.duplicates[sample_hash] = {
                        'master': sample,
                        'duplicates': sample_duplicates,
                    }
                    submitted_cluster_duplicates.append(sample_str)
                    self.jobs.put(sample, True, self.queue_timeout)
                    del self.cluster_duplicates[sample_hash]

        if len(submitted_cluster_duplicates) > 0:
            logger.debug(
                "Submitted cluster duplicates (and potentially their "
                "duplicates) from backlog: %s", submitted_cluster_duplicates)

        return True

    def clear_stale_in_flight_samples(self):
        """ Clear any stale in-flight sample logs from the database. """
        try:
            cleared = self.db_con.clear_stale_in_flight_samples()
        except PeekabooDatabaseError as dberr:
            logger.error(dberr)
            cleared = False

        return cleared

    def submit_duplicates(self, sample_hash):
        """ Check if any samples have been held from processing as duplicates
        and submit them now. Clear the original sample whose duplicates have
        been submitted from the in-flight list.

        @param sample_hash: Hash of sample to check for duplicates
        """
        submitted_duplicates = []
        with self.duplock:
            # duplicates which have been submitted from the backlog still
            # report done but do not get registered as potentially having
            # duplicates because we expect the ruleset to identify them as
            # already known and process them quickly now that the first
            # instance has gone through full analysis. Therefore we can ignore
            # them here.
            if sample_hash not in self.duplicates:
                return

            # submit all samples which have accumulated in the backlog
            for sample in self.duplicates[sample_hash]['duplicates']:
                submitted_duplicates.append("%s" % sample)
                self.jobs.put(sample, True, self.queue_timeout)

            sample = self.duplicates[sample_hash]['master']
            try:
                self.db_con.clear_sample_in_flight(sample)
            except PeekabooDatabaseError as dberr:
                logger.error(dberr)

            sample_str = "%s" % sample
            del self.duplicates[sample_hash]

        logger.debug("Cleared sample %s from in-flight list", sample_str)
        if len(submitted_duplicates) > 0:
            logger.debug("Submitted duplicates from backlog: %s",
                         submitted_duplicates)

    def done(self, sample):
        """ Perform cleanup actions after sample processing is done:
        1. Submit held duplicates and
        2. notify request handler thread that sample processing is done.

        @param sample: The Sample object to post-process. """
        self.submit_duplicates(sample.sha256sum)

        # now that this sample is really done and cleared from the queue, tell
        # its connection handler about it
        sample.mark_done()

    def dequeue(self):
        """ Remove a sample from the queue. Used by the workers to get their
        work. Blocks indefinitely until some work is available. If we want to
        wake the workers for some other reason, we send them a None item as
        ping. """
        return self.jobs.get(True)

    def shut_down(self):
        """ Trigger a shutdown of the queue including the workers. """
        logger.info("Queue shutdown requested. Signalling workers.")

        if self.ruleset_engine is not None:
            self.ruleset_engine.shut_down()

        if self.cluster_duplicate_handler is not None:
            self.cluster_duplicate_handler.shut_down()

        # tell all workers to shut down
        for worker in self.workers:
            worker.shut_down()

        # put a ping for each worker on the queue. Since they already all know
        # that they're supposed to shut down, each of them will only remove
        # one item from the queue and then exit, leaving the others for their
        # colleagues. For this reason this loop can't be folded into the above!
        for worker in self.workers:
            self.jobs.put(None)

    def close_down(self, timeout=None):
        """ Wait for workers to stop and free up resources. """
        if not timeout:
            timeout = self.shutdown_timeout

        logger.info("Closing down. Giving workers %d seconds to stop", timeout)

        # wait for workers to end
        interval = 1
        for attempt in range(1, timeout // interval + 1):
            still_running = []
            for worker in self.workers:
                if worker.is_alive():
                    still_running.append(worker)

            self.workers = still_running
            if len(self.workers) == 0:
                break

            time.sleep(interval)
            logger.debug('%d: %d workers still running', attempt,
                         len(self.workers))

        if len(self.workers) > 0:
            logger.error("Some workers refused to stop.")

        if self.cluster_duplicate_handler is not None:
            self.cluster_duplicate_handler.join()
        if self.ruleset_engine is not None:
            self.ruleset_engine.close_down()

Beispiel #11

0

Datei anzeigen

Datei: queuing.py Projekt: seclab-int-dev-group/PeekabooAV

    def run(self):
        self.running_flag.set()
        while not self.shutdown_requested.is_set():
            logger.debug('Worker %d: Ready', self.worker_id)

            try:
                # wait blocking for next job (thread safe) with timeout
                sample = self.job_queue.dequeue()
            except Empty:
                continue

            if sample is None:
                # we just got pinged
                continue

            logger.info('Worker %d: Processing sample %s', self.worker_id,
                        sample)

            # The following used to be one big try/except block catching any
            # exception. This got complicated because in the case of
            # CuckooReportPending we use exceptions for control flow as well
            # (which might be questionable in itself). Instead of catching,
            # logging and ignoring errors here if workers start to die again
            # because of uncaught exceptions we should improve error handling
            # in the subroutines causing it.

            if not sample.init():
                logger.error('Sample initialization failed')
                sample.add_rule_result(
                    RuleResult("Worker",
                               result=Result.failed,
                               reason=_("Sample initialization failed"),
                               further_analysis=False))
                self.job_queue.done(sample.sha256sum)
                continue

            engine = RulesetEngine(self.ruleset_config, self.db_con)
            try:
                engine.run(sample)
            except PeekabooAnalysisDeferred:
                logger.debug("Report for sample %s still pending", sample)
                continue

            if sample.result >= Result.failed:
                sample.dump_processing_info()

            if sample.result != Result.failed:
                logger.debug('Saving results to database')
                try:
                    self.db_con.analysis_save(sample)
                except PeekabooDatabaseError as dberr:
                    logger.error(
                        'Failed to save analysis result to '
                        'database: %s', dberr)
                    # no showstopper, we can limp on without caching in DB
            else:
                logger.debug('Not saving results of failed analysis')

            sample.cleanup()
            self.job_queue.done(sample)

        logger.info('Worker %d: Stopped' % self.worker_id)
        self.running_flag.clear()