Beispiel #1
0
    def sample_info_update(self, sample):
        """
        Update sample information.

        :param sample: The sample object containing the info to update.
        """
        with self.__lock:
            session = self.__Session()
            sample_info = PeekabooDatabase.__get(
                session,
                SampleInfo,
                sha256sum=sample.sha256sum,
                file_extension=sample.file_extension)
            if sample_info is not None:
                sample_info.result = PeekabooDatabase.__get_or_create(
                    session, AnalysisResult, name=sample.get_result().name)
                sample_info.reason = sample.reason
                try:
                    session.commit()
                    logger.debug(
                        'Updated sample info in the database for sample %s.' %
                        sample)
                except SQLAlchemyError as e:
                    session.rollback()
                    raise PeekabooDatabaseError(
                        'Failed to update info for sample %s in the database: %s'
                        % (sample, e))
                finally:
                    session.close()
            else:
                raise PeekabooDatabaseError(
                    'No info found in the database for sample %s' % sample)
Beispiel #2
0
    def clear_sample_in_flight(self, sample, instance_id=None):
        """
        Clear the mark that a sample is being processed by an instance.

        @param sample: The sample to clear from in-flight list.
        @param instance_id: (optionally) The ID of the instance that is
                            handling this sample. Default: Us.
        """
        # an instance id of 0 denotes that we're alone and don't need to track
        # in-flight samples in the database
        if self.instance_id == 0:
            return

        # use our own instance id if none is given
        if instance_id is None:
            instance_id = self.instance_id

        sha256sum = sample.sha256sum

        attempt = 1
        while attempt <= self.retries:
            session = self.__session()

            # clear in-flight marker from database
            query = session.query(InFlightSample).filter(
                InFlightSample.sha256sum == sha256sum).filter(
                    InFlightSample.instance_id == instance_id)

            try:
                # delete() is not queued and goes to the DB before commit()
                cleared = query.delete()
                session.commit()
            except (OperationalError, DBAPIError, SQLAlchemyError) as error:
                session.rollback()

                attempt = self.was_transient_error(
                    error, attempt,
                    'clearing in-flight status of sample %s' % sha256sum)
                if attempt > 0:
                    continue

                raise PeekabooDatabaseError('Unable to clear in-flight status '
                                            'of sample: %s' % error)
            finally:
                session.close()

            break

        if cleared == 0:
            raise PeekabooDatabaseError('Unexpected inconsistency: Sample %s '
                                        'not recoreded as in-flight upon '
                                        'clearing flag.' % sha256sum)
        elif cleared > 1:
            raise PeekabooDatabaseError('Unexpected inconsistency: Multiple '
                                        'instances of sample %s in-flight '
                                        'status cleared against database '
                                        'constraints!?' % sha256sum)

        logger.debug('Cleared sample %s from in-flight list', sha256sum)
Beispiel #3
0
    async def clear_sample_in_flight(self, sample, instance_id=None):
        """
        Clear the mark that a sample is being processed by an instance.

        @param sample: The sample to clear from in-flight list.
        @param instance_id: (optionally) The ID of the instance that is
                            handling this sample. Default: Us.
        """
        # an instance id of 0 denotes that we're alone and don't need to track
        # in-flight samples in the database
        if self.instance_id == 0:
            return

        # use our own instance id if none is given
        if instance_id is None:
            instance_id = self.instance_id

        statement = sqlalchemy.sql.expression.delete(InFlightSample).where(
            InFlightSample.identity == await sample.identity).where(
                InFlightSample.instance_id == instance_id)

        attempt = 1
        cleared = 0
        while attempt <= self.retries:
            async with self.__session_factory() as session:
                try:
                    # clear in-flight marker from database
                    marker = await session.execute(statement)
                    await session.commit()
                    cleared = marker.rowcount
                    break
                except (OperationalError, DBAPIError,
                        SQLAlchemyError) as error:
                    await session.rollback()

                    attempt, delay = self.was_transient_error(
                        error, attempt, 'clearing in-flight status of '
                        'sample %d' % sample.id)

                    if attempt < 0:
                        raise PeekabooDatabaseError(
                            '%d: Unable to clear in-flight status of sample: '
                            '%s' % (sample.id, error))

            await asyncio.sleep(delay)

        if cleared == 0:
            raise PeekabooDatabaseError(
                '%d: Unexpected inconsistency: Sample not recorded as '
                'in-flight upon clearing flag.' % sample.id)
        elif cleared > 1:
            raise PeekabooDatabaseError(
                '%d: Unexpected inconsistency: Multiple instances of sample '
                'in-flight status cleared against database constraints!?' %
                sample.id)
Beispiel #4
0
    def analysis_save(self, sample):
        """
        Save an analysis task to the analysis journal in the database.

        @param sample: The sample object for this analysis task.
        """
        sample_info = SampleInfo(sha256sum=sample.sha256sum,
                                 file_extension=sample.file_extension,
                                 analysis_time=datetime.now(),
                                 result=sample.result,
                                 reason=sample.reason)

        with self.__lock:
            attempt = 1
            while attempt <= self.retries:
                session = self.__session()
                session.add(sample_info)
                try:
                    session.commit()
                except (OperationalError, DBAPIError,
                        SQLAlchemyError) as error:
                    session.rollback()

                    attempt = self.was_transient_error(
                        error, attempt, 'saving analysis result')
                    if attempt > 0:
                        continue

                    raise PeekabooDatabaseError(
                        'Failed to add analysis task to the database: %s' %
                        error)
                finally:
                    session.close()

                break
Beispiel #5
0
    async def analysis_update(self, sample):
        """
        Update an analysis task in the analysis journal in the database.

        @param sample: The sample object for this analysis task.
        """
        statement = sqlalchemy.sql.expression.update(SampleInfo).where(
            SampleInfo.id == sample.id).values(state=sample.state,
                                               result=sample.result,
                                               reason=sample.reason)

        attempt = 1
        delay = 0
        while attempt <= self.retries:
            async with self.__session_factory() as session:
                try:
                    await session.execute(statement)
                    await session.commit()
                    break
                except (OperationalError, DBAPIError,
                        SQLAlchemyError) as error:
                    await session.rollback()

                    attempt, delay = self.was_transient_error(
                        error, attempt, 'updating analysis')

                    if attempt < 0:
                        raise PeekabooDatabaseError(
                            'Failed to update analysis task in the database: %s'
                            % error)

            await asyncio.sleep(delay)
Beispiel #6
0
 def drop(self):
     """ Drop all tables of the database. """
     try:
         Base.metadata.drop_all(self.__engine)
     except SQLAlchemyError as error:
         raise PeekabooDatabaseError(
             'Unable to drop all tables of the database: %s' % error)
Beispiel #7
0
 def _init_db(self):
     """
     Initializes the Peekaboo database by creating tables and
     writing meta information to the '_meta' table.
     """
     Base.metadata.create_all(self.__engine)
     meta = PeekabooMetadata()
     meta.peekaboo_version = __version__
     meta.db_schema_version = self.db_schema_version
     # TODO: Get Cuckoo version.
     meta.cuckoo_version = '2.0'
     session = self.__Session()
     session.add(meta)
     '''
     session.add_all([
         AnalysisResult(name='inProgress'),
         AnalysisResult(name='unchecked'),
         AnalysisResult(name='unknown'),
         AnalysisResult(name='ignored'),
         AnalysisResult(name='checked'),
         AnalysisResult(name='good'),
         AnalysisResult(name='bad'),
     ])
     '''
     try:
         session.commit()
     except SQLAlchemyError as e:
         session.rollback()
         raise PeekabooDatabaseError('Cannot initialize the database: %s' %
                                     e)
     finally:
         session.close()
Beispiel #8
0
    def analysis_save(self, sample):
        """
        Save an analysis task to the analysis journal in the database.

        @param sample: The sample object for this analysis task.
        """
        analysis = AnalysisJournal()
        analysis.job_hash = sample.job_hash
        analysis.cuckoo_job_id = sample.job_id
        analysis.filename = sample.filename
        analysis.analysis_time = datetime.now()
        sample_info = self.sample_info_fetch(sample)
        if sample_info is None:
            sample_info = SampleInfo(sha256sum=sample.sha256sum,
                                     file_extension=sample.file_extension,
                                     result=sample.result,
                                     reason=sample.reason)

        analysis.sample = sample_info

        with self.__lock:
            session = self.__session()
            session.add(analysis)
            try:
                session.commit()
            except SQLAlchemyError as error:
                session.rollback()
                raise PeekabooDatabaseError(
                    'Failed to add analysis task to the database: %s' % error)
            finally:
                session.close()
Beispiel #9
0
 def clear_in_progress(self):
     """ Remove all samples with the result 'inProgress'. """
     session = self.__Session()
     in_progress = PeekabooDatabase.__get(session,
                                          AnalysisResult,
                                          name='inProgress')
     in_progress_samples = session.query(SampleInfo).filter_by(
         result=in_progress)
     # The direct approach does not work currently with message:
     #   in_() not yet supported for relationships.  For a simple many-to-one,
     #   use in_() against the set of foreign key values.
     # This is what we do below.
     #session.query(AnalysisJournal).filter(
     #        AnalysisJournal.sample.in_(in_progress_samples)).delete()
     sample_ids = [s.id for s in in_progress_samples.all()]
     if sample_ids:
         session.query(AnalysisJournal).filter(
             AnalysisJournal.sample_id.in_(sample_ids)).delete(
                 synchronize_session=False)
         in_progress_samples.delete()
     try:
         session.commit()
         logger.debug('Cleared the database from "inProgress" entries.')
     except SQLAlchemyError as e:
         session.rollback()
         raise PeekabooDatabaseError(
             'Unable to clear the database from "inProgress" entries: %s' %
             e)
     finally:
         session.close()
Beispiel #10
0
 def clear_in_progress(self):
     """ Remove all samples with the result 'inProgress'. """
     session = self.__Session()
     in_progress = PeekabooDatabase.__get(
         session,
         AnalysisResult,
         name='inProgress'
     )
     in_progress_samples = session.query(SampleInfo).filter_by(
         result=in_progress
     ).all()
     for in_progress_sample in in_progress_samples:
         session.query(AnalysisJournal).filter_by(
             sample=in_progress_sample
         ).delete()
     try:
         session.commit()
         logger.debug('Cleared the database from "inProgress" entries.')
     except SQLAlchemyError as e:
         session.rollback()
         raise PeekabooDatabaseError(
             'Unable to clear the database from "inProgress" entries: %s' % e
         )
     finally:
         session.close()
Beispiel #11
0
    def analysis_update(self, sample):
        """
        Update an analysis task in the database.
        This method is called if a sample object was processed by Cuckoo and therefore
        has a Cuckoo job ID, which we want to store in the database.

        :param sample: The sample object containing the info to update.
        """
        with self.__lock:
            session = self.__Session()
            analysis = self.__get(session,
                                  AnalysisJournal,
                                  job_hash=sample.get_job_hash(),
                                  filename=sample.get_filename())
            if analysis:
                analysis.cuckoo_job_id = sample.job_id
                session.add(analysis)
                try:
                    session.commit()
                except SQLAlchemyError as e:
                    session.rollback()
                    raise PeekabooDatabaseError(
                        'Failed to update analysis task in the database: %s' %
                        e)
                finally:
                    session.close()
Beispiel #12
0
    def clear_stale_in_flight_samples(self):
        """
        Clear all in-flight markers that are too old and therefore stale. This
        detects instances which are locked up, crashed or shut down.
        """
        # an instance id of 0 denotes that we're alone and don't need to track
        # in-flight samples in the database
        if self.instance_id == 0:
            return True

        logger.debug(
            'Clearing database of all stale in-flight samples '
            '(%d seconds)', self.stale_in_flight_threshold)

        attempt = 1
        while attempt <= self.retries:
            session = self.__session()

            # delete only the locks of a specific instance
            query = session.query(InFlightSample).filter(
                InFlightSample.start_time <= datetime.utcnow() -
                timedelta(seconds=self.stale_in_flight_threshold))
            try:
                # the loop triggers the query, so only do it if debugging is
                # enabled
                if logger.isEnabledFor(logging.DEBUG):
                    # obviously there's a race between logging and actual
                    # delete here, use with caution, compare with actual number
                    # of markers cleared below before relying on it for
                    # debugging
                    for stale in query:
                        logger.debug('Stale in-flight marker to clear: %s',
                                     stale)

                # delete() is not queued and goes to the DB before commit()
                cleared = query.delete()
                session.commit()
                if cleared > 0:
                    logger.warning('%d stale in-flight samples cleared.',
                                   cleared)
            except (OperationalError, DBAPIError, SQLAlchemyError) as error:
                session.rollback()

                attempt = self.was_transient_error(
                    error, attempt,
                    'clearing the database of stale in-flight samples')
                if attempt > 0:
                    continue

                raise PeekabooDatabaseError(
                    'Unable to clear the database of stale in-flight '
                    'samples: %s' % error)
            finally:
                session.close()

            break

        return cleared > 0
Beispiel #13
0
    def clear_in_flight_samples(self, instance_id=None):
        """
        Clear all in-flight markers left over by previous runs or other
        instances by removing them from the lock table.

        @param instance_id: Clear our own (None), another instance's (positive
                            integer) or all instances' (negative integer) locks.
                            Since an instance_id of 0 disables in-flight sample
                            tracking, no instance will ever set a marker with
                            that ID so that specifying 0 here will amount to a
                            no-op or rather clean-up of invalid entries.
        """
        # an instance id of 0 denotes that we're alone and don't need to track
        # in-flight samples
        if self.instance_id == 0:
            return

        # use our own instance id if none is given
        if instance_id is None:
            instance_id = self.instance_id

        attempt = 1
        while attempt <= self.retries:
            session = self.__session()

            if instance_id < 0:
                # delete all locks
                query = session.query(InFlightSample)
                logger.debug('Clearing database of all in-flight samples.')
            else:
                # delete only the locks of a specific instance
                query = session.query(InFlightSample).filter(
                    InFlightSample.instance_id == instance_id)
                logger.debug(
                    'Clearing database of all in-flight samples of '
                    'instance %d.', instance_id)
            try:
                # delete() is not queued and goes to the DB before commit()
                query.delete()
                session.commit()
            except (OperationalError, DBAPIError, SQLAlchemyError) as error:
                session.rollback()

                attempt = self.was_transient_error(
                    error, attempt, 'clearing database of in-flight samples')
                if attempt > 0:
                    continue

                raise PeekabooDatabaseError('Unable to clear the database of '
                                            'in-flight samples: %s' % error)
            finally:
                session.close()

            break
Beispiel #14
0
    async def clear_in_flight_samples(self, instance_id=None):
        """
        Clear all in-flight markers left over by previous runs or other
        instances by removing them from the lock table.

        @param instance_id: Clear our own (None), another instance's (positive
                            integer) or all instances' (negative integer) locks.
                            Since an instance_id of 0 disables in-flight sample
                            tracking, no instance will ever set a marker with
                            that ID so that specifying 0 here will amount to a
                            no-op or rather clean-up of invalid entries.
        """
        # an instance id of 0 denotes that we're alone and don't need to track
        # in-flight samples
        if self.instance_id == 0:
            return

        # use our own instance id if none is given
        if instance_id is None:
            instance_id = self.instance_id

        if instance_id < 0:
            # delete all locks
            statement = sqlalchemy.sql.expression.delete(InFlightSample)
            logger.debug('Clearing database of all in-flight samples.')
        else:
            # delete only the locks of a specific instance
            statement = sqlalchemy.sql.expression.delete(InFlightSample).where(
                InFlightSample.instance_id == instance_id)
            logger.debug(
                'Clearing database of all in-flight samples of '
                'instance %d.', instance_id)

        attempt = 1
        while attempt <= self.retries:
            async with self.__session_factory() as session:
                try:
                    await session.execute(statement)
                    await session.commit()
                    break
                except (OperationalError, DBAPIError,
                        SQLAlchemyError) as error:
                    await session.rollback()

                    attempt, delay = self.was_transient_error(
                        error, attempt,
                        'clearing database of in-flight samples')

                    if attempt < 0:
                        raise PeekabooDatabaseError(
                            'Unable to clear the database of in-flight '
                            'samples: %s' % error)

            await asyncio.sleep(delay)
Beispiel #15
0
    def mark_sample_in_flight(self, sample, instance_id=None, start_time=None):
        """
        Mark a sample as in flight, i.e. being worked on by an instance.

        @param sample: The sample to mark as in flight.
        @param instance_id: (optionally) The ID of the instance that is
                            handling this sample. Default: Us.
        @param start_time: Override the time the marker was placed for
                           debugging purposes.
        """
        # an instance id of 0 denotes that we're alone and don't need to track
        # in-flight samples in the database
        if self.instance_id == 0:
            return True

        # use our own instance id if none is given
        if instance_id is None:
            instance_id = self.instance_id

        if start_time is None:
            start_time = datetime.utcnow()

        session = self.__session()

        # try to mark this sample as in flight in an atomic insert operation
        sha256sum = sample.sha256sum
        session.add(
            InFlightSample(sha256sum=sha256sum,
                           instance_id=instance_id,
                           start_time=start_time))

        locked = False
        try:
            session.commit()
            locked = True
            logger.debug('Marked sample %s as in flight', sha256sum)
        # duplicate primary key == entry already exists
        except IntegrityError:
            session.rollback()
            logger.debug('Sample %s is already in flight on another instance',
                         sha256sum)
        except SQLAlchemyError as error:
            session.rollback()
            raise PeekabooDatabaseError('Unable to mark sample as in flight: '
                                        '%s' % error)
        finally:
            session.close()

        return locked
Beispiel #16
0
    async def start(self):
        attempt = 1
        delay = 0
        while attempt <= self.retries:
            async with self.__engine.begin() as conn:
                try:
                    await conn.run_sync(Base.metadata.create_all)
                    break
                except (OperationalError, DBAPIError,
                        SQLAlchemyError) as error:
                    attempt, delay = self.was_transient_error(
                        error, attempt, 'create metadata')

                    if attempt < 0:
                        raise PeekabooDatabaseError(
                            'Failed to create schema in database: %s' % error)

            await asyncio.sleep(delay)
Beispiel #17
0
    async def analysis_add(self, sample):
        """
        Add an analysis task to the analysis journal in the database.

        @param sample: The sample object for this analysis task.
        @returns: ID of the newly created analysis task (also updated
                  in the sample)
        """
        sample_info = SampleInfo(state=sample.state,
                                 sha256sum=sample.sha256sum,
                                 file_extension=sample.file_extension,
                                 analysis_time=datetime.now(),
                                 result=sample.result,
                                 reason=sample.reason)

        job_id = None
        attempt = 1
        delay = 0
        while attempt <= self.retries:
            async with self.__async_session_factory_modify() as session:
                session.add(sample_info)
                try:
                    # flush to retrieve the automatically assigned primary
                    # key value
                    await session.flush()
                    job_id = sample_info.id
                    await session.commit()
                    break
                except (OperationalError, DBAPIError,
                        SQLAlchemyError) as error:
                    await session.rollback()

                    attempt, delay = self.was_transient_error(
                        error, attempt, 'adding analysis')

                    if attempt < 0:
                        raise PeekabooDatabaseError(
                            'Failed to add analysis task to the database: %s' %
                            error)

            await asyncio.sleep(delay)

        sample.update_id(job_id)
        return job_id
Beispiel #18
0
    def analysis2db(self, sample):
        """
        Save an analysis task to the analysis journal in the database.

        :param sample: The sample object for this analysis task.
        """
        with self.__lock:
            session = self.__Session()
            analysis = AnalysisJournal()
            analysis.job_hash = sample.get_job_hash()
            analysis.cuckoo_job_id = sample.job_id
            analysis.filename = sample.get_filename()
            analysis.analyses_time = datetime.strptime(sample.analyses_time,
                                                       "%Y%m%dT%H%M%S")
            analysis_result = PeekabooDatabase.__get_or_create(
                session,
                AnalysisResult,
                name=sample.get_result().name
            )
            # NOTE: We cannot determine if a known sample is inProgress again.
            s = PeekabooDatabase.__get(
                session,
                SampleInfo,
                sha256sum=sample.sha256sum,
                file_extension=sample.file_extension,
            )
            if s is None:
                s = PeekabooDatabase.__create(
                    SampleInfo,
                    sha256sum=sample.sha256sum,
                    file_extension=sample.file_extension,
                    result=analysis_result
                )
            analysis.sample = s
            session.add(analysis)
            try:
                session.commit()
            except SQLAlchemyError as e:
                session.rollback()
                raise PeekabooDatabaseError(
                    'Failed to add analysis task to the database: %s' % e
                )
            finally:
                session.close()
Beispiel #19
0
    def analysis_journal_fetch_journal(self, sample):
        """
        Fetch information stored in the database about a given sample object.

        @param sample: The sample object of which the information shall be
                       fetched from the database.
        @return: A sorted list of (analysis_time, result, reason) of the
                 requested sample.
        """
        statement = sqlalchemy.sql.expression.select(
            SampleInfo.analysis_time, SampleInfo.result,
            SampleInfo.reason).where(SampleInfo.id != sample.id).where(
                SampleInfo.result != Result.failed).filter_by(
                    state=JobState.FINISHED,
                    sha256sum=sample.sha256sum,
                    file_extension=sample.file_extension).order_by(
                        SampleInfo.analysis_time)

        sample_journal = None
        attempt = 1
        delay = 0
        while attempt <= self.retries:
            with self.__session() as session:
                try:
                    sample_journal = session.execute(statement).all()
                    break
                except (OperationalError, DBAPIError,
                        SQLAlchemyError) as error:
                    session.rollback()

                    attempt, delay = self.was_transient_error(
                        error, attempt, 'fetching analysis journal')

                    if attempt < 0:
                        raise PeekabooDatabaseError(
                            'Failed to fetch analysis journal from the database: %s'
                            % error)

            time.sleep(delay)

        return sample_journal
Beispiel #20
0
 def _init_db(self):
     """
     Initializes the Peekaboo database by creating tables and
     writing meta information to the '_meta' table.
     """
     Base.metadata.create_all(self.__engine)
     meta = PeekabooMetadata()
     meta.peekaboo_version = __version__
     meta.db_schema_version = DB_SCHEMA_VERSION
     # TODO: Get Cuckoo version.
     meta.cuckoo_version = '2.0'
     session = self.__session()
     session.add(meta)
     try:
         session.commit()
     except SQLAlchemyError as error:
         session.rollback()
         raise PeekabooDatabaseError('Cannot initialize the database: %s' %
                                     error)
     finally:
         session.close()
Beispiel #21
0
    async def analysis_retrieve(self, job_id):
        """
        Fetch information stored in the database about a given sample object.

        @param job_id: ID of the analysis to retrieve
        @type job_id: int
        @return: reason and result for the given analysis task
        """
        statement = sqlalchemy.sql.expression.select(
            SampleInfo.reason,
            SampleInfo.result).filter_by(id=job_id, state=JobState.FINISHED)

        result = None
        attempt = 1
        delay = 0
        while attempt <= self.retries:
            async with self.__session_factory() as session:
                try:
                    proxy = await session.execute(statement)
                    result = proxy.first()
                    break
                except (OperationalError, DBAPIError,
                        SQLAlchemyError) as error:
                    await session.rollback()

                    attempt, delay = self.was_transient_error(
                        error, attempt, 'retrieving analysis result')

                    if attempt < 0:
                        raise PeekabooDatabaseError(
                            'Failed to retrieve analysis from the database: %s'
                            % error)

            await asyncio.sleep(delay)

        return result
Beispiel #22
0
    async def clear_stale_in_flight_samples(self):
        """
        Clear all in-flight markers that are too old and therefore stale. This
        detects instances which are locked up, crashed or shut down.
        """
        # an instance id of 0 denotes that we're alone and don't need to track
        # in-flight samples in the database
        if self.instance_id == 0:
            return True

        logger.debug(
            'Clearing database of all stale in-flight samples '
            '(%d seconds)', self.stale_in_flight_threshold)

        def clear_statement(statement_class):
            # delete only the locks of a specific instance
            return statement_class(InFlightSample).where(
                InFlightSample.start_time <= datetime.utcnow() -
                timedelta(seconds=self.stale_in_flight_threshold))

        delete_statement = clear_statement(sqlalchemy.sql.expression.delete)
        select_statement = clear_statement(sqlalchemy.sql.expression.select)

        attempt = 1
        cleared = 0
        while attempt <= self.retries:
            async with self.__session_factory() as session:
                try:
                    # only do the query if debugging is enabled
                    if logger.isEnabledFor(logging.DEBUG):
                        # obviously there's a race between logging and actual
                        # delete here, use with caution, compare with actual
                        # number of markers cleared below before relying on it
                        # for debugging
                        markers = await session.execute(select_statement)
                        for stale in markers:
                            logger.debug('Stale in-flight marker to clear: %s',
                                         stale)

                    markers = await session.execute(delete_statement)
                    await session.commit()

                    cleared = markers.rowcount
                    if cleared > 0:
                        logger.warning('%d stale in-flight samples cleared.',
                                       cleared)

                    break
                except (OperationalError, DBAPIError,
                        SQLAlchemyError) as error:
                    await session.rollback()

                    attempt, delay = self.was_transient_error(
                        error, attempt,
                        'clearing the database of stale in-flight samples')

                    if attempt < 0:
                        raise PeekabooDatabaseError(
                            'Unable to clear the database of stale in-flight '
                            'samples: %s' % error)

            await asyncio.sleep(delay)

        return cleared > 0
Beispiel #23
0
    def __init__(self,
                 db_url,
                 instance_id=0,
                 stale_in_flight_threshold=15 * 60,
                 log_level=logging.WARNING,
                 async_driver=None):
        """
        Initialize the Peekaboo database handler.

        @param db_url: An RFC 1738 URL that points to the database.
        @param instance_id: A positive, unique ID differentiating this Peekaboo
                            instance from any other instance using the same
                            database for concurrency coordination. Value of 0
                            means that we're alone and have no other instances
                            to worry about.
        @param stale_in_flight_threshold: Number of seconds after which a in
        flight marker is considered stale and deleted or ignored.
        @param log_level: Overrides the log level of the database modules. The
                          idea is for the database to be silent by default and
                          only emit log messages if switched on explictly and
                          independently of the Peekaboo log level.
        @param async_driver: last resort override of the asyncio driver
                             auto-detection
        """
        logging.getLogger('sqlalchemy.engine').setLevel(log_level)
        logging.getLogger('sqlalchemy.pool').setLevel(log_level)
        # aiosqlite picks up the global log level unconditionally so we need to
        # override it as well and explicitly
        logging.getLogger('aiosqlite').setLevel(log_level)

        # <backend>[+<driver>]:// -> <backend>
        backend = db_url.split(':')[0].split('+')[0]

        connect_args = {}
        if backend == 'sqlite':
            connect_args['timeout'] = 0

        self.__engine = create_engine(db_url,
                                      future=True,
                                      connect_args=connect_args)
        session_factory = sessionmaker(bind=self.__engine)
        self.__session = scoped_session(session_factory)
        self.__lock = threading.RLock()

        asyncio_drivers = {
            'sqlite': ['aiosqlite'],
            'mysql': ['asyncmy', 'aiomysql'],
            'postgresql': ['asyncpg'],
        }

        if async_driver is not None:
            drivers = [async_driver]
        else:
            drivers = asyncio_drivers.get(backend)
            if drivers is None:
                raise PeekabooDatabaseError(
                    'Unknown database backend configured: %s' % backend)

        async_engine = None
        async_db_url = None
        for driver in drivers:
            # replace backend and driver with our asyncio alternative
            scheme = "%s+%s" % (backend, driver)
            async_db_url = ':'.join([scheme] + db_url.split(':')[1:])

            try:
                async_engine = sqlalchemy.ext.asyncio.create_async_engine(
                    async_db_url, connect_args=connect_args)
            except ModuleNotFoundError:
                continue

            logger.debug(
                'Auto-detected %s SQLAlchemy backend+driver for '
                'asyncio database accesses', scheme)
            break

        if async_engine is None:
            raise PeekabooDatabaseError(
                'None of the asyncio drivers for backend %s could be '
                'found: %s' % (backend, drivers))

        self.__async_session_factory = sessionmaker(
            bind=async_engine, class_=sqlalchemy.ext.asyncio.AsyncSession)
        # no scoping necessary as we're not using asyncio across threads

        # special handling for sqlite: since it does not respond well to
        # multiple modify operations in parallel to the same database, we
        # serialise them through a QueuePool with only one connection
        self.__async_session_factory_modify = self.__async_session_factory
        if backend in ['sqlite']:
            async_engine_modify = sqlalchemy.ext.asyncio.create_async_engine(
                async_db_url,
                poolclass=sqlalchemy.pool.AsyncAdaptedQueuePool,
                pool_size=1,
                max_overflow=0)
            self.__async_session_factory_modify = sessionmaker(
                bind=async_engine_modify,
                class_=sqlalchemy.ext.asyncio.AsyncSession)

        self.instance_id = instance_id
        self.stale_in_flight_threshold = stale_in_flight_threshold
        self.retries = 5
        # ultra-simple quadratic backoff:
        # attempt 1: 10 * 2**(1) == 10-20msecs
        # attempt 2: 10 * 2**(2) == 20-40msecs
        # attempt 3: 10 * 2**(3) == 40-80msecs
        # attempt 4: 10 * 2**(4) == 80-160msecs
        self.deadlock_backoff_base = 10
        self.connect_backoff_base = 2000

        attempt = 1
        delay = 0
        while attempt <= self.retries:
            with self.__lock:
                try:
                    Base.metadata.create_all(self.__engine)
                    break
                except (OperationalError, DBAPIError,
                        SQLAlchemyError) as error:
                    attempt, delay = self.was_transient_error(
                        error, attempt, 'create metadata')

                    if attempt < 0:
                        raise PeekabooDatabaseError(
                            'Failed to create schema in database: %s' % error)

            time.sleep(delay)
Beispiel #24
0
    async def mark_sample_in_flight(self,
                                    sample,
                                    instance_id=None,
                                    start_time=None):
        """
        Mark a sample as in flight, i.e. being worked on by an instance.

        @param sample: The sample to mark as in flight.
        @param instance_id: (optionally) The ID of the instance that is
                            handling this sample. Default: Us.
        @param start_time: Override the time the marker was placed for
                           debugging purposes.
        """
        # an instance id of 0 denotes that we're alone and don't need to track
        # in-flight samples in the database
        if self.instance_id == 0:
            return True

        # use our own instance id if none is given
        if instance_id is None:
            instance_id = self.instance_id

        if start_time is None:
            start_time = datetime.utcnow()

        in_flight_marker = InFlightSample(identity=await sample.identity,
                                          instance_id=instance_id,
                                          start_time=start_time)

        attempt = 1
        delay = 0
        while attempt <= self.retries:
            # a new session needs to be constructed on each attempt
            async with self.__session_factory() as session:
                # try to mark this sample as in flight in an atomic insert
                # operation (modulo possible deadlocks with various RDBMS)
                session.add(in_flight_marker)

                try:
                    await session.commit()
                    logger.debug('%d: Marked sample in flight', sample.id)
                    return True
                # duplicate primary key == entry already exists
                except IntegrityError:
                    await session.rollback()
                    logger.debug(
                        '%d: Sample is already in flight on another '
                        'instance', sample.id)
                    return False
                except (OperationalError, DBAPIError,
                        SQLAlchemyError) as error:
                    await session.rollback()

                    attempt, delay = self.was_transient_error(
                        error, attempt,
                        'marking sample %d in flight' % sample.id)

                    if attempt < 0:
                        raise PeekabooDatabaseError(
                            '%d: Unable to mark sample as in flight: %s' %
                            (sample.id, error))

            await asyncio.sleep(delay)

        return False
Beispiel #25
0
    def mark_sample_in_flight(self, sample, instance_id=None, start_time=None):
        """
        Mark a sample as in flight, i.e. being worked on by an instance.

        @param sample: The sample to mark as in flight.
        @param instance_id: (optionally) The ID of the instance that is
                            handling this sample. Default: Us.
        @param start_time: Override the time the marker was placed for
                           debugging purposes.
        """
        # an instance id of 0 denotes that we're alone and don't need to track
        # in-flight samples in the database
        if self.instance_id == 0:
            return True

        # use our own instance id if none is given
        if instance_id is None:
            instance_id = self.instance_id

        if start_time is None:
            start_time = datetime.utcnow()

        sha256sum = sample.sha256sum
        in_flight_marker = InFlightSample(sha256sum=sha256sum,
                                          instance_id=instance_id,
                                          start_time=start_time)
        attempt = 1
        locked = False
        while attempt <= self.retries:
            # a new session needs to be constructed on each attempt
            session = self.__session()

            # try to mark this sample as in flight in an atomic insert
            # operation (modulo possible deadlocks with various RDBMS)
            session.add(in_flight_marker)

            try:
                session.commit()
                locked = True
                logger.debug('Marked sample %s as in flight', sha256sum)
            # duplicate primary key == entry already exists
            except IntegrityError:
                session.rollback()
                logger.debug(
                    'Sample %s is already in flight on another '
                    'instance', sha256sum)
            except (OperationalError, DBAPIError, SQLAlchemyError) as error:
                session.rollback()

                attempt = self.was_transient_error(
                    error, attempt,
                    'marking sample %s as in flight' % sha256sum)
                if attempt > 0:
                    continue

                raise PeekabooDatabaseError(
                    'Unable to mark sample as in flight: %s' % error)
            finally:
                session.close()

            return locked

        return False
Beispiel #26
0
    def __init__(self,
                 db_url,
                 instance_id=0,
                 stale_in_flight_threshold=15 * 60,
                 log_level=logging.WARNING):
        """
        Initialize the Peekaboo database handler.

        @param db_url: An RFC 1738 URL that points to the database.
        @param instance_id: A positive, unique ID differentiating this Peekaboo
                            instance from any other instance using the same
                            database for concurrency coordination. Value of 0
                            means that we're alone and have no other instances
                            to worry about.
        @param stale_in_flight_threshold: Number of seconds after which a in
        flight marker is considered stale and deleted or ignored.
        @param log_level: Overrides the log level of the database modules. The
                          idea is for the database to be silent by default and
                          only emit log messages if switched on explictly and
                          independently of the Peekaboo log level.
        """
        logging.getLogger('sqlalchemy.engine').setLevel(log_level)
        logging.getLogger('sqlalchemy.pool').setLevel(log_level)
        # aiosqlite picks up the global log level unconditionally so we need to
        # override it as well and explicitly
        logging.getLogger('aiosqlite').setLevel(log_level)

        # <backend>[+<driver>]:// -> <backend>
        url_parts = db_url.split(':')
        scheme_parts = url_parts[0].split('+')
        backend = scheme_parts[0]

        engine_kwargs = {}
        if backend == 'sqlite':
            engine_kwargs.update(
                dict(poolclass=sqlalchemy.pool.AsyncAdaptedQueuePool,
                     pool_size=1,
                     max_overflow=0,
                     connect_args={'timeout': 0}))

        # if there is no driver specified or its a known non-asyncio driver,
        # try to find to a known-good asyncio driver
        sync_drivers = {
            'sqlite': ['pysqlite'],
            'mysql': ['mysqldb', 'pymysql'],
            'postgresql':
            ['psycopg2', 'pg8000', 'psycopg2cffi', 'pypostgresql', 'pygresql'],
        }

        asyncio_drivers = {
            'sqlite': ['aiosqlite'],
            'mysql': ['asyncmy', 'aiomysql'],
            'postgresql': ['asyncpg'],
        }

        backend_async_drivers = asyncio_drivers.get(backend)

        # if there seems to be a driver specified, look more closely
        if len(scheme_parts) > 1:
            driver = scheme_parts[1]

            backend_sync_drivers = sync_drivers.get(backend)
            if (backend_sync_drivers is not None
                    and driver in backend_sync_drivers):
                logger.warning(
                    'Configuration specifies a synchronous database driver '
                    '"%s". Please update your configuration to use an '
                    'asynchronous driver, preferably out of: %s', driver,
                    backend_async_drivers)
            elif driver not in backend_async_drivers:
                logger.warning(
                    'Configuration specifies unknown asynchronous driver "%s". '
                    'Trying to use anyway.', driver)
                backend_async_drivers = [driver]

        self.__engine = None
        for driver in backend_async_drivers:
            scheme = f'{backend}+{driver}'
            db_url = ':'.join([scheme] + url_parts[1:])

            try:
                logger.debug('Trying SQLAlchemy backend+driver "%s"', scheme)
                self.__engine = sqlalchemy.ext.asyncio.create_async_engine(
                    db_url, **engine_kwargs)
            except ModuleNotFoundError:
                continue

            logger.info(
                'Using "%s" SQLAlchemy backend+driver for '
                'database accesses', scheme)
            break

        if self.__engine is None:
            raise PeekabooDatabaseError(
                f'None of the drivers for backend "{backend}" could be found: '
                f'{backend_async_drivers}')

        self.__session_factory = sessionmaker(
            bind=self.__engine, class_=sqlalchemy.ext.asyncio.AsyncSession)

        self.instance_id = instance_id
        self.stale_in_flight_threshold = stale_in_flight_threshold
        self.retries = 5
        # ultra-simple quadratic backoff:
        # attempt 1: 10 * 2**(1) == 10-20msecs
        # attempt 2: 10 * 2**(2) == 20-40msecs
        # attempt 3: 10 * 2**(3) == 40-80msecs
        # attempt 4: 10 * 2**(4) == 80-160msecs
        self.deadlock_backoff_base = 10
        self.connect_backoff_base = 2000