def sample_info_update(self, sample): """ Update sample information. :param sample: The sample object containing the info to update. """ with self.__lock: session = self.__Session() sample_info = PeekabooDatabase.__get( session, SampleInfo, sha256sum=sample.sha256sum, file_extension=sample.file_extension) if sample_info is not None: sample_info.result = PeekabooDatabase.__get_or_create( session, AnalysisResult, name=sample.get_result().name) sample_info.reason = sample.reason try: session.commit() logger.debug( 'Updated sample info in the database for sample %s.' % sample) except SQLAlchemyError as e: session.rollback() raise PeekabooDatabaseError( 'Failed to update info for sample %s in the database: %s' % (sample, e)) finally: session.close() else: raise PeekabooDatabaseError( 'No info found in the database for sample %s' % sample)
def clear_sample_in_flight(self, sample, instance_id=None): """ Clear the mark that a sample is being processed by an instance. @param sample: The sample to clear from in-flight list. @param instance_id: (optionally) The ID of the instance that is handling this sample. Default: Us. """ # an instance id of 0 denotes that we're alone and don't need to track # in-flight samples in the database if self.instance_id == 0: return # use our own instance id if none is given if instance_id is None: instance_id = self.instance_id sha256sum = sample.sha256sum attempt = 1 while attempt <= self.retries: session = self.__session() # clear in-flight marker from database query = session.query(InFlightSample).filter( InFlightSample.sha256sum == sha256sum).filter( InFlightSample.instance_id == instance_id) try: # delete() is not queued and goes to the DB before commit() cleared = query.delete() session.commit() except (OperationalError, DBAPIError, SQLAlchemyError) as error: session.rollback() attempt = self.was_transient_error( error, attempt, 'clearing in-flight status of sample %s' % sha256sum) if attempt > 0: continue raise PeekabooDatabaseError('Unable to clear in-flight status ' 'of sample: %s' % error) finally: session.close() break if cleared == 0: raise PeekabooDatabaseError('Unexpected inconsistency: Sample %s ' 'not recoreded as in-flight upon ' 'clearing flag.' % sha256sum) elif cleared > 1: raise PeekabooDatabaseError('Unexpected inconsistency: Multiple ' 'instances of sample %s in-flight ' 'status cleared against database ' 'constraints!?' % sha256sum) logger.debug('Cleared sample %s from in-flight list', sha256sum)
async def clear_sample_in_flight(self, sample, instance_id=None): """ Clear the mark that a sample is being processed by an instance. @param sample: The sample to clear from in-flight list. @param instance_id: (optionally) The ID of the instance that is handling this sample. Default: Us. """ # an instance id of 0 denotes that we're alone and don't need to track # in-flight samples in the database if self.instance_id == 0: return # use our own instance id if none is given if instance_id is None: instance_id = self.instance_id statement = sqlalchemy.sql.expression.delete(InFlightSample).where( InFlightSample.identity == await sample.identity).where( InFlightSample.instance_id == instance_id) attempt = 1 cleared = 0 while attempt <= self.retries: async with self.__session_factory() as session: try: # clear in-flight marker from database marker = await session.execute(statement) await session.commit() cleared = marker.rowcount break except (OperationalError, DBAPIError, SQLAlchemyError) as error: await session.rollback() attempt, delay = self.was_transient_error( error, attempt, 'clearing in-flight status of ' 'sample %d' % sample.id) if attempt < 0: raise PeekabooDatabaseError( '%d: Unable to clear in-flight status of sample: ' '%s' % (sample.id, error)) await asyncio.sleep(delay) if cleared == 0: raise PeekabooDatabaseError( '%d: Unexpected inconsistency: Sample not recorded as ' 'in-flight upon clearing flag.' % sample.id) elif cleared > 1: raise PeekabooDatabaseError( '%d: Unexpected inconsistency: Multiple instances of sample ' 'in-flight status cleared against database constraints!?' % sample.id)
def analysis_save(self, sample): """ Save an analysis task to the analysis journal in the database. @param sample: The sample object for this analysis task. """ sample_info = SampleInfo(sha256sum=sample.sha256sum, file_extension=sample.file_extension, analysis_time=datetime.now(), result=sample.result, reason=sample.reason) with self.__lock: attempt = 1 while attempt <= self.retries: session = self.__session() session.add(sample_info) try: session.commit() except (OperationalError, DBAPIError, SQLAlchemyError) as error: session.rollback() attempt = self.was_transient_error( error, attempt, 'saving analysis result') if attempt > 0: continue raise PeekabooDatabaseError( 'Failed to add analysis task to the database: %s' % error) finally: session.close() break
async def analysis_update(self, sample): """ Update an analysis task in the analysis journal in the database. @param sample: The sample object for this analysis task. """ statement = sqlalchemy.sql.expression.update(SampleInfo).where( SampleInfo.id == sample.id).values(state=sample.state, result=sample.result, reason=sample.reason) attempt = 1 delay = 0 while attempt <= self.retries: async with self.__session_factory() as session: try: await session.execute(statement) await session.commit() break except (OperationalError, DBAPIError, SQLAlchemyError) as error: await session.rollback() attempt, delay = self.was_transient_error( error, attempt, 'updating analysis') if attempt < 0: raise PeekabooDatabaseError( 'Failed to update analysis task in the database: %s' % error) await asyncio.sleep(delay)
def drop(self): """ Drop all tables of the database. """ try: Base.metadata.drop_all(self.__engine) except SQLAlchemyError as error: raise PeekabooDatabaseError( 'Unable to drop all tables of the database: %s' % error)
def _init_db(self): """ Initializes the Peekaboo database by creating tables and writing meta information to the '_meta' table. """ Base.metadata.create_all(self.__engine) meta = PeekabooMetadata() meta.peekaboo_version = __version__ meta.db_schema_version = self.db_schema_version # TODO: Get Cuckoo version. meta.cuckoo_version = '2.0' session = self.__Session() session.add(meta) ''' session.add_all([ AnalysisResult(name='inProgress'), AnalysisResult(name='unchecked'), AnalysisResult(name='unknown'), AnalysisResult(name='ignored'), AnalysisResult(name='checked'), AnalysisResult(name='good'), AnalysisResult(name='bad'), ]) ''' try: session.commit() except SQLAlchemyError as e: session.rollback() raise PeekabooDatabaseError('Cannot initialize the database: %s' % e) finally: session.close()
def analysis_save(self, sample): """ Save an analysis task to the analysis journal in the database. @param sample: The sample object for this analysis task. """ analysis = AnalysisJournal() analysis.job_hash = sample.job_hash analysis.cuckoo_job_id = sample.job_id analysis.filename = sample.filename analysis.analysis_time = datetime.now() sample_info = self.sample_info_fetch(sample) if sample_info is None: sample_info = SampleInfo(sha256sum=sample.sha256sum, file_extension=sample.file_extension, result=sample.result, reason=sample.reason) analysis.sample = sample_info with self.__lock: session = self.__session() session.add(analysis) try: session.commit() except SQLAlchemyError as error: session.rollback() raise PeekabooDatabaseError( 'Failed to add analysis task to the database: %s' % error) finally: session.close()
def clear_in_progress(self): """ Remove all samples with the result 'inProgress'. """ session = self.__Session() in_progress = PeekabooDatabase.__get(session, AnalysisResult, name='inProgress') in_progress_samples = session.query(SampleInfo).filter_by( result=in_progress) # The direct approach does not work currently with message: # in_() not yet supported for relationships. For a simple many-to-one, # use in_() against the set of foreign key values. # This is what we do below. #session.query(AnalysisJournal).filter( # AnalysisJournal.sample.in_(in_progress_samples)).delete() sample_ids = [s.id for s in in_progress_samples.all()] if sample_ids: session.query(AnalysisJournal).filter( AnalysisJournal.sample_id.in_(sample_ids)).delete( synchronize_session=False) in_progress_samples.delete() try: session.commit() logger.debug('Cleared the database from "inProgress" entries.') except SQLAlchemyError as e: session.rollback() raise PeekabooDatabaseError( 'Unable to clear the database from "inProgress" entries: %s' % e) finally: session.close()
def clear_in_progress(self): """ Remove all samples with the result 'inProgress'. """ session = self.__Session() in_progress = PeekabooDatabase.__get( session, AnalysisResult, name='inProgress' ) in_progress_samples = session.query(SampleInfo).filter_by( result=in_progress ).all() for in_progress_sample in in_progress_samples: session.query(AnalysisJournal).filter_by( sample=in_progress_sample ).delete() try: session.commit() logger.debug('Cleared the database from "inProgress" entries.') except SQLAlchemyError as e: session.rollback() raise PeekabooDatabaseError( 'Unable to clear the database from "inProgress" entries: %s' % e ) finally: session.close()
def analysis_update(self, sample): """ Update an analysis task in the database. This method is called if a sample object was processed by Cuckoo and therefore has a Cuckoo job ID, which we want to store in the database. :param sample: The sample object containing the info to update. """ with self.__lock: session = self.__Session() analysis = self.__get(session, AnalysisJournal, job_hash=sample.get_job_hash(), filename=sample.get_filename()) if analysis: analysis.cuckoo_job_id = sample.job_id session.add(analysis) try: session.commit() except SQLAlchemyError as e: session.rollback() raise PeekabooDatabaseError( 'Failed to update analysis task in the database: %s' % e) finally: session.close()
def clear_stale_in_flight_samples(self): """ Clear all in-flight markers that are too old and therefore stale. This detects instances which are locked up, crashed or shut down. """ # an instance id of 0 denotes that we're alone and don't need to track # in-flight samples in the database if self.instance_id == 0: return True logger.debug( 'Clearing database of all stale in-flight samples ' '(%d seconds)', self.stale_in_flight_threshold) attempt = 1 while attempt <= self.retries: session = self.__session() # delete only the locks of a specific instance query = session.query(InFlightSample).filter( InFlightSample.start_time <= datetime.utcnow() - timedelta(seconds=self.stale_in_flight_threshold)) try: # the loop triggers the query, so only do it if debugging is # enabled if logger.isEnabledFor(logging.DEBUG): # obviously there's a race between logging and actual # delete here, use with caution, compare with actual number # of markers cleared below before relying on it for # debugging for stale in query: logger.debug('Stale in-flight marker to clear: %s', stale) # delete() is not queued and goes to the DB before commit() cleared = query.delete() session.commit() if cleared > 0: logger.warning('%d stale in-flight samples cleared.', cleared) except (OperationalError, DBAPIError, SQLAlchemyError) as error: session.rollback() attempt = self.was_transient_error( error, attempt, 'clearing the database of stale in-flight samples') if attempt > 0: continue raise PeekabooDatabaseError( 'Unable to clear the database of stale in-flight ' 'samples: %s' % error) finally: session.close() break return cleared > 0
def clear_in_flight_samples(self, instance_id=None): """ Clear all in-flight markers left over by previous runs or other instances by removing them from the lock table. @param instance_id: Clear our own (None), another instance's (positive integer) or all instances' (negative integer) locks. Since an instance_id of 0 disables in-flight sample tracking, no instance will ever set a marker with that ID so that specifying 0 here will amount to a no-op or rather clean-up of invalid entries. """ # an instance id of 0 denotes that we're alone and don't need to track # in-flight samples if self.instance_id == 0: return # use our own instance id if none is given if instance_id is None: instance_id = self.instance_id attempt = 1 while attempt <= self.retries: session = self.__session() if instance_id < 0: # delete all locks query = session.query(InFlightSample) logger.debug('Clearing database of all in-flight samples.') else: # delete only the locks of a specific instance query = session.query(InFlightSample).filter( InFlightSample.instance_id == instance_id) logger.debug( 'Clearing database of all in-flight samples of ' 'instance %d.', instance_id) try: # delete() is not queued and goes to the DB before commit() query.delete() session.commit() except (OperationalError, DBAPIError, SQLAlchemyError) as error: session.rollback() attempt = self.was_transient_error( error, attempt, 'clearing database of in-flight samples') if attempt > 0: continue raise PeekabooDatabaseError('Unable to clear the database of ' 'in-flight samples: %s' % error) finally: session.close() break
async def clear_in_flight_samples(self, instance_id=None): """ Clear all in-flight markers left over by previous runs or other instances by removing them from the lock table. @param instance_id: Clear our own (None), another instance's (positive integer) or all instances' (negative integer) locks. Since an instance_id of 0 disables in-flight sample tracking, no instance will ever set a marker with that ID so that specifying 0 here will amount to a no-op or rather clean-up of invalid entries. """ # an instance id of 0 denotes that we're alone and don't need to track # in-flight samples if self.instance_id == 0: return # use our own instance id if none is given if instance_id is None: instance_id = self.instance_id if instance_id < 0: # delete all locks statement = sqlalchemy.sql.expression.delete(InFlightSample) logger.debug('Clearing database of all in-flight samples.') else: # delete only the locks of a specific instance statement = sqlalchemy.sql.expression.delete(InFlightSample).where( InFlightSample.instance_id == instance_id) logger.debug( 'Clearing database of all in-flight samples of ' 'instance %d.', instance_id) attempt = 1 while attempt <= self.retries: async with self.__session_factory() as session: try: await session.execute(statement) await session.commit() break except (OperationalError, DBAPIError, SQLAlchemyError) as error: await session.rollback() attempt, delay = self.was_transient_error( error, attempt, 'clearing database of in-flight samples') if attempt < 0: raise PeekabooDatabaseError( 'Unable to clear the database of in-flight ' 'samples: %s' % error) await asyncio.sleep(delay)
def mark_sample_in_flight(self, sample, instance_id=None, start_time=None): """ Mark a sample as in flight, i.e. being worked on by an instance. @param sample: The sample to mark as in flight. @param instance_id: (optionally) The ID of the instance that is handling this sample. Default: Us. @param start_time: Override the time the marker was placed for debugging purposes. """ # an instance id of 0 denotes that we're alone and don't need to track # in-flight samples in the database if self.instance_id == 0: return True # use our own instance id if none is given if instance_id is None: instance_id = self.instance_id if start_time is None: start_time = datetime.utcnow() session = self.__session() # try to mark this sample as in flight in an atomic insert operation sha256sum = sample.sha256sum session.add( InFlightSample(sha256sum=sha256sum, instance_id=instance_id, start_time=start_time)) locked = False try: session.commit() locked = True logger.debug('Marked sample %s as in flight', sha256sum) # duplicate primary key == entry already exists except IntegrityError: session.rollback() logger.debug('Sample %s is already in flight on another instance', sha256sum) except SQLAlchemyError as error: session.rollback() raise PeekabooDatabaseError('Unable to mark sample as in flight: ' '%s' % error) finally: session.close() return locked
async def start(self): attempt = 1 delay = 0 while attempt <= self.retries: async with self.__engine.begin() as conn: try: await conn.run_sync(Base.metadata.create_all) break except (OperationalError, DBAPIError, SQLAlchemyError) as error: attempt, delay = self.was_transient_error( error, attempt, 'create metadata') if attempt < 0: raise PeekabooDatabaseError( 'Failed to create schema in database: %s' % error) await asyncio.sleep(delay)
async def analysis_add(self, sample): """ Add an analysis task to the analysis journal in the database. @param sample: The sample object for this analysis task. @returns: ID of the newly created analysis task (also updated in the sample) """ sample_info = SampleInfo(state=sample.state, sha256sum=sample.sha256sum, file_extension=sample.file_extension, analysis_time=datetime.now(), result=sample.result, reason=sample.reason) job_id = None attempt = 1 delay = 0 while attempt <= self.retries: async with self.__async_session_factory_modify() as session: session.add(sample_info) try: # flush to retrieve the automatically assigned primary # key value await session.flush() job_id = sample_info.id await session.commit() break except (OperationalError, DBAPIError, SQLAlchemyError) as error: await session.rollback() attempt, delay = self.was_transient_error( error, attempt, 'adding analysis') if attempt < 0: raise PeekabooDatabaseError( 'Failed to add analysis task to the database: %s' % error) await asyncio.sleep(delay) sample.update_id(job_id) return job_id
def analysis2db(self, sample): """ Save an analysis task to the analysis journal in the database. :param sample: The sample object for this analysis task. """ with self.__lock: session = self.__Session() analysis = AnalysisJournal() analysis.job_hash = sample.get_job_hash() analysis.cuckoo_job_id = sample.job_id analysis.filename = sample.get_filename() analysis.analyses_time = datetime.strptime(sample.analyses_time, "%Y%m%dT%H%M%S") analysis_result = PeekabooDatabase.__get_or_create( session, AnalysisResult, name=sample.get_result().name ) # NOTE: We cannot determine if a known sample is inProgress again. s = PeekabooDatabase.__get( session, SampleInfo, sha256sum=sample.sha256sum, file_extension=sample.file_extension, ) if s is None: s = PeekabooDatabase.__create( SampleInfo, sha256sum=sample.sha256sum, file_extension=sample.file_extension, result=analysis_result ) analysis.sample = s session.add(analysis) try: session.commit() except SQLAlchemyError as e: session.rollback() raise PeekabooDatabaseError( 'Failed to add analysis task to the database: %s' % e ) finally: session.close()
def analysis_journal_fetch_journal(self, sample): """ Fetch information stored in the database about a given sample object. @param sample: The sample object of which the information shall be fetched from the database. @return: A sorted list of (analysis_time, result, reason) of the requested sample. """ statement = sqlalchemy.sql.expression.select( SampleInfo.analysis_time, SampleInfo.result, SampleInfo.reason).where(SampleInfo.id != sample.id).where( SampleInfo.result != Result.failed).filter_by( state=JobState.FINISHED, sha256sum=sample.sha256sum, file_extension=sample.file_extension).order_by( SampleInfo.analysis_time) sample_journal = None attempt = 1 delay = 0 while attempt <= self.retries: with self.__session() as session: try: sample_journal = session.execute(statement).all() break except (OperationalError, DBAPIError, SQLAlchemyError) as error: session.rollback() attempt, delay = self.was_transient_error( error, attempt, 'fetching analysis journal') if attempt < 0: raise PeekabooDatabaseError( 'Failed to fetch analysis journal from the database: %s' % error) time.sleep(delay) return sample_journal
def _init_db(self): """ Initializes the Peekaboo database by creating tables and writing meta information to the '_meta' table. """ Base.metadata.create_all(self.__engine) meta = PeekabooMetadata() meta.peekaboo_version = __version__ meta.db_schema_version = DB_SCHEMA_VERSION # TODO: Get Cuckoo version. meta.cuckoo_version = '2.0' session = self.__session() session.add(meta) try: session.commit() except SQLAlchemyError as error: session.rollback() raise PeekabooDatabaseError('Cannot initialize the database: %s' % error) finally: session.close()
async def analysis_retrieve(self, job_id): """ Fetch information stored in the database about a given sample object. @param job_id: ID of the analysis to retrieve @type job_id: int @return: reason and result for the given analysis task """ statement = sqlalchemy.sql.expression.select( SampleInfo.reason, SampleInfo.result).filter_by(id=job_id, state=JobState.FINISHED) result = None attempt = 1 delay = 0 while attempt <= self.retries: async with self.__session_factory() as session: try: proxy = await session.execute(statement) result = proxy.first() break except (OperationalError, DBAPIError, SQLAlchemyError) as error: await session.rollback() attempt, delay = self.was_transient_error( error, attempt, 'retrieving analysis result') if attempt < 0: raise PeekabooDatabaseError( 'Failed to retrieve analysis from the database: %s' % error) await asyncio.sleep(delay) return result
async def clear_stale_in_flight_samples(self): """ Clear all in-flight markers that are too old and therefore stale. This detects instances which are locked up, crashed or shut down. """ # an instance id of 0 denotes that we're alone and don't need to track # in-flight samples in the database if self.instance_id == 0: return True logger.debug( 'Clearing database of all stale in-flight samples ' '(%d seconds)', self.stale_in_flight_threshold) def clear_statement(statement_class): # delete only the locks of a specific instance return statement_class(InFlightSample).where( InFlightSample.start_time <= datetime.utcnow() - timedelta(seconds=self.stale_in_flight_threshold)) delete_statement = clear_statement(sqlalchemy.sql.expression.delete) select_statement = clear_statement(sqlalchemy.sql.expression.select) attempt = 1 cleared = 0 while attempt <= self.retries: async with self.__session_factory() as session: try: # only do the query if debugging is enabled if logger.isEnabledFor(logging.DEBUG): # obviously there's a race between logging and actual # delete here, use with caution, compare with actual # number of markers cleared below before relying on it # for debugging markers = await session.execute(select_statement) for stale in markers: logger.debug('Stale in-flight marker to clear: %s', stale) markers = await session.execute(delete_statement) await session.commit() cleared = markers.rowcount if cleared > 0: logger.warning('%d stale in-flight samples cleared.', cleared) break except (OperationalError, DBAPIError, SQLAlchemyError) as error: await session.rollback() attempt, delay = self.was_transient_error( error, attempt, 'clearing the database of stale in-flight samples') if attempt < 0: raise PeekabooDatabaseError( 'Unable to clear the database of stale in-flight ' 'samples: %s' % error) await asyncio.sleep(delay) return cleared > 0
def __init__(self, db_url, instance_id=0, stale_in_flight_threshold=15 * 60, log_level=logging.WARNING, async_driver=None): """ Initialize the Peekaboo database handler. @param db_url: An RFC 1738 URL that points to the database. @param instance_id: A positive, unique ID differentiating this Peekaboo instance from any other instance using the same database for concurrency coordination. Value of 0 means that we're alone and have no other instances to worry about. @param stale_in_flight_threshold: Number of seconds after which a in flight marker is considered stale and deleted or ignored. @param log_level: Overrides the log level of the database modules. The idea is for the database to be silent by default and only emit log messages if switched on explictly and independently of the Peekaboo log level. @param async_driver: last resort override of the asyncio driver auto-detection """ logging.getLogger('sqlalchemy.engine').setLevel(log_level) logging.getLogger('sqlalchemy.pool').setLevel(log_level) # aiosqlite picks up the global log level unconditionally so we need to # override it as well and explicitly logging.getLogger('aiosqlite').setLevel(log_level) # <backend>[+<driver>]:// -> <backend> backend = db_url.split(':')[0].split('+')[0] connect_args = {} if backend == 'sqlite': connect_args['timeout'] = 0 self.__engine = create_engine(db_url, future=True, connect_args=connect_args) session_factory = sessionmaker(bind=self.__engine) self.__session = scoped_session(session_factory) self.__lock = threading.RLock() asyncio_drivers = { 'sqlite': ['aiosqlite'], 'mysql': ['asyncmy', 'aiomysql'], 'postgresql': ['asyncpg'], } if async_driver is not None: drivers = [async_driver] else: drivers = asyncio_drivers.get(backend) if drivers is None: raise PeekabooDatabaseError( 'Unknown database backend configured: %s' % backend) async_engine = None async_db_url = None for driver in drivers: # replace backend and driver with our asyncio alternative scheme = "%s+%s" % (backend, driver) async_db_url = ':'.join([scheme] + db_url.split(':')[1:]) try: async_engine = sqlalchemy.ext.asyncio.create_async_engine( async_db_url, connect_args=connect_args) except ModuleNotFoundError: continue logger.debug( 'Auto-detected %s SQLAlchemy backend+driver for ' 'asyncio database accesses', scheme) break if async_engine is None: raise PeekabooDatabaseError( 'None of the asyncio drivers for backend %s could be ' 'found: %s' % (backend, drivers)) self.__async_session_factory = sessionmaker( bind=async_engine, class_=sqlalchemy.ext.asyncio.AsyncSession) # no scoping necessary as we're not using asyncio across threads # special handling for sqlite: since it does not respond well to # multiple modify operations in parallel to the same database, we # serialise them through a QueuePool with only one connection self.__async_session_factory_modify = self.__async_session_factory if backend in ['sqlite']: async_engine_modify = sqlalchemy.ext.asyncio.create_async_engine( async_db_url, poolclass=sqlalchemy.pool.AsyncAdaptedQueuePool, pool_size=1, max_overflow=0) self.__async_session_factory_modify = sessionmaker( bind=async_engine_modify, class_=sqlalchemy.ext.asyncio.AsyncSession) self.instance_id = instance_id self.stale_in_flight_threshold = stale_in_flight_threshold self.retries = 5 # ultra-simple quadratic backoff: # attempt 1: 10 * 2**(1) == 10-20msecs # attempt 2: 10 * 2**(2) == 20-40msecs # attempt 3: 10 * 2**(3) == 40-80msecs # attempt 4: 10 * 2**(4) == 80-160msecs self.deadlock_backoff_base = 10 self.connect_backoff_base = 2000 attempt = 1 delay = 0 while attempt <= self.retries: with self.__lock: try: Base.metadata.create_all(self.__engine) break except (OperationalError, DBAPIError, SQLAlchemyError) as error: attempt, delay = self.was_transient_error( error, attempt, 'create metadata') if attempt < 0: raise PeekabooDatabaseError( 'Failed to create schema in database: %s' % error) time.sleep(delay)
async def mark_sample_in_flight(self, sample, instance_id=None, start_time=None): """ Mark a sample as in flight, i.e. being worked on by an instance. @param sample: The sample to mark as in flight. @param instance_id: (optionally) The ID of the instance that is handling this sample. Default: Us. @param start_time: Override the time the marker was placed for debugging purposes. """ # an instance id of 0 denotes that we're alone and don't need to track # in-flight samples in the database if self.instance_id == 0: return True # use our own instance id if none is given if instance_id is None: instance_id = self.instance_id if start_time is None: start_time = datetime.utcnow() in_flight_marker = InFlightSample(identity=await sample.identity, instance_id=instance_id, start_time=start_time) attempt = 1 delay = 0 while attempt <= self.retries: # a new session needs to be constructed on each attempt async with self.__session_factory() as session: # try to mark this sample as in flight in an atomic insert # operation (modulo possible deadlocks with various RDBMS) session.add(in_flight_marker) try: await session.commit() logger.debug('%d: Marked sample in flight', sample.id) return True # duplicate primary key == entry already exists except IntegrityError: await session.rollback() logger.debug( '%d: Sample is already in flight on another ' 'instance', sample.id) return False except (OperationalError, DBAPIError, SQLAlchemyError) as error: await session.rollback() attempt, delay = self.was_transient_error( error, attempt, 'marking sample %d in flight' % sample.id) if attempt < 0: raise PeekabooDatabaseError( '%d: Unable to mark sample as in flight: %s' % (sample.id, error)) await asyncio.sleep(delay) return False
def mark_sample_in_flight(self, sample, instance_id=None, start_time=None): """ Mark a sample as in flight, i.e. being worked on by an instance. @param sample: The sample to mark as in flight. @param instance_id: (optionally) The ID of the instance that is handling this sample. Default: Us. @param start_time: Override the time the marker was placed for debugging purposes. """ # an instance id of 0 denotes that we're alone and don't need to track # in-flight samples in the database if self.instance_id == 0: return True # use our own instance id if none is given if instance_id is None: instance_id = self.instance_id if start_time is None: start_time = datetime.utcnow() sha256sum = sample.sha256sum in_flight_marker = InFlightSample(sha256sum=sha256sum, instance_id=instance_id, start_time=start_time) attempt = 1 locked = False while attempt <= self.retries: # a new session needs to be constructed on each attempt session = self.__session() # try to mark this sample as in flight in an atomic insert # operation (modulo possible deadlocks with various RDBMS) session.add(in_flight_marker) try: session.commit() locked = True logger.debug('Marked sample %s as in flight', sha256sum) # duplicate primary key == entry already exists except IntegrityError: session.rollback() logger.debug( 'Sample %s is already in flight on another ' 'instance', sha256sum) except (OperationalError, DBAPIError, SQLAlchemyError) as error: session.rollback() attempt = self.was_transient_error( error, attempt, 'marking sample %s as in flight' % sha256sum) if attempt > 0: continue raise PeekabooDatabaseError( 'Unable to mark sample as in flight: %s' % error) finally: session.close() return locked return False
def __init__(self, db_url, instance_id=0, stale_in_flight_threshold=15 * 60, log_level=logging.WARNING): """ Initialize the Peekaboo database handler. @param db_url: An RFC 1738 URL that points to the database. @param instance_id: A positive, unique ID differentiating this Peekaboo instance from any other instance using the same database for concurrency coordination. Value of 0 means that we're alone and have no other instances to worry about. @param stale_in_flight_threshold: Number of seconds after which a in flight marker is considered stale and deleted or ignored. @param log_level: Overrides the log level of the database modules. The idea is for the database to be silent by default and only emit log messages if switched on explictly and independently of the Peekaboo log level. """ logging.getLogger('sqlalchemy.engine').setLevel(log_level) logging.getLogger('sqlalchemy.pool').setLevel(log_level) # aiosqlite picks up the global log level unconditionally so we need to # override it as well and explicitly logging.getLogger('aiosqlite').setLevel(log_level) # <backend>[+<driver>]:// -> <backend> url_parts = db_url.split(':') scheme_parts = url_parts[0].split('+') backend = scheme_parts[0] engine_kwargs = {} if backend == 'sqlite': engine_kwargs.update( dict(poolclass=sqlalchemy.pool.AsyncAdaptedQueuePool, pool_size=1, max_overflow=0, connect_args={'timeout': 0})) # if there is no driver specified or its a known non-asyncio driver, # try to find to a known-good asyncio driver sync_drivers = { 'sqlite': ['pysqlite'], 'mysql': ['mysqldb', 'pymysql'], 'postgresql': ['psycopg2', 'pg8000', 'psycopg2cffi', 'pypostgresql', 'pygresql'], } asyncio_drivers = { 'sqlite': ['aiosqlite'], 'mysql': ['asyncmy', 'aiomysql'], 'postgresql': ['asyncpg'], } backend_async_drivers = asyncio_drivers.get(backend) # if there seems to be a driver specified, look more closely if len(scheme_parts) > 1: driver = scheme_parts[1] backend_sync_drivers = sync_drivers.get(backend) if (backend_sync_drivers is not None and driver in backend_sync_drivers): logger.warning( 'Configuration specifies a synchronous database driver ' '"%s". Please update your configuration to use an ' 'asynchronous driver, preferably out of: %s', driver, backend_async_drivers) elif driver not in backend_async_drivers: logger.warning( 'Configuration specifies unknown asynchronous driver "%s". ' 'Trying to use anyway.', driver) backend_async_drivers = [driver] self.__engine = None for driver in backend_async_drivers: scheme = f'{backend}+{driver}' db_url = ':'.join([scheme] + url_parts[1:]) try: logger.debug('Trying SQLAlchemy backend+driver "%s"', scheme) self.__engine = sqlalchemy.ext.asyncio.create_async_engine( db_url, **engine_kwargs) except ModuleNotFoundError: continue logger.info( 'Using "%s" SQLAlchemy backend+driver for ' 'database accesses', scheme) break if self.__engine is None: raise PeekabooDatabaseError( f'None of the drivers for backend "{backend}" could be found: ' f'{backend_async_drivers}') self.__session_factory = sessionmaker( bind=self.__engine, class_=sqlalchemy.ext.asyncio.AsyncSession) self.instance_id = instance_id self.stale_in_flight_threshold = stale_in_flight_threshold self.retries = 5 # ultra-simple quadratic backoff: # attempt 1: 10 * 2**(1) == 10-20msecs # attempt 2: 10 * 2**(2) == 20-40msecs # attempt 3: 10 * 2**(3) == 40-80msecs # attempt 4: 10 * 2**(4) == 80-160msecs self.deadlock_backoff_base = 10 self.connect_backoff_base = 2000