class Scheduler(SynergyProcess): """ Scheduler encapsulate logic for starting the aggregators/alerts/other readers """ def __init__(self, process_name): super(Scheduler, self).__init__(process_name) self.logger.info('Starting %s' % self.process_name) self.publishers = PublishersPool(self.logger) self.thread_handlers = dict() self.lock = Lock() self.timetable = TimeTable(self.logger) self.regular_pipeline = RegularPipeline(self, self.timetable) self.hadoop_pipeline = HadoopPipeline(self, self.timetable) self.logger.info('Started %s' % self.process_name) def __del__(self): for handler in self.thread_handlers: handler.cancel() self.thread_handlers.clear() super(Scheduler, self).__del__() def _log_message(self, level, process_name, time_record, msg): """ method performs logging into log file and TimeTable node""" self.timetable.add_log_entry(process_name, time_record, datetime.utcnow(), msg) self.logger.log(level, msg) # **************** Scheduler Methods ************************ @with_reconnect def start(self): """ reading scheduler configurations and starting timers to trigger events """ collection = CollectionContext.get_collection(self.logger, COLLECTION_SCHEDULER_CONFIGURATION) cursor = collection.find({}) if cursor.count() == 0: raise LookupError('MongoDB has no scheduler configuration entries') for entry in cursor: document = SchedulerConfigurationEntry(entry) interval = document.get_interval() is_active = document.get_process_state() == SchedulerConfigurationEntry.STATE_ON type = ProcessContext.get_type(document.get_process_name()) parameters = [document.get_process_name(), document] if type == TYPE_ALERT: function = self.fire_alert elif type == TYPE_HORIZONTAL_AGGREGATOR: function = self.fire_worker elif type == TYPE_VERTICAL_AGGREGATOR: function = self.fire_worker elif type == TYPE_GARBAGE_COLLECTOR: function = self.fire_garbage_collector else: self.logger.error('Can not start scheduler for %s since it has no processing function' % type) continue handler = RepeatTimer(interval, function, args=parameters) self.thread_handlers[document.get_process_name()] = handler if is_active: handler.start() self.logger.info('Started scheduler for %s:%s, triggering every %d seconds'\ % (type, document.get_process_name(), interval)) else: self.logger.info('Handler for %s:%s registered in Scheduler. Idle until activated.'\ % (type, document.get_process_name())) # as Scheduler is now initialized and running - we can safely start its MX self.start_mx() def start_mx(self): """ method's only purpose: import MX module (which has back-reference import to scheduler) and start it """ from mx.mx import MX self.mx = MX(self) self.mx.start_mx_thread() def fire_worker(self, *args): """requests vertical aggregator (hourly site, daily variant, etc) to start up""" try: process_name = args[0] self.lock.acquire() self.logger.info('%s {' % process_name) time_record = self.timetable.get_next_timetable_record(process_name) time_qualifier = ProcessContext.get_time_qualifier(process_name) if time_qualifier == ProcessContext.QUALIFIER_HOURLY: self.regular_pipeline.manage_pipeline_for_process(process_name, time_record) else: self.hadoop_pipeline.manage_pipeline_for_process(process_name, time_record) except (AMQPException, IOError) as e: self.logger.error('AMQPException: %s' % str(e), exc_info=True) self.publishers.reset_all_publishers(suppress_logging=True) except Exception as e: self.logger.error('Exception: %s' % str(e), exc_info=True) finally: self.logger.info('}') self.lock.release() def fire_alert(self, *args): """ Triggers AlertWorker. Makes sure its <dependent on> trees have finalized corresponding timeperiods prior to that""" try: process_name = args[0] self.lock.acquire() self.logger.info('%s {' % process_name) time_record = self.timetable.get_next_timetable_record(process_name) self.hadoop_pipeline.manage_pipeline_with_blocking_dependencies(process_name, time_record) except (AMQPException, IOError) as e: self.logger.error('AMQPException: %s' % str(e), exc_info=True) self.publishers.reset_all_publishers(suppress_logging=True) except Exception as e: self.logger.error('Exception: %s' % str(e), exc_info=True) finally: self.logger.info('}') self.lock.release() def fire_garbage_collector(self, *args): """fires garbage collector to re-run all invalid records""" try: process_name = args[0] self.lock.acquire() self.logger.info('%s {' % process_name) self.publishers.get_publisher(process_name).publish({}) self.logger.info('Publishing trigger for garbage_collector') self.timetable.build_tree() self.timetable.validate() self.logger.info('Validated Timetable for all trees') except (AMQPException, IOError) as e: self.logger.error('AMQPException: %s' % str(e), exc_info=True) self.publishers.reset_all_publishers(suppress_logging=True) except Exception as e: self.logger.error('fire_garbage_collector: %s' % str(e)) finally: self.logger.info('}') self.lock.release()
class GarbageCollectorWorker(AbstractWorker): """ this class reads stream of messages from RabbitMQ and dump them to the MongoDB """ def __init__(self, process_name): super(GarbageCollectorWorker, self).__init__(process_name) self.publishers = PublishersPool(self.logger) self.collection = CollectionContext.get_collection(self.logger, COLLECTION_UNITS_OF_WORK) self.lock = Lock() def __del__(self): super(GarbageCollectorWorker, self).__del__() @thread_safe def _mq_callback(self, message): """ method looks for units of work in STATE_INVALID and re-runs them""" try: query = { UnitOfWorkEntry.STATE: { "$in": [ UnitOfWorkEntry.STATE_IN_PROGRESS, UnitOfWorkEntry.STATE_INVALID, UnitOfWorkEntry.STATE_REQUESTED, ] } } cursor = self.collection.find(query).sort("_id", ASCENDING) if cursor.count() != 0: for document in cursor: self._process_single_document(document) except Exception as e: self.logger.error("_mq_callback: %s" % str(e), exc_info=True) finally: self.consumer.acknowledge(message.delivery_tag) def _process_single_document(self, document): """ actually inspects UOW retrieved from the database""" repost = False unit_of_work = UnitOfWorkEntry(document) process_name = unit_of_work.get_process_name() if unit_of_work.get_state() == UnitOfWorkEntry.STATE_INVALID: repost = True elif ( unit_of_work.get_state() == UnitOfWorkEntry.STATE_IN_PROGRESS or unit_of_work.get_state() == UnitOfWorkEntry.STATE_REQUESTED ): last_activity = unit_of_work.get_started_at() if last_activity is None: last_activity = unit_of_work.get_created_at() if datetime.utcnow() - last_activity > timedelta(hours=REPOST_AFTER_HOURS): repost = True if repost: creation_time = unit_of_work.get_created_at() if datetime.utcnow() - creation_time < timedelta(hours=LIFE_SUPPORT_HOURS): unit_of_work.set_state(UnitOfWorkEntry.STATE_REQUESTED) unit_of_work.set_number_of_retries(unit_of_work.get_number_of_retries() + 1) unit_of_work_helper.update(self.logger, unit_of_work) self.publishers.get_publisher(process_name).publish(str(document["_id"])) self.logger.info( "UOW marked for re-processing: process %s; id %s; attempt %d" % (process_name, str(document["_id"]), unit_of_work.get_number_of_retries()) ) self.performance_ticker.increment() else: unit_of_work.set_state(UnitOfWorkEntry.STATE_CANCELED) unit_of_work_helper.update(self.logger, unit_of_work) self.logger.info( "UOW transfered to STATE_CANCELED: process %s; id %s; attempt %d" % (process_name, str(document["_id"]), unit_of_work.get_number_of_retries()) )