def test_wait_while(): state = ProcessingState() worker = Worker(state) worker.start() state.wait_while(State.BOOT) assert state.has_value(State.STEADY) worker.join()
def test_wait_until_has_timeout(): state = ProcessingState() worker = Worker(state) worker.start() state.wait_until(State.STEADY, 4) assert state.has_value(State.STEADY) worker.join()
def test_wait_until_list(): state = ProcessingState() worker = Worker(state) worker.start() state.wait_until((State.STEADY, State.IDLE)) assert state.has_value(State.STEADY) worker.join()
def test_shared_state_construction(): state = ProcessingState() assert state.has_value(State.BOOT) state = ProcessingState(State.STEADY) assert state.has_value(State.STEADY)
class TaskManager: """ Task Manager """ def __init__(self, name, generation_id, channel_dict, global_config): """ :type name: :obj:`str` :arg name: Name of channel corresponding to this task manager :type generation_id: :obj:`int` :arg generation_id: Task Manager generation id provided by caller :type channel_dict: :obj:`dict` :arg channel_dict: channel configuration :type global_config: :obj:`dict` :arg global_config: global configuration """ self.id = str(uuid.uuid4()).upper() self.dataspace = dataspace.DataSpace(global_config) self.data_block_t0 = datablock.DataBlock( self.dataspace, name, self.id, generation_id) # my current data block self.name = name self.channel = Channel(channel_dict) self.state = ProcessingState() self.loglevel = multiprocessing.Value('i', logging.WARNING) self.lock = threading.Lock() # The rest of this function will go away once the source-proxy # has been reimplemented. for src_worker in self.channel.sources.values(): src_worker.worker.post_create(global_config) def wait_for_all(self, events_done): """ Wait for all sources or transforms to finish :type events_done: :obj:`list` :arg events_done: list of events to wait for """ logging.getLogger().info('Waiting for all tasks to run') try: while not all([e.is_set() for e in events_done]): time.sleep(1) if self.state.should_stop(): break for e in events_done: e.clear() except Exception: # pragma: no cover logging.getLogger().exception("Unexpected error!") raise def wait_for_any(self, events_done): """ Wait for any sources to finish :type events_done: :obj:`list` :arg events_done: list of events to wait for """ try: while not any([e.is_set() for e in events_done]): time.sleep(1) if self.state.should_stop(): break for e in events_done: if e.is_set(): e.clear() except Exception: # pragma: no cover logging.getLogger().exception("Unexpected error!") raise def run(self): """ Task Manager main loop """ logging.getLogger().setLevel(self.loglevel.value) logging.getLogger().info(f'Starting Task Manager {self.id}') done_events, source_threads = self.start_sources(self.data_block_t0) # This is a boot phase # Wait until all sources run at least one time self.wait_for_all(done_events) logging.getLogger().info('All sources finished') if not self.state.has_value(State.BOOT): for thread in source_threads: thread.join() logging.getLogger().error( f'Error occured during initial run of sources. Task Manager {self.name} exits' ) return while not self.state.should_stop(): try: self.decision_cycle() if not self.state.should_stop(): # the last decision_cycle completed without error self.state.set(State.STEADY) self.wait_for_any(done_events) except Exception: # pragma: no cover logging.getLogger().exception( "Exception in the task manager main loop") logging.getLogger().error( 'Error occured. Task Manager %s exits with state %s', self.id, self.get_state_name()) break else: # we did not use 'break' to exit the loop logging.getLogger().info( f'Task Manager {self.id} received stop signal and exits') for source in self.channel.sources.values(): source.stop_running.set() for transform in self.channel.transforms.values(): transform.stop_running.set() # the run_transform function is performing our .join() # for these threads. for thread in source_threads: thread.join() def get_state_value(self): return self.state.get().value def get_state(self): return self.state.get() def get_state_name(self): return self.get_state().name def get_produces(self): # FIXME: What happens if a transform and source have the same name? produces = {} for name, mod in self.channel.sources.items(): produces[name] = list(mod.worker._produces.keys()) for name, mod in self.channel.transforms.items(): produces[name] = list(mod.worker._produces.keys()) return produces def get_consumes(self): # FIXME: What happens if a transform and publisher have the same name? consumes = {} for name, mod in self.channel.transforms.items(): consumes[name] = list(mod.worker._consumes.keys()) for name, mod in self.channel.publishers.items(): consumes[name] = list(mod.worker._consumes.keys()) return consumes def set_loglevel_value(self, log_level): """Assumes log_level is a string corresponding to the supported logging-module levels.""" with self.loglevel.get_lock(): # Convert from string to int form using technique # suggested by logging module self.loglevel.value = getattr(logging, log_level) def get_loglevel(self): with self.loglevel.get_lock(): return self.loglevel.value def set_to_shutdown(self): self.state.set(State.SHUTTINGDOWN) logging.getLogger("decision_engine").debug( 'Shutting down. Will call ' 'shutdown on all publishers') for publisher_worker in self.channel.publishers.values(): publisher_worker.worker.shutdown() self.state.set(State.SHUTDOWN) def take_offline(self, current_data_block): """ offline and stop task manager """ self.state.set(State.OFFLINE) # invalidate data block # not implemented yet def data_block_put(self, data, header, data_block): """ Put data into data block :type data: :obj:`dict` :arg data: key, value pairs :type header: :obj:`~datablock.Header` :arg header: data header :type data_block: :obj:`~datablock.DataBlock` :arg data_block: data block """ if not isinstance(data, dict): logging.getLogger().error( f'data_block put expecting {dict} type, got {type(data)}') return logging.getLogger().debug(f'data_block_put {data}') with data_block.lock: metadata = datablock.Metadata( data_block.taskmanager_id, state='END_CYCLE', generation_id=data_block.generation_id) for key, product in data.items(): data_block.put(key, product, header, metadata=metadata) def do_backup(self): """ Duplicate current data block and return its copy :rtype: :obj:`~datablock.DataBlock` """ with self.lock: data_block = self.data_block_t0.duplicate() logging.getLogger().debug(f'Duplicated block {data_block}') return data_block def decision_cycle(self): """ Decision cycle to be run periodically (by trigger) """ data_block_t1 = self.do_backup() try: self.run_transforms(data_block_t1) except Exception: # pragma: no cover logging.getLogger().exception( "error in decision cycle(transforms) ") # We do not call 'take_offline' here because it has # already been called in the run_transform code on # operating on a separate thread. actions_facts = [] try: actions_facts = self.run_logic_engine(data_block_t1) logging.getLogger().info('ran all logic engines') except Exception: # pragma: no cover logging.getLogger().exception( "error in decision cycle(logic engine) ") self.take_offline(data_block_t1) for a_f in actions_facts: try: self.run_publishers(a_f['actions'], a_f['newfacts'], data_block_t1) except Exception: # pragma: no cover logging.getLogger().exception( "error in decision cycle(publishers) ") self.take_offline(data_block_t1) def run_source(self, src): """ Get the data from source and put it into the data block :type src: :obj:`~Worker` :arg src: source Worker """ # If task manager is in offline state, do not keep executing sources. while not self.state.should_stop(): try: logging.getLogger().info(f'Src {src.name} calling acquire') data = src.worker.acquire() Module.verify_products(src.worker, data) logging.getLogger().info(f'Src {src.name} acquire retuned') logging.getLogger().info(f'Src {src.name} filling header') if data: t = time.time() header = datablock.Header( self.data_block_t0.taskmanager_id, create_time=t, creator=src.module) logging.getLogger().info(f'Src {src.name} header done') self.data_block_put(data, header, self.data_block_t0) logging.getLogger().info( f'Src {src.name} data block put done') else: logging.getLogger().warning( f'Src {src.name} acquire retuned no data') src.run_counter += 1 src.data_updated.set() logging.getLogger().info( f'Src {src.name} {src.module} finished cycle') except Exception: logging.getLogger().exception( f'Exception running source {src.name} ') self.take_offline(self.data_block_t0) if src.schedule > 0: s = src.stop_running.wait(src.schedule) if s: logging.getLogger().info( f'received stop_running signal for {src.name}') break else: logging.getLogger().info(f'source {src.name} runs only once') break logging.getLogger().info(f'stopped {src.name}') def start_sources(self, data_block=None): """ Start sources, each in a separate thread :type data_block: :obj:`~datablock.DataBlock` :arg data_block: data block """ event_list = [] source_threads = [] for key, source in self.channel.sources.items(): logging.getLogger().info(f'starting loop for {key}') event_list.append(source.data_updated) thread = threading.Thread(target=self.run_source, name=source.name, args=(source, )) source_threads.append(thread) # Cannot catch exception from function called in separate thread thread.start() return (event_list, source_threads) def run_transforms(self, data_block=None): """ Run transforms. So far in main process. :type data_block: :obj:`~datablock.DataBlock` :arg data_block: data block """ logging.getLogger().info('run_transforms') logging.getLogger().debug(f'run_transforms: data block {data_block}') if not data_block: return event_list = [] threads = [] for key, transform in self.channel.transforms.items(): logging.getLogger().info(f'starting transform {key}') event_list.append(transform.data_updated) thread = threading.Thread(target=self.run_transform, name=transform.name, args=(transform, data_block)) threads.append(thread) # Cannot catch exception from function called in separate thread thread.start() self.wait_for_all(event_list) for thread in threads: thread.join() logging.getLogger().info('all transforms finished') def run_transform(self, transform, data_block): """ Run a transform :type transform: :obj:`~Worker` :arg transform: source Worker :type data_block: :obj:`~datablock.DataBlock` :arg data_block: data block """ data_to = self.channel.task_manager.get('data_TO', _TRANSFORMS_TO) consume_keys = list(transform.worker._consumes.keys()) logging.getLogger().info( 'transform: %s expected keys: %s provided keys: %s', transform.name, consume_keys, list(data_block.keys())) loop_counter = 0 while not self.state.should_stop(): # Check if data is ready if set(consume_keys) <= set(data_block.keys()): # data is ready - may run transform() logging.getLogger().info('run transform %s', transform.name) try: with data_block.lock: data = transform.worker.transform(data_block) logging.getLogger().debug(f'transform returned {data}') t = time.time() header = datablock.Header(data_block.taskmanager_id, create_time=t, creator=transform.name) self.data_block_put(data, header, data_block) logging.getLogger().info('transform put data') except Exception: # pragma: no cover logging.getLogger().exception( f'exception from transform {transform.name} ') self.take_offline(data_block) break s = transform.stop_running.wait(1) if s: logging.getLogger().info( f'received stop_running signal for {transform.name}') break loop_counter += 1 if loop_counter == data_to: logging.getLogger().info( f'transform {transform.name} did not get consumes data' f'in {data_to} seconds. Exiting') break transform.data_updated.set() def run_logic_engine(self, data_block=None): """ Run Logic Engine. :type data_block: :obj:`~datablock.DataBlock` :arg data_block: data block """ le_list = [] if not data_block: return try: for le in self.channel.le_s: logging.getLogger().info('run logic engine %s', self.channel.le_s[le].name) logging.getLogger().debug('run logic engine %s %s', self.channel.le_s[le].name, data_block) rc = self.channel.le_s[le].worker.evaluate(data_block) le_list.append(rc) logging.getLogger().info('run logic engine %s done', self.channel.le_s[le].name) logging.getLogger().info( 'logic engine %s generated newfacts: %s', self.channel.le_s[le].name, rc['newfacts'].to_dict(orient='records')) logging.getLogger().info( 'logic engine %s generated actions: %s', self.channel.le_s[le].name, rc['actions']) # Add new facts to the datablock # Add empty dataframe if nothing is available if le_list: all_facts = pandas.concat([i['newfacts'] for i in le_list], ignore_index=True) else: logging.getLogger().info( 'Logic engine(s) did not return any new facts') all_facts = pandas.DataFrame() data = {'de_logicengine_facts': all_facts} t = time.time() header = datablock.Header(data_block.taskmanager_id, create_time=t, creator='logicengine') self.data_block_put(data, header, data_block) except Exception: # pragma: no cover logging.getLogger().exception("Unexpected error!") raise else: return le_list def run_publishers(self, actions, facts, data_block=None): """ Run Publishers in main process. :type data_block: :obj:`~datablock.DataBlock` :arg data_block: data block """ if not data_block: return try: log = logging.getLogger() for action_list in actions.values(): for action in action_list: publisher = self.channel.publishers[action] name = publisher.name log.info(f'run publisher {name}') log.debug(f'run publisher {name} {data_block}') try: publisher.worker.publish(data_block) except KeyError as e: if self.state.should_stop(): log.warning( f"TaskManager stopping, ignore exception {name} publish() call: {e}" ) continue else: raise except Exception: # pragma: no cover logging.getLogger().exception("Unexpected error!") raise
class Reaper: """ Reaper provides functionality of periodic deletion of data older than retention_interval in days The class attributes indicate a rational set of defaults that shouldn't be altered by user configuration. """ MIN_RETENTION_INTERVAL_DAYS = 7 MIN_SECONDS_BETWEEN_RUNS = 2 * 60 * 59 def __init__(self, config): """ :type config: :obj:`dict` :arg config: Configuration dictionary """ # Validate configuration self.logger = structlog.getLogger(LOGGERNAME) self.logger = self.logger.bind(module=__name__.split(".")[-1], channel=DELOGGER_CHANNEL_NAME) self.logger.debug("Initializing a reaper") # since we must validate this, have a private store space self.__retention_interval = self.MIN_RETENTION_INTERVAL_DAYS self.__seconds_between_runs = self.MIN_SECONDS_BETWEEN_RUNS if not config.get("dataspace"): self.logger.exception("Error in initializing Reaper!") raise dataspace.DataSpaceConfigurationError( "Invalid dataspace configuration: " "dataspace key not found in dictionary") elif not isinstance(config.get("dataspace"), dict): self.logger.exception("Error in initializing Reaper!") raise dataspace.DataSpaceConfigurationError( "Invalid dataspace configuration: " "dataspace key must correspond to a dictionary") try: db_driver_name = config["dataspace"]["datasource"]["name"] db_driver_module = config["dataspace"]["datasource"]["module"] db_driver_config = config["dataspace"]["datasource"]["config"] self.retention_interval = config["dataspace"][ "retention_interval_in_days"] self.seconds_between_runs = config["dataspace"].get( "reaper_run_interval", 24 * 60 * 60) except KeyError: self.logger.exception("Error in initializing Reaper!") raise dataspace.DataSpaceConfigurationError( "Invalid dataspace configuration") self.datasource = dataspace.DataSourceLoader().create_datasource( db_driver_module, db_driver_name, db_driver_config) self.thread = None self.state = ProcessingState() @property def retention_interval(self): """We have data constraints, so use a property to track""" return self.__retention_interval @retention_interval.setter def retention_interval(self, value): if int(value) < self.MIN_RETENTION_INTERVAL_DAYS: self.logger.exception("Error in initializing Reaper!") raise ValueError( f"For safety the data retention interval has to be greater than {self.MIN_RETENTION_INTERVAL_DAYS} days" ) self.logger.debug(f"Reaper setting retention_interval to {value}.") self.__retention_interval = int(value) @property def seconds_between_runs(self): """We have data constraints, so use a property to track""" return self.__seconds_between_runs @seconds_between_runs.setter def seconds_between_runs(self, value): if int(value) < self.MIN_SECONDS_BETWEEN_RUNS: self.logger.exception("Error in initializing Reaper!") raise ValueError( f"For performance the time between runs to be greater than {self.MIN_SECONDS_BETWEEN_RUNS} seconds" ) self.logger.debug(f"Reaper setting seconds_between_runs to {value}.") self.__seconds_between_runs = int(value) def reap(self): """ Actually spawn the query to delete the old records. Lock the state as this task doesn't have a cancel option. """ with self.state.lock: if self.state.should_stop(): return self.logger.info("Reaper.reap() started.") self.state.set(State.ACTIVE) self.datasource.delete_data_older_than(self.retention_interval) self.state.set(State.STEADY) self.logger.info("Reaper.reap() completed.") def _reaper_loop(self, delay): """ The thread actually runs this. """ self.logger.debug("Reaper thread started.") between_runs = delay while not self.state.should_stop(): try: with self.state.lock: if not self.state.should_stop(): # The start function will block until the state is changed from BOOT # If we are signaled to stop, don't override that state self.state.set(State.IDLE) self.logger.debug( f"Reaper waiting {between_runs} seconds or for a stop.") self.state.wait_until(STOPPING_CONDITIONS, timeout=between_runs) # in stop state, end wait and get out of the loop ASAP if self.state.should_stop(): self.logger.debug( f"Reaper recieved a stop event: {self.state.get()}.") else: self.reap() # after waiting the initial delay time # use the self.seconds_between_runs as the delay if between_runs != self.seconds_between_runs: between_runs = self.seconds_between_runs except Exception: # pragma: no cover self.state.set(State.ERROR) self.logger.exception( f"Reaper.reap() failed: {self.state.get()}.") break else: # we did not use 'break' to exit the loop self.state.set(State.SHUTDOWN) self.logger.info("Reaper shutdown cleanly.") def start(self, delay=0): """ Start thread with an optional delay to start the thread in X seconds """ if self.state.should_stop() and not self.state.inactive(): self.logger.debug( f"Reaper asked to start during stop: {self.state.get()}.") if isinstance(self.thread, threading.Thread) and self.thread.is_alive(): try: # threads need to end, else we orphan one and make a new thread raise RuntimeError( "Reaper asked to start, but it is running already.") except RuntimeError as __e: self.logger.exception(__e) raise try: # each invocation must be a new thread with self.state.lock: self.state.set(State.BOOT) self.thread = threading.Thread(group=None, target=self._reaper_loop, args=(delay, ), name="Reaper_loop_thread") self.thread.start() except Exception: # pragma: no cover self.logger.exception("Reaper loop thread not started") self.state.wait_while(State.BOOT) def stop(self): """ Try to stop the reaper, will block if the reaper cannot be interupted. """ if isinstance(self.thread, threading.Thread): if self.thread.is_alive(): with self.state.lock: if not self.state.should_stop(): self.logger.debug( "Sending reaper State.SHUTTINGDOWN signal") self.state.set(State.SHUTTINGDOWN) if self.state.has_value(State.SHUTTINGDOWN): self.state.wait_while(State.SHUTTINGDOWN) self.logger.info(f"Reaper shutdown : {self.state.get()}.") else: self.logger.debug( f"Reaper tried to stop but is stopped already: {self.state.get()}." ) else: self.logger.debug("Reaper tried to stop but was never started") def __repr__(self): # pragma: no cover return ( f"Reaper, retention interval {self.retention_interval,} days, " f"between runs wait {self.seconds_between_runs} seconds, state {self.state.get()}" )
def test_wrong_value_has_value_list(): state = ProcessingState() with pytest.raises(RuntimeError): state.has_value((State.STEADY, 'jkl'))
def test_wrong_value_has_value(): state = ProcessingState() with pytest.raises(RuntimeError): state.has_value('asdf')