def test_dataspace_config_finds_bad(): with pytest.raises(dataspace.DataSpaceConfigurationError) as e: dataspace.DataSpace({}) assert e.match('missing dataspace information') with pytest.raises(dataspace.DataSpaceConfigurationError) as e: dataspace.DataSpace({'dataspace': 'asdf'}) assert e.match('dataspace key must correspond to a dictionary') with pytest.raises(dataspace.DataSpaceConfigurationError) as e: dataspace.DataSpace({'dataspace': {'asdf': 'asdf'}}) assert e.match('Invalid dataspace configuration')
def test_dataspace_config_finds_bad(): with pytest.raises(ds.DataSpaceConfigurationError) as e: ds.DataSpace({}) assert e.match("missing dataspace information") with pytest.raises(ds.DataSpaceConfigurationError) as e: ds.DataSpace({"dataspace": "asdf"}) assert e.match("dataspace key must correspond to a dictionary") with pytest.raises(ds.DataSpaceConfigurationError) as e: ds.DataSpace({"dataspace": {"asdf": "asdf"}}) assert e.match("Invalid dataspace configuration")
def main(): """ Call this a a test unit or use as CLI of this module """ import argparse parser = argparse.ArgumentParser() parser.add_argument('--configtemplate', action='store_true', help='prints the expected module configuration') parser.add_argument( '--configinfo', action='store_true', help='prints config template along with produces and consumes info') args = parser.parse_args() if args.configtemplate: module_config_template() elif args.configinfo: module_config_info() else: config_manager = configmanager.ConfigManager() config_manager.load() global_config = config_manager.get_global_config() print("GLOBAL CONF", global_config) ds = dataspace.DataSpace(global_config)
def __init__(self, name, generation_id, channel_dict, global_config): """ :type name: :obj:`str` :arg name: Name of channel corresponding to this task manager :type generation_id: :obj:`int` :arg generation_id: Task Manager generation id provided by caller :type channel_dict: :obj:`dict` :arg channel_dict: channel configuration :type global_config: :obj:`dict` :arg global_config: global configuration """ self.id = str(uuid.uuid4()).upper() self.dataspace = dataspace.DataSpace(global_config) self.data_block_t0 = datablock.DataBlock( self.dataspace, name, self.id, generation_id) # my current data block self.name = name self.channel = Channel(channel_dict) self.state = ProcessingState() self.loglevel = multiprocessing.Value('i', logging.WARNING) self.lock = threading.Lock() # The rest of this function will go away once the source-proxy # has been reimplemented. for src_worker in self.channel.sources.values(): src_worker.worker.post_create(global_config)
def __init__(self, global_config, channel_config_loader, server_address): xmlrpc.server.SimpleXMLRPCServer.__init__( self, server_address, logRequests=False, requestHandler=RequestHandler) signal.signal(signal.SIGHUP, self.handle_sighup) self.channel_config_loader = channel_config_loader self.global_config = global_config self.dataspace = dataspace.DataSpace(self.global_config) self.reaper = Reaper(self.global_config) self.startup_complete = Event() self.logger = structlog.getLogger(LOGGERNAME) self.logger = self.logger.bind(module=__name__.split(".")[-1], channel=DELOGGER_CHANNEL_NAME) self.logger.debug(f"DecisionEngine starting on {server_address}") exchange_name = self.global_config.get("exchange_name", "hepcloud_topic_exchange") self.logger.debug(f"Creating topic exchange {exchange_name}") self.exchange = Exchange(exchange_name, "topic") self.broker_url = self.global_config.get("broker_url", "redis://localhost:6379/0") _verify_redis_server(self.broker_url) self.source_workers = SourceWorkers(self.exchange, self.broker_url, self.logger) self.channel_workers = ChannelWorkers() self.register_function(self.rpc_metrics, name="metrics") self.logger.info( f"DecisionEngine __init__ complete {server_address} with {self.broker_url}" )
def dataspace(request): """ This parameterized fixture will setup up various datasources. Add datasource objects to DATASOURCES_TO_TEST once they've got our basic schema loaded. And adjust our `if` statements here until we are SQLAlchemy only. Pytest should take it from there and automatically run it through all the tests using this fixture. """ conn_fixture = request.getfixturevalue(request.param) db_info = {} try: # SQL Alchemy db_info["url"] = conn_fixture["url"] db_info["echo"] = True # put into extra chatty mode for tests except TypeError: try: # psycopg2 db_info["host"] = conn_fixture.info.host db_info["port"] = conn_fixture.info.port db_info["user"] = conn_fixture.info.user db_info["password"] = conn_fixture.info.password db_info["database"] = conn_fixture.info.dbname except AttributeError: # psycopg2cffi for element in conn_fixture.dsn.split(): (key, value) = element.split("=") if value != "''" and value != '""': db_info[key] = value config = {} config["dataspace"] = {} config["dataspace"]["datasource"] = {} config["dataspace"]["datasource"]["config"] = db_info if request.param == "PG_DE_DB_WITH_SCHEMA": config["dataspace"]["datasource"][ "module"] = "decisionengine.framework.dataspace.datasources.postgresql" config["dataspace"]["datasource"]["name"] = "Postgresql" elif "SQLALCHEMY" in request.param: config["dataspace"]["datasource"][ "module"] = "decisionengine.framework.dataspace.datasources.sqlalchemy_ds" config["dataspace"]["datasource"]["name"] = "SQLAlchemyDS" my_ds = ds.DataSpace(config) load_sample_data_into_datasource(my_ds) yield my_ds del my_ds gc.collect()
def start_channel(self, channel_name, channel_config): channel_config = copy.deepcopy(channel_config) with START_CHANNEL_HISTOGRAM.labels(channel_name).time(): # NB: Possibly override channel name channel_name = channel_config.get("channel_name", channel_name) source_configs = channel_config.pop("sources") src_workers = self.source_workers.update(channel_name, source_configs) module_workers = validated_workflow(channel_name, src_workers, channel_config, self.logger) queue_info = [(worker.queue.name, worker.key) for worker in src_workers.values()] self.logger.debug(f"Building TaskManger for {channel_name}") task_manager = TaskManager.TaskManager( channel_name, module_workers, dataspace.DataSpace(self.global_config), source_products(src_workers), self.exchange, self.broker_url, queue_info, ) self.logger.debug(f"Building Worker for {channel_name}") worker = ChannelWorker(task_manager, self.global_config["logger"]) WORKERS_COUNT.inc() with self.channel_workers.access() as workers: workers[channel_name] = worker # The channel must be started first so it can listen for the messages from the sources. self.logger.debug(f"Trying to start {channel_name}") worker.start() self.logger.info(f"Channel {channel_name} started") worker.wait_while(ProcessingState.State.BOOT) # Start any sources that are not yet alive. for key, src_worker in src_workers.items(): if src_worker.is_alive(): continue if src_worker.exitcode == 0: # pragma: no cover # This can happen if the source's acquire method runs only once (e.g. when testing) # and the first process completes before the next channel can use it. raise RuntimeError( f"The {key} source has already completed and cannot be used by channel {channel_name}." ) src_worker.start() self.logger.debug( f"Started process {src_worker.pid} for source {key}") worker.wait_while(ProcessingState.State.ACTIVE)
def __init__(self, cfg, server_address, RequestHandlerClass): SimpleXMLRPCServer.SimpleXMLRPCServer.__init__( self, server_address, logRequests=False, requestHandler=RequestHandlerClass) self.logger = logging.getLogger("decision_engine") signal.signal(signal.SIGHUP, self.handle_sighup) self.task_managers = {} self.config_manager = cfg self.dataspace = dataspace.DataSpace( self.config_manager.get_global_config())
def __init__(self, *args, **kwargs): if not set(must_have).issubset(set(args[0].keys())): raise RuntimeError( 'SourceProxy misconfigured. Must have %s defined' % (must_have, )) self.source_channel = args[0]['channel_name'] self.data_keys = args[0]['Dataproducts'] self.retries = args[0].get('retries', RETRIES) self.retry_to = args[0].get('retry_timeout', RETRY_TO) self.logger = de_logger.get_logger() config_manager = configmanager.ConfigManager() config_manager.load() global_config = config_manager.get_global_config() self.dataspace = dataspace.DataSpace(global_config)
def __init__(self, global_config, channel_config_loader, server_address): xmlrpc.server.SimpleXMLRPCServer.__init__(self, server_address, logRequests=False, requestHandler=RequestHandler) self.logger = logging.getLogger("decision_engine") signal.signal(signal.SIGHUP, self.handle_sighup) self.workers = Workers() self.channel_config_loader = channel_config_loader self.global_config = global_config self.dataspace = dataspace.DataSpace(self.global_config) self.reaper = dataspace.Reaper(self.global_config) self.logger.info("DecisionEngine started on {}".format(server_address))
def main(): """ Call this a a test unit or use as CLI of this module """ import argparse parser = argparse.ArgumentParser() parser.add_argument('--configtemplate', action='store_true', help='prints the expected module configuration') parser.add_argument( '--configinfo', action='store_true', help='prints config template along with produces and consumes info') args = parser.parse_args() if args.configtemplate: module_config_template() elif args.configinfo: module_config_info() else: config_manager = configmanager.ConfigManager() config_manager.load() global_config = config_manager.get_global_config() print("GLOBAL CONF", global_config) ds = dataspace.DataSpace(global_config) data_block = datablock.DataBlock( ds, # '5CC840DD-88B9-45CE-9DA2-FF531289AC66', 'C56E0AAF-99D3-42A8-88A3-921E30C1879C', 1) fm_info = AWSFOMPublisher({ "publish_to_graphite": True, "graphite_host": "fifemondata.fnal.gov", "graphite_port": 2104, "graphite_context": "hepcloud.aws", "output_file": "%s/de_data/AWS_figure_of_merit.csv" % (os.environ.get('HOME'), ) }) rc = fm_info.publish(data_block)
def __init__(self, global_config, channel_config_loader, server_address): xmlrpc.server.SimpleXMLRPCServer.__init__( self, server_address, logRequests=False, requestHandler=RequestHandler) signal.signal(signal.SIGHUP, self.handle_sighup) self.workers = Workers() self.channel_config_loader = channel_config_loader self.global_config = global_config self.dataspace = dataspace.DataSpace(self.global_config) self.reaper = Reaper(self.global_config) self.startup_complete = Event() self.logger = structlog.getLogger(LOGGERNAME) self.logger = self.logger.bind(module=__name__.split(".")[-1], channel=DELOGGER_CHANNEL_NAME) self.logger.info(f"DecisionEngine started on {server_address}")
def __init__(self, global_config, channel_config_loader, server_address): xmlrpc.server.SimpleXMLRPCServer.__init__( self, server_address, logRequests=False, requestHandler=RequestHandler ) signal.signal(signal.SIGHUP, self.handle_sighup) self.source_workers = {} self.channel_workers = Workers() self.channel_config_loader = channel_config_loader self.global_config = global_config self.dataspace = dataspace.DataSpace(self.global_config) self.reaper = Reaper(self.global_config) self.startup_complete = Event() self.logger = structlog.getLogger(LOGGERNAME) self.logger = self.logger.bind(module=__name__.split(".")[-1], channel=DELOGGER_CHANNEL_NAME) self.logger.info(f"DecisionEngine started on {server_address}") self.register_function(self.rpc_metrics, name="metrics") if not global_config.get("no_webserver"): self.start_webserver() self.broker_url = self.global_config.get("broker_url", "redis://localhost:6379/0") _verify_redis_server(self.broker_url)
def dataspace(request): """ This parameterized fixture will setup up various datasources. Add datasource objects to DATASOURCES_TO_TEST once they've got our basic schema loaded. And adjust our `if` statements here until we are SQLAlchemy only. Pytest should take it from there and automatically run it through all the tests using this fixture. """ conn_fixture = request.getfixturevalue(request.param) db_info = {} try: # SQL Alchemy db_info["url"] = conn_fixture["url"] db_info["echo"] = True # put into extra chatty mode for tests except TypeError: # psycopg2 db_info["host"] = conn_fixture.info.host db_info["port"] = conn_fixture.info.port db_info["user"] = conn_fixture.info.user db_info["password"] = conn_fixture.info.password db_info["database"] = conn_fixture.info.dbname config = {} config["dataspace"] = {} config["dataspace"]["datasource"] = {} config["dataspace"]["datasource"]["config"] = db_info config["dataspace"]["datasource"]["module"] = "decisionengine.framework.dataspace.datasources.sqlalchemy_ds" config["dataspace"]["datasource"]["name"] = "SQLAlchemyDS" my_ds = ds.DataSpace(config) load_sample_data_into_datasource(my_ds) yield my_ds del my_ds gc.collect()
def __init__(self, name, task_manager_id, generation_id, channel_dict, global_config): """ :type task_manager_id: :obj:`int` :arg task_manager_id: Task Manager id provided by caller :type channel_dict: :obj:`dict` :arg channel_dict: channel configuration :type data_block: :obj:`~datablock.DataBlock` :arg data_block: data block """ self.dataspace = dataspace.DataSpace(global_config) self.data_block_t0 = datablock.DataBlock( self.dataspace, name, task_manager_id, generation_id) # my current data block self.name = name self.id = task_manager_id self.channel = Channel(channel_dict) self.state = multiprocessing.Value('i', BOOT) self.decision_cycle_active = False self.lock = threading.Lock() self.logger = de_logger.get_logger() self.stop = False # stop running all loops when this is True
def dspace(datasource): global_config = { 'dataspace': { 'reaper_start_delay_seconds': 1818, 'retention_interval_in_days': 365, 'datasource': { 'module': 'decisionengine.framework.dataspace.datasources.postgresql', 'name': 'Postgresql', 'config': { 'user': datasource.info.dsn_parameters["user"], 'blocking': True, 'host': datasource.info.dsn_parameters["host"], 'port': datasource.info.dsn_parameters["port"], 'database': datasource.info.dsn_parameters["dbname"], 'maxconnections': 100, 'maxcached': 10, }, }, } } return dataspace.DataSpace(global_config)
def main(): """ Call this a a test unit or use as CLI of this module """ import argparse parser = argparse.ArgumentParser() parser.add_argument('--configtemplate', action='store_true', help='prints the expected module configuration') parser.add_argument( '--configinfo', action='store_true', help='prints config template along with produces and consumes info') args = parser.parse_args() if args.configtemplate: module_config_template() elif args.configinfo: module_config_info() else: config_manager = configmanager.ConfigManager() config_manager.load() global_config = config_manager.get_global_config() print "GLOBAL CONF", global_config ds = dataspace.DataSpace(global_config) #data_block = datablock.DataBlock(ds, # '6D596F43-B4DB-4418-812A-79869001E72B', # 1) data_block = datablock.DataBlock( ds, "AWS_Calculations_with_source_proxy", "F70B4110-E66D-49CA-9333-4A983A679F37", 1, 109) fm_info = FigureOfMerit() rc = fm_info.transform(data_block) print "INFO" print rc
my_logger = logging.getLogger('decision_engine') my_logger.info('Starting decision engine') if len(sys.argv) > 1: channel_name = sys.argv[1] channel_conf = os.path.join(config_manager.channel_config_dir, channel_name) with open(os.path.abspath(channel_conf), 'r') as f: channels = {} channel_name = channel_name.split('.')[0] code = 'channels[channel_name]=' + ''.join(f.readlines()) exec(code) else: channels = config_manager.get_channels() ds = dataspace.DataSpace(global_config) taskmanager_id = str(uuid.uuid4()).upper() generation_id = 1 task_managers = {} data_space = {} """ create channels """ for ch in channels: task_managers[ch] = TaskManager(ch, taskmanager_id, generation_id, channels[ch], global_config) for key, value in task_managers.items(): p = multiprocessing.Process(target=value.run, args=(),
def post_create(self, global_config): self.dataspace = dataspace.DataSpace(global_config)
def __init__(self, name, channel_dict, global_config, de_source_workers=None): """ :type name: :obj:`str` :arg name: Name of channel corresponding to this task manager :type generation_id: :obj:`int` :arg generation_id: Task manager generation id provided by caller :type channel_dict: :obj:`dict` :arg channel_dict: channel configuration :type global_config: :obj:`dict` :arg global_config: global configuration """ super().__init__(channel_dict.get("channel_name", name)) self.id = str(uuid.uuid4()).upper() self.dataspace = dataspace.DataSpace(global_config) self.data_block_t0 = datablock.DataBlock(self.dataspace, name, self.id, 1) # my current data block self.logger = structlog.getLogger(CHANNELLOGGERNAME) self.logger = self.logger.bind(module=__name__.split(".")[-1], channel=self.name) self.broker_url = global_config.get("broker_url", "redis://localhost:6379/0") self.logger.debug(f"Using data-broker URL: {self.broker_url}") self.logger.debug("Creating channel sources") self.source_workers = _make_workers_for(channel_dict["sources"], Source, self.name) if de_source_workers is not None: # Decision engine owns the sources de_source_workers[self.name] = self.source_workers self.logger.debug("Creating channel publishers") self.publisher_workers = _make_workers_for(channel_dict["publishers"], Publisher, self.name) self.logger.debug("Creating channel logic engines") configured_le_s = channel_dict.get("logicengines") if configured_le_s is None: self.logger.debug( "No 'logicengines' configuration detected; will use default configuration, which unconditionally executes all configured publishers." ) configured_le_s = passthrough_configuration( channel_dict["publishers"].keys()) if len(configured_le_s) > 1: raise RuntimeError( "Cannot support more than one logic engine per channel.") self.logic_engine = None if configured_le_s: key, config = configured_le_s.popitem() self.logic_engine = Worker(key, config, LogicEngine, self.name) self.logger.debug("Creating channel transforms") transform_workers = _make_workers_for(channel_dict["transforms"], Transform, self.name) self.transform_workers = ensure_no_circularities( self.source_workers, transform_workers, self.publisher_workers) exchange_name = global_config.get("exchange_name", "hepcloud_topic_exchange") self.logger.debug( f"Creating topic exchange {exchange_name} for channel {self.name}") self.exchange = Exchange(exchange_name, "topic") self.connection = Connection(self.broker_url) expected_source_products = set() queues = {} for worker in self.source_workers.values(): # FIXME: Just keeping track of instance names will not # work whenever we have multiple source instances # of the same source type. expected_source_products.update( worker.module_instance._produces.keys()) self.logger.debug( f"Creating queue {worker.full_key} with routing key {worker.full_key}" ) queues[worker.full_key] = Queue( worker.full_key, exchange=self.exchange, routing_key=worker.full_key, auto_delete=True, ) self.expected_source_products = expected_source_products self.queues = queues # Caching to determine if all sources have run at least once. self.sources_have_run_once = False self.source_product_cache = {} # The rest of this function will go away once the source-proxy # has been reimplemented. for src_worker in self.source_workers.values(): src_worker.module_instance.post_create(global_config)