Ejemplo n.º 1
0
    def __init__(self, name, config):
        self.name = name
        try:
            self.verbose      = config['verbose']
            title             = 'DASAbstactService_%s' % self.name
            self.logger       = PrintManager(title, self.verbose)
            self.dasmapping   = config['dasmapping']
            self.analytics    = config['dasanalytics']
            self.write2cache  = config.get('write_cache', True)
            self.multitask    = config['das'].get('multitask', True)
            self.error_expire = config['das'].get('error_expire', 300) 
            if  config.has_key('dbs'):
                self.dbs_global = config['dbs'].get('dbs_global_instance', None)
            else:
                self.dbs_global = None
            dburi             = config['mongodb']['dburi']
            engine            = config.get('engine', None)
            self.gfs          = db_gridfs(dburi)
        except Exception as exc:
            print_exc(exc)
            raise Exception('fail to parse DAS config')

        # read key/cert info
        try:
            self.ckey, self.cert = get_key_cert()
        except Exception as exc:
            print_exc(exc)
            self.ckey = None
            self.cert = None

        if  self.multitask:
            nworkers = config['das'].get('api_workers', 3)
            thr_weights = config['das'].get('thread_weights', [])
            for system_weight in thr_weights:
                system, weight = system_weight.split(':')
                if  system == self.name:
                    nworkers *= int(weight)
            if  engine:
                thr_name = 'DASAbstractService:%s:PluginTaskManager' % self.name
                self.taskmgr = PluginTaskManager(\
                        engine, nworkers=nworkers, name=thr_name)
                self.taskmgr.subscribe()
            else:
                thr_name = 'DASAbstractService:%s:TaskManager' % self.name
                self.taskmgr = TaskManager(nworkers=nworkers, name=thr_name)
        else:
            self.taskmgr = None

        self.map        = {}   # to be defined by data-service implementation
        self._keys      = None # to be defined at run-time in self.keys
        self._params    = None # to be defined at run-time in self.parameters
        self._notations = {}   # to be defined at run-time in self.notations

        self.logger.info('initialized')
        # define internal cache manager to put 'raw' results into cache
        if  config.has_key('rawcache') and config['rawcache']:
            self.localcache   = config['rawcache']
        else:
            msg = 'Undefined rawcache, please check your configuration'
            raise Exception(msg)
Ejemplo n.º 2
0
    def __init__(self, name, config):
        self.name = name
        try:
            self.verbose = config['verbose']
            title = 'DASAbstactService_%s' % self.name
            self.logger = PrintManager(title, self.verbose)
            self.dasmapping = config['dasmapping']
            self.write2cache = config.get('write_cache', True)
            self.multitask = config['das'].get('multitask', True)
            self.error_expire = config['das'].get('error_expire', 300)
            self.dbs_global = None  # to be configured at run time
            self.dburi = config['mongodb']['dburi']
            engine = config.get('engine', None)
            self.gfs = db_gridfs(self.dburi)
        except Exception as exc:
            print_exc(exc)
            raise Exception('fail to parse DAS config')

        # read key/cert info
        try:
            self.ckey, self.cert = get_key_cert()
        except Exception as exc:
            print_exc(exc)
            self.ckey = None
            self.cert = None

        if self.multitask:
            nworkers = config['das'].get('api_workers', 3)
            thr_weights = config['das'].get('thread_weights', [])
            for system_weight in thr_weights:
                system, weight = system_weight.split(':')
                if system == self.name:
                    nworkers *= int(weight)
#             if  engine:
#                 thr_name = 'DASAbstractService:%s:PluginTaskManager' % self.name
#                 self.taskmgr = PluginTaskManager(\
#                         engine, nworkers=nworkers, name=thr_name)
#                 self.taskmgr.subscribe()
#             else:
#                 thr_name = 'DASAbstractService:%s:TaskManager' % self.name
#                 self.taskmgr = TaskManager(nworkers=nworkers, name=thr_name)
            thr_name = 'DASAbstractService:%s:TaskManager' % self.name
            self.taskmgr = TaskManager(nworkers=nworkers, name=thr_name)
        else:
            self.taskmgr = None

        self.map = {}  # to be defined by data-service implementation
        self._keys = None  # to be defined at run-time in self.keys
        self._params = None  # to be defined at run-time in self.parameters
        self._notations = {}  # to be defined at run-time in self.notations

        self.logger.info('initialized')
        # define internal cache manager to put 'raw' results into cache
        if 'rawcache' in config and config['rawcache']:
            self.localcache = config['rawcache']
        else:
            msg = 'Undefined rawcache, please check your configuration'
            raise Exception(msg)
Ejemplo n.º 3
0
 def test_task_manager(self):
     """Test task manager"""
     expect = [idx for idx in range(self.size)]
     mypool = TaskManager()
     tasks = []
     for idx in expect:
         tasks.append(mypool.spawn(worker, idx, self.data))
     mypool.joinall(tasks)
     result = [idx for idx in self.data]
     self.assertEqual(result, expect)
Ejemplo n.º 4
0
 def test_task_manager(self):
     """Test task manager"""
     expect = [idx for idx in range(self.size)]
     mypool = TaskManager()
     tasks  = []
     for idx in expect:
         tasks.append(mypool.spawn(worker, idx, self.data))
     mypool.joinall(tasks)
     result = [idx for idx in self.data]
     self.assertEqual(result, expect)
Ejemplo n.º 5
0
    def __init__(self, name, config):
        self.name = name
        try:
            self.verbose = config["verbose"]
            title = "DASAbstactService_%s" % self.name
            self.logger = PrintManager(title, self.verbose)
            self.dasmapping = config["dasmapping"]
            self.write2cache = config.get("write_cache", True)
            self.multitask = config["das"].get("multitask", True)
            self.error_expire = config["das"].get("error_expire", 300)
            self.dbs_global = None  # to be configured at run time
            self.dburi = config["mongodb"]["dburi"]
            engine = config.get("engine", None)
            self.gfs = db_gridfs(self.dburi)
        except Exception as exc:
            print_exc(exc)
            raise Exception("fail to parse DAS config")

        # read key/cert info
        try:
            self.ckey, self.cert = get_key_cert()
        except Exception as exc:
            print_exc(exc)
            self.ckey = None
            self.cert = None

        if self.multitask:
            nworkers = config["das"].get("api_workers", 3)
            thr_weights = config["das"].get("thread_weights", [])
            for system_weight in thr_weights:
                system, weight = system_weight.split(":")
                if system == self.name:
                    nworkers *= int(weight)
            if engine:
                thr_name = "DASAbstractService:%s:PluginTaskManager" % self.name
                self.taskmgr = PluginTaskManager(engine, nworkers=nworkers, name=thr_name)
                self.taskmgr.subscribe()
            else:
                thr_name = "DASAbstractService:%s:TaskManager" % self.name
                self.taskmgr = TaskManager(nworkers=nworkers, name=thr_name)
        else:
            self.taskmgr = None

        self.map = {}  # to be defined by data-service implementation
        self._keys = None  # to be defined at run-time in self.keys
        self._params = None  # to be defined at run-time in self.parameters
        self._notations = {}  # to be defined at run-time in self.notations

        self.logger.info("initialized")
        # define internal cache manager to put 'raw' results into cache
        if "rawcache" in config and config["rawcache"]:
            self.localcache = config["rawcache"]
        else:
            msg = "Undefined rawcache, please check your configuration"
            raise Exception(msg)
Ejemplo n.º 6
0
    def __init__(self, dasconfig):
        DASWebManager.__init__(self, dasconfig)
        config = dasconfig['web_server']
        self.pid_pat     = re.compile(r'^[a-z0-9]{32}')
        # TODO: self.base shall be automatically included in all tmpls
        self.base        = config['url_base']
        self.interval    = config.get('status_update', 2500)
        self.engine      = config.get('engine', None)
        self.check_clients = config.get('check_clients', False)
        nworkers         = config['web_workers']
        self.hot_thr     = config.get('hot_threshold', 3000)
        self.dasconfig   = dasconfig
        self.dburi       = self.dasconfig['mongodb']['dburi']
        self.lifetime    = self.dasconfig['mongodb']['lifetime']
        self.queue_limit = config.get('queue_limit', 50)
        qtype            = config.get('qtype', 'Queue')
        qfreq            = config.get('qfreq', 5)
        if  qtype not in ['Queue', 'PriorityQueue']:
            msg = 'Wrong queue type, qtype=%s' % qtype
            raise Exception(msg)
#         if  self.engine:
#             thr_name = 'DASWebService:PluginTaskManager'
#             self.taskmgr = PluginTaskManager(bus=self.engine, \
#                     nworkers=nworkers, name=thr_name, qtype=qtype, \
#                     qfreq=qfreq)
#             self.taskmgr.subscribe()
#         else:
#             thr_name = 'DASWebService:TaskManager'
#             self.taskmgr = TaskManager(nworkers=nworkers, name=thr_name, \
#                     qtype=qtype, qfreq=qfreq)
        thr_name = 'DASWebService:TaskManager'
        self.taskmgr = TaskManager(nworkers=nworkers, name=thr_name, \
                qtype=qtype, qfreq=qfreq)
        self.adjust      = config.get('adjust_input', False)
        self.dasmgr      = None # defined at run-time via self.init()
        self.reqmgr      = None # defined at run-time via self.init()
        self.daskeys     = []   # defined at run-time via self.init()
        self.colors      = {}   # defined at run-time via self.init()
        self.dbs_url     = None # defined at run-time via self.init()
        self.dbs_global  = None # defined at run-time via self.init()
        self.dbs_instances = [] # defined at run-time via self.init()
        self.kws         = None # defined at run-time via self.init()
        self.q_rewriter  = None # defined at run-time via self.init()
        self.dataset_daemon = None
        self.dbsmgr      = {} # dbs_urls vs dbs_daemons, defined at run-time
        self.daskeyslist = [] # list of DAS keys
        self.init()
        self.dbs_init(config)

        # Monitoring thread which performs auto-reconnection
        thname = 'dascore_monitor'
        start_new_thread(thname, dascore_monitor, \
                ({'das':self.dasmgr, 'uri':self.dburi}, self.init, 5))
Ejemplo n.º 7
0
class Maintainer(object):
    "Maintainer keeps alive data records in DAS cache"
    def __init__(self, config):
        self.sleep   = config.get('sleep', 5)
        pattern      = {'das.system':'dbs', 'das.primary_key': 'dataset.name'}
        self.pattern = config.get('query_pattern', pattern)
        nworkers     = int(config.get('nworkers', 10))
        name         = config.get('name', 'dataset_keeper')
        dasconfig    = das_readconfig()
        debug        = False
        self.dascore = DASCore(config=dasconfig, nores=True, debug=debug)
        self.taskmgr = TaskManager(nworkers=nworkers, name=name)
        self.conn    = db_connection(dasconfig['mongodb']['dburi'])

    def check_records(self):
        "Check and return list of DAS records which require update"
        for row in self.conn['das']['merge'].find():
            if  'qhash' not in row:
                continue
            spec = {'qhash': row['qhash'], 'das.system':'das'}
            for rec in self.conn['das']['cache'].find(spec):
                if  'query' in rec:
                    expire = rec['das']['expire']
                    if  expire < time.time() or \
                        abs(expire-time.time()) < self.sleep:
                        yield DASQuery(rec['query']), expire

    def update(self):
        """
        Update DAS cache:

            - get list of expired or near expire DAS records
            - store them into onhold set
            - loop over onhold set and invoke expired queries
            - sleep and repeat.
        """
        add_to_analytics = False
        onhold = {}
        while True:
            jobs = []
            for query, expire in self.check_records():
                if  query not in onhold:
                    onhold[query] = expire
            for query, expire in onhold.items():
                if  expire < time.time():
                    print "update %s at %s" % (query, time.time())
                    jobs.append(self.taskmgr.spawn(\
                            self.dascore.call, query, add_to_analytics))
                    del onhold[query]
            self.taskmgr.joinall(jobs)
            time.sleep(self.sleep)
Ejemplo n.º 8
0
 def test_assign_priority(self):
     """Test priority assignment"""
     tasks  = TaskManager(qtype='PriorityQueue')
     uid1   = '1.1.1.1'
     tasks._uids.add(uid1)
     uid2   = '2.2.2.2'
     tasks._uids.add(uid1)
     result = tasks.assign_priority(uid1) # no tasks in a queue
     self.assertEqual(result, 0)
     tasks._tasks = TestQueue(empty=False)
     res1   = [tasks._uids.add(uid1) for r in xrange(20)]
     self.assertEqual(tasks.assign_priority(uid1), 2)
     res2   = [tasks._uids.add(uid2) for r in xrange(50)]
     self.assertEqual(tasks.assign_priority(uid2), 5)
Ejemplo n.º 9
0
 def test_priority_task_manager(self):
     """Test priority task manager"""
     data   = [idx for idx in xrange(0, 100)]
     shared_data = Array('i', len(data))
     mypool = TaskManager(qtype='PriorityQueue')
     tasks  = []
     for idx in data:
         if  idx%2:
             tasks.append(mypool.spawn(worker, idx, shared_data, uid=1))
         else:
             tasks.append(mypool.spawn(worker, idx, shared_data, uid=2))
     mypool.joinall(tasks)
     result = [idx for idx in shared_data]
     self.assertEqual(result, data)
Ejemplo n.º 10
0
    def __init__(self, dasconfig):
        DASWebManager.__init__(self, dasconfig)
        config = dasconfig['web_server']
        self.pid_pat     = re.compile(r'^[a-z0-9]{32}')
        # TODO: self.base shall be automatically included in all tmpls
        self.base        = config['url_base']
        self.interval    = config.get('status_update', 2500)
        self.engine      = config.get('engine', None)
        self.check_clients = config.get('check_clients', False)
        nworkers         = config['web_workers']
        self.hot_thr     = config.get('hot_threshold', 3000)
        self.dasconfig   = dasconfig
        self.dburi       = self.dasconfig['mongodb']['dburi']
        self.lifetime    = self.dasconfig['mongodb']['lifetime']
        self.queue_limit = config.get('queue_limit', 50)
        qtype            = config.get('qtype', 'Queue')
        qfreq            = config.get('qfreq', 5)
        if  qtype not in ['Queue', 'PriorityQueue']:
            msg = 'Wrong queue type, qtype=%s' % qtype
            raise Exception(msg)
        if  self.engine:
            thr_name = 'DASWebService:PluginTaskManager'
            self.taskmgr = PluginTaskManager(bus=self.engine, \
                    nworkers=nworkers, name=thr_name, qtype=qtype, \
                    qfreq=qfreq)
            self.taskmgr.subscribe()
        else:
            thr_name = 'DASWebService:TaskManager'
            self.taskmgr = TaskManager(nworkers=nworkers, name=thr_name, \
                    qtype=qtype, qfreq=qfreq)
        self.adjust      = config.get('adjust_input', False)
        self.dasmgr      = None # defined at run-time via self.init()
        self.reqmgr      = None # defined at run-time via self.init()
        self.daskeys     = []   # defined at run-time via self.init()
        self.colors      = {}   # defined at run-time via self.init()
        self.dbs_url     = None # defined at run-time via self.init()
        self.dbs_global  = None # defined at run-time via self.init()
        self.dbs_instances = [] # defined at run-time via self.init()
        self.kws         = None # defined at run-time via self.init()
        self.q_rewriter  = None # defined at run-time via self.init()
        self.dataset_daemon = None
        self.dbsmgr      = {} # dbs_urls vs dbs_daemons, defined at run-time
        self.daskeyslist = [] # list of DAS keys
        self.init()
        self.dbs_init(config)

        # Monitoring thread which performs auto-reconnection
        thname = 'dascore_monitor'
        start_new_thread(thname, dascore_monitor, \
                ({'das':self.dasmgr, 'uri':self.dburi}, self.init, 5))
Ejemplo n.º 11
0
 def __init__(self, config):
     nworkers     = int(config.get('nworkers', 10))
     name         = config.get('name', 'dataset_populator')
     dasconfig    = das_readconfig()
     debug        = False
     self.dascore = DASCore(config=dasconfig, nores=True, debug=debug)
     self.taskmgr = TaskManager(nworkers=nworkers, name=name)
Ejemplo n.º 12
0
    def __init__(self, dasconfig):
        DASWebManager.__init__(self, dasconfig)
        config = dasconfig['web_server']
        self.pid_pat     = re.compile(r'^[a-z0-9]{32}')
        self.base        = config['url_base']
        self.interval    = config.get('status_update', 2500)
        self.engine      = config.get('engine', None)
        nworkers         = config['number_of_workers']
        self.hot_thr     = config.get('hot_threshold', 3000)
        self.dasconfig   = dasconfig
        self.dburi       = self.dasconfig['mongodb']['dburi']
        self.lifetime    = self.dasconfig['mongodb']['lifetime']
        self.queue_limit = config.get('queue_limit', 50)
        if  self.engine:
            thr_name = 'DASWebService:PluginTaskManager'
            self.taskmgr = PluginTaskManager(\
                        bus=self.engine, nworkers=nworkers, name=thr_name)
            self.taskmgr.subscribe()
        else:
            thr_name = 'DASWebService:TaskManager'
            self.taskmgr = TaskManager(nworkers=nworkers, name=thr_name)
        self.adjust      = config.get('adjust_input', False)

        self.init()

        # Monitoring thread which performs auto-reconnection
        thread.start_new_thread(dascore_monitor, \
                ({'das':self.dasmgr, 'uri':self.dburi}, self.init, 5))

        # Obtain DBS global instance or set it as None
        if  self.dasconfig.has_key('dbs'):
            self.dbs_global = \
                self.dasconfig['dbs'].get('dbs_global_instance', None)
            self.dbs_instances = \
                self.dasconfig['dbs'].get('dbs_instances', [])
        else:
            self.dbs_global = None
            self.dbs_instances = []

        # Start DBS daemon
        self.dataset_daemon = config.get('dbs_daemon', False)
        if  self.dataset_daemon:
            self.dbs_daemon(config)
Ejemplo n.º 13
0
    def __init__(self, dasconfig):
        DASWebManager.__init__(self, dasconfig)
        config = dasconfig["web_server"]
        self.pid_pat = re.compile(r"^[a-z0-9]{32}")
        self.base = config["url_base"]
        self.interval = config.get("status_update", 2500)
        self.engine = config.get("engine", None)
        self.check_clients = config.get("check_clients", False)
        nworkers = config["web_workers"]
        self.hot_thr = config.get("hot_threshold", 3000)
        self.dasconfig = dasconfig
        self.dburi = self.dasconfig["mongodb"]["dburi"]
        self.lifetime = self.dasconfig["mongodb"]["lifetime"]
        self.queue_limit = config.get("queue_limit", 50)
        qtype = config.get("qtype", "Queue")
        if qtype not in ["Queue", "PriorityQueue"]:
            msg = "Wrong queue type, qtype=%s" % qtype
            raise Exception(msg)
        if self.engine:
            thr_name = "DASWebService:PluginTaskManager"
            self.taskmgr = PluginTaskManager(bus=self.engine, nworkers=nworkers, name=thr_name, qtype=qtype)
            self.taskmgr.subscribe()
        else:
            thr_name = "DASWebService:TaskManager"
            self.taskmgr = TaskManager(nworkers=nworkers, name=thr_name, qtype=qtype)
        self.adjust = config.get("adjust_input", False)
        self.dasmgr = None  # defined at run-time via self.init()
        self.reqmgr = None  # defined at run-time via self.init()
        self.daskeys = []  # defined at run-time via self.init()
        self.colors = {}  # defined at run-time via self.init()
        self.dbs_url = None  # defined at run-time via self.init()
        self.dbs_global = None  # defined at run-time via self.init()
        self.kws = None  # defined at run-time via self.init()
        self.q_rewriter = None  # defined at run-time via self.init()
        self.dataset_daemon = config.get("dbs_daemon", False)
        self.dbsmgr = {}  # dbs_urls vs dbs_daemons, defined at run-time
        self.daskeyslist = []  # list of DAS keys
        self.init()

        # Monitoring thread which performs auto-reconnection
        thname = "dascore_monitor"
        start_new_thread(thname, dascore_monitor, ({"das": self.dasmgr, "uri": self.dburi}, self.init, 5))
Ejemplo n.º 14
0
 def __init__(self, config):
     self.sleep   = config.get('sleep', 5)
     pattern      = {'das.system':'dbs', 'das.primary_key': 'dataset.name'}
     self.pattern = config.get('query_pattern', pattern)
     nworkers     = int(config.get('nworkers', 10))
     name         = config.get('name', 'dataset_keeper')
     dasconfig    = das_readconfig()
     debug        = False
     self.dascore = DASCore(config=dasconfig, nores=True, debug=debug)
     self.taskmgr = TaskManager(nworkers=nworkers, name=name)
     self.conn    = db_connection(dasconfig['mongodb']['dburi'])
Ejemplo n.º 15
0
class Populator(object):
    """
    This class populates DAS cache with data.
    The run method accepts list of DAS queries.
    """
    def __init__(self, config):
        nworkers     = int(config.get('nworkers', 10))
        name         = config.get('name', 'dataset_populator')
        dasconfig    = das_readconfig()
        debug        = False
        self.dascore = DASCore(config=dasconfig, nores=True, debug=debug)
        self.taskmgr = TaskManager(nworkers=nworkers, name=name)

    def run(self, queries):
        "Run taskmanger with given queries"
        jobs = []
        add_to_analytics = False
        for query in queries:
            jobs.append(self.taskmgr.spawn(\
                    self.dascore.call, DASQuery(query), add_to_analytics))
        self.taskmgr.joinall(jobs)
Ejemplo n.º 16
0
 def test_priority_task_manager(self):
     """Test priority task manager"""
     data = [idx for idx in range(0, 30)]
     shared_data = Array('i', len(data))
     mypool = TaskManager(qtype='PriorityQueue', qfreq=10)
     tasks = []
     for idx in data:
         if idx % 2:
             tasks.append(mypool.spawn(worker, idx, shared_data, uid=1))
         else:
             tasks.append(mypool.spawn(worker, idx, shared_data, uid=2))
     mypool.joinall(tasks)
     result = [idx for idx in shared_data]
     self.assertEqual(result, data)
Ejemplo n.º 17
0
 def test_assign_priority(self):
     """Test priority assignment"""
     tasks = TaskManager(qtype='PriorityQueue', qfreq=10)
     uid1 = '1.1.1.1'
     tasks._uids.add(uid1)
     uid2 = '2.2.2.2'
     tasks._uids.add(uid1)
     result = tasks.assign_priority(uid1)  # no tasks in a queue
     self.assertEqual(int(result), 0)
     tasks._tasks = TestQueue(empty=False)
     res1 = [tasks._uids.add(uid1) for r in range(20)]
     self.assertEqual(int(tasks.assign_priority(uid1)), 2)
     res2 = [tasks._uids.add(uid2) for r in range(50)]
     self.assertEqual(int(tasks.assign_priority(uid2)), 5)
Ejemplo n.º 18
0
class DASWebService(DASWebManager):
    """
    DAS web service interface.
    """

    def __init__(self, dasconfig):
        DASWebManager.__init__(self, dasconfig)
        config = dasconfig['web_server']
        self.pid_pat     = re.compile(r'^[a-z0-9]{32}')
        # TODO: self.base shall be automatically included in all tmpls
        self.base        = config['url_base']
        self.interval    = config.get('status_update', 2500)
        self.engine      = config.get('engine', None)
        self.check_clients = config.get('check_clients', False)
        nworkers         = config['web_workers']
        self.hot_thr     = config.get('hot_threshold', 3000)
        self.dasconfig   = dasconfig
        self.dburi       = self.dasconfig['mongodb']['dburi']
        self.lifetime    = self.dasconfig['mongodb']['lifetime']
        self.queue_limit = config.get('queue_limit', 50)
        qtype            = config.get('qtype', 'Queue')
        qfreq            = config.get('qfreq', 5)
        if  qtype not in ['Queue', 'PriorityQueue']:
            msg = 'Wrong queue type, qtype=%s' % qtype
            raise Exception(msg)
#         if  self.engine:
#             thr_name = 'DASWebService:PluginTaskManager'
#             self.taskmgr = PluginTaskManager(bus=self.engine, \
#                     nworkers=nworkers, name=thr_name, qtype=qtype, \
#                     qfreq=qfreq)
#             self.taskmgr.subscribe()
#         else:
#             thr_name = 'DASWebService:TaskManager'
#             self.taskmgr = TaskManager(nworkers=nworkers, name=thr_name, \
#                     qtype=qtype, qfreq=qfreq)
        thr_name = 'DASWebService:TaskManager'
        self.taskmgr = TaskManager(nworkers=nworkers, name=thr_name, \
                qtype=qtype, qfreq=qfreq)
        self.adjust      = config.get('adjust_input', False)
        self.dasmgr      = None # defined at run-time via self.init()
        self.reqmgr      = None # defined at run-time via self.init()
        self.daskeys     = []   # defined at run-time via self.init()
        self.colors      = {}   # defined at run-time via self.init()
        self.dbs_url     = None # defined at run-time via self.init()
        self.dbs_global  = None # defined at run-time via self.init()
        self.dbs_instances = [] # defined at run-time via self.init()
        self.kws         = None # defined at run-time via self.init()
        self.q_rewriter  = None # defined at run-time via self.init()
        self.dataset_daemon = None
        self.dbsmgr      = {} # dbs_urls vs dbs_daemons, defined at run-time
        self.daskeyslist = [] # list of DAS keys
        self.init()
        self.dbs_init(config)

        # Monitoring thread which performs auto-reconnection
        thname = 'dascore_monitor'
        start_new_thread(thname, dascore_monitor, \
                ({'das':self.dasmgr, 'uri':self.dburi}, self.init, 5))

    def dbs_init(self, config):
        """Initialize DBS daemons"""
        main_dbs_url = self.dbs_url
        dbs_urls = []
        print("### DBS URL:", self.dbs_url)
        print("### DBS global instance:", self.dbs_global)
        print("### DBS instances:", self.dbs_instances)
        for inst in self.dbs_instances:
            dbs_urls.append(\
                    (main_dbs_url.replace(self.dbs_global, inst), inst))
        interval  = config.get('dbs_daemon_interval', 3600)
        dbsexpire = config.get('dbs_daemon_expire', 3600)
        preserve_dbs_col = config.get('preserve_on_restart', False)
        dbs_config  = {'expire': dbsexpire,
                       'preserve_on_restart': preserve_dbs_col}
        for dbs_url, inst in dbs_urls:
            dbsmgr = DBSDaemon(dbs_url, self.dburi, dbs_config)
            self.dbsmgr[(dbs_url, inst)] = dbsmgr

    def init(self):
        """Init DAS web server, connect to DAS Core"""
        try:
            self.reqmgr     = RequestManager(lifetime=self.lifetime)
            self.dasmgr     = DASCore(engine=self.engine)
            self.repmgr     = CMSRepresentation(self.dasconfig, self.dasmgr)
            self.daskeys    = self.dasmgr.das_keys()
            self.gfs        = db_gridfs(self.dburi)
            self.daskeys.sort()
            self.dasmapping = self.dasmgr.mapping
            self.dbs_url    = self.dasmapping.dbs_url()
            self.dbs_global = self.dasmapping.dbs_global_instance()
            self.dbs_instances = self.dasmapping.dbs_instances()
            self.dasmapping.init_presentationcache()
            self.colors = {'das':gen_color('das')}
            for system in self.dasmgr.systems:
                self.colors[system] = gen_color(system)
            if  not self.daskeyslist:
                keylist = [r for r in self.dasmapping.das_presentation_map()]
                keylist.sort(key=lambda r: r['das'])
                self.daskeyslist = keylist

        except ConnectionFailure as _err:
            tstamp = dastimestamp('')
            mythr  = threading.current_thread()
            print("### MongoDB connection failure thread=%s, id=%s, time=%s" \
                    % (mythr.name, mythr.ident, tstamp))
        except Exception as exc:
            print_exc(exc)
            self.dasmgr  = None
            self.reqmgr  = None
            self.dbs_url = None
            self.dbs_global = None
            self.dbs_instances = []
            self.daskeys = []
            self.colors  = {}
            self.q_rewriter = None
            return

        # KWS and Query Rewriting failures are not fatal
        try:
            # init query rewriter, if needed
            if self.dasconfig['query_rewrite']['pk_rewrite_on']:
                self.q_rewriter = CMSQueryRewrite(self.repmgr,
                                                  self.templatepage)
        except Exception as exc:
            print_exc(exc)
            self.q_rewriter = None

    @expose
    @checkargs(DAS_WEB_INPUTS)
    @tools.secmodv2()
    def redirect(self, **kwargs):
        """
        Represent DAS redirect page
        """
        dmsg = 'You do not have permission to access the resource requested.'
        msg  = kwargs.get('reason', dmsg)
        if  msg:
            msg = 'Reason: ' + msg
        page = self.templatepage('das_redirect', msg=msg)
        return self.page(page, response_div=False)

    @expose
    @checkargs(DAS_WEB_INPUTS)
    @tools.secmodv2()
    def dumpthreads(self, **kwargs):
        """
        Represent DAS redirect page
        """
        dumpstacks('web call', 'web frame')
        msg = 'Thread dump performed: %s' % time.strftime("%Y%m%d %H:%M:%S GMT", time.gmtime())
        return self.page(msg, response_div=False)

    def bottom(self, response_div=True):
        """
        Define footer for all DAS web pages
        """
        tstamp = time.strftime("%a, %d %b %Y %H:%M:%S GMT", time.gmtime())
        return self.templatepage('das_bottom', div=response_div, base=self.base,
                version=DAS.version, time=time)

    def page(self, content, ctime=None, response_div=True):
        """
        Define footer for all DAS web pages
        """
        page  = self.top()
        page += content
        page += self.templatepage('das_bottom', ctime=ctime,  base=self.base,
                                  version=DAS.version, div=response_div, time=time)
        return page

    @expose
    @checkargs(DAS_WEB_INPUTS + ['section', 'highlight'])
    @tools.secmodv2()
    def faq(self, **kwargs):
        """
        represent DAS FAQ.
        """
        section = kwargs.get('section', None)
        highlight = kwargs.get('highlight', None)
        guide = self.templatepage('dbsql_vs_dasql',
                    operators=', '.join(das_operators()))
        daskeys = self.templatepage('das_keys', daskeys=self.daskeyslist)
        page = self.templatepage('das_faq', guide=guide, daskeys=daskeys,
                section=section, highlight=highlight,
                operators=', '.join(das_operators()),
                aggregators=', '.join(das_aggregators()))
        return self.page(page, response_div=False)

    @expose
    @tools.secmodv2()
    def cli(self):
        """
        Serve DAS CLI file download.
        """
        msg = 'Please use dasgoclient which is available in any CMSSW releases'
        return self.page(msg)
#         dasroot = '/'.join(__file__.split('/')[:-3])
#         clifile = os.path.join(dasroot, 'DAS/tools/das_client.py')
#         return serve_file(clifile, content_type='text/plain')

    @expose
    @tools.secmodv2()
    def movetodas(self):
        "Placeholder page for DBS to DAS migration"
        style = \
            "width:600px;margin-left:auto;margin-right:auto;padding-top:20px"
        page  = """<div style="%s">""" % style
        page += "Dear user,<br/>DBS Data Discovery page is depricated.<br/>"
        page += "Please migrate to Data Aggregation Service located at"
        page += "<p>https://cmsweb.cern.ch/das/</p>"
        page += "<em>CMS HTTP group.</em>"
        page += "</div>"""
        return page

    @expose
    @tools.secmodv2()
    def opensearch(self):
        """
        Serve DAS opensearch file.
        """
        if  self.base and self.base.find('http://') != -1:
            base = self.base
        else:
            base = 'http://cmsweb.cern.ch/das'
        desc = self.templatepage('das_opensearch', base=base)
        cherrypy.response.headers['Content-Type'] = \
                'application/opensearchdescription+xml'
        return desc

    @expose
    @checkargs(DAS_WEB_INPUTS)
    @tools.secmodv2()
    def keys(self, **kwds):
        """
        Show DAS keys and their attibutes
        """
        adict = {}
        for row in self.dasmgr.keylearning.attributes():
            try:
                qpat = row.get('query_pat', [])
                key, attr = row['member'].split('.', 1)
            except:
                continue
            if  key in adict:
                vdict = adict[key]
                if  attr in vdict:
                    vdict[attr] += qpat
                else:
                    vdict[attr] = qpat
                adict[key] = vdict
            else:
                adict[key] = {attr: qpat}
        view = kwds.get('view', '')
        if  view == 'json':
            return json.dumps(adict)
        page = self.templatepage('das_keys_attrs', attrs=adict)
        return self.page(page, response_div=False)

    @expose
    @checkargs(DAS_WEB_INPUTS)
    @tools.secmodv2()
    def services(self):
        """
        represent DAS services
        """
        dasdict = {}
        daskeys = set()
        dasmapkeys = list(self.dasmgr.mapping.dasmapscache.keys())
        dasmapkeys.sort()
        for key in dasmapkeys:
            srv, urn = key
            if  srv not in self.dasmgr.systems:
                continue
            entry = self.dasmgr.mapping.dasmapscache[key]
            tmpdict = {}
            for item in entry['das_map']:
                dkey = item['das_key']
                rkey = item['rec_key']
                daskeys.add(dkey)
                vlist = tmpdict.get(dkey, []) + [rkey]
                tmpdict[dkey] = list(set(vlist))
            apis = []
            if  srv in dasdict:
                vdict = dasdict[srv]
                okeys = vdict['keys']
                apis  = vdict['apis'] + [urn]
                for kkk, vvv in okeys.items():
                    vlist = tmpdict.get(kkk, []) + vvv
                    tmpdict[kkk] = list(set(vlist))
            else:
                apis = [urn]
            vdict = dict(keys=dict(tmpdict), apis=apis)
            dasdict[srv] = vdict
        mapreduce = [r for r in self.dasmgr.rawcache.get_map_reduce()]
        page = self.templatepage('das_services', dasdict=dasdict,
                        dbses=self.dbs_instances, dbs_global=self.dbs_global,
                        daskeys=list(daskeys), mapreduce=mapreduce,
                        urllib=urllib)
        return self.page(page, response_div=False)

    @expose
    @checkargs(DAS_WEB_INPUTS)
    def nsystems(self):
        """
        Return number of systems participating in DAS
        """
        systems = self.dasmgr.mapping.list_systems()
        return "DAS systems %s" % ','.join(systems)

    @expose
    @checkargs(DAS_WEB_INPUTS)
    @tools.secmodv2()
    def api(self, system, name):
        """
        Return DAS mapping record about provided API.
        """
        record = self.dasmgr.mapping.api_info(system, name)
        page   = "<b>DAS mapping record</b>"
        page  += das_json_full(record)
        return self.page(page, response_div=False)

    @expose
    @checkargs(DAS_WEB_INPUTS)
    @tools.secmodv2()
    def default(self, *args, **kwargs):
        """
        Default method.
        """
        return self.index(args, kwargs)

    def adjust_input(self, kwargs):
        """
        Adjust user input wrt common DAS keyword patterns, e.g.
        /Zee/*/* -> dataset=*Zee*, T1_US -> site=T1_US.

        More ambiguous input (such as Zee -> dataset=*Zee*) is however left
        to be handled by the keyword search.

        This is active only if adjust_input is set in DAS server configuration.
        """
        if not self.adjust:
            return
        uinput = kwargs.get('input', '')
        inst = kwargs.get('instance', self.dbs_global)

        kwargs['input'] = identify_apparent_query_patterns(uinput, inst)

    def _get_dbsmgr(self, inst):
        """
        Given a string representation of DBS instance, returns DBSManager
        instance which "knows" how to look up datasets
        """
        mgr = None
        # instance selection shall be more clean
        if not self.dataset_daemon:
            return mgr
        for dbs_url, dbs_inst in self.dbsmgr.keys():
            if  dbs_inst == inst:
                return self.dbsmgr[(dbs_url, dbs_inst)]
        return mgr


    def _get_kws_host(self):
        """
        gets the host for keyword search from config. default is same server
        """
        return self.dasconfig['load_balance']['kws_host']

    def _get_autocompl_host(self):
        """
        gets the host for autocompletion from config. default is same server
        """
        conf = self.dasconfig.get('load_balance', {})
        return conf.get('autocompletion_host', '')

    def is_kws_enabled(self):
        """
        is keyword search client (ajax request) enabled
        """
        return self.dasconfig['keyword_search']['kws_on']

    def is_kws_service_enabled(self):
        """
        is keyword search service (response to ajax call) enabled
        """
        return self.dasconfig['keyword_search']['kws_service_on']

    def generate_dasquery(self, uinput, inst, html_mode=True, qcache=0):
        """
        Check provided input as valid DAS input query.
        Returns status and content (either error message or valid DASQuery)
        :param uinput: user's input
        :param inst: DBS instance
        :param html_mode: whether errors shall be output in html
        """

        def error_msg(msg, show_kws=False, tmpl='das_ambiguous', **kwargs):
            """
            Helper function which renders an error template, default is
            das_ambiguous, but can be overriden via tmpl param.
            Template has two versions: html and text for CLI.

            The template is passed with msg, base, guide, and **kwargs. """
            # TODO: this shall be done by inheriting a parent template
            # TODO: no header/footer?
            guide = self.templatepage('dbsql_vs_dasql',
                                      operators=', '.join(das_operators()))
            # render keyword search loader, if needed
            kws = ''
            if show_kws:
                kws = self.templatepage('kwdsearch_via_ajax',
                                        uinput=uinput,
                                        jsonize=jsonize,
                                        url_extend_params_as_dict=url_extend_params_as_dict,
                                        inst=inst or self.dbs_global,
                                        kws_host=self._get_kws_host())
            # render the appropriate template (html vs text mode)
            page = self.templatepage(tmpl + ('_txt' if not html_mode else ''),
                                     msg=msg, base=self.base, guide=guide,
                                     kws_enabled=show_kws, kws=kws, **kwargs)
            return page

        if not uinput:
            return 1, error_msg('No input query')

        # Generate a DASQuery object, if it fails we catch the exception and
        # wrap it for upper layer (web interface)
        try:
            dasquery = DASQuery(uinput, instance=inst, qcache=qcache)
        except WildcardMultipleMatchesException as err:
            # TODO: hints could be shown here also, but it makes no sense, as
            # they are shown only when no matches are found
            if isinstance(err.options.values, list) and err.options.values:
                return 1, error_msg(str(err), tmpl='das_wildcard_err',
                                    suggest=err.options.values,
                                    url_extend_params=url_extend_params)
            return 1, error_msg(str(err), tmpl='das_wildcard_err',
                                url_extend_params=url_extend_params)

        except WildcardMatchingException as err:
            kwds = {'input':uinput, 'instance':inst}
            hints = self.hint_datasets(kwds)
            page = error_msg(str(err))
            for hint in hints:
                page += self.templatepage('hint',
                        url_extend_params=url_extend_params,
                        hint=hint, base=self.base, dbs=self.dbs_global)
            return 1, page
        except Exception as err:
            # show multiple dataset matches for 1 keyword queries
            if hasattr(response, 'dataset_matches_msg'):
                return 1, error_msg(response.dataset_matches_msg,
                                    show_kws=self.is_kws_enabled())

            # for non Wildcard parsing errors, show the Keyword Search
            return 1, error_msg(str(err), show_kws=self.is_kws_enabled())

        if dasquery.error:
            return 1, error_msg(dasquery.error)

        # DAS query validation
        if isinstance(uinput, dict):  # DASQuery w/ {'spec':{'_id:id}}
            pass
        elif uinput.find('queries') != -1:
            pass
        elif uinput.find('records') != -1:
            pass
        else:  # normal user DAS query
            try:
                service_map = dasquery.service_apis_map()
            except Exception as exc:
                msg = 'Fail to obtain service API map for this DASQuery'
                print(msg)
                print_exc(exc)
                return 1, error_msg(msg)
            if not service_map:
                return 1, error_msg('Unable to resolve the query over the '
                                    'available services: %s' % dasquery)
        return 0, dasquery

    @expose
    @checkargs(DAS_WEB_INPUTS)
#    @tools.secmodv2()
    def index(self, *args, **kwargs):
        """
        represents DAS web interface.
        It uses das_searchform template for
        input form and yui_table for output Table widget.
        """
        uinput = getarg(kwargs, 'input', '')
        return self.page(self.form(uinput=uinput, cards=True))


    def form(self, uinput='', instance=None, view='list', cards=False):
        """
        provide input DAS search form
        """
        # TODO: rename into search_form()? (template is also called like this

        if  "'" in uinput: # e.g. file.creation_date>'20120101 12:01:01'
            uinput = uinput.replace("'", '"')
        if  not instance:
            instance = self.dbs_global
        hcards = help_cards(self.base)
        width = 900
        height = 220
        cards = self.templatepage('das_cards', base=self.base, show=cards, \
                width=width, height=height, max_width=len(hcards)*width, \
                cards=hcards, enumerate=enumerate)
        daskeys = self.templatepage('das_keys', daskeys=self.daskeyslist)
        page  = self.templatepage('das_searchform', input=uinput, \
                init_dbses=list(self.dbs_instances), daskeys=daskeys, \
                base=self.base, instance=instance, view=view, cards=cards,
                autocompl_host=json.dumps(self._get_autocompl_host())
                )
        return page

    @expose
    @tools.secmodv2()
    def error(self, msg, wrap=True):
        """
        Show error message.
        """
        page = self.templatepage('das_error', msg=str(msg))
        if  wrap:
            page  = self.page(self.form() + page)
        return page

    @expose
    @checkargs(DAS_WEB_INPUTS)
    @tools.secmodv2()
    def gridfs(self, **kwargs):
        """
        Retieve records from GridFS
        """
        time0 = time.time()
        if  'fid' not in kwargs:
            code = web_code('No file id')
            raise HTTPError(500, 'DAS error, code=%s' % code)
        fid  = kwargs.get('fid')
        data = {'status':'requested', 'fid':fid}
        try:
            fds = self.gfs.get(ObjectId(fid))
            return fds.read()
        except Exception as exc:
            print_exc(exc)
            code = web_code('Exception')
            raise HTTPError(500, 'DAS error, code=%s' % code)
        data['ctime'] = time.time() - time0
        return json.dumps(data)

    @expose
    @checkargs(DAS_WEB_INPUTS)
    @tools.secmodv2()
    def records(self, *args, **kwargs):
        """
        Retieve all records id's.
        """
        try:
            recordid = None
            if  args:
                recordid = args[0]
                spec = {'_id':ObjectId(recordid)}
                fields = None
                query = dict(fields=fields, spec=spec)
            elif  kwargs and '_id' in kwargs:
                spec = {'_id': ObjectId(kwargs['_id'])}
                fields = None
                query = dict(fields=fields, spec=spec)
            else: # return all ids
                query = dict(fields=None, spec={})

            res      = ''
            time0    = time.time()
            idx      = getarg(kwargs, 'idx', 0)
            limit    = getarg(kwargs, 'limit', 50)
            coll     = kwargs.get('collection', 'merge')
            view     = kwargs.get('view', '')
            if  view == 'json':
                res  = []
            inst     = kwargs.get('instance', self.dbs_global)
            form     = self.form(uinput="")
            check, content = self.generate_dasquery(query, inst)
            if  check:
                return self.page(form + content, ctime=time.time()-time0)
            dasquery = content # returned content is valid DAS query
            nresults = self.dasmgr.rawcache.nresults(dasquery, coll)
            gen      = self.dasmgr.rawcache.get_from_cache\
                (dasquery, idx=idx, limit=limit, collection=coll)
            if  recordid: # we got id
                for row in gen:
                    if  view == 'json':
                        res.append(row)
                    else:
                        res += das_json(dasquery, row)
            else:
                for row in gen:
                    rid  = row['_id']
                    del row['_id']
                    res += self.templatepage('das_record', \
                            id=rid, collection=coll, daskeys=', '.join(row))
            if  recordid:
                page  = res
            else:
                url   = '/das/records?'
                if  nresults:
                    page = self.templatepage('das_pagination', \
                        nrows=nresults, idx=idx, limit=limit, url=url, \
                        cgi=cgi, str=str)
                else:
                    page = 'No results found, nresults=%s' % nresults
                page += res

            ctime   = (time.time()-time0)
            if  view == 'json':
                return json.dumps(res)
            page = self.page(form + page, ctime=ctime)
            return page
        except Exception as exc:
            print_exc(exc)
            return self.error(gen_error_msg(kwargs))

    @jsonstreamer
    def datastream(self, kwargs):
        """Stream DAS data into JSON format"""
        head = kwargs.get('head', dict(timestamp=time.time()))
        if  'mongo_query' not in head:
            head['mongo_query'] = head['dasquery'].mongo_query \
                if 'dasquery' in head else {}
        if  'dasquery' in head:
            del head['dasquery']
        if  'args' in head:
            del head['args']
        data = kwargs.get('data', [])
        if  self.check_clients:
            # update client version
            cli, cli_msg = check_client_version()
            head.update({'client': cli, 'client_message': cli_msg})
        return head, data

    def hint_datasets(self, kwargs):
        "Use hint functions to find datasets in non-default DBS istances"
        query = kwargs.get('input', '').strip()
        dbsinst = kwargs.get('instance', self.dbs_global)
        hint_functions = [hint_dataset_case_insensitive,
                          hint_dataset_in_other_insts, ]
        hints = (hint(query, dbsinst) for hint in hint_functions)
        hints = [r for r in hints if r and r.get('results')]
        return hints

    def get_data(self, kwargs):
        """
        Invoke DAS workflow and get data from the cache.
        """
        head   = dict(timestamp=time.time())
        head['args'] = kwargs
        uinput = kwargs.get('input', '')
        inst   = kwargs.get('instance', self.dbs_global)
        idx    = getarg(kwargs, 'idx', 0)
        limit  = getarg(kwargs, 'limit', 0) # do not impose limit
        coll   = kwargs.get('collection', 'merge')
        status = kwargs.get('status')
        error  = kwargs.get('error')
        reason = kwargs.get('reason')
        dasquery = kwargs.get('dasquery', None)
        time0  = time.time()
        if  dasquery:
            dasquery = DASQuery(dasquery, instance=inst)
            if  dasquery.error:
                return self.page(form + dasquery.error, ctime=time.time()-time0)
        else:
            check, content = \
                    self.generate_dasquery(uinput, inst, html_mode=False)
            if  check:
                head.update({'status': 'fail', 'reason': content,
                             'ctime': time.time()-time0, 'input': uinput})
                data = []
                return head, data
            dasquery = content # returned content is valid DAS query
        try:
            nres = self.dasmgr.nresults(dasquery, coll)
            data = \
                self.dasmgr.get_from_cache(dasquery, idx, limit)
            # check that we got what we expected
            data = [r for r in data]
            if  nres and not len(data):
                for retry in range(1, 3, 5):
                    msg = 'retry in %s sec' % retry
                    dasprint(dastimestamp('DAS WARNING '), msg, dasquery)
                    time.sleep(retry) # retry one more time
                    data = \
                        self.dasmgr.get_from_cache(dasquery, idx, limit)
                    data = [r for r in data]
                    if  len(data):
                        break
            if  nres and not len(data):
                msg = 'fail to get all data for %s, nres=%s, len(data)=%s' \
                        % (dasquery, nres, len(data))
                dasprint(dastimestamp('DAS WARNING '), msg)
                status = 'fail'
                reason = 'Fail to retrieve data from DAS cache, please retry'

            if  dasquery.aggregators:
                # aggregators split DAS record into sub-system and then
                # apply aggregator functions, therefore we need to correctly
                # account for nresults. Resolve generator into list and take
                # its length as nresults value.
                data = [r for r in data]
                nres = len(data)
            if  error: # DAS record contains an error
                status = 'error'
            head.update({'status':status, 'nresults':nres,
                         'ctime': time.time()-time0, 'dasquery': dasquery})
        except Exception as exc:
            status = 'fail'
            reason = str(exc)
            print_exc(exc)
            head.update({'status': status,
                         'ctime': time.time()-time0, 'dasquery': dasquery})
            data = []
        head.update({'incache':self.dasmgr.incache(dasquery, coll='cache'),
                     'apilist':self.dasmgr.apilist(dasquery)})
        if  reason:
            head.update({'reason': reason})
        if  status != 'ok':
            head.update(self.info())

        # check if query had dataset input and returned no results
        # then run hint functions to find dataset in other DBS instances
        mquery = dasquery.mongo_query
        empty = False
        for item in data:
            if  'dataset.name' in mquery['spec'] and 'dataset' in mquery['fields'] \
                    and 'result' not in item:
                if  not item['dataset']:
                    empty = True
                    break
        if  empty: # if no results found add dataset from other DBS instances
            hints = self.hint_datasets(kwargs)
            for item in data:
                item.update({'hints': hints})

        return head, data

    def info(self):
        "Return status of DAS server"
        info = {'nrequests': self.reqmgr.size(),
                'nworkers': self.taskmgr.nworkers(),
                'dasweb': self.reqmgr.status()}
        if  self.dasmgr and self.dasmgr.taskmgr:
            info.update({'dascore': self.dasmgr.taskmgr.status()})
        return dict(das_server=info)

    def busy(self):
        """
        Check server load and report busy status if
        nrequests - nworkers > queue limit
        """
        nrequests = self.reqmgr.size()
        if  (nrequests - self.taskmgr.nworkers()) > self.queue_limit:
            msg = '#request=%s, queue_limit=%s, #workers=%s' \
                    % (nrequests, self.taskmgr.nworkers(), self.queue_limit)
            dasprint(dastimestamp('DAS WEB SERVER IS BUSY '), msg)
            return True
        return False

    def busy_page(self, uinput=None):
        """DAS server busy page layout"""
        page = "<h3>DAS server is busy, please try later</h3>"
        form = self.form(uinput)
        return self.page(form + page)

    def _is_web_request(self, view):
        """
        returns whether the current view mode is not web
        """

        # first, check for explicit output type (view)

        if view in ['json', 'xml', 'plain']:
            return False

        # check accept header - e.g. das client only provides accept header
        accepts = cherrypy.request.headers.elements('Accept')
        non_html_accepts = ['application/json']
        other_accepted = [a for a in accepts
                          if a.value not in non_html_accepts]

        # if only non html content types are accepted we are in non html mode
        if not other_accepted and accepts:
            return  False

        return True

    def empty_return(self, dasquery, status='busy', reason=None):
        "Return header/data when DAS server is busy"
        if  not reason:
            reason  = 'DAS server is busy'
            reason += ', #requests=%s, #workers=%s, queue size=%s' \
                % (self.reqmgr.size(), self.taskmgr.nworkers(), self.queue_limit)
        head = dict(timestamp=time.time())
        head.update({'status': status, 'reason': reason, 'ctime':0})
        data = []
        dasprint(dastimestamp('DAS INFO '), dasquery, 'server status=%s'%status, reason)
        return self.datastream(dict(head=head, data=data))

    @expose
    @checkargs(DAS_WEB_INPUTS)
    @tools.secmodv2()
    def cache(self, **kwargs):
        """
        DAS web cache interface. Fire up new process for new requests and
        record its pid. The client is in charge to keep track of pid.
        The new process uses DAS core call to request the data into cache.
        Since query are cached the repeated call with the same query
        has no cost to DAS core.
        """
        # do not allow caching
        set_no_cache_flags()

        # if busy return right away
        if  self.busy():
            return self.empty_return(kwargs)

        uinput = kwargs.get('input', '').strip()
        check_query(uinput)
        if  not uinput:
            head = {'status': 'fail', 'reason': 'No input found',
                    'args': kwargs, 'ctime': 0, 'input': uinput}
            data = []
            return self.datastream(dict(head=head, data=data))
        self.adjust_input(kwargs)
        pid    = kwargs.get('pid', '')
        inst   = kwargs.get('instance', self.dbs_global)
        uinput = kwargs.get('input', '')
        view   = kwargs.get('view', 'list')
        qcache = kwargs.get('qcache', 0)
        data   = []

        # textual views need text only error messages...
        check, content = self.generate_dasquery(uinput, inst,
                              html_mode=self._is_web_request(view),
                              qcache=qcache)
        if  check:
            head = dict(timestamp=time.time())
            head.update({'status': 'fail',
                         'reason': 'Can not interpret the query'+ \
                                   ' (while creating DASQuery)',
                         'ctime': 0})
            if not self._is_web_request(view):
                head['error_details'] = content
                head['reason'] = head['reason'] + '\n\n' + content
            return self.datastream(dict(head=head, data=data))

        dasquery = content # returned content is valid DAS query
        status, error, reason = self.dasmgr.get_status(dasquery)
        kwargs.update({'status':status, 'error':error, 'reason':reason})
        if  not pid:
            pid = dasquery.qhash
        if  status == None and not self.reqmgr.has_pid(pid): # submit new request
            uid = cherrypy.request.headers.get('Remote-Addr')
            if  hasattr(cherrypy.request, 'user'):
                uid = cherrypy.request.user.get('dn', None)
            _evt, pid = self.taskmgr.spawn(\
                self.dasmgr.call, dasquery, uid=uid, pid=dasquery.qhash)
            self.reqmgr.add(pid, kwargs)
            return pid
        if  status == 'ok':
            self.reqmgr.remove(pid)
            self.taskmgr.remove(pid)
            kwargs['dasquery'] = dasquery
            head, data = self.get_data(kwargs)
            return self.datastream(dict(head=head, data=data))
        kwargs['dasquery'] = dasquery.storage_query
        if  not self.pid_pat.match(str(pid)) or len(str(pid)) != 32:
            self.reqmgr.remove(pid)
            self.taskmgr.remove(pid)
            return self.empty_return(dasquery, 'fail', 'Invalid pid')
        elif self.taskmgr.is_alive(pid):
            return pid
        elif status == None:
            # DAS was busy and query expired since status==None
            if  not self.taskmgr.is_alive(pid) and self.reqmgr.has_pid(pid):
                self.reqmgr.remove(pid)
                self.taskmgr.remove(pid)
                return self.empty_return(dasquery, 'fail', 'request expired')
            return pid
        else: # process is done, get data
            self.reqmgr.remove(pid)
            self.taskmgr.remove(pid)
            head, data = self.get_data(kwargs)
            return self.datastream(dict(head=head, data=data))

    def get_page_content(self, kwargs, complete_msg=True):
        """Retrieve page content for provided set of parameters"""
        html_views = ['list', 'table']
        page = ''
        try:
            view = kwargs.get('view', 'list')
            if  view == 'plain':
                if  'limit' in kwargs:
                    del kwargs['limit']
            if  view in ['json', 'xml', 'plain'] and complete_msg:
                page = 'Request completed. Reload the page ...'
            else:
                head, data = self.get_data(kwargs)

                allowed_views = ['list', 'table', 'plain', 'xml', 'json']
                if view not in allowed_views:
                    raise

                func = getattr(self, view + "view")
                page = func(head, data)
        except HTTPError as _err:
            raise
        except Exception as exc:
            print_exc(exc)
            msg  = gen_error_msg(kwargs)
            page = self.templatepage('das_error', msg=msg)
        return page

    @expose
    @tools.secmodv2()
    def download(self, lfn):
        "DAS download page for given LFN"
        page = self.templatepage('filemover', lfn=lfn)
        return self.page(page, response_div=False)

    @expose
    @tools.secmodv2()
    def makepy(self, dataset, instance):
        """
        Request to create CMSSW py snippet for a given dataset
        """
        pat = re.compile('/.*/.*/.*')
        if  not pat.match(dataset):
            msg = 'Invalid dataset name'
            return self.error(msg)
        query = "file dataset=%s instance=%s | grep file.name" \
                % (dataset, instance)
        try:
            data   = self.dasmgr.result(query, idx=0, limit=0)
        except Exception as exc:
            print_exc(exc)
            msg    = 'Exception: %s\n' % str(exc)
            msg   += 'Unable to retrieve data for query=%s' % query
            return self.error(msg)
        lfns = []
        for rec in data:
            filename = DotDict(rec).get('file.name')
            if  filename not in lfns:
                lfns.append(filename)
        page = self.templatepage('das_files_py', lfnList=lfns, pfnList=[], isinstance=isinstance, list=list)
        cherrypy.response.headers['Content-Type'] = "text/plain"
        return page

    @expose
    @checkargs(DAS_WEB_INPUTS)
    @tools.secmodv2()
    def request(self, **kwargs):
        """
        Request data from DAS cache.
        """
        # do not allow caching
        set_no_cache_flags()

        uinput  = kwargs.get('input', '').strip()
        check_query(uinput)
        if  not uinput:
            kwargs['reason'] = 'No input found'
            return self.redirect(**kwargs)

        # if busy return right away
        if  self.busy():
            return self.busy_page(uinput)

        time0   = time.time()
        self.adjust_input(kwargs)
        view    = kwargs.get('view', 'list')
        qcache  = kwargs.get('qcache', 0)
        if  'instance' in uinput:
            form     = self.form(uinput=uinput, view=view)
            content  = 'On DAS web UI please use drop-down menu to specify DBS'
            content += ' instance to avoid ambiguity. '
            content += 'To proceed please clear your input query.'
            return self.page(form + '<div class="box_red">%s</div>' % content)
        else:
            inst = kwargs.get('instance', self.dbs_global)
        uinput  = kwargs.get('input', '')
        form    = self.form(uinput=uinput, instance=inst, view=view)
        check, content = self.generate_dasquery(uinput, inst, qcache=qcache)
        if  check:
            if  view == 'list' or view == 'table':
                return self.page(form + content, ctime=time.time()-time0)
            else:
                return content
        dasquery = content # returned content is valid DAS query
        status, error, reason = self.dasmgr.get_status(dasquery)
        kwargs.update({'status':status, 'error':error, 'reason':reason})
        pid = dasquery.qhash
        if  status is None: # process new request
            kwargs['dasquery'] = dasquery.storage_query
            uid = cherrypy.request.headers.get('Remote-Addr')
            if  hasattr(cherrypy.request, 'user'):
                uid = cherrypy.request.user.get('dn', None)
            _evt, pid = self.taskmgr.spawn(self.dasmgr.call, dasquery,
                    uid=uid, pid=dasquery.qhash)
            self.reqmgr.add(pid, kwargs)
        elif status == 'ok' or status == 'fail':
            self.reqmgr.remove(pid)
            self.taskmgr.remove(pid)

            # check if query can be rewritten via nested PK query
            rew_msg = self.q_rewriter and self.q_rewriter.check_fields(dasquery)
            if rew_msg:
                content =  self.templatepage('das_error', msg=rew_msg)
                return self.page(form + content, ctime=time.time()-time0)

            kwargs['dasquery'] = dasquery
            page = self.get_page_content(kwargs, complete_msg=False)
            ctime = (time.time()-time0)
            if  view == 'list' or view == 'table':
                return self.page(form + page, ctime=ctime)

            return page
        if  self.taskmgr.is_alive(pid):
            page = self.templatepage('das_check_pid', method='check_pid',
                    uinput=uinput, view=view, urllib=urllib,
                    base=self.base, pid=pid, interval=self.interval)
        elif status == None:
            # DAS was busy and query expired since status==None
            if  not self.taskmgr.is_alive(pid) and self.reqmgr.has_pid(pid):
                self.reqmgr.remove(pid)
                self.taskmgr.remove(pid)
                return self.empty_return(dasquery, 'fail', 'request expired')
            page = self.templatepage('das_check_pid', method='check_pid',
                    uinput=uinput, view=view, urllib=urllib,
                    base=self.base, pid=pid, interval=self.interval)
        else:
            self.reqmgr.remove(pid)
            self.taskmgr.remove(pid)
            page = self.get_page_content(kwargs)
        ctime = (time.time()-time0)
        return self.page(form + page, ctime=ctime)

    @expose
    @tools.secmodv2()
    def status(self):
        """Return list of all current requests in DAS queue"""
        requests = [r for r in self.reqmgr.items()]
        page = self.templatepage('das_status', requests=requests, time=time)

        sdict = self.dasmgr.status()
        sdict['web'] = self.taskmgr.status()
        dasprint(dastimestamp('DAS INFO '), "web TaskManager", sdict['web'])
        for key, val in sdict.items():
            dasprint(dastimestamp('DAS INFO '), "%s TaskManager %s" % (key, val))
        page += '<h3>Services</h3>'
        def dump(idict):
            "Dump input dict"
            return ', '.join(['<em>%s:</em> %s' % (k, idict[k]) for k in sorted(idict)])
        for key, val in sdict.items():
            page += '<div>'
            stats = ', '.join([dump(v) for v in val.values()])
            page += '<b>%s</b>: %s' % (key, stats)
            page += '</div>'
        return self.page(page)

    @expose
    @checkargs(['pid'])
    @tools.secmodv2()
    def check_pid(self, pid):
        """
        Check status of given pid. This is a server callback
        function for ajaxCheckPid, see js/ajax_utils.js
        """
        # do not allow caching
        set_no_cache_flags()

        img  = '<img src="%s/images/loading.gif" alt="loading"/>' % self.base
        page = ''
        try:
            if  self.taskmgr.is_alive(pid):
                page = img + " processing PID=%s" % pid
            else:
                # at this point we don't know if request arrived to this host
                # or it was processed. To distinguish the case we'll ask
                # request manager for that pid
                if  self.reqmgr.has_pid(pid):
                    self.reqmgr.remove(pid)
                    self.taskmgr.remove(pid)
                    page  = 'Request PID=%s is completed' % pid
                    page += ', please wait for results to load'
                else:
                    # there're no request on this server, re-initiate it
                    ref = cherrypy.request.headers.get('Referer', None)
                    if  ref:
                        url = urlparse(ref)
                        params = dict(parse_qsl(url.query))
                        return self.request(**params)
                    else:
                        msg  = 'No referer in cherrypy.request.headers'
                        msg += '\nHeaders: %s' % cherrypy.request.headers
                        dasprint(dastimestamp('DAS WEB ERROR '), msg)
        except Exception as err:
            msg = 'check_pid fails for pid=%s' % pid
            dasprint(dastimestamp('DAS WEB ERROR '), msg)
            print_exc(err)
            self.reqmgr.remove(pid)
            self.taskmgr.remove(pid)
            return self.error(gen_error_msg({'pid':pid}), wrap=False)
        return page

    def listview(self, head, data):
        """DAS listview data representation"""
        return self.repmgr.listview(head, data)

    def tableview(self, head, data):
        """DAS tabular view data representation"""
        return self.repmgr.tableview(head, data)

    def plainview(self, head, data):
        """DAS plain view data representation"""
        return self.repmgr.plainview(head, data)

    def xmlview(self, head, data):
        """DAS XML data representation"""
        return self.repmgr.xmlview(head, data)

    def jsonview(self, head, data):
        """DAS JSON data representation"""
        return self.repmgr.jsonview(head, data)

    @exposedasjson
    @enable_cross_origin
    @checkargs(['query', 'dbs_instance'])
    @tools.secmodv2()
    def autocomplete(self, **kwargs):
        """
        Provides autocomplete functionality for DAS web UI.
        """
        query = kwargs.get("query", "").strip()
        result = autocomplete_helper(query, self.dasmgr, self.daskeys)
        dataset = [r for r in result if r['value'].find('dataset=')!=-1]
        dbsinst = kwargs.get('dbs_instance', self.dbs_global)
        if  self.dataset_daemon and len(dataset):
            dbsmgr = self._get_dbsmgr(dbsinst)
            # we shall autocomplete the last token so queries like
            # file dataset=/ZMM/.. are autocompleted
            prefix = ''
            if ' ' in query:
                prefix = '  '.join(query.split()[:-1]) + ' '
                print('prefix=', prefix)
                query = query.split()[-1]
            if  query.find('dataset=') != -1:
                query = query.replace('dataset=', '')
            for row in dbsmgr.find(query):
                result.append({'css': 'ac-info',
                               'value': prefix + 'dataset=%s' % row,
                               'info': 'dataset'})
        return result
Ejemplo n.º 19
0
Archivo: das_core.py Proyecto: dmwm/DAS
class DASCore(object):
    """
    DAS core class.
    """

    def __init__(self, config=None, debug=0, nores=False, logger=None, engine=None, multitask=True):
        if config:
            dasconfig = config
        else:
            dasconfig = das_readconfig()
        verbose = dasconfig["verbose"]
        self.stdout = debug
        if isinstance(debug, int):
            self.verbose = debug
            dasconfig["verbose"] = debug
        else:
            self.verbose = verbose
        das_timer("DASCore::init", self.verbose)
        self.operators = das_operators()

        # set noresults option
        self.noresults = False
        if nores:
            dasconfig["write_cache"] = True
            self.noresults = nores

        self.multitask = dasconfig["das"].get("multitask", True)
        if debug or self.verbose:
            self.multitask = False  # in verbose mode do not use multitask
            dasconfig["das"]["multitask"] = False
        if not multitask:  # explicitly call DASCore ctor
            self.multitask = False
            dasconfig["das"]["multitask"] = False
        dasconfig["engine"] = engine
        if self.multitask:
            nworkers = dasconfig["das"].get("core_workers", 5)
            if engine:
                thr_name = "DASCore:PluginTaskManager"
                self.taskmgr = PluginTaskManager(engine, nworkers=nworkers, name=thr_name)
                self.taskmgr.subscribe()
            else:
                thr_name = "DASCore:TaskManager"
                self.taskmgr = TaskManager(nworkers=nworkers, name=thr_name)
        else:
            self.taskmgr = None

        if logger:
            self.logger = logger
        else:
            self.logger = PrintManager("DASCore", self.verbose)

        # define Mapping/Analytics/Parser in this order since Parser depends
        # on first two
        dasmapping = DASMapping(dasconfig)
        dasconfig["dasmapping"] = dasmapping
        self.mapping = dasmapping

        self.keylearning = DASKeyLearning(dasconfig)
        dasconfig["keylearning"] = self.keylearning

        # init DAS cache
        self.rawcache = DASMongocache(dasconfig)
        dasconfig["rawcache"] = self.rawcache

        # plug-in architecture: loop over registered data-services in
        # dasconfig; load appropriate module/class; register data
        # service with DASCore.
        self.systems = dasmapping.list_systems()
        # pointer to the DAS top level directory
        dasroot = "/".join(__file__.split("/")[:-3])
        for name in self.systems:
            try:
                klass = "DAS/services/%s/%s_service.py" % (name, name)
                srvfile = os.path.join(dasroot, klass)
                with open(srvfile) as srvclass:
                    for line in srvclass:
                        if line.find("(DASAbstractService)") != -1:
                            klass = line.split("(DASAbstractService)")[0]
                            klass = klass.split("class ")[-1]
                            break
                mname = "DAS.services.%s.%s_service" % (name, name)
                module = __import__(mname, fromlist=[klass])
                obj = getattr(module, klass)(dasconfig)
                setattr(self, name, obj)
            except IOError as err:
                if debug > 1:
                    # we have virtual services, so IOError can be correct
                    print_exc(err)
                try:
                    mname = "DAS.services.generic_service"
                    module = __import__(mname, fromlist=["GenericService"])
                    obj = module.GenericService(name, dasconfig)
                    setattr(self, name, obj)
                except Exception as exc:
                    print_exc(exc)
                    msg = "Unable to load %s data-service plugin" % name
                    raise Exception(msg)
            except Exception as exc:
                print_exc(exc)
                msg = "Unable to load %s data-service plugin" % name
                raise Exception(msg)

        # loop over systems and get system keys, add mapping keys to final list
        self.service_keys = {}
        self.service_parameters = {}
        for name in self.systems:
            skeys = list(getattr(self, name).keys())
            self.service_keys[getattr(self, name).name] = skeys
            sparams = getattr(self, name).parameters()
            self.service_parameters[getattr(self, name).name] = sparams

        self.service_keys["special"] = das_special_keys()
        self.dasconfig = dasconfig
        das_timer("DASCore::init", self.verbose)

    def keys(self):
        """
        Return map of data service keys
        """
        return self.service_keys

    def das_keys(self):
        """
        Return map of data service keys
        """
        _keys = ["records"]
        for values in self.service_keys.values():
            for key in values:
                if key not in _keys:
                    _keys.append(key)
        return _keys

    def result(self, query, idx=0, limit=None):
        """
        Get results either from cache or from explicit call
        """
        self.logger.info("input query=%s" % query)
        results = []
        dasquery = DASQuery(query)
        query = dasquery.mongo_query
        # check if we have any service which cover the query
        # otherwise decompose it into list of queries
        service_map = dasquery.service_apis_map()
        if not service_map:
            msg = "no APIs found to answer input query, will decompose it"
            self.logger.info(msg)
            skeys = query["fields"]
            if not skeys:
                skeys = []
            for key in skeys:
                newquery = DASQuery(dict(fields=[key], spec=query["spec"]))
                self.call(newquery)  # process query
        else:
            self.call(dasquery)  # process query

        # lookup provided query in a cache
        if not self.noresults:
            results = self.get_from_cache(dasquery, idx, limit)
        return results

    def remove_from_cache(self, dasquery):
        """
        Delete in cache entries about input query
        """
        self.rawcache.remove_from_cache(dasquery)

    def get_status(self, dasquery):
        """
        Look-up status of provided query in a cache.
        Return status of the query request and its hash.
        """
        status = None
        error = None
        reason = None
        if dasquery and "fields" in dasquery.mongo_query:
            fields = dasquery.mongo_query["fields"]
            if fields and isinstance(fields, list) and "queries" in fields:
                return "ok", error, reason
        record = self.rawcache.find(dasquery)
        error, reason = self.rawcache.is_error_in_records(dasquery)
        try:
            if record and "das" in record and "status" in record["das"]:
                status = record["das"]["status"]
                if not error:
                    error = record["das"].get("error", error)
                if not reason:
                    reason = record["das"].get("reason", reason)
                return status, error, reason
        except Exception as exc:
            print_exc(exc)
            status = error = reason = None
            self.rawcache.remove_from_cache(dasquery)
        return status, error, reason

    def worker(self, srv, dasquery):
        """Main worker function which calls data-srv call function"""
        self.logger.info("##### %s ######\n" % srv)
        das_timer(srv, self.verbose)
        getattr(getattr(self, srv), "call")(dasquery)
        das_timer(srv, self.verbose)

    def insert_query_records(self, dasquery):
        """
        Insert DAS query records into DAS cache and return list of services
        which will answer this query
        """
        services = dasquery.services
        self.logger.info("Potential services = %s" % services)
        if not services:
            msg = "No data-services for query %s" % dasquery
            msg += "mongo_query: %s" % dasquery.mongo_query
            msg += "params: %s" % dasquery.params()
            print(dastimestamp("DAS WARNING "), msg)

        # get list of URI which can answer this query
        ack_services = []
        for srv in services:
            gen = [t for t in getattr(getattr(self, srv), "apimap")(dasquery)]
            for url, api, args, iformat, expire in gen:
                header = dasheader(srv, dasquery, expire, api, url, ctime=0)
                self.rawcache.insert_query_record(dasquery, header)
                if srv not in ack_services:
                    ack_services.append(srv)
        if not ack_services:
            ack_services = services
        if dasquery.query.find("records ") != -1:
            srv_status = True  # skip DAS queries w/ records request
        # create das record with initial expire tstamp 2 min in a future
        # it should be sufficient for processing data-srv records
        expire = time.time() + 2 * 60
        header = dasheader("das", dasquery, expire, api="das_core", services=dict(das=ack_services))
        header["lookup_keys"] = []
        self.rawcache.insert_query_record(dasquery, header)
        das_timer("das_record", self.verbose)
        return ack_services

    def call(self, query, **kwds):
        """
        Top level DAS api which execute a given query using underlying
        data-services. It follows the following steps:

            - parse input query
            - identify data-sercices based on selection keys
              and where clause conditions
            - construct DAS workflow and execute data-service 
              API calls. At this step individual 
              data-services store results into DAS cache.

        Return status 0/1 depending on success of the calls, can be
        used by workers on cache server.

        kwds is provided for compatibility with web layer, e.g. it
        may invoke this method with additional pid parameter.
        """

        def update_das_query(dasquery, status, reason=None):
            "Update DAS query record with given status and reason"
            self.rawcache.update_query_record(dasquery, status, reason=reason)
            self.rawcache.add_to_record(dasquery, {"das.timer": get_das_timer()}, system="das")
            # make sure that das record is updated, we use 7 iteration which
            # sum up into 1 minute to cover default syncdelay value of mongo
            # server (in a future it would be better to find programatically
            # this syncdelay value, but it seems pymongo driver does not
            # provide any API for it.
            for idx in range(0, 7):
                spec = {"qhash": dasquery.qhash, "das.system": ["das"]}
                res = self.rawcache.col.find_one(spec)
                if res:
                    dbstatus = res.get("das", {}).get("status", None)
                    if dbstatus == status:
                        break
                    msg = "qhash %s, das.status=%s, status=%s, wait for update" % (dasquery.qhash, dbstatus, status)
                    print(dastimestamp("DAS WARNING"), msg)
                self.rawcache.update_query_record(dasquery, status, reason=reason)
                time.sleep(idx * idx)

        self.logger.info("input query=%s" % query)
        das_timer("DASCore::call", self.verbose)
        if isinstance(query, object) and hasattr(query, "__class__") and query.__class__.__name__ == "DASQuery":
            dasquery = query
        else:
            dasquery = DASQuery(query)
        for col in ["merge", "cache"]:
            self.rawcache.remove_expired(dasquery, col)
        query = dasquery.mongo_query
        spec = query.get("spec")
        fields = query.get("fields")
        if fields == ["records"]:
            msg = "look-up all records in cache"
            self.logger.info(msg)
            return "in cache"
        if spec == dict(records="*"):
            self.logger.info("look-up everything in cache")
            return "in cache"
        for record in self.rawcache.find_specs(dasquery):
            status = record["das"]["status"]
            msg = "found query %s in cache, status=%s\n" % (record["query"], status)
            self.logger.info(msg)
            print(dastimestamp("DAS INFO"), msg)
            return status

        self.logger.info(dasquery)
        das_timer("das_record", self.verbose)
        services = self.insert_query_records(dasquery)
        if not services:
            msg = "unable to locate data-services to fulfill this request"
            msg += ", will iterate over all registered services"
            print(dastimestamp("DAS WARNING "), dasquery, msg)
            services = dasquery.services if dasquery.services else self.systems
        try:
            if self.multitask:
                jobs = []
                for srv in sorted(services):
                    jobs.append(self.taskmgr.spawn(self.worker, srv, dasquery))
                self.taskmgr.joinall(jobs)
            else:
                for srv in services:
                    self.worker(srv, dasquery)
        except Exception as exc:
            print_exc(exc)
            return "fail"
        self.logger.info("\n##### merging ######\n")
        update_das_query(dasquery, "merging")
        das_timer("merge", self.verbose)
        for attempt in range(0, 4):  # try couple of times to avoid DB problems
            time.sleep(attempt)
            status = self.rawcache.merge_records(dasquery, attempt)
            if status == "ok":
                break
        das_timer("merge", self.verbose)
        # check if we have service records and properly setup status
        self.logger.info("\n##### check services ######\n")
        das_services = self.rawcache.check_services(dasquery)
        reason = ""
        status = "ok"
        if not das_services:
            if "records" in dasquery.query:
                status = "ok"  # keep status ok for 'records' queries
            else:
                reason = "no data records found in DAS cache"
                status = "fail"
                print(dastimestamp("DAS ERROR "), dasquery, reason)
        update_das_query(dasquery, status, reason)
        das_timer("DASCore::call", self.verbose)
        return status

    def processing_time(self, dasquery):
        "Look-up and return DAS query processing time"
        query_record = self.rawcache.find(dasquery)
        if query_record:
            das = query_record.get("das", None)
            if isinstance(das, dict):
                ctime = das.get("ctime", [])
                if ctime:
                    return ctime[-1] - ctime[0]
        return None

    def nresults(self, dasquery, coll="merge"):
        """
        Return total number of results (count) for provided query
        Code should match body of get_from_cache method.
        """
        fields = dasquery.mongo_query.get("fields", None)
        if dasquery.mapreduce:
            result = self.rawcache.map_reduce(dasquery.mapreduce, dasquery)
            return len([1 for _ in result])
        elif dasquery.aggregators:
            return len(dasquery.aggregators)
        return self.rawcache.nresults(dasquery, coll)

    def apilist(self, dasquery):
        "Return list of APIs answer given das query"
        return self.rawcache.apilist(dasquery)

    def incache(self, dasquery, coll="merge"):
        """
        Answer the question if given query in DAS cache or not
        """
        return self.rawcache.incache(dasquery, collection=coll)

    def get_from_cache(self, dasquery, idx=0, limit=0, collection="merge"):
        """
        Look-up results from the merge cache and yield them for
        further processing.
        """
        das_timer("DASCore::get_from_cache", self.verbose)
        msg = "col=%s, query=%s, idx=%s, limit=%s" % (collection, dasquery, idx, limit)
        self.logger.info(msg)

        fields = dasquery.mongo_query.get("fields", None)

        if dasquery.mapreduce:
            res = self.rawcache.map_reduce(dasquery.mapreduce, dasquery)
        elif dasquery.aggregators:
            # extract das information from rawcache
            rows = self.rawcache.get_from_cache(dasquery, collection=collection)
            first = next(rows)
            sinfo = das_sinfo(first)
            # to perform aggregation we need:
            # - loop over all aggregator functions
            # - loop over all data-services
            # - loop over all APIs within a data-services
            # the code below does that, it applies aggregator
            # to selected (based on key/srv/api) records
            res = []
            _id = 0
            time0 = time.time()
            expire = 300  # min expire
            for func, key in dasquery.aggregators:
                afunc = getattr(das_aggregator, "das_%s" % func)
                found = False
                for srv, apis in sinfo.items():
                    for api in apis:
                        rows = self.rawcache.get_from_cache(dasquery, collection=collection)
                        gen = api_rows(rows, api)
                        data = afunc(key, gen)
                        ctime = time.time() - time0
                        das = dasheader(srv, dasquery, expire, api=api, ctime=ctime)
                        if isinstance(data, dict) and data["value"] != "N/A":
                            aggr = {"_id": _id, "function": func, "key": key, "result": data}
                            aggr.update(das)
                            res.append(aggr)
                            _id += 1
                            found = True
                if not found:  # when we got nothing add empty result record
                    empty = {"value": "N/A"}
                    ctime = time.time() - time0
                    das = dasheader("das", dasquery, expire, api="das_core", ctime=ctime)
                    rec = {"_id": 0, "function": func, "key": key, "result": empty}
                    rec.update(das)
                    res.append(rec)
        else:
            res = self.rawcache.get_from_cache(dasquery, idx, limit, collection=collection)
        # we assume that all records from single query will have
        # identical structure, therefore it will be sufficient to update
        # keylearning DB only with first record
        count = 0
        for row in res:
            if not count:
                self.keylearning.add_record(dasquery, row)
            fix_times(row)
            yield row
            count += 1
        das_timer("DASCore::get_from_cache", self.verbose)
Ejemplo n.º 20
0
class DASAbstractService(object):
    """
    Abstract class describing DAS service. It initialized with a name which
    is used to identify service parameters from DAS configuration file.
    Those parameters are keys, verbosity level, URL of the data-service.
    """
    def __init__(self, name, config):
        self.name = name
        try:
            self.verbose = config['verbose']
            title = 'DASAbstactService_%s' % self.name
            self.logger = PrintManager(title, self.verbose)
            self.dasmapping = config['dasmapping']
            self.write2cache = config.get('write_cache', True)
            self.multitask = config['das'].get('multitask', True)
            self.error_expire = config['das'].get('error_expire', 300)
            self.dbs_global = None  # to be configured at run time
            self.dburi = config['mongodb']['dburi']
            engine = config.get('engine', None)
            self.gfs = db_gridfs(self.dburi)
        except Exception as exc:
            print_exc(exc)
            raise Exception('fail to parse DAS config')

        # read key/cert info
        try:
            self.ckey, self.cert = get_key_cert()
        except Exception as exc:
            print_exc(exc)
            self.ckey = None
            self.cert = None

        if self.multitask:
            nworkers = config['das'].get('api_workers', 3)
            thr_weights = config['das'].get('thread_weights', [])
            for system_weight in thr_weights:
                system, weight = system_weight.split(':')
                if system == self.name:
                    nworkers *= int(weight)
#             if  engine:
#                 thr_name = 'DASAbstractService:%s:PluginTaskManager' % self.name
#                 self.taskmgr = PluginTaskManager(\
#                         engine, nworkers=nworkers, name=thr_name)
#                 self.taskmgr.subscribe()
#             else:
#                 thr_name = 'DASAbstractService:%s:TaskManager' % self.name
#                 self.taskmgr = TaskManager(nworkers=nworkers, name=thr_name)
            thr_name = 'DASAbstractService:%s:TaskManager' % self.name
            self.taskmgr = TaskManager(nworkers=nworkers, name=thr_name)
        else:
            self.taskmgr = None

        self.map = {}  # to be defined by data-service implementation
        self._keys = None  # to be defined at run-time in self.keys
        self._params = None  # to be defined at run-time in self.parameters
        self._notations = {}  # to be defined at run-time in self.notations

        self.logger.info('initialized')
        # define internal cache manager to put 'raw' results into cache
        if 'rawcache' in config and config['rawcache']:
            self.localcache = config['rawcache']
        else:
            msg = 'Undefined rawcache, please check your configuration'
            raise Exception(msg)

    def status(self):
        "Return status of the service"
        return self.taskmgr.status()

    def services(self):
        """
        Return sub-subsystems used to retrieve data records. It is used
        in dasheader call to setup das.services field. This method can be
        overwritten in sub-classes, otherwise returns dict of service name
        and CMS systems used to retrieve data records.
        """
        return {self.name: [self.name]}

    def version(self):
        """Return data-services version, should be implemented in sub-classes"""
        return ''

    def keys(self):
        """
        Return service keys
        """
        if self._keys:
            return self._keys
        srv_keys = []
        for _api, params in self.map.items():
            for key in params['keys']:
                if not key in srv_keys:
                    srv_keys.append(key)
        self._keys = srv_keys
        return srv_keys

    def parameters(self):
        """
        Return mapped service parameters
        """
        if self._params:
            return self._params
        srv_params = []
        for _api, params in self.map.items():
            for key in params['params']:
                param_list = self.dasmapping.api2das(self.name, key)
                for par in param_list:
                    if not par in srv_params:
                        srv_params.append(par)
        self._params = srv_params
        return srv_params

    def notations(self):
        """
        Return a map of system notations.
        """
        if self._notations:
            return self._notations
        for _, rows in self.dasmapping.notations(self.name).items():
            for row in rows:
                api = row['api']
                nmap = row['rec_key']
                notation = row['api_output']
                if api in self._notations:
                    self._notations[api].update({notation: nmap})
                else:
                    self._notations[api] = {notation: nmap}
        return self._notations

    def getdata(self, url, params, expire, headers=None, post=None):
        """URL call wrapper"""
        if url.find('https:') != -1:
            return getdata(url,
                           params,
                           headers,
                           expire,
                           post,
                           self.error_expire,
                           self.verbose,
                           self.ckey,
                           self.cert,
                           system=self.name)
        else:
            return getdata(url,
                           params,
                           headers,
                           expire,
                           post,
                           self.error_expire,
                           self.verbose,
                           system=self.name)

    def call(self, dasquery):
        """
        Invoke service API to execute given query.
        Return results as a collect list set.
        """
        self.logger.info(dasquery)
        # check the cache for records with given query/system
        res = self.localcache.incache(dasquery,
                                      collection='cache',
                                      system=self.name)
        if res:
            msg = "found records in local cache"
            self.logger.info(msg)
            return
        # ask data-service api to get results, they'll be store them in
        # cache, so return at the end what we have in cache.
        self.api(dasquery)

    def write_to_cache(self, dasquery, expire, url, api, args, gen, ctime):
        """
        Write provided result set into DAS cache.
        """
        if not self.write2cache:
            return

        # before going to cache we should check/set possible misses, e.g.
        # primary key when error is thrown
        result = self.set_misses(dasquery, api, gen)

        # update the cache
        header = dasheader(self.name,
                           dasquery,
                           expire,
                           api,
                           url,
                           services=self.services())
        header['lookup_keys'] = self.lookup_keys(api)
        header['prim_key'] = self.dasmapping.primary_mapkey(self.name, api)
        header['ctime'] = ctime
        system = self.name
        self.localcache.update_cache(dasquery, result, header, system, api)

        msg = 'cache has been updated,\n'
        self.logger.debug(msg)

    def adjust_params(self, api, kwds, instance=None):
        """
        Data-service specific parser to adjust parameters according to
        its specifications. For example, DQ service accepts a string
        of parameters, rather parameter set, while DBS2 can reuse
        some parameters for different API, e.g. I can use dataset path
        to pass to listPrimaryDatasets as primary_dataset pattern.
        """
        pass

    def lookup_keys(self, api):
        """
        Return look-up keys of data output for given data-service API.
        """
        lkeys = self.dasmapping.lookup_keys(self.name, api)
        return [{api: lkeys}]

    def inspect_params(self, api, args):
        """
        Perform API parameter inspection. Check if API accept a range
        of parameters, etc.
        """
        for key, value in args.items():
            if isinstance(value, dict):
                minval = None
                maxval = None
                for oper, val in value.items():
                    if oper == '$in':
                        minval = int(val[0])
                        maxval = int(val[-1])
                        args[key] = range(minval, maxval)
                    elif oper == '$lt':
                        maxval = int(val)
                        args[key] = maxval
                    elif oper == '$lte':
                        maxval = int(val)
                        args[key] = maxval
                    elif oper == '$gt':
                        minval = int(val)
                        args[key] = minval
                    elif oper == '$gte':
                        minval = int(val)
                        args[key] = minval
                    else:
                        msg = '%s does not support operator %s' % (api, oper)
                        raise Exception(msg)
        return args

    def get_notations(self, api):
        """Return notations used for given API"""
        notationmap = self.notations()
        if not notationmap:
            return {}
        notations = {}
        if '' in notationmap:
            notations = dict(notationmap[''])  # notations applied to all APIs
            if api in notationmap:  # overwrite the one for provided API
                notations.update(notationmap[api])
        return notations

    def parser(self, dasquery, dformat, data, api):
        """
        DAS data parser. Input parameters:

        - *query* input DAS query
        - *dformat* is a data format, e.g. XML, JSON
        - *data* is a data source, either file-like object or
          actual data
        - *api* is API name
        """
        prim_key = self.dasmapping.primary_key(self.name, api)
        counter = 0
        if dformat.lower() == 'xml':
            tags = self.dasmapping.api2daskey(self.name, api)
            gen = xml_parser(data, prim_key, tags)
            for row in gen:
                counter += 1
                yield row
        elif dformat.lower() == 'json' or dformat.lower() == 'dasjson':
            gen = json_parser(data, self.logger)
            das_dict = {}
            for row in gen:
                if dformat.lower() == 'dasjson':
                    for key, val in row.items():
                        if key != 'results':
                            das_dict[key] = val
                    row = row['results']
                if isinstance(row, list):
                    for item in row:
                        if item:
                            if prim_key in item:
                                counter += 1
                                yield item
                            else:
                                counter += 1
                                yield {prim_key: item}
                else:
                    if prim_key in row:
                        counter += 1
                        yield row
                    else:
                        counter += 1
                        yield {prim_key: row}
        else:
            msg = 'Unsupported data format="%s", API="%s"' % (dformat, api)
            raise Exception(msg)
        msg = "api=%s, format=%s " % (api, dformat)
        msg += "prim_key=%s yield %s rows" % (prim_key, counter)
        self.logger.info(msg)

    def translator(self, api, genrows):
        """
        Convert raw results into DAS records. 
        """
        prim_key = self.dasmapping.primary_key(self.name, api)
        count = 0
        for row in genrows:
            row2das(self.dasmapping.notation2das, self.name, api, row)
            count += 1
            # check for primary key existance, since it can be overriden
            # by row2das. For example DBS3 uses flat namespace, so we
            # override dataset=>name, while dataset still is a primary key
            if isinstance(row, list):
                yield {prim_key: row}
            elif prim_key in row:
                if prim_key in row[prim_key]:
                    yield row[prim_key]  # remapping may create nested dict
                else:
                    yield row
            else:
                yield {prim_key: row}
        msg = "yield %s rows" % count
        self.logger.debug(msg)

    def set_misses(self, dasquery, api, genrows):
        """
        Check and adjust DAS records wrt input query. If some of the DAS
        keys are missing, add it with its value to the DAS record.
        """
        # look-up primary key
        prim_key = self.dasmapping.primary_key(self.name, api)

        # Scan all docs and store those whose size above MongoDB limit into
        # GridFS
        map_key = self.dasmapping.primary_mapkey(self.name, api)
        genrows = parse2gridfs(self.gfs, map_key, genrows, self.logger)

        spec = dasquery.mongo_query['spec']
        row = next(genrows)
        ddict = DotDict(row)
        keys2adjust = []
        for key in spec.keys():
            val = ddict.get(key)
            if spec[key] != val and key not in keys2adjust:
                keys2adjust.append(key)
        msg = "adjust keys %s" % keys2adjust
        self.logger.debug(msg)
        count = 0
        if keys2adjust:
            # adjust of the rows
            for row in yield_rows(row, genrows):
                ddict = DotDict(row)
                pval = ddict.get(map_key)
                if isinstance(pval, dict) and 'error' in pval:
                    ddict[map_key] = ''
                    ddict.update({prim_key: pval})
                for key in keys2adjust:
                    value = spec[key]
                    existing_value = ddict.get(key)
                    # the way to deal with proximity/patern/condition results
                    if  (isinstance(value, str) or isinstance(value, unicode))\
                        and value.find('*') != -1: # we got pattern
                        if existing_value:
                            value = existing_value
                    elif isinstance(value, dict) or \
                        isinstance(value, list): # we got condition
                        if existing_value:
                            value = existing_value
                        elif isinstance(value, dict) and \
                        '$in' in value: # we got a range {'$in': []}
                            value = value['$in']
                        elif isinstance(value, dict) and \
                        '$lte' in value and '$gte' in value:
                            # we got a between range
                            value = [value['$gte'], value['$lte']]
                        else:
                            value = json.dumps(value)
                    elif existing_value and value != existing_value:
                        # we got proximity results
                        if 'proximity' in ddict:
                            proximity = DotDict({key: existing_value})
                            ddict['proximity'].update(proximity)
                        else:
                            proximity = DotDict({})
                            proximity[key] = existing_value
                            ddict['proximity'] = proximity
                    else:
                        if existing_value:
                            value = existing_value
                    ddict[key] = value
                yield ddict
                count += 1
        else:
            yield row
            for row in genrows:
                yield row
                count += 1
        msg = "yield %s rows" % count
        self.logger.debug(msg)

    def api(self, dasquery):
        """
        Data service api method, can be defined by data-service class.
        It parse input query and invoke appropriate data-service API
        call. All results are stored into the DAS cache along with
        api call inserted into Analytics DB.
        """
        self.logger.info(dasquery)
        genrows = self.apimap(dasquery)
        if not genrows:
            return
        jobs = []
        for url, api, args, dformat, expire in genrows:
            # insert DAS query record for given API
            header = dasheader(self.name, dasquery, expire, api, url)
            self.localcache.insert_query_record(dasquery, header)
            # fetch DAS data records
            if self.multitask:
                jobs.append(self.taskmgr.spawn(self.apicall, \
                            dasquery, url, api, args, dformat, expire))
            else:
                self.apicall(dasquery, url, api, args, dformat, expire)
        if self.multitask:
            self.taskmgr.joinall(jobs)

    def apicall(self, dasquery, url, api, args, dformat, expire):
        """
        Data service api method, can be defined by data-service class.
        It parse input query and invoke appropriate data-service API
        call. All results are stored into the DAS cache along with
        api call inserted into Analytics DB.

        We invoke explicitly close call for our datastream instead
        of using context manager since this method as well as
        getdata/parser can be overwritten by child classes.
        """
        datastream = None
        try:
            args = self.inspect_params(api, args)
            time0 = time.time()
            headers = make_headers(dformat)
            datastream, expire = self.getdata(url, args, expire, headers)
            self.logger.info("%s expire %s" % (api, expire))
            rawrows = self.parser(dasquery, dformat, datastream, api)
            dasrows = self.translator(api, rawrows)
            ctime = time.time() - time0
            self.write_to_cache(dasquery, expire, url, api, args, dasrows,
                                ctime)
        except Exception as exc:
            msg  = 'Fail to process: url=%s, api=%s, args=%s' \
                    % (url, api, args)
            print(msg)
            print_exc(exc)
        close(datastream)

    def url_instance(self, url, _instance):
        """
        Virtual method to adjust URL for a given instance,
        must be implemented in service classes
        """
        return url

    def adjust_url(self, url, instance):
        """
        Adjust data-service URL wrt provided instance, e.g.
        DBS carry several instances
        """
        if instance:
            url = self.url_instance(url, instance)
        return url

    def apimap(self, dasquery):
        """
        Analyze input query and yield url, api, args, format, expire
        for further processing.
        """
        srv = self.name  # get local copy to avoid threading issues
        cond = getarg(dasquery.mongo_query, 'spec', {})
        instance = dasquery.mongo_query.get('instance', self.dbs_global)
        skeys = getarg(dasquery.mongo_query, 'fields', [])
        if not skeys:
            skeys = []
        self.logger.info("\n")
        for api, value in self.map.items():
            expire = value['expire']
            iformat = value['format']
            url = self.adjust_url(value['url'], instance)
            if not url:
                msg = '--- rejects API %s, no URL' % api
                self.logger.info(msg)
                continue
            args = dict(value['params'])  # make new copy, since we'll adjust
            wild = value.get('wild_card', '*')
            found = 0
            # check if input parameters are covered by API
            if not self.dasmapping.check_api_match(srv, api, cond):
                msg = '--- rejects API %s, does not cover input condition keys' \
                        % api
                self.logger.info(msg)
                continue
            # once we now that API covers input set of parameters we check
            # every input parameter for pattern matching
            for key, val in cond.items():
                # check if keys from conditions are accepted by API
                # need to convert key (which is daskeys.map) into
                # input api parameter
                for apiparam in self.dasmapping.das2api(srv, api, key, val):
                    if apiparam in args:
                        args[apiparam] = val
                        found += 1
            # VK 20160708, wrong statement, it caused to pass
            # datasets API for query dataset in [path1, path2]
            # I'll leave block here until I test and verify that
            # commented out block will not cause other issues
            #
            # check the case when we only have single condition key
            # and it is the key we look-up
#             if  not found and skeys == [k.split('.')[0] for k in cond.keys()]:
#                 found = 1
# check if number of keys on cond and args are the same
            if len(cond.keys()) != found:
                msg = "--- reject API %s, not all condition keys are covered" \
                        % api
                self.logger.info(msg)
                msg = 'args=%s' % args
                self.logger.debug(msg)
                continue
            if not found:
                msg = "--- rejects API %s, parameters don't match" % api
                self.logger.info(msg)
                msg = 'args=%s' % args
                self.logger.debug(msg)
                continue
            self.adjust_params(api, args, instance)
            # delete args keys whose value is optional
            delete_keys(args, 'optional')
            # check that there is no "required" parameter left in args,
            # since such api will not work
            if 'required' in args.values():
                msg = '--- rejects API %s, parameter is required' % api
                self.logger.info(msg)
                msg = 'args=%s' % args
                self.logger.debug(msg)
                continue
            # adjust pattern symbols in arguments
            if wild != '*':
                for key, val in args.items():
                    if isinstance(val, str) or isinstance(val, unicode):
                        val = val.replace('*', wild)
                    args[key] = val

            # compare query selection keys with API look-up keys
            api_lkeys = self.dasmapping.api_lkeys(srv, api)
            if set(api_lkeys) != set(skeys):
                msg = "--- rejects API %s, api_lkeys(%s)!=skeys(%s)"\
                        % (api, api_lkeys, skeys)
                self.logger.info(msg)
                continue

            msg = '+++ %s passes API %s' % (srv, api)
            self.logger.info(msg)
            msg = 'args=%s' % args
            self.logger.debug(msg)

            msg = "yield "
            msg += "system ***%s***, url=%s, api=%s, args=%s, format=%s, " \
                % (srv, url, api, args, iformat)
            msg += "expire=%s, wild_card=%s" \
                % (expire, wild)
            self.logger.debug(msg)

            yield url, api, args, iformat, expire
Ejemplo n.º 21
0
Archivo: das_core.py Proyecto: ktf/DAS
class DASCore(object):
    """
    DAS core class.
    """
    def __init__(self, config=None, debug=0,
                nores=False, logger=None, engine=None, multitask=True):
        if  config:
            dasconfig = config
        else:
            dasconfig = das_readconfig()
        verbose       = dasconfig['verbose']
        self.stdout   = debug
        if  isinstance(debug, int):
            self.verbose = debug
            dasconfig['verbose'] = debug
        else:
            self.verbose = verbose
        das_timer('DASCore::init', self.verbose)
        self.operators = das_operators()

        # set noresults option
        self.noresults = False
        if  nores:
            dasconfig['write_cache'] = True
            self.noresults = nores

        self.multitask = dasconfig['das'].get('multitask', True)
        if  debug or self.verbose:
            self.multitask = False # in verbose mode do not use multitask
            dasconfig['das']['multitask'] = False
        if  not multitask: # explicitly call DASCore ctor, e.g. in analytics
            self.multitask = False
            dasconfig['das']['multitask'] = False
        dasconfig['engine'] = engine
        if  self.multitask:
            nworkers = dasconfig['das'].get('core_workers', 5)
            if  engine:
                thr_name = 'DASCore:PluginTaskManager'
                self.taskmgr = PluginTaskManager(\
                        engine, nworkers=nworkers, name=thr_name)
                self.taskmgr.subscribe()
            else:
                thr_name = 'DASCore:TaskManager'
                self.taskmgr = TaskManager(nworkers=nworkers, name=thr_name)
        else:
            self.taskmgr = None

        if  logger:
            self.logger = logger
        else:
            self.logger = PrintManager('DASCore', self.verbose)

        # define Mapping/Analytics/Parser in this order since Parser depends
        # on first two
        dasmapping = DASMapping(dasconfig)
        dasconfig['dasmapping'] = dasmapping
        self.mapping = dasmapping

        self.analytics = DASAnalytics(dasconfig)
        dasconfig['dasanalytics'] = self.analytics

        self.keylearning = DASKeyLearning(dasconfig)
        dasconfig['keylearning'] = self.keylearning

        # init DAS cache
        self.rawcache = DASMongocache(dasconfig)
        dasconfig['rawcache'] = self.rawcache

        # plug-in architecture: loop over registered data-services in
        # dasconfig; load appropriate module/class; register data
        # service with DASCore.
        self.systems = dasmapping.list_systems()
        # pointer to the DAS top level directory
        dasroot = '/'.join(__file__.split('/')[:-3])
        for name in self.systems:
            try:
                klass  = 'DAS/services/%s/%s_service.py' \
                    % (name, name)
                srvfile = os.path.join(dasroot, klass)
                with file(srvfile) as srvclass:
                    for line in srvclass:
                        if  line.find('(DASAbstractService)') != -1:
                            klass = line.split('(DASAbstractService)')[0]
                            klass = klass.split('class ')[-1] 
                            break
                mname  = 'DAS.services.%s.%s_service' % (name, name)
                module = __import__(mname, fromlist=[klass])
                obj = getattr(module, klass)(dasconfig)
                setattr(self, name, obj)
                SERVICES[name] = obj
            except IOError as err:
                if  debug > 1:
                    # we have virtual services, so IOError can be correct
                    print_exc(err)
                try:
                    mname  = 'DAS.services.generic_service'
                    module = __import__(mname, fromlist=['GenericService'])
                    obj    = module.GenericService(name, dasconfig)
                    setattr(self, name, obj)
                except Exception as exc:
                    print_exc(exc)
                    msg = "Unable to load %s data-service plugin" % name
                    raise Exception(msg)
            except Exception as exc:
                print_exc(exc)
                msg = "Unable to load %s data-service plugin" % name
                raise Exception(msg)

        # loop over systems and get system keys, add mapping keys to final list
        self.service_keys = {}
        self.service_parameters = {}
        for name in self.systems: 
            skeys = getattr(self, name).keys()
            self.service_keys[getattr(self, name).name] = skeys
            sparams = getattr(self, name).parameters()
            self.service_parameters[getattr(self, name).name] = sparams

        self.service_keys['special'] = das_special_keys()
        self.dasconfig = dasconfig
        das_timer('DASCore::init', self.verbose)

    def keys(self):
        """
        Return map of data service keys
        """
        return self.service_keys

    def das_keys(self):
        """
        Return map of data service keys
        """
        _keys = ['records']
        for values in self.service_keys.values():
            for key in values:
                if  key not in _keys:
                    _keys.append(key)
        return _keys

    def result(self, query, idx=0, limit=None):
        """
        Get results either from cache or from explicit call
        """
        self.logger.info('input query=%s' % query)
        results = []
        dasquery = DASQuery(query)
        dasquery.add_to_analytics()
        query    = dasquery.mongo_query
        # check if we have any service which cover the query
        # otherwise decompose it into list of queries
        service_map = dasquery.service_apis_map()
        if  not service_map:
            msg  = 'no APIs found to answer input query, will decompose it'
            self.logger.info(msg)
            skeys = query['fields']
            if  not skeys:
                skeys = []
            for key in skeys:
                newquery = DASQuery(dict(fields=[key], spec=query['spec']))
                self.call(newquery) # process query
        else:
            self.call(dasquery) # process query

        # lookup provided query in a cache
        if  not self.noresults:
            results = self.get_from_cache(dasquery, idx, limit)
        return results

    def remove_from_cache(self, dasquery):
        """
        Delete in cache entries about input query
        """
        self.rawcache.remove_from_cache(dasquery)

    def get_status(self, dasquery):
        """
        Look-up status of provided query in a cache.
        Return status of the query request and its hash.
        """
        status = None
        error  = None
        reason = None
        for col in ['merge', 'cache']:
            self.rawcache.remove_expired(dasquery, col)
        if  dasquery and 'fields' in dasquery.mongo_query:
            fields = dasquery.mongo_query['fields']
            if  fields and isinstance(fields, list) and 'queries' in fields:
                return 'ok', error, reason
        record = self.rawcache.find(dasquery)
        error, reason = self.rawcache.is_error_in_records(dasquery)
        try:
            if  record and 'das' in record and 'status' in record['das']:
                status = record['das']['status']
                if  not error:
                    error = record['das'].get('error', error)
                if  not reason:
                    reason = record['das'].get('reason', reason)
                return status, error, reason
        except Exception as exc:
            print_exc(exc)
            status = error = reason = None
            self.rawcache.remove_from_cache(dasquery)
        return status, error, reason

    def worker(self, srv, dasquery):
        """Main worker function which calls data-srv call function"""
        self.logger.info('##### %s ######\n' % srv)
        das_timer(srv, self.verbose)
        getattr(getattr(self, srv), 'call')(dasquery)
        das_timer(srv, self.verbose)

    def insert_query_records(self, dasquery):
        """
        Insert DAS query records into DAS cache and return list of services
        which will answer this query
        """
        services = dasquery.services
        self.logger.info('Potential services = %s' % services)
        if  not services:
            msg  = 'No data-services for query %s' % dasquery
            msg += 'mongo_query: %s' % dasquery.mongo_query
            msg += 'params: %s' % dasquery.params()
            print dastimestamp('DAS WARNING '), msg

        # get list of URI which can answer this query
        ack_services = []
        for srv in services:
            gen = [t for t in getattr(getattr(self, srv), 'apimap')(dasquery)]
            for url, api, args, iformat, expire in gen:
                header = dasheader(srv, dasquery, expire, api, url, ctime=0)
                self.rawcache.insert_query_record(dasquery, header)
                if  srv not in ack_services:
                    ack_services.append(srv)
        if  not ack_services:
            ack_services = services
        if  dasquery.query.find('records ') != -1:
            srv_status = True # skip DAS queries w/ records request
        expire = 2*60 # 2 minutes, it should be overwriten by data-srv
        header = dasheader("das", dasquery, expire, api='das_core',
                services=dict(das=ack_services))
        header['lookup_keys'] = []
        self.rawcache.insert_query_record(dasquery, header)
        das_timer('das_record', self.verbose)
        return ack_services

    def call(self, query, add_to_analytics=True, **kwds):
        """
        Top level DAS api which execute a given query using underlying
        data-services. It follows the following steps:

            - parse input query
            - identify data-sercices based on selection keys
              and where clause conditions
            - construct DAS workflow and execute data-service 
              API calls. At this step individual 
              data-services store results into DAS cache.

        Return status 0/1 depending on success of the calls, can be
        used by workers on cache server.

        kwds is provided for compatibility with web layer, e.g. it
        may invoke this method with additional pid parameter.
        """
        def update_das_query(dasquery, status, reason=None):
            "Update DAS query record with given status and reason"
            self.rawcache.update_query_record(dasquery, status, reason=reason)
            self.rawcache.add_to_record(\
                    dasquery, {'das.timer': get_das_timer()}, system='das')
            # make sure that das record is updated, we use 7 iteration which
            # sum up into 1 minute to cover default syncdelay value of mongo
            # server (in a future it would be better to find programatically
            # this syncdelay value, but it seems pymongo driver does not
            # provide any API for it.
            for idx in xrange(1, 7):
                spec = {'qhash':dasquery.qhash, 'das.system':['das']}
                res = self.rawcache.col.find_one(spec)
                if  res:
                    dbstatus = res.get('das', {}).get('status', None)
                    if  dbstatus == status:
                        break
                    msg = 'qhash %s, das.status=%s, status=%s, wait for update' \
                            % (dasquery.qhash, dbstatus, status)
                    print dastimestamp('DAS WARNING'), msg
                time.sleep(idx*idx)
                self.rawcache.update_query_record(dasquery, status, reason=reason)

        self.logger.info('input query=%s' % query)
        das_timer('DASCore::call', self.verbose)
        if  isinstance(query, object) and hasattr(query, '__class__')\
            and query.__class__.__name__ == 'DASQuery':
            dasquery = query
        else:
            dasquery = DASQuery(query)
        for col in ['merge', 'cache']:
            self.rawcache.remove_expired(dasquery, col)
        if  add_to_analytics:
            dasquery.add_to_analytics()
        query  = dasquery.mongo_query
        spec   = query.get('spec')
        fields = query.get('fields')
        if  fields == ['records']:
            msg = 'look-up all records in cache'
            self.logger.info(msg)
            return 'in cache'
        if  spec == dict(records='*'):
            self.logger.info("look-up everything in cache")
            return 'in cache'
        for record in self.rawcache.find_specs(dasquery):
            status = record['das']['status']
            msg = 'found query %s in cache, status=%s\n' \
                        % (record['query'], status)
            self.logger.info(msg)
            print dastimestamp('DAS INFO'), msg
            return status

        self.logger.info(dasquery)
        das_timer('das_record', self.verbose)
        services = self.insert_query_records(dasquery)
        if  not services:
            msg = 'unable to locate data-services to fulfill this request'
            msg += ', will iterate over all registered services'
            print dastimestamp('DAS WARNING '), dasquery, msg
            services = dasquery.services if dasquery.services else self.systems
        try:
            if  self.multitask:
                jobs = []
                for srv in sorted(services):
                    jobs.append(self.taskmgr.spawn(self.worker, srv, dasquery))
                self.taskmgr.joinall(jobs)
            else:
                for srv in services:
                    self.worker(srv, dasquery)
        except Exception as exc:
            print_exc(exc)
            return 'fail'
        self.logger.info('\n##### merging ######\n')
        update_das_query(dasquery, 'merging')
        das_timer('merge', self.verbose)
        self.rawcache.merge_records(dasquery)
        das_timer('merge', self.verbose)
        # check if we have service records and properly setup status
        self.logger.info('\n##### check services ######\n')
        das_services = self.rawcache.check_services(dasquery)
        reason = ''
        status = 'ok'
        if  not das_services:
            if  'records' in dasquery.query:
                status = 'ok' # keep status ok for 'records' queries
            else:
                reason = 'no data records found in DAS cache'
                status = 'fail'
                print dastimestamp('DAS ERROR '), dasquery, reason
        update_das_query(dasquery, status, reason)
        das_timer('DASCore::call', self.verbose)
        return status

    def processing_time(self, dasquery):
        "Look-up and return DAS query processing time"
        query_record = self.rawcache.find(dasquery)
        if  query_record:
            das = query_record.get('das', None)
            if  isinstance(das, dict):
                ctime = das.get('ctime', [])
                if  ctime:
                    return ctime[-1]-ctime[0]
        return None

    def nresults(self, dasquery, coll='merge'):
        """
        Return total number of results (count) for provided query
        Code should match body of get_from_cache method.
        """
        fields = dasquery.mongo_query.get('fields', None)
        if  dasquery.mapreduce:
            result = self.rawcache.map_reduce(dasquery.mapreduce, dasquery)
            return len([1 for _ in result])
        elif dasquery.aggregators:
            return len(dasquery.aggregators)
        elif isinstance(fields, list) and 'queries' in fields:
            return len([1 for _ in self.get_queries(dasquery)])
        return self.rawcache.nresults(dasquery, coll)

    def apilist(self, dasquery):
        "Return list of APIs answer given das query"
        return self.rawcache.apilist(dasquery)

    def incache(self, dasquery, coll='merge'):
        """
        Answer the question if given query in DAS cache or not
        """
        return self.rawcache.incache(dasquery, collection=coll)

    def get_from_cache(self, dasquery, idx=0, limit=0, collection='merge'):
        """
        Look-up results from the merge cache and yield them for
        further processing.
        """
        das_timer('DASCore::get_from_cache', self.verbose)
        msg = 'col=%s, query=%s, idx=%s, limit=%s'\
                % (collection, dasquery, idx, limit)
        self.logger.info(msg)

        fields  = dasquery.mongo_query.get('fields', None)

        if  dasquery.mapreduce:
            res = self.rawcache.map_reduce(dasquery.mapreduce, dasquery)
        elif dasquery.aggregators:
            # extract das information from rawcache
            rows  = self.rawcache.get_from_cache(\
                    dasquery, collection=collection)
            first = rows.next()
            sinfo = das_sinfo(first)
            # to perform aggregation we need:
            # - loop over all aggregator functions
            # - loop over all data-services
            # - loop over all APIs within a data-services
            # the code below does that, it applies aggregator
            # to selected (based on key/srv/api) records
            res = []
            _id = 0
            time0  = time.time()
            expire = 300 # min expire
            for func, key in dasquery.aggregators:
                afunc = getattr(das_aggregator, 'das_%s' % func)
                found = False
                for srv, apis, in sinfo.items():
                    for api in apis:
                        rows  = self.rawcache.get_from_cache(\
                                dasquery, collection=collection)
                        gen   = api_rows(rows, api)
                        data  = afunc(key, gen)
                        ctime = time.time() - time0
                        das   = dasheader(srv, dasquery, expire, api=api,
                                ctime=ctime)
                        if  isinstance(data, dict) and data['value'] != 'N/A':
                            aggr = {'_id':_id, 'function': func,
                                    'key': key, 'result': data}
                            aggr.update(das)
                            res.append(aggr)
                            _id += 1
                            found = True
                if  not found: # when we got nothing add empty result record
                    empty = {'value':'N/A'}
                    ctime = time.time() - time0
                    das = dasheader('das', dasquery, expire, api='das_core',
                            ctime=ctime)
                    rec = {'_id':0, 'function':func, 'key':key, 'result':empty}
                    rec.update(das)
                    res.append(rec)
        elif isinstance(fields, list) and 'queries' in fields:
            res = itertools.islice(self.get_queries(dasquery), idx, idx+limit)
        else:
            res = self.rawcache.get_from_cache(dasquery, idx, limit, \
                    collection=collection)
        for row in res:
            fix_times(row)
            yield row
        das_timer('DASCore::get_from_cache', self.verbose)

    def get_queries(self, dasquery):
        """
        Look-up (popular) queries in DAS analytics/logging db
        """
        das_timer('DASCore::get_queries', self.verbose)
        fields = dasquery.mongo_query.get('fields')
        spec   = dasquery.mongo_query.get('spec')
        if  'popular' in fields:
            res = self.analytics.get_popular_queries(spec)
        else:
            datestamp = spec.get('date')
            if  isinstance(datestamp, dict):
                value = datestamp.get('$in')
                res = \
                self.analytics.list_queries(after=value[0], before=value[1])
            elif isinstance(datestamp, int):
                res = self.analytics.list_queries(after=datestamp)
            elif not datestamp:
                res = self.analytics.list_queries()
            else:
                msg = 'Unsupported date value: %s' % datestamp
                raise Exception(msg)
        for row in res:
            rid = row.pop('_id')
            yield dict(das_query=row, _id=rid)
        das_timer('DASCore::get_queries', self.verbose)
Ejemplo n.º 22
0
class DASWebService(DASWebManager):
    """
    DAS web service interface.
    """

    def __init__(self, dasconfig):
        DASWebManager.__init__(self, dasconfig)
        config = dasconfig["web_server"]
        self.pid_pat = re.compile(r"^[a-z0-9]{32}")
        self.base = config["url_base"]
        self.interval = config.get("status_update", 2500)
        self.engine = config.get("engine", None)
        self.check_clients = config.get("check_clients", False)
        nworkers = config["web_workers"]
        self.hot_thr = config.get("hot_threshold", 3000)
        self.dasconfig = dasconfig
        self.dburi = self.dasconfig["mongodb"]["dburi"]
        self.lifetime = self.dasconfig["mongodb"]["lifetime"]
        self.queue_limit = config.get("queue_limit", 50)
        qtype = config.get("qtype", "Queue")
        if qtype not in ["Queue", "PriorityQueue"]:
            msg = "Wrong queue type, qtype=%s" % qtype
            raise Exception(msg)
        if self.engine:
            thr_name = "DASWebService:PluginTaskManager"
            self.taskmgr = PluginTaskManager(bus=self.engine, nworkers=nworkers, name=thr_name, qtype=qtype)
            self.taskmgr.subscribe()
        else:
            thr_name = "DASWebService:TaskManager"
            self.taskmgr = TaskManager(nworkers=nworkers, name=thr_name, qtype=qtype)
        self.adjust = config.get("adjust_input", False)
        self.dasmgr = None  # defined at run-time via self.init()
        self.reqmgr = None  # defined at run-time via self.init()
        self.daskeys = []  # defined at run-time via self.init()
        self.colors = {}  # defined at run-time via self.init()
        self.dbs_url = None  # defined at run-time via self.init()
        self.dbs_global = None  # defined at run-time via self.init()
        self.kws = None  # defined at run-time via self.init()
        self.q_rewriter = None  # defined at run-time via self.init()
        self.dataset_daemon = config.get("dbs_daemon", False)
        self.dbsmgr = {}  # dbs_urls vs dbs_daemons, defined at run-time
        self.daskeyslist = []  # list of DAS keys
        self.init()

        # Monitoring thread which performs auto-reconnection
        thname = "dascore_monitor"
        start_new_thread(thname, dascore_monitor, ({"das": self.dasmgr, "uri": self.dburi}, self.init, 5))

    def dbs_daemon(self, config):
        """Start DBS daemon if it is requested via DAS configuration"""
        try:
            main_dbs_url = self.dbs_url
            dbs_urls = []
            print "### DBS URL:", self.dbs_url
            print "### DBS instances:", self.dbs_instances
            if not self.dbs_url or not self.dbs_instances:
                return  # just quit
            for inst in self.dbs_instances:
                dbs_urls.append((main_dbs_url.replace(self.dbs_global, inst), inst))
            interval = config.get("dbs_daemon_interval", 3600)
            dbsexpire = config.get("dbs_daemon_expire", 3600)
            preserve_dbs_col = config.get("preserve_on_restart", False)
            dbs_config = {"expire": dbsexpire, "preserve_on_restart": preserve_dbs_col}
            if self.dataset_daemon:
                for dbs_url, inst in dbs_urls:
                    dbsmgr = DBSDaemon(dbs_url, self.dburi, dbs_config)
                    self.dbsmgr[(dbs_url, inst)] = dbsmgr

                    def dbs_updater(_dbsmgr, interval):
                        """DBS updater daemon"""
                        while True:
                            try:
                                _dbsmgr.update()
                            except:
                                pass
                            time.sleep(interval)

                    print "### Start DBSDaemon for %s" % dbs_url
                    thname = "dbs_updater:%s" % dbs_url
                    start_new_thread(thname, dbs_updater, (dbsmgr, interval))
        except Exception as exc:
            print_exc(exc)

    def init(self):
        """Init DAS web server, connect to DAS Core"""
        try:
            self.reqmgr = RequestManager(lifetime=self.lifetime)
            self.dasmgr = DASCore(engine=self.engine)
            self.repmgr = CMSRepresentation(self.dasconfig, self.dasmgr)
            self.daskeys = self.dasmgr.das_keys()
            self.gfs = db_gridfs(self.dburi)
            self.daskeys.sort()
            self.dasmapping = self.dasmgr.mapping
            self.dbs_url = self.dasmapping.dbs_url()
            self.dbs_global = self.dasmapping.dbs_global_instance()
            self.dbs_instances = self.dasmapping.dbs_instances()
            self.dasmapping.init_presentationcache()
            self.colors = {"das": gen_color("das")}
            for system in self.dasmgr.systems:
                self.colors[system] = gen_color(system)
            # get SiteDB from global scope
            self.sitedbmgr = SERVICES.get("sitedb2", None)
            # Start DBS daemon
            if self.dataset_daemon:
                self.dbs_daemon(self.dasconfig["web_server"])
            if not self.daskeyslist:
                keylist = [r for r in self.dasmapping.das_presentation_map()]
                keylist.sort(key=lambda r: r["das"])
                self.daskeyslist = keylist

        except ConnectionFailure as _err:
            tstamp = dastimestamp("")
            mythr = threading.current_thread()
            print "### MongoDB connection failure thread=%s, id=%s, time=%s" % (mythr.name, mythr.ident, tstamp)
        except Exception as exc:
            print_exc(exc)
            self.dasmgr = None
            self.reqmgr = None
            self.dbs_url = None
            self.dbs_global = None
            self.dbs_instances = []
            self.daskeys = []
            self.colors = {}
            self.q_rewriter = None
            return

        # KWS and Query Rewriting failures are not fatal
        try:
            # init query rewriter, if needed
            if self.dasconfig["query_rewrite"]["pk_rewrite_on"]:
                self.q_rewriter = CMSQueryRewrite(self.repmgr, self.templatepage)
        except Exception as exc:
            print_exc(exc)
            self.q_rewriter = None

    @expose
    @checkargs(DAS_WEB_INPUTS)
    def redirect(self, **kwargs):
        """
        Represent DAS redirect page
        """
        dmsg = "You do not have permission to access the resource requested."
        msg = kwargs.get("reason", dmsg)
        if msg:
            msg = "Reason: " + msg
        page = self.templatepage("das_redirect", msg=msg)
        return self.page(page, response_div=False)

    def bottom(self, response_div=True):
        """
        Define footer for all DAS web pages
        """
        return self.templatepage("das_bottom", div=response_div, base=self.base, version=DAS.version)

    def page(self, content, ctime=None, response_div=True):
        """
        Define footer for all DAS web pages
        """
        page = self.top()
        page += content
        page += self.templatepage("das_bottom", ctime=ctime, base=self.base, version=DAS.version, div=response_div)
        return page

    @expose
    @checkargs(DAS_WEB_INPUTS + ["section", "highlight"])
    def faq(self, **kwargs):
        """
        represent DAS FAQ.
        """
        section = kwargs.get("section", None)
        highlight = kwargs.get("highlight", None)
        guide = self.templatepage("dbsql_vs_dasql", operators=", ".join(das_operators()))
        daskeys = self.templatepage("das_keys", daskeys=self.daskeyslist)
        page = self.templatepage(
            "das_faq",
            guide=guide,
            daskeys=daskeys,
            section=section,
            highlight=highlight,
            operators=", ".join(das_operators()),
            aggregators=", ".join(das_aggregators()),
        )
        return self.page(page, response_div=False)

    @expose
    def cli(self):
        """
        Serve DAS CLI file download.
        """
        dasroot = "/".join(__file__.split("/")[:-3])
        clifile = os.path.join(dasroot, "DAS/tools/das_client.py")
        return serve_file(clifile, content_type="text/plain")

    @expose
    def movetodas(self):
        "Placeholder page for DBS to DAS migration"
        style = "width:600px;margin-left:auto;margin-right:auto;padding-top:20px"
        page = """<div style="%s">""" % style
        page += "Dear user,<br/>DBS Data Discovery page is depricated.<br/>"
        page += "Please migrate to Data Aggregation Service located at"
        page += "<p>https://cmsweb.cern.ch/das/</p>"
        page += "<em>CMS HTTP group.</em>"
        page += "</div>" ""
        return page

    @expose
    def opensearch(self):
        """
        Serve DAS opensearch file.
        """
        if self.base and self.base.find("http://") != -1:
            base = self.base
        else:
            base = "http://cmsweb.cern.ch/das"
        desc = self.templatepage("das_opensearch", base=base)
        cherrypy.response.headers["Content-Type"] = "application/opensearchdescription+xml"
        return desc

    @expose
    @checkargs(DAS_WEB_INPUTS)
    def services(self):
        """
        represent DAS services
        """
        dasdict = {}
        daskeys = set()
        dasmapkeys = self.dasmgr.mapping.dasmapscache.keys()
        dasmapkeys.sort()
        for key in dasmapkeys:
            srv, urn = key
            if srv not in self.dasmgr.systems:
                continue
            entry = self.dasmgr.mapping.dasmapscache[key]
            tmpdict = {}
            for item in entry["das_map"]:
                dkey = item["das_key"]
                rkey = item["rec_key"]
                daskeys.add(dkey)
                vlist = tmpdict.get(dkey, []) + [rkey]
                tmpdict[dkey] = list(set(vlist))
            apis = []
            if srv in dasdict:
                vdict = dasdict[srv]
                okeys = vdict["keys"]
                apis = vdict["apis"] + [urn]
                for kkk, vvv in okeys.iteritems():
                    vlist = tmpdict.get(kkk, []) + vvv
                    tmpdict[kkk] = list(set(vlist))
            else:
                apis = [urn]
            vdict = dict(keys=dict(tmpdict), apis=apis)
            dasdict[srv] = vdict
        mapreduce = [r for r in self.dasmgr.rawcache.get_map_reduce()]
        page = self.templatepage("das_services", dasdict=dasdict, daskeys=list(daskeys), mapreduce=mapreduce)
        return self.page(page, response_div=False)

    @expose
    @checkargs(DAS_WEB_INPUTS)
    def api(self, system, name):
        """
        Return DAS mapping record about provided API.
        """
        record = self.dasmgr.mapping.api_info(system, name)
        page = "<b>DAS mapping record</b>"
        page += das_json_full(record)
        return self.page(page, response_div=False)

    @expose
    @checkargs(DAS_WEB_INPUTS)
    def default(self, *args, **kwargs):
        """
        Default method.
        """
        return self.index(args, kwargs)

    def adjust_input(self, kwargs):
        """
        Adjust user input wrt common DAS keyword patterns, e.g.
        /Zee/*/* -> dataset=*Zee*, T1_US -> site=T1_US.

        More ambiguous input (such as Zee -> dataset=*Zee*) is however left
        to be handled by the keyword search.

        This is active only if adjust_input is set in DAS server configuration.
        """
        if not self.adjust:
            return
        uinput = kwargs.get("input", "")
        inst = kwargs.get("instance", self.dbs_global)

        kwargs["input"] = identify_apparent_query_patterns(uinput, inst)

    def _get_dbsmgr(self, inst):
        """
        Given a string representation of DBS instance, returns DBSManager
        instance which "knows" how to look up datasets
        """
        mgr = None
        # instance selection shall be more clean
        if not self.dataset_daemon:
            return mgr
        for dbs_url, dbs_inst in self.dbsmgr.keys():
            if dbs_inst == inst:
                return self.dbsmgr[(dbs_url, dbs_inst)]
        return mgr

    def _get_kws_host(self):
        """
        gets the host for keyword search from config. default is same server
        """
        return self.dasconfig["load_balance"]["kws_host"]

    def _get_autocompl_host(self):
        """
        gets the host for autocompletion from config. default is same server
        """
        conf = self.dasconfig.get("load_balance", {})
        return conf.get("autocompletion_host", "")

    def is_kws_enabled(self):
        """
        is keyword search client (ajax request) enabled
        """
        return self.dasconfig["keyword_search"]["kws_on"]

    def is_kws_service_enabled(self):
        """
        is keyword search service (response to ajax call) enabled
        """
        return self.dasconfig["keyword_search"]["kws_service_on"]

    def generate_dasquery(self, uinput, inst, html_mode=True):
        """
        Check provided input as valid DAS input query.
        Returns status and content (either error message or valid DASQuery)
        :param uinput: user's input
        :param inst: DBS instance
        :param html_mode: whether errors shall be output in html
        """

        def error_msg(msg, show_kws=False, tmpl="das_ambiguous", **kwargs):
            """
            Helper function which renders an error template, default is
            das_ambiguous, but can be overriden via tmpl param.
            Template has two versions: html and text for CLI.

            The template is passed with msg, base, guide, and **kwargs. """
            guide = self.templatepage("dbsql_vs_dasql", operators=", ".join(das_operators()))
            # render keyword search loader, if needed
            kws = ""
            if show_kws:
                kws = self.templatepage(
                    "kwdsearch_via_ajax", uinput=uinput, inst=inst or self.dbs_global, kws_host=self._get_kws_host()
                )
            # render the appropriate template (html vs text mode)
            page = self.templatepage(
                tmpl + ("_txt" if not html_mode else ""),
                msg=msg,
                base=self.base,
                guide=guide,
                kws_enabled=show_kws,
                kws=kws,
                **kwargs
            )
            return page

        if not uinput:
            return 1, error_msg("No input query")

        # Generate a DASQuery object, if it fails we catch the exception and
        # wrap it for upper layer (web interface)
        try:
            dasquery = DASQuery(uinput, instance=inst)
        except WildcardMultipleMatchesException as err:
            das_parser_error(uinput, str(err).replace("\n", ""))
            return 1, error_msg(str(err), tmpl="das_wildcard_err", suggest=err.options.values)
        except WildcardMatchingException as err:
            das_parser_error(uinput, str(type(err)) + " " + str(err))
            return 1, error_msg(str(err))
        except Exception as err:
            das_parser_error(uinput, str(type(err)) + " " + str(err))

            # show multiple dataset matches for 1 keyword queries
            if hasattr(response, "dataset_matches_msg"):
                return 1, error_msg(response.dataset_matches_msg, show_kws=self.is_kws_enabled())

            # for non Wildcard parsing errors, show the Keyword Search
            return 1, error_msg(str(err), show_kws=self.is_kws_enabled())

        # DAS query validation
        if isinstance(uinput, dict):  # DASQuery w/ {'spec':{'_id:id}}
            pass
        elif uinput.find("queries") != -1:
            pass
        elif uinput.find("records") != -1:
            pass
        else:  # normal user DAS query
            try:
                service_map = dasquery.service_apis_map()
            except Exception as exc:
                msg = "Fail to obtain service API map for this DASQuery"
                print msg
                print_exc(exc)
                return 1, error_msg(msg)
            if not service_map:
                return 1, error_msg("Unable to resolve the query over the " "available services: %s" % dasquery)
        return 0, dasquery

    @expose
    @checkargs(DAS_WEB_INPUTS)
    def index(self, *args, **kwargs):
        """
        represents DAS web interface.
        It uses das_searchform template for
        input form and yui_table for output Table widget.
        """
        uinput = getarg(kwargs, "input", "")
        return self.page(self.form(uinput=uinput, cards=True))

    def form(self, uinput="", instance=None, view="list", cards=False):
        """
        provide input DAS search form
        """
        # TODO: rename into search_form()? (template is also called like this

        if "'" in uinput:  # e.g. file.creation_date>'20120101 12:01:01'
            uinput = uinput.replace("'", '"')
        if not instance:
            instance = self.dbs_global
        cards = self.templatepage(
            "das_cards", base=self.base, show=cards, width=900, height=220, cards=help_cards(self.base)
        )
        daskeys = self.templatepage("das_keys", daskeys=self.daskeyslist)
        page = self.templatepage(
            "das_searchform",
            input=uinput,
            init_dbses=list(self.dbs_instances),
            daskeys=daskeys,
            base=self.base,
            instance=instance,
            view=view,
            cards=cards,
            autocompl_host=json.dumps(self._get_autocompl_host()),
        )
        return page

    @expose
    def error(self, msg, wrap=True):
        """
        Show error message.
        """
        page = self.templatepage("das_error", msg=str(msg))
        if wrap:
            page = self.page(self.form() + page)
        return page

    @expose
    @checkargs(DAS_WEB_INPUTS)
    def gridfs(self, **kwargs):
        """
        Retieve records from GridFS
        """
        time0 = time.time()
        if "fid" not in kwargs:
            code = web_code("No file id")
            raise HTTPError(500, "DAS error, code=%s" % code)
        fid = kwargs.get("fid")
        data = {"status": "requested", "fid": fid}
        try:
            fds = self.gfs.get(ObjectId(fid))
            return fds.read()
        except Exception as exc:
            print_exc(exc)
            code = web_code("Exception")
            raise HTTPError(500, "DAS error, code=%s" % code)
        data["ctime"] = time.time() - time0
        return json.dumps(data)

    @expose
    @checkargs(DAS_WEB_INPUTS)
    def records(self, *args, **kwargs):
        """
        Retieve all records id's.
        """
        try:
            recordid = None
            if args:
                recordid = args[0]
                spec = {"_id": ObjectId(recordid)}
                fields = None
                query = dict(fields=fields, spec=spec)
            elif kwargs and "_id" in kwargs:
                spec = {"_id": ObjectId(kwargs["_id"])}
                fields = None
                query = dict(fields=fields, spec=spec)
            else:  # return all ids
                query = dict(fields=None, spec={})

            res = ""
            time0 = time.time()
            idx = getarg(kwargs, "idx", 0)
            limit = getarg(kwargs, "limit", 10)
            coll = kwargs.get("collection", "merge")
            view = kwargs.get("view", "")
            if view == "json":
                res = []
            inst = kwargs.get("instance", self.dbs_global)
            form = self.form(uinput="")
            check, content = self.generate_dasquery(query, inst)
            if check:
                return self.page(form + content, ctime=time.time() - time0)
            dasquery = content  # returned content is valid DAS query
            nresults = self.dasmgr.rawcache.nresults(dasquery, coll)
            gen = self.dasmgr.rawcache.get_from_cache(dasquery, idx=idx, limit=limit, collection=coll)
            if recordid:  # we got id
                for row in gen:
                    if view == "json":
                        res.append(row)
                    else:
                        res += das_json(dasquery, row)
            else:
                for row in gen:
                    rid = row["_id"]
                    del row["_id"]
                    res += self.templatepage("das_record", id=rid, collection=coll, daskeys=", ".join(row))
            if recordid:
                page = res
            else:
                url = "/das/records?"
                if nresults:
                    page = self.templatepage("das_pagination", nrows=nresults, idx=idx, limit=limit, url=url)
                else:
                    page = "No results found, nresults=%s" % nresults
                page += res

            ctime = time.time() - time0
            if view == "json":
                return json.dumps(res)
            page = self.page(form + page, ctime=ctime)
            return page
        except Exception as exc:
            print_exc(exc)
            return self.error(gen_error_msg(kwargs))

    @jsonstreamer
    def datastream(self, kwargs):
        """Stream DAS data into JSON format"""
        head = kwargs.get("head", dict(timestamp=time.time()))
        if "mongo_query" not in head:
            head["mongo_query"] = head["dasquery"].mongo_query if "dasquery" in head else {}
        if "dasquery" in head:
            del head["dasquery"]
        if "args" in head:
            del head["args"]
        data = kwargs.get("data", [])
        if self.check_clients:
            # update client version
            cli, cli_msg = check_client_version()
            head.update({"client": cli, "client_message": cli_msg})
            # for old clients setup appropriate status/reason
            if cli_msg:
                head.update({"status": "warning", "reason": cli_msg})
        return head, data

    def get_data(self, kwargs):
        """
        Invoke DAS workflow and get data from the cache.
        """
        head = dict(timestamp=time.time())
        head["args"] = kwargs
        uinput = kwargs.get("input", "")
        inst = kwargs.get("instance", self.dbs_global)
        idx = getarg(kwargs, "idx", 0)
        limit = getarg(kwargs, "limit", 0)  # do not impose limit
        coll = kwargs.get("collection", "merge")
        status = kwargs.get("status")
        error = kwargs.get("error")
        reason = kwargs.get("reason")
        dasquery = kwargs.get("dasquery", None)
        time0 = time.time()
        if dasquery:
            dasquery = DASQuery(dasquery, instance=inst)
        else:
            check, content = self.generate_dasquery(uinput, inst, html_mode=False)
            if check:
                head.update({"status": "fail", "reason": content, "ctime": time.time() - time0, "input": uinput})
                data = []
                return head, data
            dasquery = content  # returned content is valid DAS query
        try:
            nres = self.dasmgr.nresults(dasquery, coll)
            data = self.dasmgr.get_from_cache(dasquery, idx, limit)
            # check that we got what we expected
            data = [r for r in data]
            if nres and not len(data):
                for retry in xrange(1, 3, 5):
                    msg = "retry in %s sec" % retry
                    print dastimestamp("DAS WARNING "), msg, dasquery
                    time.sleep(retry)  # retry one more time
                    data = self.dasmgr.get_from_cache(dasquery, idx, limit)
                    data = [r for r in data]
                    if len(data):
                        break
            if nres and not len(data):
                msg = "fail to get all data for %s, nres=%s, len(data)=%s" % (dasquery, nres, len(data))
                print dastimestamp("DAS WARNING "), msg
                status = "fail"
                reason = "Fail to retrieve data from DAS cache, please retry"

            if dasquery.aggregators:
                # aggregators split DAS record into sub-system and then
                # apply aggregator functions, therefore we need to correctly
                # account for nresults. Resolve generator into list and take
                # its length as nresults value.
                data = [r for r in data]
                nres = len(data)
            if error:  # DAS record contains an error
                status = "error"
            head.update({"status": status, "nresults": nres, "ctime": time.time() - time0, "dasquery": dasquery})
        except Exception as exc:
            status = "fail"
            reason = str(exc)
            print_exc(exc)
            head.update({"status": status, "ctime": time.time() - time0, "dasquery": dasquery})
            data = []
        head.update({"incache": self.dasmgr.incache(dasquery, coll="cache"), "apilist": self.dasmgr.apilist(dasquery)})
        if reason:
            head.update({"reason": reason})
        if status != "ok":
            head.update(self.info())
        return head, data

    def info(self):
        "Return status of DAS server"
        info = {"nrequests": self.reqmgr.size(), "nworkers": self.taskmgr.nworkers(), "dasweb": self.reqmgr.status()}
        if self.dasmgr and self.dasmgr.taskmgr:
            info.update({"dascore": self.dasmgr.taskmgr.status()})
        return dict(das_server=info)

    def busy(self):
        """
        Check server load and report busy status if
        nrequests - nworkers > queue limit
        """
        nrequests = self.reqmgr.size()
        if (nrequests - self.taskmgr.nworkers()) > self.queue_limit:
            msg = "#request=%s, queue_limit=%s, #workers=%s" % (nrequests, self.taskmgr.nworkers(), self.queue_limit)
            print dastimestamp("DAS WEB SERVER IS BUSY "), msg
            return True
        return False

    def busy_page(self, uinput=None):
        """DAS server busy page layout"""
        page = "<h3>DAS server is busy, please try later</h3>"
        form = self.form(uinput)
        return self.page(form + page)

    def _is_web_request(self, view):
        """
        returns whether the current view mode is not web
        """

        # first, check for explicit output type (view)

        if view in ["json", "xml", "plain"]:
            return False

        # check accept header - e.g. das client only provides accept header
        accepts = cherrypy.request.headers.elements("Accept")
        non_html_accepts = ["application/json"]
        other_accepted = [a for a in accepts if a.value not in non_html_accepts]

        # if only non html content types are accepted we are in non html mode
        if not other_accepted and accepts:
            return False

        return True

    @expose
    @checkargs(DAS_WEB_INPUTS)
    def cache(self, **kwargs):
        """
        DAS web cache interface. Fire up new process for new requests and
        record its pid. The client is in charge to keep track of pid.
        The new process uses DAS core call to request the data into cache.
        Since query are cached the repeated call with the same query
        has no cost to DAS core.
        """
        # do not allow caching
        set_no_cache_flags()

        # if busy return right away
        if self.busy():
            nrequests = self.reqmgr.size()
            level = nrequests - self.taskmgr.nworkers() - self.queue_limit
            reason = "DAS server is busy"
            reason += ", #requests=%s, #workers=%s, queue size=%s" % (
                self.reqmgr.size(),
                self.taskmgr.nworkds(),
                self.queue_limit,
            )
            head = dict(timestamp=time.time())
            head.update({"status": "busy", "reason": reason, "ctime": 0})
            data = []
            return self.datastream(dict(head=head, data=data))

        uinput = kwargs.get("input", "").strip()
        if not uinput:
            head = {"status": "fail", "reason": "No input found", "args": kwargs, "ctime": 0, "input": uinput}
            data = []
            return self.datastream(dict(head=head, data=data))
        self.adjust_input(kwargs)
        pid = kwargs.get("pid", "")
        inst = kwargs.get("instance", self.dbs_global)
        uinput = kwargs.get("input", "")
        view = kwargs.get("view", "list")
        data = []

        # textual views need text only error messages...
        check, content = self.generate_dasquery(uinput, inst, html_mode=self._is_web_request(view))
        if check:
            head = dict(timestamp=time.time())
            head.update(
                {"status": "fail", "reason": "Can not interpret the query" + " (while creating DASQuery)", "ctime": 0}
            )
            if not self._is_web_request(view):
                head["error_details"] = content
                head["reason"] = head["reason"] + "\n\n" + content
            return self.datastream(dict(head=head, data=data))

        dasquery = content  # returned content is valid DAS query
        status, error, reason = self.dasmgr.get_status(dasquery)
        kwargs.update({"status": status, "error": error, "reason": reason})
        if not pid:
            pid = dasquery.qhash
        if status == None and not self.reqmgr.has_pid(pid):  # submit new request
            addr = cherrypy.request.headers.get("Remote-Addr")
            _evt, pid = self.taskmgr.spawn(self.dasmgr.call, dasquery, uid=addr, pid=dasquery.qhash)
            self.reqmgr.add(pid, kwargs)
            return pid
        if status == "ok":
            self.reqmgr.remove(pid)
            kwargs["dasquery"] = dasquery
            head, data = self.get_data(kwargs)
            return self.datastream(dict(head=head, data=data))
        kwargs["dasquery"] = dasquery.storage_query
        if not self.pid_pat.match(str(pid)) or len(str(pid)) != 32:
            self.reqmgr.remove(pid)
            head = {"status": "fail", "reason": "Invalid pid", "args": kwargs, "ctime": 0, "input": uinput}
            data = []
            return self.datastream(dict(head=head, data=data))
        elif self.taskmgr.is_alive(pid):
            return pid
        else:  # process is done, get data
            self.reqmgr.remove(pid)
            head, data = self.get_data(kwargs)
            return self.datastream(dict(head=head, data=data))

    def get_page_content(self, kwargs, complete_msg=True):
        """Retrieve page content for provided set of parameters"""
        page = ""
        try:
            view = kwargs.get("view", "list")
            if view == "plain":
                if "limit" in kwargs:
                    del kwargs["limit"]
            if view in ["json", "xml", "plain"] and complete_msg:
                page = "Request completed. Reload the page ..."
            else:
                head, data = self.get_data(kwargs)

                allowed_views = ["list", "table", "plain", "xml", "json"]
                if view not in allowed_views:
                    raise

                func = getattr(self, view + "view")
                page = func(head, data)
        except HTTPError as _err:
            raise
        except Exception as exc:
            print_exc(exc)
            msg = gen_error_msg(kwargs)
            page = self.templatepage("das_error", msg=msg)
        return page

    @expose
    def download(self, lfn):
        "DAS download page for given LFN"
        page = self.templatepage("filemover", lfn=lfn)
        return self.page(page, response_div=False)

    @expose
    def makepy(self, dataset, instance):
        """
        Request to create CMSSW py snippet for a given dataset
        """
        pat = re.compile("/.*/.*/.*")
        if not pat.match(dataset):
            msg = "Invalid dataset name"
            return self.error(msg)
        query = "file dataset=%s instance=%s | grep file.name" % (dataset, instance)
        try:
            data = self.dasmgr.result(query, idx=0, limit=0)
        except Exception as exc:
            print_exc(exc)
            msg = "Exception: %s\n" % str(exc)
            msg += "Unable to retrieve data for query=%s" % query
            return self.error(msg)
        lfns = []
        for rec in data:
            filename = DotDict(rec).get("file.name")
            if filename not in lfns:
                lfns.append(filename)
        page = self.templatepage("das_files_py", lfnList=lfns, pfnList=[])
        cherrypy.response.headers["Content-Type"] = "text/plain"
        return page

    @expose
    @checkargs(DAS_WEB_INPUTS)
    def request(self, **kwargs):
        """
        Request data from DAS cache.
        """
        # do not allow caching
        set_no_cache_flags()

        uinput = kwargs.get("input", "").strip()
        if not uinput:
            kwargs["reason"] = "No input found"
            return self.redirect(**kwargs)

        # if busy return right away
        if self.busy():
            return self.busy_page(uinput)

        time0 = time.time()
        self.adjust_input(kwargs)
        view = kwargs.get("view", "list")
        inst = kwargs.get("instance", self.dbs_global)
        uinput = kwargs.get("input", "")
        form = self.form(uinput=uinput, instance=inst, view=view)
        check, content = self.generate_dasquery(uinput, inst)
        if check:
            if view == "list" or view == "table":
                return self.page(form + content, ctime=time.time() - time0)
            else:
                return content
        dasquery = content  # returned content is valid DAS query
        status, error, reason = self.dasmgr.get_status(dasquery)
        kwargs.update({"status": status, "error": error, "reason": reason})
        pid = dasquery.qhash
        if status == None:  # process new request
            kwargs["dasquery"] = dasquery.storage_query
            addr = cherrypy.request.headers.get("Remote-Addr")
            _evt, pid = self.taskmgr.spawn(self.dasmgr.call, dasquery, uid=addr, pid=dasquery.qhash)
            self.reqmgr.add(pid, kwargs)
        elif status == "ok" or status == "fail":
            self.reqmgr.remove(pid)

            # check if query can be rewritten via nested PK query
            rew_msg = self.q_rewriter and self.q_rewriter.check_fields(dasquery)
            if rew_msg:
                content = self.templatepage("das_error", msg=rew_msg)
                return self.page(form + content, ctime=time.time() - time0)

            kwargs["dasquery"] = dasquery
            page = self.get_page_content(kwargs, complete_msg=False)
            ctime = time.time() - time0
            if view == "list" or view == "table":
                return self.page(form + page, ctime=ctime)

            return page
        if self.taskmgr.is_alive(pid):
            page = self.templatepage(
                "das_check_pid",
                method="check_pid",
                uinput=uinput,
                view=view,
                base=self.base,
                pid=pid,
                interval=self.interval,
            )
        else:
            self.reqmgr.remove(pid)
            page = self.get_page_content(kwargs)
        ctime = time.time() - time0
        return self.page(form + page, ctime=ctime)

    @expose
    def status(self):
        """Return list of all current requests in DAS queue"""
        requests = [r for r in self.reqmgr.items()]
        page = self.templatepage("das_status", requests=requests)
        return self.page(page)

    @expose
    @checkargs(["pid"])
    def check_pid(self, pid):
        """
        Check status of given pid. This is a server callback
        function for ajaxCheckPid, see js/ajax_utils.js
        """
        # do not allow caching
        set_no_cache_flags()

        img = '<img src="%s/images/loading.gif" alt="loading"/>' % self.base
        page = ""
        try:
            if self.taskmgr.is_alive(pid):
                page = img + " processing PID=%s" % pid
            else:
                # at this point we don't know if request arrived to this host
                # or it was processed. To distinguish the case we'll ask
                # request manager for that pid
                if self.reqmgr.has_pid(pid):
                    self.reqmgr.remove(pid)
                    page = "Request PID=%s is completed" % pid
                    page += ", please wait for results to load"
                else:
                    # there're no request on this server, re-initiate it
                    ref = cherrypy.request.headers.get("Referer", None)
                    if ref:
                        url = urlparse(ref)
                        params = dict(parse_qsl(url.query))
                        return self.request(**params)
                    else:
                        msg = "No referer in cherrypy.request.headers"
                        msg += "\nHeaders: %s" % cherrypy.request.headers
                        print dastimestamp("DAS WEB ERROR "), msg
        except Exception as err:
            msg = "check_pid fails for pid=%s" % pid
            print dastimestamp("DAS WEB ERROR "), msg
            print_exc(err)
            self.reqmgr.remove(pid)
            self.taskmgr.remove(pid)
            return self.error(gen_error_msg({"pid": pid}), wrap=False)
        return page

    def listview(self, head, data):
        """DAS listview data representation"""
        return self.repmgr.listview(head, data)

    def tableview(self, head, data):
        """DAS tabular view data representation"""
        return self.repmgr.tableview(head, data)

    def plainview(self, head, data):
        """DAS plain view data representation"""
        return self.repmgr.plainview(head, data)

    def xmlview(self, head, data):
        """DAS XML data representation"""
        return self.repmgr.xmlview(head, data)

    def jsonview(self, head, data):
        """DAS JSON data representation"""
        return self.repmgr.jsonview(head, data)

    @exposedasjson
    @enable_cross_origin
    @checkargs(["query", "dbs_instance"])
    def autocomplete(self, **kwargs):
        """
        Provides autocomplete functionality for DAS web UI.
        """
        query = kwargs.get("query", "").strip()
        result = autocomplete_helper(query, self.dasmgr, self.daskeys)
        dataset = [r for r in result if r["value"].find("dataset=") != -1]
        dbsinst = kwargs.get("dbs_instance", self.dbs_global)
        if self.dataset_daemon and len(dataset):
            dbsmgr = self._get_dbsmgr(dbsinst)
            if query.find("dataset=") != -1:
                query = query.replace("dataset=", "")
            for row in dbsmgr.find(query):
                result.append({"css": "ac-info", "value": "dataset=%s" % row, "info": "dataset"})
        return result
Ejemplo n.º 23
0
class DASCore(object):
    """
    DAS core class.
    """
    def __init__(self,
                 config=None,
                 debug=0,
                 nores=False,
                 logger=None,
                 engine=None,
                 multitask=True):
        if config:
            dasconfig = config
        else:
            dasconfig = das_readconfig()
        verbose = dasconfig['verbose']
        self.stdout = debug
        if isinstance(debug, int) and debug:
            self.verbose = debug
            dasconfig['verbose'] = debug
        else:
            self.verbose = verbose
        das_timer('DASCore::init', self.verbose)
        self.operators = das_operators()
        self.collect_wait_time = dasconfig['das'].get('collect_wait_time', 120)

        # set noresults option
        self.noresults = False
        if nores:
            dasconfig['write_cache'] = True
            self.noresults = nores

        self.init_expire = dasconfig['das'].get('init_expire', 5 * 60)
        self.multitask = dasconfig['das'].get('multitask', True)
        if debug or self.verbose:
            self.multitask = False  # in verbose mode do not use multitask
            dasconfig['das']['multitask'] = False
        if not multitask:  # explicitly call DASCore ctor
            self.multitask = False
            dasconfig['das']['multitask'] = False
        dasconfig['engine'] = engine
        if self.multitask:
            nworkers = dasconfig['das'].get('core_workers', 5)
            #             if  engine:
            #                 thr_name = 'DASCore:PluginTaskManager'
            #                 self.taskmgr = PluginTaskManager(\
            #                         engine, nworkers=nworkers, name=thr_name)
            #                 self.taskmgr.subscribe()
            #             else:
            #                 thr_name = 'DASCore:TaskManager'
            #                 self.taskmgr = TaskManager(nworkers=nworkers, name=thr_name)
            thr_name = 'DASCore:TaskManager'
            self.taskmgr = TaskManager(nworkers=nworkers, name=thr_name)
        else:
            self.taskmgr = None

        if logger:
            self.logger = logger
        else:
            self.logger = PrintManager('DASCore', self.verbose)

        # define Mapping/Analytics/Parser in this order since Parser depends
        # on first two
        dasmapping = DASMapping(dasconfig)
        dasconfig['dasmapping'] = dasmapping
        self.mapping = dasmapping

        self.keylearning = DASKeyLearning(dasconfig)
        dasconfig['keylearning'] = self.keylearning

        # init DAS cache
        self.rawcache = DASMongocache(dasconfig)
        dasconfig['rawcache'] = self.rawcache

        # plug-in architecture: loop over registered data-services in
        # dasconfig; load appropriate module/class; register data
        # service with DASCore.
        self.systems = dasmapping.list_systems()
        # pointer to the DAS top level directory
        dasroot = '/'.join(__file__.split('/')[:-3])
        for name in self.systems:
            try:
                klass  = 'DAS/services/%s/%s_service.py' \
                    % (name, name)
                srvfile = os.path.join(dasroot, klass)
                with open(srvfile) as srvclass:
                    for line in srvclass:
                        if line.find('(DASAbstractService)') != -1:
                            klass = line.split('(DASAbstractService)')[0]
                            klass = klass.split('class ')[-1]
                            break
                mname = 'DAS.services.%s.%s_service' % (name, name)
                module = __import__(mname, fromlist=[klass])
                obj = getattr(module, klass)(dasconfig)
                setattr(self, name, obj)
            except IOError as err:
                if debug > 1:
                    # we have virtual services, so IOError can be correct
                    print_exc(err)
                try:
                    mname = 'DAS.services.generic_service'
                    module = __import__(mname, fromlist=['GenericService'])
                    obj = module.GenericService(name, dasconfig)
                    setattr(self, name, obj)
                except Exception as exc:
                    print_exc(exc)
                    msg = "Unable to load %s data-service plugin" % name
                    raise Exception(msg)
            except Exception as exc:
                print_exc(exc)
                msg = "Unable to load %s data-service plugin" % name
                raise Exception(msg)

        # loop over systems and get system keys, add mapping keys to final list
        self.service_keys = {}
        self.service_parameters = {}
        for name in self.systems:
            skeys = list(getattr(self, name).keys())
            self.service_keys[getattr(self, name).name] = skeys
            sparams = getattr(self, name).parameters()
            self.service_parameters[getattr(self, name).name] = sparams

        self.service_keys['special'] = das_special_keys()
        self.dasconfig = dasconfig
        das_timer('DASCore::init', self.verbose)

    def keys(self):
        """
        Return map of data service keys
        """
        return self.service_keys

    def das_keys(self):
        """
        Return map of data service keys
        """
        _keys = ['records']
        for values in self.service_keys.values():
            for key in values:
                if key not in _keys:
                    _keys.append(key)
        return _keys

    def result(self, query, idx=0, limit=None):
        """
        Get results either from cache or from explicit call
        """
        self.logger.info('input query=%s' % query)
        results = []
        dasquery = DASQuery(query)
        query = dasquery.mongo_query
        # check if we have any service which cover the query
        # otherwise decompose it into list of queries
        service_map = dasquery.service_apis_map()
        if not service_map:
            msg = 'no APIs found to answer input query, will decompose it'
            self.logger.info(msg)
            skeys = query['fields']
            if not skeys:
                skeys = []
            for key in skeys:
                newquery = DASQuery(dict(fields=[key], spec=query['spec']))
                self.call(newquery)  # process query
        else:
            self.call(dasquery)  # process query

        # lookup provided query in a cache
        if not self.noresults:
            results = self.get_from_cache(dasquery, idx, limit)
        return results

    def remove_from_cache(self, dasquery):
        """
        Delete in cache entries about input query
        """
        self.rawcache.remove_from_cache(dasquery)

    def get_status(self, dasquery):
        """
        Look-up status of provided query in a cache.
        Return status of the query request and its hash.
        """
        status = None
        error = None
        reason = None
        if dasquery and 'fields' in dasquery.mongo_query:
            fields = dasquery.mongo_query['fields']
            if fields and isinstance(fields, list) and 'queries' in fields:
                return 'ok', error, reason
        record = self.rawcache.find(dasquery)
        error, reason = self.rawcache.is_error_in_records(dasquery)
        try:
            if record and 'das' in record and 'status' in record['das']:
                status = record['das']['status']
                if not error:
                    error = record['das'].get('error', error)
                if not reason:
                    reason = record['das'].get('reason', reason)
                return status, error, reason
        except Exception as exc:
            print_exc(exc)
            status = error = reason = None
            self.rawcache.remove_from_cache(dasquery)
        return status, error, reason

    def status(self):
        "Return status of given service"
        sdict = {'das': self.taskmgr.status()}
        for srv in sorted(self.systems):
            sdict[srv] = getattr(getattr(self, srv), 'status')()
        return sdict

    def worker(self, srv, dasquery):
        """Main worker function which calls data-srv call function"""
        self.logger.info('##### %s ######\n' % srv)
        das_timer(srv, self.verbose)
        getattr(getattr(self, srv), 'call')(dasquery)
        das_timer(srv, self.verbose)

    def insert_query_records(self, dasquery):
        """
        Insert DAS query records into DAS cache and return list of services
        which will answer this query
        """
        services = dasquery.services
        self.logger.info('Potential services = %s' % services)
        if not services:
            msg = 'No data-services for query %s' % dasquery
            msg += 'mongo_query: %s' % dasquery.mongo_query
            msg += 'params: %s' % dasquery.params()
            print(dastimestamp('DAS WARNING '), msg)

        # get list of URI which can answer this query
        ack_services = []
        for srv in services:
            gen = [t for t in getattr(getattr(self, srv), 'apimap')(dasquery)]
            for url, api, args, iformat, expire in gen:
                header = dasheader(srv, dasquery, expire, api, url, ctime=0)
                self.rawcache.insert_query_record(dasquery, header)
                if srv not in ack_services:
                    ack_services.append(srv)
        if not ack_services:
            ack_services = services
        if dasquery.query.find('records ') != -1:
            srv_status = True  # skip DAS queries w/ records request
        # create das record with initial expire tstamp
        expire = time.time() + self.init_expire
        header = dasheader("das",
                           dasquery,
                           expire,
                           api='das_core',
                           services=dict(das=ack_services))
        header['lookup_keys'] = []
        self.rawcache.insert_query_record(dasquery, header)
        das_timer('das_record', self.verbose)
        return ack_services

    def call(self, query, **kwds):
        """
        Top level DAS api which execute a given query using underlying
        data-services. It follows the following steps:

            - parse input query
            - identify data-sercices based on selection keys
              and where clause conditions
            - construct DAS workflow and execute data-service 
              API calls. At this step individual 
              data-services store results into DAS cache.

        Return status 0/1 depending on success of the calls, can be
        used by workers on cache server.

        kwds is provided for compatibility with web layer, e.g. it
        may invoke this method with additional pid parameter.
        """
        def update_das_query(dasquery, status, reason=None):
            "Update DAS query record with given status and reason"
            self.rawcache.update_query_record(dasquery, status, reason=reason)
            self.rawcache.add_to_record(\
                    dasquery, {'das.timer': get_das_timer()}, system='das')

        self.logger.info('input query=%s' % query)
        das_timer('DASCore::call', self.verbose)
        if  isinstance(query, object) and hasattr(query, '__class__')\
            and query.__class__.__name__ == 'DASQuery':
            dasquery = query
        else:
            dasquery = DASQuery(query)
        for col in ['merge', 'cache']:
            self.rawcache.remove_expired(dasquery, col)
        query = dasquery.mongo_query
        spec = query.get('spec')
        fields = query.get('fields')
        if fields == ['records']:
            msg = 'look-up all records in cache'
            self.logger.info(msg)
            return 'in cache'
        if spec == dict(records='*'):
            self.logger.info("look-up everything in cache")
            return 'in cache'
        for record in self.rawcache.find_specs(dasquery):
            status = record['das']['status']
            msg = 'found query %s in cache, status=%s\n' \
                        % (record['query'], status)
            self.logger.info(msg)
            print(dastimestamp('DAS INFO'), msg)
            return status

        self.logger.info(dasquery)
        das_timer('das_record', self.verbose)
        services = self.insert_query_records(dasquery)
        if not services:
            msg = 'unable to locate data-services to fulfill this request'
            msg += ', will iterate over all registered services'
            print(dastimestamp('DAS WARNING '), dasquery, msg)
            services = dasquery.services if dasquery.services else self.systems
        try:
            if self.multitask:
                jobs = []
                for srv in sorted(services):
                    jobs.append(self.taskmgr.spawn(self.worker, srv, dasquery))
                self.taskmgr.joinall(jobs)
            else:
                for srv in services:
                    self.worker(srv, dasquery)
        except Exception as exc:
            print_exc(exc)
            return 'fail'
        self.logger.info('\n##### merging ######\n')
        update_das_query(dasquery, 'merging')
        das_timer('merge', self.verbose)

        # check that all query record statuses are ok, i.e. we did insert records
        # this status is set by self.rawcache.update_cache
        for idx in range(self.collect_wait_time):
            records = self.rawcache.find_query_record(dasquery)
            statuses = []
            for row in records:
                system = row['das']['system']
                status = row['das']['status']
                self.logger.info("### query record status %s %s %s" %
                                 (dasquery.qhash, system, status))
                statuses.append(status)
            all_statuses = sorted(list(set(statuses)))
            # at this point we're looking that all services will have 'ok' and das status will be 'merging'
            if len(all_statuses) == 2 and all_statuses == ['merging', 'ok']:
                break
            time.sleep(1)

        # now we can merge records
        status = self.rawcache.merge_records(dasquery)
        das_timer('merge', self.verbose)
        # check if we have service records and properly setup status
        self.logger.info('\n##### check services ######\n')
        das_services = self.rawcache.check_services(dasquery)
        reason = ''
        status = 'ok'
        if not das_services:
            if 'records' in dasquery.query:
                status = 'ok'  # keep status ok for 'records' queries
            else:
                reason = 'no data records found in DAS cache'
                status = 'fail'
                print(dastimestamp('DAS ERROR '), dasquery, reason)
        update_das_query(dasquery, status, reason)
        das_timer('DASCore::call', self.verbose)
        return status

    def processing_time(self, dasquery):
        "Look-up and return DAS query processing time"
        query_record = self.rawcache.find(dasquery)
        if query_record:
            das = query_record.get('das', None)
            if isinstance(das, dict):
                ctime = das.get('ctime', [])
                if ctime:
                    return ctime[-1] - ctime[0]
        return None

    def nresults(self, dasquery, coll='merge'):
        """
        Return total number of results (count) for provided query
        Code should match body of get_from_cache method.
        """
        fields = dasquery.mongo_query.get('fields', None)
        if dasquery.mapreduce:
            result = self.rawcache.map_reduce(dasquery.mapreduce, dasquery)
            return len([1 for _ in result])
        elif dasquery.aggregators:
            return len(dasquery.aggregators)
        return self.rawcache.nresults(dasquery, coll)

    def apilist(self, dasquery):
        "Return list of APIs answer given das query"
        return self.rawcache.apilist(dasquery)

    def incache(self, dasquery, coll='merge'):
        """
        Answer the question if given query in DAS cache or not
        """
        return self.rawcache.incache(dasquery, collection=coll)

    def get_from_cache(self, dasquery, idx=0, limit=0, collection='merge'):
        """
        Look-up results from the merge cache and yield them for
        further processing.
        """
        das_timer('DASCore::get_from_cache', self.verbose)
        msg = 'col=%s, query=%s, idx=%s, limit=%s'\
                % (collection, dasquery, idx, limit)
        self.logger.info(msg)

        fields = dasquery.mongo_query.get('fields', None)

        if dasquery.mapreduce:
            res = self.rawcache.map_reduce(dasquery.mapreduce, dasquery)
        elif dasquery.aggregators:
            # extract das information from rawcache
            rows  = self.rawcache.get_from_cache(\
                    dasquery, collection=collection)
            first = next(rows)
            sinfo = das_sinfo(first)
            # to perform aggregation we need:
            # - loop over all aggregator functions
            # - loop over all data-services
            # - loop over all APIs within a data-services
            # the code below does that, it applies aggregator
            # to selected (based on key/srv/api) records
            res = []
            _id = 0
            time0 = time.time()
            expire = 300  # min expire
            for func, key in dasquery.aggregators:
                afunc = getattr(das_aggregator, 'das_%s' % func)
                found = False
                for srv, apis, in sinfo.items():
                    for api in apis:
                        rows  = self.rawcache.get_from_cache(\
                                dasquery, collection=collection)
                        gen = api_rows(rows, api)
                        data = afunc(key, gen)
                        ctime = time.time() - time0
                        das = dasheader(srv,
                                        dasquery,
                                        expire,
                                        api=api,
                                        ctime=ctime)
                        if isinstance(data, dict) and data['value'] != 'N/A':
                            aggr = {
                                '_id': _id,
                                'function': func,
                                'key': key,
                                'result': data
                            }
                            aggr.update(das)
                            res.append(aggr)
                            _id += 1
                            found = True
                if not found:  # when we got nothing add empty result record
                    empty = {'value': 'N/A'}
                    ctime = time.time() - time0
                    das = dasheader('das',
                                    dasquery,
                                    expire,
                                    api='das_core',
                                    ctime=ctime)
                    rec = {
                        '_id': 0,
                        'function': func,
                        'key': key,
                        'result': empty
                    }
                    rec.update(das)
                    res.append(rec)
        else:
            res = self.rawcache.get_from_cache(dasquery, idx, limit, \
                    collection=collection)
        # we assume that all records from single query will have
        # identical structure, therefore it will be sufficient to update
        # keylearning DB only with first record
        count = 0
        for row in res:
            if not count:
                self.keylearning.add_record(dasquery, row)
            fix_times(row)
            yield row
            count += 1
        das_timer('DASCore::get_from_cache', self.verbose)
Ejemplo n.º 24
0
    def __init__(self,
                 config=None,
                 debug=0,
                 nores=False,
                 logger=None,
                 engine=None,
                 multitask=True):
        if config:
            dasconfig = config
        else:
            dasconfig = das_readconfig()
        verbose = dasconfig['verbose']
        self.stdout = debug
        if isinstance(debug, int) and debug:
            self.verbose = debug
            dasconfig['verbose'] = debug
        else:
            self.verbose = verbose
        das_timer('DASCore::init', self.verbose)
        self.operators = das_operators()
        self.collect_wait_time = dasconfig['das'].get('collect_wait_time', 120)

        # set noresults option
        self.noresults = False
        if nores:
            dasconfig['write_cache'] = True
            self.noresults = nores

        self.init_expire = dasconfig['das'].get('init_expire', 5 * 60)
        self.multitask = dasconfig['das'].get('multitask', True)
        if debug or self.verbose:
            self.multitask = False  # in verbose mode do not use multitask
            dasconfig['das']['multitask'] = False
        if not multitask:  # explicitly call DASCore ctor
            self.multitask = False
            dasconfig['das']['multitask'] = False
        dasconfig['engine'] = engine
        if self.multitask:
            nworkers = dasconfig['das'].get('core_workers', 5)
            #             if  engine:
            #                 thr_name = 'DASCore:PluginTaskManager'
            #                 self.taskmgr = PluginTaskManager(\
            #                         engine, nworkers=nworkers, name=thr_name)
            #                 self.taskmgr.subscribe()
            #             else:
            #                 thr_name = 'DASCore:TaskManager'
            #                 self.taskmgr = TaskManager(nworkers=nworkers, name=thr_name)
            thr_name = 'DASCore:TaskManager'
            self.taskmgr = TaskManager(nworkers=nworkers, name=thr_name)
        else:
            self.taskmgr = None

        if logger:
            self.logger = logger
        else:
            self.logger = PrintManager('DASCore', self.verbose)

        # define Mapping/Analytics/Parser in this order since Parser depends
        # on first two
        dasmapping = DASMapping(dasconfig)
        dasconfig['dasmapping'] = dasmapping
        self.mapping = dasmapping

        self.keylearning = DASKeyLearning(dasconfig)
        dasconfig['keylearning'] = self.keylearning

        # init DAS cache
        self.rawcache = DASMongocache(dasconfig)
        dasconfig['rawcache'] = self.rawcache

        # plug-in architecture: loop over registered data-services in
        # dasconfig; load appropriate module/class; register data
        # service with DASCore.
        self.systems = dasmapping.list_systems()
        # pointer to the DAS top level directory
        dasroot = '/'.join(__file__.split('/')[:-3])
        for name in self.systems:
            try:
                klass  = 'DAS/services/%s/%s_service.py' \
                    % (name, name)
                srvfile = os.path.join(dasroot, klass)
                with open(srvfile) as srvclass:
                    for line in srvclass:
                        if line.find('(DASAbstractService)') != -1:
                            klass = line.split('(DASAbstractService)')[0]
                            klass = klass.split('class ')[-1]
                            break
                mname = 'DAS.services.%s.%s_service' % (name, name)
                module = __import__(mname, fromlist=[klass])
                obj = getattr(module, klass)(dasconfig)
                setattr(self, name, obj)
            except IOError as err:
                if debug > 1:
                    # we have virtual services, so IOError can be correct
                    print_exc(err)
                try:
                    mname = 'DAS.services.generic_service'
                    module = __import__(mname, fromlist=['GenericService'])
                    obj = module.GenericService(name, dasconfig)
                    setattr(self, name, obj)
                except Exception as exc:
                    print_exc(exc)
                    msg = "Unable to load %s data-service plugin" % name
                    raise Exception(msg)
            except Exception as exc:
                print_exc(exc)
                msg = "Unable to load %s data-service plugin" % name
                raise Exception(msg)

        # loop over systems and get system keys, add mapping keys to final list
        self.service_keys = {}
        self.service_parameters = {}
        for name in self.systems:
            skeys = list(getattr(self, name).keys())
            self.service_keys[getattr(self, name).name] = skeys
            sparams = getattr(self, name).parameters()
            self.service_parameters[getattr(self, name).name] = sparams

        self.service_keys['special'] = das_special_keys()
        self.dasconfig = dasconfig
        das_timer('DASCore::init', self.verbose)
Ejemplo n.º 25
0
class DASAbstractService(object):
    """
    Abstract class describing DAS service. It initialized with a name which
    is used to identify service parameters from DAS configuration file.
    Those parameters are keys, verbosity level, URL of the data-service.
    """
    def __init__(self, name, config):
        self.name = name
        try:
            self.verbose      = config['verbose']
            title             = 'DASAbstactService_%s' % self.name
            self.logger       = PrintManager(title, self.verbose)
            self.dasmapping   = config['dasmapping']
            self.write2cache  = config.get('write_cache', True)
            self.multitask    = config['das'].get('multitask', True)
            self.error_expire = config['das'].get('error_expire', 300) 
            self.dbs_global   = None # to be configured at run time
            self.dburi        = config['mongodb']['dburi']
            engine            = config.get('engine', None)
            self.gfs          = db_gridfs(self.dburi)
        except Exception as exc:
            print_exc(exc)
            raise Exception('fail to parse DAS config')

        # read key/cert info
        try:
            self.ckey, self.cert = get_key_cert()
        except Exception as exc:
            print_exc(exc)
            self.ckey = None
            self.cert = None

        if  self.multitask:
            nworkers = config['das'].get('api_workers', 3)
            thr_weights = config['das'].get('thread_weights', [])
            for system_weight in thr_weights:
                system, weight = system_weight.split(':')
                if  system == self.name:
                    nworkers *= int(weight)
            if  engine:
                thr_name = 'DASAbstractService:%s:PluginTaskManager' % self.name
                self.taskmgr = PluginTaskManager(\
                        engine, nworkers=nworkers, name=thr_name)
                self.taskmgr.subscribe()
            else:
                thr_name = 'DASAbstractService:%s:TaskManager' % self.name
                self.taskmgr = TaskManager(nworkers=nworkers, name=thr_name)
        else:
            self.taskmgr = None

        self.map        = {}   # to be defined by data-service implementation
        self._keys      = None # to be defined at run-time in self.keys
        self._params    = None # to be defined at run-time in self.parameters
        self._notations = {}   # to be defined at run-time in self.notations

        self.logger.info('initialized')
        # define internal cache manager to put 'raw' results into cache
        if  'rawcache' in config and config['rawcache']:
            self.localcache   = config['rawcache']
        else:
            msg = 'Undefined rawcache, please check your configuration'
            raise Exception(msg)

    def services(self):
        """
        Return sub-subsystems used to retrieve data records. It is used
        in dasheader call to setup das.services field. This method can be
        overwritten in sub-classes, otherwise returns dict of service name
        and CMS systems used to retrieve data records.
        """
        return {self.name:[self.name]}

    def version(self):
        """Return data-services version, should be implemented in sub-classes"""
        return ''

    def keys(self):
        """
        Return service keys
        """
        if  self._keys:
            return self._keys
        srv_keys = []
        for _api, params in self.map.items():
            for key in params['keys']:
                if  not key in srv_keys:
                    srv_keys.append(key)
        self._keys = srv_keys
        return srv_keys

    def parameters(self):
        """
        Return mapped service parameters
        """
        if  self._params:
            return self._params
        srv_params = []
        for _api, params in self.map.items():
            for key in params['params']:
                param_list = self.dasmapping.api2das(self.name, key)
                for par in param_list:
                    if  not par in srv_params:
                        srv_params.append(par)
        self._params = srv_params
        return srv_params

    def notations(self):
        """
        Return a map of system notations.
        """
        if  self._notations:
            return self._notations
        for _, rows in self.dasmapping.notations(self.name).items():
            for row in rows:
                api  = row['api']
                nmap = row['rec_key']
                notation = row['api_output']
                if  api in self._notations:
                    self._notations[api].update({notation:nmap})
                else:
                    self._notations[api] = {notation:nmap}
        return self._notations

    def getdata(self, url, params, expire, headers=None, post=None):
        """URL call wrapper"""
        if  url.find('https:') != -1:
            return getdata(url, params, headers, expire, post,
                self.error_expire, self.verbose, self.ckey, self.cert,
                system=self.name)
        else:
            return getdata(url, params, headers, expire, post,
                self.error_expire, self.verbose, system=self.name)

    def call(self, dasquery):
        """
        Invoke service API to execute given query.
        Return results as a collect list set.
        """
        self.logger.info(dasquery)
        # check the cache for records with given query/system
        res = self.localcache.incache(dasquery,
                                      collection='cache',
                                      system=self.name)
        if  res:
            msg  = "found records in local cache"
            self.logger.info(msg)
            return
        # ask data-service api to get results, they'll be store them in
        # cache, so return at the end what we have in cache.
        self.api(dasquery)

    def write_to_cache(self, dasquery, expire, url, api, args, gen, ctime):
        """
        Write provided result set into DAS cache.
        """
        if  not self.write2cache:
            return

        # before going to cache we should check/set possible misses, e.g.
        # primary key when error is thrown
        result = self.set_misses(dasquery, api, gen)

        # update the cache
        header = dasheader(self.name, dasquery, expire, api, url,
                services=self.services())
        header['lookup_keys'] = self.lookup_keys(api)
        header['prim_key'] = self.dasmapping.primary_mapkey(self.name, api)
        header['ctime'] = ctime
        self.localcache.update_cache(dasquery, result, header)

        msg  = 'cache has been updated,\n'
        self.logger.debug(msg)

    def adjust_params(self, api, kwds, instance=None):
        """
        Data-service specific parser to adjust parameters according to
        its specifications. For example, DQ service accepts a string
        of parameters, rather parameter set, while DBS2 can reuse
        some parameters for different API, e.g. I can use dataset path
        to pass to listPrimaryDatasets as primary_dataset pattern.
        """
        pass

    def lookup_keys(self, api):
        """
        Return look-up keys of data output for given data-service API.
        """
        lkeys = self.dasmapping.lookup_keys(self.name, api)
        return [{api:lkeys}]

    def inspect_params(self, api, args):
        """
        Perform API parameter inspection. Check if API accept a range
        of parameters, etc.
        """
        for key, value in args.items():
            if  isinstance(value, dict):
                minval = None
                maxval = None
                for oper, val in value.items():
                    if  oper == '$in':
                        minval = int(val[0])
                        maxval = int(val[-1])
                        args[key] = range(minval, maxval)
                    elif oper == '$lt':
                        maxval = int(val)
                        args[key] = maxval
                    elif oper == '$lte':
                        maxval = int(val)
                        args[key] = maxval
                    elif oper == '$gt':
                        minval = int(val)
                        args[key] = minval
                    elif oper == '$gte':
                        minval = int(val)
                        args[key] = minval
                    else:
                        msg = '%s does not support operator %s' % (api, oper)
                        raise Exception(msg)
        return args

    def get_notations(self, api):
        """Return notations used for given API"""
        notationmap = self.notations()
        if  not notationmap:
            return {}
        notations = {}
        if  '' in notationmap:
            notations = dict(notationmap['']) # notations applied to all APIs
            if  api in notationmap: # overwrite the one for provided API
                notations.update(notationmap[api])
        return notations

    def parser(self, dasquery, dformat, data, api):
        """
        DAS data parser. Input parameters:

        - *query* input DAS query
        - *dformat* is a data format, e.g. XML, JSON
        - *data* is a data source, either file-like object or
          actual data
        - *api* is API name
        """
        prim_key  = self.dasmapping.primary_key(self.name, api)
        counter   = 0
        if  dformat.lower() == 'xml':
            tags = self.dasmapping.api2daskey(self.name, api)
            gen  = xml_parser(data, prim_key, tags)
            for row in gen:
                counter += 1
                yield row
        elif dformat.lower() == 'json' or dformat.lower() == 'dasjson':
            gen  = json_parser(data, self.logger)
            das_dict = {}
            for row in gen:
                if  dformat.lower() == 'dasjson':
                    for key, val in row.items():
                        if  key != 'results':
                            das_dict[key] = val
                    row = row['results']
                if  isinstance(row, list):
                    for item in row:
                        if  item:
                            if  prim_key in item:
                                counter += 1
                                yield item
                            else:
                                counter += 1
                                yield {prim_key:item}
                else:
                    if  prim_key in row:
                        counter += 1
                        yield row
                    else:
                        counter += 1
                        yield {prim_key:row}
        else:
            msg = 'Unsupported data format="%s", API="%s"' % (dformat, api)
            raise Exception(msg)
        msg  = "api=%s, format=%s " % (api, dformat)
        msg += "prim_key=%s yield %s rows" % (prim_key, counter)
        self.logger.info(msg)

    def translator(self, api, genrows):
        """
        Convert raw results into DAS records. 
        """
        prim_key  = self.dasmapping.primary_key(self.name, api)
        count = 0
        for row in genrows:
            row2das(self.dasmapping.notation2das, self.name, api, row)
            count += 1
            # check for primary key existance, since it can be overriden
            # by row2das. For example DBS3 uses flat namespace, so we
            # override dataset=>name, while dataset still is a primary key
            if  isinstance(row, list):
                yield {prim_key:row}
            elif  prim_key in row:
                if  prim_key in row[prim_key]:
                    yield row[prim_key] # remapping may create nested dict
                else:
                    yield row
            else:
                yield {prim_key:row}
        msg = "yield %s rows" % count
        self.logger.debug(msg)

    def set_misses(self, dasquery, api, genrows):
        """
        Check and adjust DAS records wrt input query. If some of the DAS
        keys are missing, add it with its value to the DAS record.
        """
        # look-up primary key
        prim_key  = self.dasmapping.primary_key(self.name, api)

        # Scan all docs and store those whose size above MongoDB limit into
        # GridFS
        map_key = self.dasmapping.primary_mapkey(self.name, api)
        genrows = parse2gridfs(self.gfs, map_key, genrows, self.logger)

        spec  = dasquery.mongo_query['spec']
        row   = next(genrows)
        ddict = DotDict(row)
        keys2adjust = []
        for key in spec.keys():
            val = ddict.get(key)
            if  spec[key] != val and key not in keys2adjust:
                keys2adjust.append(key)
        msg   = "adjust keys %s" % keys2adjust
        self.logger.debug(msg)
        count = 0
        if  keys2adjust:
            # adjust of the rows
            for row in yield_rows(row, genrows):
                ddict = DotDict(row)
                pval  = ddict.get(map_key)
                if  isinstance(pval, dict) and 'error' in pval:
                    ddict[map_key] = ''
                    ddict.update({prim_key: pval})
                for key in keys2adjust:
                    value = spec[key]
                    existing_value = ddict.get(key)
                    # the way to deal with proximity/patern/condition results
                    if  (isinstance(value, str) or isinstance(value, unicode))\
                        and value.find('*') != -1: # we got pattern
                        if  existing_value:
                            value = existing_value
                    elif isinstance(value, dict) or \
                        isinstance(value, list): # we got condition
                        if  existing_value:
                            value = existing_value
                        elif isinstance(value, dict) and \
                        '$in' in value: # we got a range {'$in': []}
                            value = value['$in']
                        elif isinstance(value, dict) and \
                        '$lte' in value and '$gte' in value:
                            # we got a between range
                            value = [value['$gte'], value['$lte']]
                        else: 
                            value = json.dumps(value) 
                    elif existing_value and value != existing_value:
                        # we got proximity results
                        if  'proximity' in ddict:
                            proximity = DotDict({key:existing_value})
                            ddict['proximity'].update(proximity)
                        else:
                            proximity = DotDict({})
                            proximity[key] = existing_value
                            ddict['proximity'] = proximity
                    else:
                        if  existing_value:
                            value = existing_value
                    ddict[key] = value
                yield ddict
                count += 1
        else:
            yield row
            for row in genrows:
                yield row
                count += 1
        msg   = "yield %s rows" % count
        self.logger.debug(msg)
            
    def api(self, dasquery):
        """
        Data service api method, can be defined by data-service class.
        It parse input query and invoke appropriate data-service API
        call. All results are stored into the DAS cache along with
        api call inserted into Analytics DB.
        """
        self.logger.info(dasquery)
        genrows = self.apimap(dasquery)
        if  not genrows:
            return
        jobs = []
        for url, api, args, dformat, expire in genrows:
            # insert DAS query record for given API
            header = dasheader(self.name, dasquery, expire, api, url)
            self.localcache.insert_query_record(dasquery, header)
            # fetch DAS data records
            if  self.multitask:
                jobs.append(self.taskmgr.spawn(self.apicall, \
                            dasquery, url, api, args, dformat, expire))
            else:
                self.apicall(dasquery, url, api, args, dformat, expire)
        if  self.multitask:
            self.taskmgr.joinall(jobs)

    def apicall(self, dasquery, url, api, args, dformat, expire):
        """
        Data service api method, can be defined by data-service class.
        It parse input query and invoke appropriate data-service API
        call. All results are stored into the DAS cache along with
        api call inserted into Analytics DB.

        We invoke explicitly close call for our datastream instead
        of using context manager since this method as well as
        getdata/parser can be overwritten by child classes.
        """
        datastream  = None
        try:
            args    = self.inspect_params(api, args)
            time0   = time.time()
            headers = make_headers(dformat)
            datastream, expire = self.getdata(url, args, expire, headers)
            self.logger.info("%s expire %s" % (api, expire))
            rawrows = self.parser(dasquery, dformat, datastream, api)
            dasrows = self.translator(api, rawrows)
            ctime   = time.time() - time0
            self.write_to_cache(dasquery, expire, url, api, args,
                    dasrows, ctime)
        except Exception as exc:
            msg  = 'Fail to process: url=%s, api=%s, args=%s' \
                    % (url, api, args)
            print(msg)
            print_exc(exc)
        close(datastream)

    def url_instance(self, url, _instance):
        """
        Virtual method to adjust URL for a given instance,
        must be implemented in service classes
        """
        return url

    def adjust_url(self, url, instance):
        """
        Adjust data-service URL wrt provided instance, e.g.
        DBS carry several instances
        """
        if  instance:
            url = self.url_instance(url, instance)
        return url

    def apimap(self, dasquery):
        """
        Analyze input query and yield url, api, args, format, expire
        for further processing.
        """
        srv   = self.name # get local copy to avoid threading issues
        cond  = getarg(dasquery.mongo_query, 'spec', {})
        instance = dasquery.mongo_query.get('instance', self.dbs_global)
        skeys = getarg(dasquery.mongo_query, 'fields', [])
        if  not skeys:
            skeys = []
        self.logger.info("\n")
        for api, value in self.map.items():
            expire = value['expire']
            iformat = value['format']
            url    = self.adjust_url(value['url'], instance)
            if  not url:
                msg = '--- rejects API %s, no URL' % api
                self.logger.info(msg)
                continue
            args   = dict(value['params']) # make new copy, since we'll adjust
            wild   = value.get('wild_card', '*')
            found  = 0
            # check if input parameters are covered by API
            if  not self.dasmapping.check_api_match(srv, api, cond):
                msg = '--- rejects API %s, does not cover input condition keys' \
                        % api
                self.logger.info(msg)
                continue
            # once we now that API covers input set of parameters we check
            # every input parameter for pattern matching
            for key, val in cond.items():
                # check if keys from conditions are accepted by API
                # need to convert key (which is daskeys.map) into
                # input api parameter
                for apiparam in self.dasmapping.das2api(srv, api, key, val):
                    if  apiparam in args:
                        args[apiparam] = val
                        found += 1
            # VK 20160708, wrong statement, it caused to pass
            # datasets API for query dataset in [path1, path2]
            # I'll leave block here until I test and verify that
            # commented out block will not cause other issues
            #
            # check the case when we only have single condition key
            # and it is the key we look-up
#             if  not found and skeys == [k.split('.')[0] for k in cond.keys()]:
#                 found = 1
            # check if number of keys on cond and args are the same
            if  len(cond.keys()) != found:
                msg = "--- reject API %s, not all condition keys are covered" \
                        % api
                self.logger.info(msg)
                msg = 'args=%s' % args
                self.logger.debug(msg)
                continue
            if  not found:
                msg = "--- rejects API %s, parameters don't match" % api
                self.logger.info(msg)
                msg = 'args=%s' % args
                self.logger.debug(msg)
                continue
            self.adjust_params(api, args, instance)
            # delete args keys whose value is optional
            delete_keys(args, 'optional')
            # check that there is no "required" parameter left in args,
            # since such api will not work
            if 'required' in args.values():
                msg = '--- rejects API %s, parameter is required' % api
                self.logger.info(msg)
                msg = 'args=%s' % args
                self.logger.debug(msg)
                continue
            # adjust pattern symbols in arguments
            if  wild != '*':
                for key, val in args.items():
                    if  isinstance(val, str) or isinstance(val, unicode):
                        val   = val.replace('*', wild)
                    args[key] = val

            # compare query selection keys with API look-up keys
            api_lkeys = self.dasmapping.api_lkeys(srv, api)
            if  set(api_lkeys) != set(skeys):
                msg = "--- rejects API %s, api_lkeys(%s)!=skeys(%s)"\
                        % (api, api_lkeys, skeys)
                self.logger.info(msg)
                continue

            msg = '+++ %s passes API %s' % (srv, api)
            self.logger.info(msg)
            msg = 'args=%s' % args
            self.logger.debug(msg)

            msg  = "yield "
            msg += "system ***%s***, url=%s, api=%s, args=%s, format=%s, " \
                % (srv, url, api, args, iformat)
            msg += "expire=%s, wild_card=%s" \
                % (expire, wild)
            self.logger.debug(msg)

            yield url, api, args, iformat, expire
Ejemplo n.º 26
0
class DASCore(object):
    """
    DAS core class.
    """
    def __init__(self, config=None, debug=None, 
                nores=False, logger=None, engine=None, multitask=True):
        if  config:
            dasconfig = config
        else:
            dasconfig = das_readconfig()
        verbose       = dasconfig['verbose']
        self.stdout   = debug
        if  isinstance(debug, int):
            self.verbose = debug
            dasconfig['verbose'] = debug
        else:
            self.verbose = verbose
        das_timer('DASCore::init', self.verbose)
        self.operators = das_operators()

        # set noresults option
        self.noresults = False
        if  nores:
            dasconfig['write_cache'] = True
            self.noresults = nores

        self.multitask = dasconfig['das'].get('multitask', True)
        if  debug or self.verbose:
            self.multitask = False # in verbose mode do not use multitask
            dasconfig['das']['multitask'] = False
        if  not multitask: # explicitly call DASCore ctor, e.g. in analytics
            self.multitask = False
            dasconfig['das']['multitask'] = False
        dasconfig['engine'] = engine
        if  self.multitask:
            nworkers = dasconfig['das'].get('core_workers', 5)
            if  engine:
                thr_name = 'DASCore:PluginTaskManager'
                self.taskmgr = PluginTaskManager(\
                        engine, nworkers=nworkers, name=thr_name)
                self.taskmgr.subscribe()
            else:
                thr_name = 'DASCore:TaskManager'
                self.taskmgr = TaskManager(nworkers=nworkers, name=thr_name)
        else:
            self.taskmgr = None

        if  logger:
            self.logger = logger
        else:
            self.logger = PrintManager('DASCore', self.verbose)

        # define Mapping/Analytics/Parser in this order since Parser depends
        # on first two
        dasmapping = DASMapping(dasconfig)
        dasconfig['dasmapping'] = dasmapping
        self.mapping = dasmapping

        self.analytics = DASAnalytics(dasconfig)
        dasconfig['dasanalytics'] = self.analytics

        self.mongoparser = ql_manager(dasconfig)
        dasconfig['mongoparser'] = self.mongoparser

        self.keylearning = DASKeyLearning(dasconfig)
        dasconfig['keylearning'] = self.keylearning

        # init DAS cache
        self.rawcache = DASMongocache(dasconfig)
        dasconfig['rawcache'] = self.rawcache

        # plug-in architecture: loop over registered data-services in
        # dasconfig; load appropriate module/class; register data
        # service with DASCore.
        self.systems = dasmapping.list_systems()
        # pointer to the DAS top level directory
        dasroot = '/'.join(__file__.split('/')[:-3])
        for name in self.systems:
            try:
                klass  = 'DAS/services/%s/%s_service.py' \
                    % (name, name)
                srvfile = os.path.join(dasroot, klass)
                with file(srvfile) as srvclass:
                    for line in srvclass:
                        if  line.find('(DASAbstractService)') != -1:
                            klass = line.split('(DASAbstractService)')[0]
                            klass = klass.split('class ')[-1] 
                            break
                mname  = 'DAS.services.%s.%s_service' % (name, name)
                module = __import__(mname, fromlist=[klass])
                obj = getattr(module, klass)(dasconfig)
                setattr(self, name, obj)
                SERVICES[name] = obj
            except IOError as err:
                if  debug > 1:
                    # we have virtual services, so IOError can be correct
                    print_exc(err)
                try:
                    mname  = 'DAS.services.generic_service'
                    module = __import__(mname, fromlist=['GenericService'])
                    obj    = module.GenericService(name, dasconfig)
                    setattr(self, name, obj)
                except Exception as exc:
                    print_exc(exc)
                    msg = "Unable to load %s data-service plugin" % name
                    raise Exception(msg)
            except Exception as exc:
                print_exc(exc)
                msg = "Unable to load %s data-service plugin" % name
                raise Exception(msg)

        # loop over systems and get system keys, add mapping keys to final list
        self.service_keys = {}
        self.service_parameters = {}
        for name in self.systems: 
            skeys = getattr(self, name).keys()
            self.service_keys[getattr(self, name).name] = skeys
            sparams = getattr(self, name).parameters()
            self.service_parameters[getattr(self, name).name] = sparams

        self.service_keys['special'] = das_special_keys()
        self.dasconfig = dasconfig
        das_timer('DASCore::init', self.verbose)

    def keys(self):
        """
        Return map of data service keys
        """
        return self.service_keys

    def das_keys(self):
        """
        Return map of data service keys
        """
        _keys = ['records']
        for values in self.service_keys.values():
            for key in values:
                if  key not in _keys:
                    _keys.append(key)
        return _keys

    def result(self, query, idx=0, limit=None):
        """
        Get results either from cache or from explicit call
        """
        self.logger.info('input query=%s' % query)
        results = []
        dasquery = DASQuery(query, mongoparser=self.mongoparser)
        dasquery.add_to_analytics()
        query    = dasquery.mongo_query
        # check if we have any service which cover the query
        # otherwise decompose it into list of queries
        service_map = dasquery.service_apis_map()
        if  not service_map:
            msg  = 'no APIs found to answer input query, will decompose it'
            self.logger.info(msg)
            skeys = query['fields']
            if  not skeys:
                skeys = []
            for key in skeys:
                newquery = DASQuery(dict(fields=[key], spec=query['spec']),
                                        mongoparser=self.mongoparser)
                self.call(newquery) # process query
        else:
            self.call(dasquery) # process query

        # lookup provided query in a cache
        if  not self.noresults:
            results = self.get_from_cache(dasquery, idx, limit)
        return results

    def remove_from_cache(self, dasquery):
        """
        Delete in cache entries about input query
        """
        self.rawcache.remove_from_cache(dasquery)

    def get_status(self, dasquery):
        """
        Look-up status of provided query in a cache.
        Return status of the query request and its hash.
        """
        if  dasquery and dasquery.mongo_query.has_key('fields'):
            fields = dasquery.mongo_query['fields']
            if  fields and isinstance(fields, list) and 'queries' in fields:
                return 'ok', dasquery.qhash
        status = 0
        record = self.rawcache.find(dasquery)
        try:
            if  record and record.has_key('das') and \
                record['das'].has_key('status'):
                status = record['das']['status']
                return status, record['qhash']
        except:
            pass

        similar_dasquery = self.rawcache.similar_queries(dasquery)
        if  similar_dasquery:
            record = self.rawcache.find(similar_dasquery)
            if  record and record.has_key('das') and \
                record['das'].has_key('status'):
                similar_query_status = record['das']['status']
                return similar_query_status, record['qhash']
        return status, 0

    def worker(self, srv, dasquery):
        """Main worker function which calls data-srv call function"""
        self.logger.info('##### %s ######\n' % srv)
        das_timer(srv, self.verbose)
        getattr(getattr(self, srv), 'call')(dasquery)
        das_timer(srv, self.verbose)

    def call(self, query, add_to_analytics=True, **kwds):
        """
        Top level DAS api which execute a given query using underlying
        data-services. It follows the following steps:

            - parse input query
            - identify data-sercices based on selection keys
              and where clause conditions
            - construct DAS workflow and execute data-service 
              API calls. At this step individual 
              data-services store results into DAS cache.

        Return status 0/1 depending on success of the calls, can be
        used by workers on cache server.

        kwds is provided for compatibility with web layer, e.g. it
        may invoke this method with additional pid parameter.
        """
        self.logger.info('input query=%s' % query)
        das_timer('DASCore::call', self.verbose)
        services = []
        if  isinstance(query, object) and hasattr(query, '__class__')\
            and query.__class__.__name__ == 'DASQuery':
            dasquery = query
        else:
            dasquery = DASQuery(query, mongoparser=self.mongoparser)
        if  add_to_analytics:
            dasquery.add_to_analytics()
        query = dasquery.mongo_query
        if  dasquery.mongo_query.has_key('system'):
            system = query['system']
            if  isinstance(system, str) or isinstance(system, unicode):
                services = [system]
            elif isinstance(system, list):
                services = system
            else:
                msg = 'Unsupported system=%s type=%s in DAS query' \
                        % (system, type(system))
                raise Exception(msg)
        spec   = query.get('spec')
        fields = query.get('fields')
        if  fields == ['records']:
            msg = 'look-up all records in cache'
            self.logger.info(msg)
            return 'in cache'
        if  spec == dict(records='*'):
            self.logger.info("look-up everything in cache")
            return 'in cache'
        for record in self.rawcache.find_specs(dasquery):
            status = record['das']['status']
            msg = 'found query %s in cache, status=%s\n' \
                        % (record['query'], status)
            self.logger.info(msg)
            return status
        similar_dasquery = self.rawcache.similar_queries(dasquery)
        if  similar_dasquery:
            for record in self.rawcache.find_specs(similar_dasquery):
                if  record:
                    try:
                        status = record['das']['status']
                    except:
                        status = 'N/A'
                        msg = 'Fail to look-up das.status, record=%s' % record
                        self.logger.info(msg)
                msg  = 'found SIMILAR query in cache,'
                msg += 'query=%s, status=%s\n' % (record['query'], status)
                self.logger.info(msg)
                return status

        self.logger.info(dasquery)
        params = dasquery.params()
        if  not services:
            services = params['services']
        self.logger.info('services = %s' % services)
        das_timer('das_record', self.verbose)
        # initial expire tstamp 1 day (long enough to be overwriten by data-srv)
        expire = expire_timestamp(time.time()+1*24*60*60)
        header = dasheader("das", dasquery, expire)
        header['lookup_keys'] = []
        self.rawcache.insert_query_record(dasquery, header)
        das_timer('das_record', self.verbose)
        try:
            if  self.multitask:
                jobs = []
                for srv in services:
                    jobs.append(self.taskmgr.spawn(self.worker, srv, dasquery))
                self.taskmgr.joinall(jobs)
            else:
                for srv in services:
                    self.worker(srv, dasquery)
        except Exception as exc:
            print_exc(exc)
            return 'fail'
        self.logger.info('\n##### merging ######\n')
        self.rawcache.update_query_record(dasquery, 'merging')
        das_timer('merge', self.verbose)
        self.rawcache.merge_records(dasquery)
        das_timer('merge', self.verbose)
        self.rawcache.update_query_record(dasquery, 'ok')
        self.rawcache.add_to_record(\
                dasquery, {'das.timer': get_das_timer()}, system='das')
        das_timer('DASCore::call', self.verbose)
        return 'ok'

    def nresults(self, dasquery, coll='merge'):
        """
        Return total number of results (count) for provided query
        Code should match body of get_from_cache method.
        """
        fields = dasquery.mongo_query.get('fields', None)
        if  dasquery.mapreduce:
            result = self.rawcache.map_reduce(dasquery.mapreduce, dasquery)
            return len([1 for _ in result])
        elif dasquery.aggregators:
            return len(dasquery.aggregators)
        elif isinstance(fields, list) and 'queries' in fields:
            return len([1 for _ in self.get_queries(dasquery)])
        return self.rawcache.nresults(dasquery, coll)

    def incache(self, dasquery, coll='merge'):
        """
        Answer the question if given query in DAS cache or not
        """
        return self.rawcache.incache(dasquery, collection=coll)

    def get_from_cache(self, dasquery, idx=0, limit=0, collection='merge'):
        """
        Look-up results from the merge cache and yield them for
        further processing.
        """
        das_timer('DASCore::get_from_cache', self.verbose)
        msg = 'col=%s, query=%s, idx=%s, limit=%s'\
                % (collection, dasquery, idx, limit)
        self.logger.info(msg)

        fields  = dasquery.mongo_query.get('fields', None)

        if  dasquery.mapreduce:
            res = self.rawcache.map_reduce(dasquery.mapreduce, dasquery)
        elif dasquery.aggregators:
            res = []
            _id = 0
            for func, key in dasquery.aggregators:
                rows = self.rawcache.get_from_cache(\
                        dasquery, collection=collection)
                data = getattr(das_aggregator, 'das_%s' % func)(key, rows)
                res += \
                [{'_id':_id, 'function': func, 'key': key, 'result': data}]
                _id += 1
        elif isinstance(fields, list) and 'queries' in fields:
            res = itertools.islice(self.get_queries(dasquery), idx, idx+limit)
        else:
            res = self.rawcache.get_from_cache(dasquery, idx, limit, \
                    collection=collection)
        for row in res:
            fix_times(row)
            yield row
        das_timer('DASCore::get_from_cache', self.verbose)

    def get_queries(self, dasquery):
        """
        Look-up (popular) queries in DAS analytics/logging db
        """
        das_timer('DASCore::get_queries', self.verbose)
        fields = dasquery.mongo_query.get('fields')
        spec   = dasquery.mongo_query.get('spec')
        if  'popular' in fields:
            res = self.analytics.get_popular_queries(spec)
        else:
            datestamp = spec.get('date')
            if  isinstance(datestamp, dict):
                value = datestamp.get('$in')
                res = \
                self.analytics.list_queries(after=value[0], before=value[1])
            elif isinstance(datestamp, int):
                res = self.analytics.list_queries(after=datestamp)
            elif not datestamp:
                res = self.analytics.list_queries()
            else:
                msg = 'Unsupported date value: %s' % datestamp
                raise Exception(msg)
        for row in res:
            rid = row.pop('_id')
            yield dict(das_query=row, _id=rid)
        das_timer('DASCore::get_queries', self.verbose)
Ejemplo n.º 27
0
Archivo: das_core.py Proyecto: ktf/DAS
    def __init__(self, config=None, debug=0,
                nores=False, logger=None, engine=None, multitask=True):
        if  config:
            dasconfig = config
        else:
            dasconfig = das_readconfig()
        verbose       = dasconfig['verbose']
        self.stdout   = debug
        if  isinstance(debug, int):
            self.verbose = debug
            dasconfig['verbose'] = debug
        else:
            self.verbose = verbose
        das_timer('DASCore::init', self.verbose)
        self.operators = das_operators()

        # set noresults option
        self.noresults = False
        if  nores:
            dasconfig['write_cache'] = True
            self.noresults = nores

        self.multitask = dasconfig['das'].get('multitask', True)
        if  debug or self.verbose:
            self.multitask = False # in verbose mode do not use multitask
            dasconfig['das']['multitask'] = False
        if  not multitask: # explicitly call DASCore ctor, e.g. in analytics
            self.multitask = False
            dasconfig['das']['multitask'] = False
        dasconfig['engine'] = engine
        if  self.multitask:
            nworkers = dasconfig['das'].get('core_workers', 5)
            if  engine:
                thr_name = 'DASCore:PluginTaskManager'
                self.taskmgr = PluginTaskManager(\
                        engine, nworkers=nworkers, name=thr_name)
                self.taskmgr.subscribe()
            else:
                thr_name = 'DASCore:TaskManager'
                self.taskmgr = TaskManager(nworkers=nworkers, name=thr_name)
        else:
            self.taskmgr = None

        if  logger:
            self.logger = logger
        else:
            self.logger = PrintManager('DASCore', self.verbose)

        # define Mapping/Analytics/Parser in this order since Parser depends
        # on first two
        dasmapping = DASMapping(dasconfig)
        dasconfig['dasmapping'] = dasmapping
        self.mapping = dasmapping

        self.analytics = DASAnalytics(dasconfig)
        dasconfig['dasanalytics'] = self.analytics

        self.keylearning = DASKeyLearning(dasconfig)
        dasconfig['keylearning'] = self.keylearning

        # init DAS cache
        self.rawcache = DASMongocache(dasconfig)
        dasconfig['rawcache'] = self.rawcache

        # plug-in architecture: loop over registered data-services in
        # dasconfig; load appropriate module/class; register data
        # service with DASCore.
        self.systems = dasmapping.list_systems()
        # pointer to the DAS top level directory
        dasroot = '/'.join(__file__.split('/')[:-3])
        for name in self.systems:
            try:
                klass  = 'DAS/services/%s/%s_service.py' \
                    % (name, name)
                srvfile = os.path.join(dasroot, klass)
                with file(srvfile) as srvclass:
                    for line in srvclass:
                        if  line.find('(DASAbstractService)') != -1:
                            klass = line.split('(DASAbstractService)')[0]
                            klass = klass.split('class ')[-1] 
                            break
                mname  = 'DAS.services.%s.%s_service' % (name, name)
                module = __import__(mname, fromlist=[klass])
                obj = getattr(module, klass)(dasconfig)
                setattr(self, name, obj)
                SERVICES[name] = obj
            except IOError as err:
                if  debug > 1:
                    # we have virtual services, so IOError can be correct
                    print_exc(err)
                try:
                    mname  = 'DAS.services.generic_service'
                    module = __import__(mname, fromlist=['GenericService'])
                    obj    = module.GenericService(name, dasconfig)
                    setattr(self, name, obj)
                except Exception as exc:
                    print_exc(exc)
                    msg = "Unable to load %s data-service plugin" % name
                    raise Exception(msg)
            except Exception as exc:
                print_exc(exc)
                msg = "Unable to load %s data-service plugin" % name
                raise Exception(msg)

        # loop over systems and get system keys, add mapping keys to final list
        self.service_keys = {}
        self.service_parameters = {}
        for name in self.systems: 
            skeys = getattr(self, name).keys()
            self.service_keys[getattr(self, name).name] = skeys
            sparams = getattr(self, name).parameters()
            self.service_parameters[getattr(self, name).name] = sparams

        self.service_keys['special'] = das_special_keys()
        self.dasconfig = dasconfig
        das_timer('DASCore::init', self.verbose)
Ejemplo n.º 28
0
class DASWebService(DASWebManager):
    """
    DAS web service interface.
    """
    def __init__(self, dasconfig):
        DASWebManager.__init__(self, dasconfig)
        config = dasconfig['web_server']
        self.pid_pat     = re.compile(r'^[a-z0-9]{32}')
        self.base        = config['url_base']
        self.interval    = config.get('status_update', 2500)
        self.engine      = config.get('engine', None)
        nworkers         = config['number_of_workers']
        self.hot_thr     = config.get('hot_threshold', 3000)
        self.dasconfig   = dasconfig
        self.dburi       = self.dasconfig['mongodb']['dburi']
        self.lifetime    = self.dasconfig['mongodb']['lifetime']
        self.queue_limit = config.get('queue_limit', 50)
        if  self.engine:
            thr_name = 'DASWebService:PluginTaskManager'
            self.taskmgr = PluginTaskManager(\
                        bus=self.engine, nworkers=nworkers, name=thr_name)
            self.taskmgr.subscribe()
        else:
            thr_name = 'DASWebService:TaskManager'
            self.taskmgr = TaskManager(nworkers=nworkers, name=thr_name)
        self.adjust      = config.get('adjust_input', False)

        self.init()

        # Monitoring thread which performs auto-reconnection
        thread.start_new_thread(dascore_monitor, \
                ({'das':self.dasmgr, 'uri':self.dburi}, self.init, 5))

        # Obtain DBS global instance or set it as None
        if  self.dasconfig.has_key('dbs'):
            self.dbs_global = \
                self.dasconfig['dbs'].get('dbs_global_instance', None)
            self.dbs_instances = \
                self.dasconfig['dbs'].get('dbs_instances', [])
        else:
            self.dbs_global = None
            self.dbs_instances = []

        # Start DBS daemon
        self.dataset_daemon = config.get('dbs_daemon', False)
        if  self.dataset_daemon:
            self.dbs_daemon(config)

    def process_requests_onhold(self):
        "Process requests which are on hold"
        try:
            limit = self.queue_limit/2
            thread.start_new_thread(onhold_worker, \
                (self.dasmgr, self.taskmgr, self.reqmgr, limit))
        except Exception as exc:
            print_exc(exc)

    def dbs_daemon(self, config):
        """Start DBS daemon if it is requested via DAS configuration"""
        try:
            main_dbs_url = self.dasconfig['dbs']['dbs_global_url']
            self.dbs_urls = []
            for inst in self.dbs_instances:
                self.dbs_urls.append(\
                        main_dbs_url.replace(self.dbs_global, inst))
            interval  = config.get('dbs_daemon_interval', 3600)
            dbsexpire = config.get('dbs_daemon_expire', 3600)
            self.dbsmgr = {} # dbs_urls vs dbs_daemons
            if  self.dataset_daemon:
                for dbs_url in self.dbs_urls:
                    dbsmgr = DBSDaemon(dbs_url, self.dburi, expire=dbsexpire)
                    self.dbsmgr[dbs_url] = dbsmgr
                    def dbs_updater(_dbsmgr, interval):
                        """DBS updater daemon"""
                        while True:
                            try:
                                _dbsmgr.update()
                            except:
                                pass
                            time.sleep(interval)
                    print "Start DBSDaemon for %s" % dbs_url
                    thread.start_new_thread(dbs_updater, (dbsmgr, interval, ))
        except Exception as exc:
            print_exc(exc)

    def init(self):
        """Init DAS web server, connect to DAS Core"""
        try:
            self.logcol     = DASLogdb(self.dasconfig)
            self.reqmgr     = RequestManager(self.dburi, lifetime=self.lifetime)
            self.dasmgr     = DASCore(engine=self.engine)
            self.repmgr     = CMSRepresentation(self.dasconfig, self.dasmgr)
            self.daskeys    = self.dasmgr.das_keys()
            self.gfs        = db_gridfs(self.dburi)
            self.daskeys.sort()
            self.dasmapping = self.dasmgr.mapping
            self.dasmapping.init_presentationcache()
            self.colors = {}
            for system in self.dasmgr.systems:
                self.colors[system] = gen_color(system)
            self.sitedbmgr   = SiteDBService(self.dasconfig)
        except Exception as exc:
            print_exc(exc)
            self.dasmgr = None
            self.daskeys = []
            self.colors = {}
            return
        # Start Onhold_request daemon
        if  self.dasconfig['web_server'].get('onhold_daemon', False):
            self.process_requests_onhold()

    def logdb(self, query):
        """
        Make entry in Logging DB
        """
        qhash = genkey(query)
        args  = cherrypy.request.params
        doc = dict(qhash=qhash,
                date=int(str(date.fromtimestamp(time.time())).replace('-', '')),
                headers=cherrypy.request.headers,
                method=cherrypy.request.method,
                path=cherrypy.request.path_info,
                args=args, ahash=genkey(args),
                ip=cherrypy.request.remote.ip,
                hostname=cherrypy.request.remote.name,
                port=cherrypy.request.remote.port)
        self.logcol.insert('web', doc)

    def get_nhits(self):
        "Return number of hits per day client made"
        tsec  = time.mktime(date.timetuple(date.today()))
        spec  = {'ip': cherrypy.request.remote.ip, 'ts': {'$gte': tsec},
                 'args.pid': {'$exists': False}, # do not count pid requests
                 'path': '/cache'} # requests from das_client calls
        nhits = self.logcol.find(spec, count=True)
        return nhits

    @expose
    @checkargs(DAS_WEB_INPUTS)
    def redirect(self, **kwargs):
        """
        Represent DAS redirect page
        """
        dmsg = 'You do not have permission to access the resource requested.'
        msg  = kwargs.get('reason', dmsg)
        if  msg:
            msg = 'Reason: ' + msg
        page = self.templatepage('das_redirect', msg=msg)
        return self.page(page, response_div=False)

    def bottom(self, response_div=True):
        """
        Define footer for all DAS web pages
        """
        return self.templatepage('das_bottom', div=response_div,
                version=DAS.version)

    def page(self, content, ctime=None, response_div=True):
        """
        Define footer for all DAS web pages
        """
        page  = self.top()
        page += content
        page += self.templatepage('das_bottom', ctime=ctime, 
                                  version=DAS.version, div=response_div)
        return page

    @expose
    @checkargs(DAS_WEB_INPUTS + ['section', 'highlight'])
    def faq(self, *args, **kwargs):
        """
        represent DAS FAQ.
        """
        section = kwargs.get('section', None)
        highlight = kwargs.get('highlight', None)
        guide = self.templatepage('dbsql_vs_dasql', 
                    operators=', '.join(das_operators()))
        page = self.templatepage('das_faq', guide=guide,
                section=section, highlight=highlight,
                operators=', '.join(das_operators()), 
                aggregators=', '.join(das_aggregators()))
        return self.page(page, response_div=False)

    @expose
    def cli(self):
        """
        Serve DAS CLI file download.
        """
        dasroot = '/'.join(__file__.split('/')[:-3])
        clifile = os.path.join(dasroot, 'DAS/tools/das_client.py')
        return serve_file(clifile, content_type='text/plain')

    @expose
    def movetodas(self):
        "Placeholder page for DBS to DAS migration"
        style = "width:600px;margin-left:auto;margin-right:auto;padding-top:20px"
        page  = """<div style="%s">""" % style
        page += "Dear user,<br/>DBS Data Discovery page is depricated.<br/>"
        page += "Please migrate to Data Aggregation Service located at"
        page += "<p>https://cmsweb.cern.ch/das/</p>"
        page += "<em>CMS HTTP group.</em>"
        page += "</div>"""
        return page

    @expose
    def opensearch(self):
        """
        Serve DAS opensearch file.
        """
        if  self.base and self.base.find('http://') != -1:
            base = self.base
        else:
            base = 'http://cmsweb.cern.ch/das'
        desc = self.templatepage('das_opensearch', base=base)
        cherrypy.response.headers['Content-Type'] = \
                'application/opensearchdescription+xml'
        return desc

    @expose
    @checkargs(DAS_WEB_INPUTS)
    def services(self, *args, **kwargs):
        """
        represent DAS services
        """
        dasdict = {}
        daskeys = []
        for system, keys in self.dasmgr.mapping.daskeys().iteritems():
            if  system not in self.dasmgr.systems:
                continue
            tmpdict = {}
            for key in keys:
                tmpdict[key] = self.dasmgr.mapping.lookup_keys(system, key) 
                if  key not in daskeys:
                    daskeys.append(key)
            dasdict[system] = dict(keys=dict(tmpdict), 
                apis=self.dasmgr.mapping.list_apis(system))
        mapreduce = [r for r in self.dasmgr.rawcache.get_map_reduce()]
        page = self.templatepage('das_services', dasdict=dasdict, 
                        daskeys=daskeys, mapreduce=mapreduce)
        return self.page(page, response_div=False)

    @expose
    @checkargs(DAS_WEB_INPUTS)
    def api(self, name, **kwargs):
        """
        Return DAS mapping record about provided API.
        """
        record = self.dasmgr.mapping.api_info(name)
        page   = "<b>DAS mapping record</b>"
        page  += das_json(record)
        return self.page(page, response_div=False)

    @expose
    @checkargs(DAS_WEB_INPUTS)
    def default(self, *args, **kwargs):
        """
        Default method.
        """
        return self.index(args, kwargs)

    def adjust_input(self, kwargs):
        """
        Adjust user input wrt common DAS keyword patterns, e.g.
        Zee -> dataset=*Zee*, T1_US -> site=T1_US*. This method
        only works if self.adjust is set in configuration of DAS server.
        This method can be customization for concrete DAS applications via
        external free_text_parser function (part of DAS.web.utils module)
        """
        if  not self.adjust:
            return
        uinput = kwargs.get('input', '')
        query_part = uinput.split('|')[0]
        if  query_part == 'queries' or query_part == 'records':
            return
        new_input = free_text_parser(uinput, self.daskeys)
        if  uinput and new_input == uinput:
            selkey = choose_select_key(uinput, self.daskeys, 'dataset')
            if  selkey and len(new_input) > len(selkey) and \
                new_input[:len(selkey)] != selkey:
                new_input = selkey + ' ' + new_input
        kwargs['input'] = new_input

    def generate_dasquery(self, uinput, inst, html_error=True):
        """
        Check provided input as valid DAS input query.
        Returns status and content (either error message or valid DASQuery)
        """
        def helper(msg, html_error=None):
            """Helper function which provide error template"""
            if  not html_error:
                return msg
            guide = self.templatepage('dbsql_vs_dasql', 
                        operators=', '.join(das_operators()))
            page = self.templatepage('das_ambiguous', msg=msg, base=self.base,
                        guide=guide)
            return page
        if  not uinput:
            return 1, helper('No input query')
        # Generate DASQuery object, if it fails we catch the exception and
        # wrap it for upper layer (web interface)
        try:
            dasquery = DASQuery(uinput, instance=inst)
        except Exception as err:
            return 1, helper(das_parser_error(uinput, str(err)), html_error)
        fields = dasquery.mongo_query.get('fields', [])
        if  not fields:
            fields = []
        spec   = dasquery.mongo_query.get('spec', {})
        for word in fields+spec.keys():
            found = 0
            if  word in DAS_DB_KEYWORDS:
                found = 1
            for key in self.daskeys:
                if  word.find(key) != -1:
                    found = 1
            if  not found:
                msg = 'Provided input does not contain a valid DAS key'
                return 1, helper(msg, html_error)
        if  isinstance(uinput, dict): # DASQuery w/ {'spec':{'_id:id}}
            pass
        elif uinput.find('queries') != -1:
            pass
        elif uinput.find('records') != -1:
            pass
        else: # normal user DAS query
            try:
                service_map = dasquery.service_apis_map()
            except Exception as exc:
                msg = 'Fail to lookup DASQuery service API map'
                print msg
                print_exc(exc)
                return 1, helper(msg, html_error)
            if  not service_map:
                msg  = "None of the API's registered in DAS "
                msg += "can resolve this query"
                return 1, helper(msg, html_error)
        return 0, dasquery

    @expose
    @checkargs(DAS_WEB_INPUTS)
    def index(self, *args, **kwargs):
        """
        represents DAS web interface. 
        It uses das_searchform template for
        input form and yui_table for output Table widget.
        """
        uinput = getarg(kwargs, 'input', '') 
        return self.page(self.form(uinput=uinput, cards=True))

    def form(self, uinput='', instance=None, view='list', cards=False):
        """
        provide input DAS search form
        """
        if  not instance:
            instance = self.dbs_global
        cards = self.templatepage('das_cards', base=self.base, show=cards, \
                width=900, height=220, cards=help_cards(self.base))
        page  = self.templatepage('das_searchform', input=uinput, \
                init_dbses=list(self.dbs_instances), \
                base=self.base, instance=instance, view=view, cards=cards)
        return page

    @expose
    def error(self, msg, wrap=True):
        """
        Show error message.
        """
        page = self.templatepage('das_error', msg=str(msg))
        if  wrap:
            page  = self.page(self.form() + page)
        return page

    @expose
    @checkargs(DAS_WEB_INPUTS)
    def gridfs(self, *args, **kwargs):
        """
        Retieve records from GridFS
        """
        time0 = time.time()
        if  not kwargs.has_key('fid'):
            code = web_code('No file id')
            raise HTTPError(500, 'DAS error, code=%s' % code)
        fid  = kwargs.get('fid')
        data = {'status':'requested', 'fid':fid}
        try:
            fds = self.gfs.get(ObjectId(fid))
            return fds.read()
        except Exception as exc:
            print_exc(exc)
            code = web_code('Exception')
            raise HTTPError(500, 'DAS error, code=%s' % code)
        data['ctime'] = time.time() - time0
        return json.dumps(data)

    @expose
    @checkargs(DAS_WEB_INPUTS)
    def records(self, *args, **kwargs):
        """
        Retieve all records id's.
        """
        try:
            recordid = None
            if  args:
                recordid = args[0]
                spec = {'_id':ObjectId(recordid)}
                fields = None
                query = dict(fields=fields, spec=spec)
            elif  kwargs and kwargs.has_key('_id'):
                spec = {'_id': ObjectId(kwargs['_id'])}
                fields = None
                query = dict(fields=fields, spec=spec)
            else: # return all ids
                query = dict(fields=None, spec={})

            res      = ''
            time0    = time.time()
            idx      = getarg(kwargs, 'idx', 0)
            limit    = getarg(kwargs, 'limit', 10)
            coll     = kwargs.get('collection', 'merge')
            inst     = kwargs.get('instance', self.dbs_global)
            form     = self.form(uinput="")
            check, content = self.generate_dasquery(query, inst)
            if  check:
                return self.page(form + content, ctime=time.time()-time0)
            dasquery = content # returned content is valid DAS query
            nresults = self.dasmgr.rawcache.nresults(dasquery, coll)
            gen      = self.dasmgr.rawcache.get_from_cache\
                (dasquery, idx=idx, limit=limit, collection=coll)
            if  recordid: # we got id
                for row in gen:
                    res += das_json(row)
            else:
                for row in gen:
                    rid  = row['_id']
                    del row['_id']
                    res += self.templatepage('das_record', \
                            id=rid, collection=coll, daskeys=', '.join(row))
            if  recordid:
                page  = res
            else:
                url   = '/das/records?'
                if  nresults:
                    page = self.templatepage('das_pagination', \
                        nrows=nresults, idx=idx, limit=limit, url=url)
                else:
                    page = 'No results found, nresults=%s' % nresults
                page += res

            ctime   = (time.time()-time0)
            page = self.page(form + page, ctime=ctime)
            return page
        except Exception as exc:
            print_exc(exc)
            return self.error(gen_error_msg(kwargs))

    @jsonstreamer
    def datastream(self, kwargs):
        """Stream DAS data into JSON format"""
        head = kwargs.get('head', dict(timestamp=time.time()))
        if  not head.has_key('mongo_query'):
            head['mongo_query'] = head['dasquery'].mongo_query \
                if head.has_key('dasquery') else {}
        if  head.has_key('dasquery'):
            del head['dasquery']
        if  head.has_key('args'):
            del head['args']
        data = kwargs.get('data', [])
        return head, data

    def get_data(self, kwargs):
        """
        Invoke DAS workflow and get data from the cache.
        """
        head   = dict(timestamp=time.time())
        head['args'] = kwargs
        uinput = kwargs.get('input', '')
        inst   = kwargs.get('instance', self.dbs_global)
        idx    = getarg(kwargs, 'idx', 0)
        limit  = getarg(kwargs, 'limit', 0) # do not impose limit
        coll   = kwargs.get('collection', 'merge')
        dasquery = kwargs.get('dasquery', None)
        time0  = time.time()
        if  dasquery:
            dasquery = DASQuery(dasquery, instance=inst)
        else:
            check, content = \
                    self.generate_dasquery(uinput, inst, html_error=False)
            if  check:
                head.update({'status': 'fail', 'reason': content,
                             'ctime': time.time()-time0, 'input': uinput})
                data = []
                return head, data
            dasquery = content # returned content is valid DAS query
        try:
            nres = self.dasmgr.nresults(dasquery, coll)
            data = \
                self.dasmgr.get_from_cache(dasquery, idx, limit)
            head.update({'status':'ok', 'nresults':nres,
                         'ctime': time.time()-time0, 'dasquery': dasquery})
        except Exception as exc:
            print_exc(exc)
            head.update({'status': 'fail', 'reason': str(exc),
                         'ctime': time.time()-time0, 'dasquery': dasquery})
            data = []
        head.update({'incache':self.dasmgr.incache(dasquery, coll='cache')})
        return head, data

    def busy(self):
        """
        Check number server load and report busy status if it's
        above threashold = queue size - nworkers
        """
        nrequests = self.reqmgr.size()
        if  (nrequests - self.taskmgr.nworkers()) > self.queue_limit:
            return True
        return False

    def busy_page(self, uinput=None):
        """DAS server busy page layout"""
        page = "<h3>DAS server is busy, please try later</h3>"
        form = self.form(uinput)
        return self.page(form + page)

    @expose
    @checkargs(DAS_WEB_INPUTS)
    def cache(self, **kwargs):
        """
        DAS web cache interface. Fire up new process for new requests and
        record its pid. The client is in charge to keep track of pid.
        The new process uses DAS core call to request the data into cache.
        Since query are cached the repeated call with the same query
        has no cost to DAS core.
        """
        # remove expires records from merge collection
        self.dasmgr.rawcache.remove_expired('merge')

        # do not allow caching
        cherrypy.response.headers['Cache-Control'] = 'no-cache'
        cherrypy.response.headers['Pragma'] = 'no-cache'
        uinput = kwargs.get('input', '').strip()
        if  not uinput:
            head = {'status': 'fail', 'reason': 'No input found',
                    'args': kwargs, 'ctime': 0, 'input': uinput}
            data = []
            return self.datastream(dict(head=head, data=data))
        self.adjust_input(kwargs)
        pid    = kwargs.get('pid', '')
        inst   = kwargs.get('instance', self.dbs_global)
        uinput = kwargs.get('input', '')
        data   = []
        check, content = self.generate_dasquery(uinput, inst)
        if  check:
            head = dict(timestamp=time.time())
            head.update({'status': 'fail',
                         'reason': 'Fail to create DASQuery object',
                         'ctime': 0})
            return self.datastream(dict(head=head, data=data))
        dasquery = content # returned content is valid DAS query
        status, qhash = self.dasmgr.get_status(dasquery)
        if  status == 'ok':
            self.reqmgr.remove(dasquery.qhash)
            head, data = self.get_data(kwargs)
            return self.datastream(dict(head=head, data=data))
        kwargs['dasquery'] = dasquery.storage_query
        if  not pid and self.busy():
            head = dict(timestamp=time.time())
            head.update({'status': 'busy', 'reason': 'DAS server is busy',
                         'ctime': 0})
            return self.datastream(dict(head=head, data=data))
        if  pid:
            if  not self.pid_pat.match(str(pid)) or len(str(pid)) != 32:
                head = {'status': 'fail', 'reason': 'Invalid pid',
                        'args': kwargs, 'ctime': 0, 'input': uinput}
                data = []
                return self.datastream(dict(head=head, data=data))
            elif self.taskmgr.is_alive(pid):
                return pid
            else: # process is done, get data
                self.reqmgr.remove(pid)
                head, data = self.get_data(kwargs)
                return self.datastream(dict(head=head, data=data))
        else:
            config = self.dasconfig.get('cacherequests', {})
            thr = threshold(self.sitedbmgr, self.hot_thr, config)
            nhits = self.get_nhits()
            if  nhits > thr: # exceed threshold
                if  self.busy(): # put request onhold, server is busy
                    tstamp = time.time() + 60*(nhits/thr) + (nhits%thr)
                    pid  = dasquery.qhash
                    self.reqmgr.add_onhold(\
                        pid, uinput, cherrypy.request.remote.ip, tstamp)
                    head = {'status':'onhold',
                            'mongo_query':dasquery.mongo_query,
                            'pid':pid, 'nresults':0, 'ctime':0,
                            'timestamp':time.time()}
                    data = []
                    return self.datastream(dict(head=head, data=data))
            addr = cherrypy.request.headers.get('Remote-Addr')
            _evt, pid = self.taskmgr.spawn(\
                self.dasmgr.call, dasquery, addr, pid=dasquery.qhash)
            self.logdb(uinput) # put entry in log DB once we place a request
            self.reqmgr.add(pid, kwargs)
            return pid

    def get_page_content(self, kwargs, complete_msg=True):
        """Retrieve page content for provided set of parameters"""
        page = ''
        try:
            view = kwargs.get('view', 'list')
            if  view == 'plain':
                if  kwargs.has_key('limit'):
                    del kwargs['limit']
            if  view in ['json', 'xml', 'plain'] and complete_msg:
                page = 'Request comlpeted. Reload the page ...'
            else:
                head, data = self.get_data(kwargs)
                func = getattr(self, view + "view")
                page = func(head, data)
        except HTTPError as _err:
            raise 
        except Exception as exc:
            print_exc(exc)
            msg  = gen_error_msg(kwargs)
            page = self.templatepage('das_error', msg=msg)
        return page

    @expose
    def makepy(self, dataset, instance):
        """
        Request to create CMSSW py snippet for a given dataset
        """
        pat = re.compile('/.*/.*/.*')
        if  not pat.match(dataset):
            msg = 'Invalid dataset name'
            return self.error(msg)
        query = "file dataset=%s instance=%s | grep file.name" \
                % (dataset, instance)
        try:
            data   = self.dasmgr.result(query, idx=0, limit=0)
        except Exception as exc:
            print_exc(exc)
            msg    = 'Exception: %s\n' % str(exc)
            msg   += 'Unable to retrieve data for query=%s' % query
            return self.error(msg)
        lfns = []
        for rec in data:
            filename = DotDict(rec).get('file.name')
            if  filename not in lfns:
                lfns.append(filename)
        page = self.templatepage('das_files_py', lfnList=lfns, pfnList=[])
        cherrypy.response.headers['Content-Type'] = "text/plain"
        return page

    @expose
    @checkargs(DAS_WEB_INPUTS)
    def request(self, **kwargs):
        """
        Request data from DAS cache.
        """
        # remove expires records from merge collection
        self.dasmgr.rawcache.remove_expired('merge')

        # do not allow caching
        cherrypy.response.headers['Cache-Control'] = 'no-cache'
        cherrypy.response.headers['Pragma'] = 'no-cache'

        uinput  = kwargs.get('input', '').strip()
        if  not uinput:
            kwargs['reason'] = 'No input found'
            return self.redirect(**kwargs)

        time0   = time.time()
        self.adjust_input(kwargs)
        view    = kwargs.get('view', 'list')
        inst    = kwargs.get('instance', self.dbs_global)
        uinput  = kwargs.get('input', '')
        if  self.busy():
            return self.busy_page(uinput)
        ahash    = genkey(cherrypy.request.params)
        self.logdb(uinput)
        form    = self.form(uinput=uinput, instance=inst, view=view)
        check, content = self.generate_dasquery(uinput, inst)
        if  check:
            if  view == 'list' or view == 'table':
                return self.page(form + content, ctime=time.time()-time0)
            else:
                return content
        dasquery = content # returned content is valid DAS query
        status, qhash = self.dasmgr.get_status(dasquery)
        if  status == 'ok':
            page = self.get_page_content(kwargs, complete_msg=False)
            ctime = (time.time()-time0)
            if  view == 'list' or view == 'table':
                return self.page(form + page, ctime=ctime)
            return page
        else:
            kwargs['dasquery'] = dasquery.storage_query
            addr = cherrypy.request.headers.get('Remote-Addr')
            _evt, pid = self.taskmgr.spawn(self.dasmgr.call, dasquery, addr,
                                pid=dasquery.qhash)
            self.reqmgr.add(pid, kwargs)
            if  self.taskmgr.is_alive(pid):
                page = self.templatepage('das_check_pid', method='check_pid',
                        uinput=uinput, view=view, ahash=ahash,
                        base=self.base, pid=pid, interval=self.interval)
            else:
                page = self.get_page_content(kwargs)
                self.reqmgr.remove(pid)
        ctime = (time.time()-time0)
        return self.page(form + page, ctime=ctime)

    @expose
    def requests(self):
        """Return list of all current requests in DAS queue"""
        page = ""
        count = 0
        for row in self.reqmgr.items():
            page += '<li>%s placed at %s<br/>%s</li>' \
                        % (row['_id'], row['timestamp'], row['kwds'])
            count += 1
        if  page:
            page = "<ul>%s</ul>" % page
        else:
            page = "The request queue is empty"
        if  count:
            page += '<div>Total: %s requests</div>' % count
        return self.page(page)

    @expose
    @checkargs(['pid', 'ahash'])
    def check_pid(self, pid, ahash):
        """
        Check status of given pid and return appropriate page content.
        This is a server callback function for ajaxCheckPid, see
        js/ajax_utils.js
        """
        cherrypy.response.headers['Cache-Control'] = 'no-cache'
        cherrypy.response.headers['Pragma'] = 'no-cache'
        img  = '<img src="%s/images/loading.gif" alt="loading"/>' % self.base
        page = ''
        try:
            if  self.taskmgr.is_alive(pid):
                page = img + " processing PID=%s" % pid
            else:
                kwargs = self.reqmgr.get(pid)
                if  kwargs and kwargs.has_key('dasquery'):
                    del kwargs['dasquery']
                # if no kwargs (another request delete it)
                # use logging DB to look-up user request via ahash
                if  not kwargs:
                    spec = {'ahash':ahash}
                    skey = [('ts', DESCENDING)]
                    res  = [r for r in self.logcol.find(spec).sort(skey)]
                    kwargs = res[0]['args']
                    self.adjust_input(kwargs)
                self.reqmgr.remove(pid)
                page = self.get_page_content(kwargs)
        except Exception as err:
            msg = 'check_pid fails for pid=%s' % pid
            print dastimestamp('DAS WEB ERROR '), msg
            print_exc(err)
            self.reqmgr.remove(pid)
            self.taskmgr.remove(pid)
            return self.error(gen_error_msg({'pid':pid}), wrap=False)
        return page

    def listview(self, head, data):
        """DAS listview data representation"""
        return self.repmgr.listview(head, data)

    def tableview(self, head, data):
        """DAS tabular view data representation"""
        return self.repmgr.tableview(head, data)

    def plainview(self, head, data):
        """DAS plain view data representation"""
        return self.repmgr.plainview(head, data)

    def xmlview(self, head, data):
        """DAS XML data representation"""
        return self.repmgr.xmlview(head, data)

    def jsonview(self, head, data):
        """DAS JSON data representation"""
        return self.repmgr.jsonview(head, data)

    @exposedasjson
    @checkargs(['query', 'dbs_instance'])
    def autocomplete(self, **kwargs):
        """
        Provides autocomplete functionality for DAS web UI.
        """
        query = kwargs.get("query", "").strip()
        result = autocomplete_helper(query, self.dasmgr, self.daskeys)
        dataset = [r for r in result if r['value'].find('dataset=')!=-1]
        dbsinst = kwargs.get('dbs_instance', self.dbs_global)
        if  self.dataset_daemon and len(dataset):
            dbs_urls = [d for d in self.dbsmgr.keys() if d.find(dbsinst) != -1]
            if  len(dbs_urls) == 1:
                dbsmgr = self.dbsmgr[dbs_urls[0]]
                if  query.find('dataset=') != -1:
                    query = query.replace('dataset=', '')
                for row in dbsmgr.find(query):
                    result.append({'css': 'ac-info',
                                   'value': 'dataset=%s' % row,
                                   'info': 'dataset'})
        return result