Ejemplo n.º 1
0
    def __init__(self, name, config):
        self.name = name
        try:
            self.verbose = config['verbose']
            title = 'DASAbstactService_%s' % self.name
            self.logger = PrintManager(title, self.verbose)
            self.dasmapping = config['dasmapping']
            self.write2cache = config.get('write_cache', True)
            self.multitask = config['das'].get('multitask', True)
            self.error_expire = config['das'].get('error_expire', 300)
            self.dbs_global = None  # to be configured at run time
            self.dburi = config['mongodb']['dburi']
            engine = config.get('engine', None)
            self.gfs = db_gridfs(self.dburi)
        except Exception as exc:
            print_exc(exc)
            raise Exception('fail to parse DAS config')

        # read key/cert info
        try:
            self.ckey, self.cert = get_key_cert()
        except Exception as exc:
            print_exc(exc)
            self.ckey = None
            self.cert = None

        if self.multitask:
            nworkers = config['das'].get('api_workers', 3)
            thr_weights = config['das'].get('thread_weights', [])
            for system_weight in thr_weights:
                system, weight = system_weight.split(':')
                if system == self.name:
                    nworkers *= int(weight)
#             if  engine:
#                 thr_name = 'DASAbstractService:%s:PluginTaskManager' % self.name
#                 self.taskmgr = PluginTaskManager(\
#                         engine, nworkers=nworkers, name=thr_name)
#                 self.taskmgr.subscribe()
#             else:
#                 thr_name = 'DASAbstractService:%s:TaskManager' % self.name
#                 self.taskmgr = TaskManager(nworkers=nworkers, name=thr_name)
            thr_name = 'DASAbstractService:%s:TaskManager' % self.name
            self.taskmgr = TaskManager(nworkers=nworkers, name=thr_name)
        else:
            self.taskmgr = None

        self.map = {}  # to be defined by data-service implementation
        self._keys = None  # to be defined at run-time in self.keys
        self._params = None  # to be defined at run-time in self.parameters
        self._notations = {}  # to be defined at run-time in self.notations

        self.logger.info('initialized')
        # define internal cache manager to put 'raw' results into cache
        if 'rawcache' in config and config['rawcache']:
            self.localcache = config['rawcache']
        else:
            msg = 'Undefined rawcache, please check your configuration'
            raise Exception(msg)
Ejemplo n.º 2
0
 def test_error(self):
     "Test logger error method"
     old_stdout = sys.stdout
     logger = PrintManager(self.name)  # verbose is irrelevant
     sys.stdout = StringIO.StringIO()
     logger.error('test')
     result = sys.stdout.getvalue()
     expect = 'ERROR %s:%s test\n' % (self.name, funcname())
     self.assertEqual(expect, result)
     sys.stdout = old_stdout
Ejemplo n.º 3
0
Archivo: logger_t.py Proyecto: ktf/DAS
 def test_info(self):
     "Test logger info method"
     old_stdout = sys.stdout
     logger = PrintManager(self.name, verbose=1)
     sys.stdout = StringIO()
     logger.info('test')
     result = sys.stdout.getvalue()
     expect = 'INFO %s:%s test\n' % (self.name, funcname())
     self.assertEqual(expect, result)
     sys.stdout = old_stdout
Ejemplo n.º 4
0
Archivo: logger_t.py Proyecto: ktf/DAS
 def test_debug(self):
     "Test logger debug method"
     old_stdout = sys.stdout
     logger = PrintManager(self.name, verbose=2)
     sys.stdout = StringIO()
     logger.debug('test')
     result = sys.stdout.getvalue()
     expect = 'DEBUG %s:%s test\n' % (self.name, funcname())
     self.assertEqual(expect, result)
     sys.stdout = old_stdout
Ejemplo n.º 5
0
Archivo: logger_t.py Proyecto: ktf/DAS
 def test_error(self):
     "Test logger error method"
     old_stdout = sys.stdout
     logger = PrintManager(self.name) # verbose is irrelevant
     sys.stdout = StringIO()
     logger.error('test')
     result = sys.stdout.getvalue()
     expect = 'ERROR %s:%s test\n' % (self.name, funcname())
     self.assertEqual(expect, result)
     sys.stdout = old_stdout
Ejemplo n.º 6
0
 def test_info(self):
     "Test logger info method"
     old_stdout = sys.stdout
     logger = PrintManager(self.name, verbose=1)
     sys.stdout = StringIO.StringIO()
     logger.info('test')
     result = sys.stdout.getvalue()
     expect = 'INFO %s:%s test\n' % (self.name, funcname())
     self.assertEqual(expect, result)
     sys.stdout = old_stdout
Ejemplo n.º 7
0
 def test_debug(self):
     "Test logger debug method"
     old_stdout = sys.stdout
     logger = PrintManager(self.name, verbose=2)
     sys.stdout = StringIO.StringIO()
     logger.debug('test')
     result = sys.stdout.getvalue()
     expect = 'DEBUG %s:%s test\n' % (self.name, funcname())
     self.assertEqual(expect, result)
     sys.stdout = old_stdout
Ejemplo n.º 8
0
Archivo: logger_t.py Proyecto: ktf/DAS
 def test_warning(self):
     "Test logger warning method"
     old_stdout = sys.stdout
     logger = PrintManager(self.name) # verbose is irrelevant
     sys.stdout = StringIO()
     logger.warning('test')
     result = sys.stdout.getvalue()
     expect = 'WARNING %s:%s test\n' % (self.name, funcname())
     self.assertEqual(expect, result)
     sys.stdout = old_stdout
Ejemplo n.º 9
0
 def test_warning(self):
     "Test logger warning method"
     old_stdout = sys.stdout
     logger = PrintManager(self.name)  # verbose is irrelevant
     sys.stdout = StringIO.StringIO()
     logger.warning('test')
     result = sys.stdout.getvalue()
     expect = 'WARNING %s:%s test\n' % (self.name, funcname())
     self.assertEqual(expect, result)
     sys.stdout = old_stdout
Ejemplo n.º 10
0
 def __init__(self, config):
     self.verbose = config['verbose']
     self.logger = PrintManager('DASParserDB', self.verbose)
     self.dburi = config['mongodb']['dburi']
     self.dbname = config['parserdb']['dbname']
     self.sizecap = config['parserdb'].get('sizecap', 5 * 1024 * 1024)
     self.colname = config['parserdb']['collname']
     msg = "DASParserCache::__init__ %s@%s" % (self.dburi, self.dbname)
     self.logger.info(msg)
     self.create_db()
Ejemplo n.º 11
0
class QueryRunner(object):
    "Replaces das_robot"
    task_options = [{'name':'query', 'type':'string', 'default':None,
                   'help':'Query to issue using das_core::call'}]
    def __init__(self, **kwargs):
        self.logger = PrintManager('QueryRunner', kwargs.get('verbose', 0))
        self.das = kwargs['DAS']
        self.dasquery = DASQuery(kwargs['dasquery'])
    def __call__(self):
        "__call__ implementation"
        self.logger.info("Issuing query %s" % self.dasquery)
        result = self.das.call(self.dasquery, add_to_analytics=False)
        return {'result':result}
Ejemplo n.º 12
0
    def __init__(self, name, config):
        self.name = name
        try:
            self.verbose      = config['verbose']
            title             = 'DASAbstactService_%s' % self.name
            self.logger       = PrintManager(title, self.verbose)
            self.dasmapping   = config['dasmapping']
            self.analytics    = config['dasanalytics']
            self.write2cache  = config.get('write_cache', True)
            self.multitask    = config['das'].get('multitask', True)
            self.error_expire = config['das'].get('error_expire', 300) 
            if  config.has_key('dbs'):
                self.dbs_global = config['dbs'].get('dbs_global_instance', None)
            else:
                self.dbs_global = None
            dburi             = config['mongodb']['dburi']
            engine            = config.get('engine', None)
            self.gfs          = db_gridfs(dburi)
        except Exception as exc:
            print_exc(exc)
            raise Exception('fail to parse DAS config')

        # read key/cert info
        try:
            self.ckey, self.cert = get_key_cert()
        except Exception as exc:
            print_exc(exc)
            self.ckey = None
            self.cert = None

        if  self.multitask:
            nworkers = config['das'].get('api_workers', 3)
            thr_weights = config['das'].get('thread_weights', [])
            for system_weight in thr_weights:
                system, weight = system_weight.split(':')
                if  system == self.name:
                    nworkers *= int(weight)
            if  engine:
                thr_name = 'DASAbstractService:%s:PluginTaskManager' % self.name
                self.taskmgr = PluginTaskManager(\
                        engine, nworkers=nworkers, name=thr_name)
                self.taskmgr.subscribe()
            else:
                thr_name = 'DASAbstractService:%s:TaskManager' % self.name
                self.taskmgr = TaskManager(nworkers=nworkers, name=thr_name)
        else:
            self.taskmgr = None

        self.map        = {}   # to be defined by data-service implementation
        self._keys      = None # to be defined at run-time in self.keys
        self._params    = None # to be defined at run-time in self.parameters
        self._notations = {}   # to be defined at run-time in self.notations

        self.logger.info('initialized')
        # define internal cache manager to put 'raw' results into cache
        if  config.has_key('rawcache') and config['rawcache']:
            self.localcache   = config['rawcache']
        else:
            msg = 'Undefined rawcache, please check your configuration'
            raise Exception(msg)
Ejemplo n.º 13
0
    def __init__(self, config=None):
        if  not config:
            config = das_readconfig()
        if  not config.has_key('dasmapping'):
            config['dasmapping'] = DASMapping(config)
        if  not config.has_key('dasanalytics'):
            config['dasanalytics'] = DASAnalytics(config)
        if  not config['dasmapping'].check_maps():
            msg = "No DAS maps found in MappingDB"
            raise Exception(msg)
        self.map         = config['dasmapping']
        self.analytics   = config['dasanalytics']
        self.dasservices = config['services']
        self.daskeysmap  = self.map.daskeys()
        self.operators   = list(das_operators())
        self.daskeys     = list(das_special_keys())
        self.verbose     = config['verbose']
        self.logger      = PrintManager('QLManger', self.verbose)
        for val in self.daskeysmap.values():
            for item in val:
                self.daskeys.append(item)
        parserdir   = config['das']['parserdir']
        self.dasply = DASPLY(parserdir, self.daskeys, self.dasservices, 
                verbose=self.verbose)

        self.enabledb = config['parserdb']['enable']
        if  self.enabledb:
            self.parserdb = DASParserDB(config)
Ejemplo n.º 14
0
    def __init__(self, **kwargs):
        self.key = kwargs['key']
        self.logger = PrintManager('ValueHotspot', kwargs.get('verbose', 0))
        self.allow_wildcarding = kwargs.get('allow_wildcarding', False)
        self.find_supersets = kwargs.get('find_supersets', False)
        self.preempt = int(kwargs.get('preempt', 60))
        self.fields = kwargs.get('fields', None)
        self.instance = kwargs.get('instance', 'cms_dbs_prod_global')
        
        HotspotBase.__init__(self,
                             identifier="valuehotspot-%s" % \
                             (self.key.replace('.','-')),
                             **kwargs)
        
        # set fields if look-up key is present
        if  not self.fields and self.key:
            self.fields = [self.key.split('.')[0]]

        # finally if fields is not yet set, look-up all DAS keys allowed
        # for given query
        if  not self.fields:
            try:
                self.fields = set()
                self.das.mapping.init_presentationcache()
                plist = self.das.mapping.presentation(self.key.split('.', 1)[0])
                for item in plist:
                    if 'link' in item:
                        for link in item['link']:
                            if len(link['query'].split(' ')) == 2:
                                self.fields.add(link['query'].split(' ')[0])
                self.fields.add(self.key.split('.', 1)[0])
                self.fields = list(self.fields)
            except:
                self.fields = []
Ejemplo n.º 15
0
    def __new__(cls, config):
        """
        creates a new instance of the class and cache it or return an existing
         instance if one exists (only when the params match).

        only the last instance is cached, but this simplifies the implementation
        as the param 'config' might be a complex unhashable object.
        """
        # check if we can reuse an existing instance
        if cls.__cached_inst and cls.__cached_params == config:
            if  config['verbose']:
                print("DASMapping::__new__: returning a cached instance")
            return cls.__cached_inst

        # otherwise create and initialize a new instance
        if  config['verbose']:
            print("DASMapping::__new__: creating a new instance")
        self = object.__new__(cls)

        self.verbose  = config['verbose']
        self.logger   = PrintManager('DASMapping', self.verbose)
        self.services = config['services']
        self.dburi    = config['mongodb']['dburi']
        self.dbname   = config['mappingdb']['dbname']
        self.colname  = config['mappingdb']['collname']
        self.map_test = config.get('map_test', True)
        self.main_dbs = config['das'].get('main_dbs', 'dbs3')
        self.dbsinsts = config['das'].get('dbs_instances', [])

        msg = "%s@%s" % (self.dburi, self.dbname)
        self.logger.info(msg)

        self.das_son_manipulator = DAS_SONManipulator()
        index = [('type', DESCENDING),\
                 ('system', DESCENDING),\
                 ('urn', DESCENDING),\
                 ('das_map.das_key', DESCENDING),\
                 ('das_map.rec_key', DESCENDING),\
                 ('das_map.api_arg', DESCENDING),\
                 ]
        create_indexes(self.col, index)

        self.daskeyscache = {}         # to be filled at run time
        self.systems = []              # to be filled at run time
        self.dasmapscache = {}         # to be filled at run time
        self.keymap = {}               # to be filled at run time
        self.presentationcache = {}    # to be filled at run time
        self.reverse_presentation = {} # to be filled at run time
        self.notationcache = {}        # to be filled at run time
        self.diffkeycache = {}         # to be filled at run time
        self.apicache = {}             # to be filled at run time
        self.dbs_global_url = None     # to be determined at run time
        self.dbs_inst_names = None     # to be determined at run time
        self.load_maps()

        # cache the instance and return it
        DASMapping.__cached_inst = self
        DASMapping.__cached_params = config
        return self
Ejemplo n.º 16
0
    def __init__(self, config):
        self.verbose = config['verbose']
        self.logger = PrintManager('DASKeyLearning', self.verbose)
        self.services = config['services']
        self.dburi = config['mongodb']['dburi']
        self.dbname = config['keylearningdb']['dbname']
        self.colname = config['keylearningdb']['collname']

        self.mapping = config['dasmapping']

        msg = "%s@%s" % (self.dburi, self.dbname)
        self.logger.info(msg)

        self.das_son_manipulator = DAS_SONManipulator()
        index_list = [('system', ASCENDING), ('urn', ASCENDING), \
                ('members', ASCENDING), ('stems', ASCENDING)]
        create_indexes(self.col, index_list)
Ejemplo n.º 17
0
 def setUp(self):
     """
     set up DAS core module
     """
     debug = 0
     config = deepcopy(das_readconfig())
     logger = PrintManager('TestDASCache', verbose=debug)
     config['logger'] = logger
     config['verbose'] = debug
Ejemplo n.º 18
0
 def __init__(self, **kwargs):
     self.logger = PrintManager('HotspotBase', kwargs.get('verbose', 0))
     self.das = kwargs['DAS']
     self.fraction = float(kwargs.get('fraction', 0.15))
     self.mode = kwargs.get('mode','calls').lower()
     self.period = int(kwargs.get('period', 86400*30))
     self.interval = kwargs['interval']
     self.allowed_gap = int(kwargs.get('allowed_gap', 3600))
     self.identifier = kwargs['identifier']
Ejemplo n.º 19
0
 def __init__(self, config):
     self.verbose = config['verbose']
     self.logger  = PrintManager('DASAnalytics', self.verbose)
     self.dburi   = config['mongodb']['dburi']
     self.dbname  = config['analyticsdb']['dbname']        
     self.colname = config['analyticsdb']['collname']
     self.history = config['analyticsdb']['history']
     msg = "%s@%s" % (self.dburi, self.dbname)
     self.logger.info(msg)
     self.create_db()
Ejemplo n.º 20
0
 def __init__(self, config):
     self.verbose  = config['verbose']
     self.logger   = PrintManager('DASParserDB', self.verbose)
     self.dburi    = config['mongodb']['dburi']
     self.dbname   = config['parserdb']['dbname']
     self.sizecap  = config['parserdb'].get('sizecap', 5*1024*1024)
     self.colname  = config['parserdb']['collname']
     msg = "DASParserCache::__init__ %s@%s" % (self.dburi, self.dbname)
     self.logger.info(msg)
     self.create_db()
Ejemplo n.º 21
0
    def __init__(self, name, config):
        self.name = name
        try:
            self.verbose = config["verbose"]
            title = "DASAbstactService_%s" % self.name
            self.logger = PrintManager(title, self.verbose)
            self.dasmapping = config["dasmapping"]
            self.write2cache = config.get("write_cache", True)
            self.multitask = config["das"].get("multitask", True)
            self.error_expire = config["das"].get("error_expire", 300)
            self.dbs_global = None  # to be configured at run time
            self.dburi = config["mongodb"]["dburi"]
            engine = config.get("engine", None)
            self.gfs = db_gridfs(self.dburi)
        except Exception as exc:
            print_exc(exc)
            raise Exception("fail to parse DAS config")

        # read key/cert info
        try:
            self.ckey, self.cert = get_key_cert()
        except Exception as exc:
            print_exc(exc)
            self.ckey = None
            self.cert = None

        if self.multitask:
            nworkers = config["das"].get("api_workers", 3)
            thr_weights = config["das"].get("thread_weights", [])
            for system_weight in thr_weights:
                system, weight = system_weight.split(":")
                if system == self.name:
                    nworkers *= int(weight)
            if engine:
                thr_name = "DASAbstractService:%s:PluginTaskManager" % self.name
                self.taskmgr = PluginTaskManager(engine, nworkers=nworkers, name=thr_name)
                self.taskmgr.subscribe()
            else:
                thr_name = "DASAbstractService:%s:TaskManager" % self.name
                self.taskmgr = TaskManager(nworkers=nworkers, name=thr_name)
        else:
            self.taskmgr = None

        self.map = {}  # to be defined by data-service implementation
        self._keys = None  # to be defined at run-time in self.keys
        self._params = None  # to be defined at run-time in self.parameters
        self._notations = {}  # to be defined at run-time in self.notations

        self.logger.info("initialized")
        # define internal cache manager to put 'raw' results into cache
        if "rawcache" in config and config["rawcache"]:
            self.localcache = config["rawcache"]
        else:
            msg = "Undefined rawcache, please check your configuration"
            raise Exception(msg)
Ejemplo n.º 22
0
    def __init__(self, config):
        self.verbose  = config['verbose']
        self.logger   = PrintManager('DASKeyLearning', self.verbose)
        self.services = config['services']
        self.dburi    = config['mongodb']['dburi']
        self.dbname   = config['keylearningdb']['dbname']
        self.colname  = config['keylearningdb']['collname']
        
        self.mapping  = config['dasmapping']

        msg = "%s@%s" % (self.dburi, self.dbname)
        self.logger.info(msg)
        
        self.col = None
        self.create_db()
Ejemplo n.º 23
0
    def setUp(self):
        """
        set up DAS core module
        """
        debug = 0
        config = deepcopy(das_readconfig())
        logger = PrintManager('TestDASMongocache', verbose=debug)
        config['logger'] = logger
        config['verbose'] = debug
        dburi = config['mongodb']['dburi']

        connection = MongoClient(dburi)
        connection.drop_database('das')
        dasmapping = DASMapping(config)
        config['dasmapping'] = dasmapping
        self.dasmongocache = DASMongocache(config)
Ejemplo n.º 24
0
 def __init__(self, config=None):
     if not config:
         config = das_readconfig()
     self.dasmapping = DASMapping(config)
     if not self.dasmapping.check_maps():
         msg = "No DAS maps found in MappingDB"
         raise Exception(msg)
     self.dasservices = config['services']
     self.daskeysmap = self.dasmapping.daskeys()
     self.operators = list(das_operators())
     self.daskeys = list(das_special_keys())
     self.verbose = config['verbose']
     self.logger = PrintManager('QLManger', self.verbose)
     for val in self.daskeysmap.values():
         for item in val:
             self.daskeys.append(item)
Ejemplo n.º 25
0
    def __init__(self, config):
        self.verbose  = config['verbose']
        self.logger   = PrintManager('DASKeyLearning', self.verbose)
        self.services = config['services']
        self.dburi    = config['mongodb']['dburi']
        self.dbname   = config['keylearningdb']['dbname']
        self.colname  = config['keylearningdb']['collname']

        self.mapping  = config['dasmapping']

        msg = "%s@%s" % (self.dburi, self.dbname)
        self.logger.info(msg)

        self.das_son_manipulator = DAS_SONManipulator()
        index_list = [('system', ASCENDING), ('urn', ASCENDING), \
                ('members', ASCENDING), ('stems', ASCENDING)]
        create_indexes(self.col, index_list)
Ejemplo n.º 26
0
    def __init__(self, config):
        self.verbose = config["verbose"]
        self.logger = PrintManager("DASMapping", self.verbose)
        self.services = config["services"]
        self.dburi = config["mongodb"]["dburi"]
        self.dbname = config["mappingdb"]["dbname"]
        self.colname = config["mappingdb"]["collname"]
        self.map_test = config.get("map_test", True)
        self.main_dbs = config["das"].get("main_dbs", "dbs")
        self.dbsinsts = config["das"].get("dbs_instances", [])

        msg = "%s@%s" % (self.dburi, self.dbname)
        self.logger.info(msg)

        self.init()
        self.on_reload = Event()

        # Monitoring thread which performs auto-reconnection to MongoDB
        thname = "mappingdb_monitor"
        sleep = 5
        reload_time = config["mappingdb"].get("reload_time", 86400)
        reload_time_bad_maps = config["mappingdb"].get("reload_time_bad_maps", 120)
        start_new_thread(
            thname,
            db_monitor,
            (self.dburi, self.init, sleep, self.load_maps, reload_time, self.check_maps, reload_time_bad_maps),
        )

        self.daskeyscache = {}  # to be filled at run time
        self.systems = []  # to be filled at run time
        self.dasmapscache = {}  # to be filled at run time
        self.keymap = {}  # to be filled at run time
        self.presentationcache = {}  # to be filled at run time
        self.reverse_presentation = {}  # to be filled at run time
        self.notationcache = {}  # to be filled at run time
        self.diffkeycache = {}  # to be filled at run time
        self.apicache = {}  # to be filled at run time
        self.dbs_global_url = None  # to be determined at run time
        self.dbs_inst_names = None  # to be determined at run time
        self.load_maps(notify=False)
Ejemplo n.º 27
0
    def __init__(self, config):
        self.verbose = config["verbose"]
        self.logger = PrintManager("DASMapping", self.verbose)
        self.services = config["services"]
        self.dburi = config["mongodb"]["dburi"]
        self.dbname = config["mappingdb"]["dbname"]
        self.colname = config["mappingdb"]["collname"]

        msg = "%s@%s" % (self.dburi, self.dbname)
        self.logger.info(msg)

        self.create_db()

        self.keymap = {}  # to be filled at run time
        self.presentationcache = {}  # to be filled at run time
        self.reverse_presentation = {}  # to be filled at run time
        self.notationcache = {}  # to be filled at run time
        self.diffkeycache = {}  # to be filled at run time
        self.apicache = {}  # to be filled at run time
        self.apiinfocache = {}  # to be filled at run time
        self.init_notationcache()
        self.init_presentationcache()
Ejemplo n.º 28
0
    def __init__(self, config):
        self.emptyset_expire = expire_timestamp(\
            config['das'].get('emptyset_expire', 5))
        self.dburi   = config['mongodb']['dburi']
        self.cache_size = config['mongodb']['bulkupdate_size']
        self.dbname  = config['dasdb']['dbname']
        self.verbose = config['verbose']
        self.logger  = PrintManager('DASMongocache', self.verbose)
        self.mapping = config['dasmapping']

        self.conn    = db_connection(self.dburi)
        self.mdb     = self.conn[self.dbname]
        self.col     = self.mdb[config['dasdb']['cachecollection']]
        self.mrcol   = self.mdb[config['dasdb']['mrcollection']]
        self.merge   = self.mdb[config['dasdb']['mergecollection']]
        self.gfs     = db_gridfs(self.dburi)

        self.logdb   = DASLogdb(config)

        self.das_internal_keys = ['das_id', 'das', 'cache_id', 'qhash']

        msg = "%s@%s" % (self.dburi, self.dbname)
        self.logger.info(msg)

        self.add_manipulator()

        # ensure that we have the following indexes
        index_list = [('das.expire', ASCENDING), ('das_id', ASCENDING),
                      ('das.system', ASCENDING),
                      ('qhash', DESCENDING),
                      ('das.empty_record', ASCENDING)]
        create_indexes(self.col, index_list)
        index_list = [('das.expire', ASCENDING), ('das_id', ASCENDING),
                      ('qhash', DESCENDING),
                      ('das.empty_record', ASCENDING), ('das.ts', ASCENDING)]
        create_indexes(self.merge, index_list)
Ejemplo n.º 29
0
class DASAbstractService(object):
    """
    Abstract class describing DAS service. It initialized with a name which
    is used to identify service parameters from DAS configuration file.
    Those parameters are keys, verbosity level, URL of the data-service.
    """
    def __init__(self, name, config):
        self.name = name
        try:
            self.verbose = config['verbose']
            title = 'DASAbstactService_%s' % self.name
            self.logger = PrintManager(title, self.verbose)
            self.dasmapping = config['dasmapping']
            self.write2cache = config.get('write_cache', True)
            self.multitask = config['das'].get('multitask', True)
            self.error_expire = config['das'].get('error_expire', 300)
            self.dbs_global = None  # to be configured at run time
            self.dburi = config['mongodb']['dburi']
            engine = config.get('engine', None)
            self.gfs = db_gridfs(self.dburi)
        except Exception as exc:
            print_exc(exc)
            raise Exception('fail to parse DAS config')

        # read key/cert info
        try:
            self.ckey, self.cert = get_key_cert()
        except Exception as exc:
            print_exc(exc)
            self.ckey = None
            self.cert = None

        if self.multitask:
            nworkers = config['das'].get('api_workers', 3)
            thr_weights = config['das'].get('thread_weights', [])
            for system_weight in thr_weights:
                system, weight = system_weight.split(':')
                if system == self.name:
                    nworkers *= int(weight)
#             if  engine:
#                 thr_name = 'DASAbstractService:%s:PluginTaskManager' % self.name
#                 self.taskmgr = PluginTaskManager(\
#                         engine, nworkers=nworkers, name=thr_name)
#                 self.taskmgr.subscribe()
#             else:
#                 thr_name = 'DASAbstractService:%s:TaskManager' % self.name
#                 self.taskmgr = TaskManager(nworkers=nworkers, name=thr_name)
            thr_name = 'DASAbstractService:%s:TaskManager' % self.name
            self.taskmgr = TaskManager(nworkers=nworkers, name=thr_name)
        else:
            self.taskmgr = None

        self.map = {}  # to be defined by data-service implementation
        self._keys = None  # to be defined at run-time in self.keys
        self._params = None  # to be defined at run-time in self.parameters
        self._notations = {}  # to be defined at run-time in self.notations

        self.logger.info('initialized')
        # define internal cache manager to put 'raw' results into cache
        if 'rawcache' in config and config['rawcache']:
            self.localcache = config['rawcache']
        else:
            msg = 'Undefined rawcache, please check your configuration'
            raise Exception(msg)

    def status(self):
        "Return status of the service"
        return self.taskmgr.status()

    def services(self):
        """
        Return sub-subsystems used to retrieve data records. It is used
        in dasheader call to setup das.services field. This method can be
        overwritten in sub-classes, otherwise returns dict of service name
        and CMS systems used to retrieve data records.
        """
        return {self.name: [self.name]}

    def version(self):
        """Return data-services version, should be implemented in sub-classes"""
        return ''

    def keys(self):
        """
        Return service keys
        """
        if self._keys:
            return self._keys
        srv_keys = []
        for _api, params in self.map.items():
            for key in params['keys']:
                if not key in srv_keys:
                    srv_keys.append(key)
        self._keys = srv_keys
        return srv_keys

    def parameters(self):
        """
        Return mapped service parameters
        """
        if self._params:
            return self._params
        srv_params = []
        for _api, params in self.map.items():
            for key in params['params']:
                param_list = self.dasmapping.api2das(self.name, key)
                for par in param_list:
                    if not par in srv_params:
                        srv_params.append(par)
        self._params = srv_params
        return srv_params

    def notations(self):
        """
        Return a map of system notations.
        """
        if self._notations:
            return self._notations
        for _, rows in self.dasmapping.notations(self.name).items():
            for row in rows:
                api = row['api']
                nmap = row['rec_key']
                notation = row['api_output']
                if api in self._notations:
                    self._notations[api].update({notation: nmap})
                else:
                    self._notations[api] = {notation: nmap}
        return self._notations

    def getdata(self, url, params, expire, headers=None, post=None):
        """URL call wrapper"""
        if url.find('https:') != -1:
            return getdata(url,
                           params,
                           headers,
                           expire,
                           post,
                           self.error_expire,
                           self.verbose,
                           self.ckey,
                           self.cert,
                           system=self.name)
        else:
            return getdata(url,
                           params,
                           headers,
                           expire,
                           post,
                           self.error_expire,
                           self.verbose,
                           system=self.name)

    def call(self, dasquery):
        """
        Invoke service API to execute given query.
        Return results as a collect list set.
        """
        self.logger.info(dasquery)
        # check the cache for records with given query/system
        res = self.localcache.incache(dasquery,
                                      collection='cache',
                                      system=self.name)
        if res:
            msg = "found records in local cache"
            self.logger.info(msg)
            return
        # ask data-service api to get results, they'll be store them in
        # cache, so return at the end what we have in cache.
        self.api(dasquery)

    def write_to_cache(self, dasquery, expire, url, api, args, gen, ctime):
        """
        Write provided result set into DAS cache.
        """
        if not self.write2cache:
            return

        # before going to cache we should check/set possible misses, e.g.
        # primary key when error is thrown
        result = self.set_misses(dasquery, api, gen)

        # update the cache
        header = dasheader(self.name,
                           dasquery,
                           expire,
                           api,
                           url,
                           services=self.services())
        header['lookup_keys'] = self.lookup_keys(api)
        header['prim_key'] = self.dasmapping.primary_mapkey(self.name, api)
        header['ctime'] = ctime
        system = self.name
        self.localcache.update_cache(dasquery, result, header, system, api)

        msg = 'cache has been updated,\n'
        self.logger.debug(msg)

    def adjust_params(self, api, kwds, instance=None):
        """
        Data-service specific parser to adjust parameters according to
        its specifications. For example, DQ service accepts a string
        of parameters, rather parameter set, while DBS2 can reuse
        some parameters for different API, e.g. I can use dataset path
        to pass to listPrimaryDatasets as primary_dataset pattern.
        """
        pass

    def lookup_keys(self, api):
        """
        Return look-up keys of data output for given data-service API.
        """
        lkeys = self.dasmapping.lookup_keys(self.name, api)
        return [{api: lkeys}]

    def inspect_params(self, api, args):
        """
        Perform API parameter inspection. Check if API accept a range
        of parameters, etc.
        """
        for key, value in args.items():
            if isinstance(value, dict):
                minval = None
                maxval = None
                for oper, val in value.items():
                    if oper == '$in':
                        minval = int(val[0])
                        maxval = int(val[-1])
                        args[key] = range(minval, maxval)
                    elif oper == '$lt':
                        maxval = int(val)
                        args[key] = maxval
                    elif oper == '$lte':
                        maxval = int(val)
                        args[key] = maxval
                    elif oper == '$gt':
                        minval = int(val)
                        args[key] = minval
                    elif oper == '$gte':
                        minval = int(val)
                        args[key] = minval
                    else:
                        msg = '%s does not support operator %s' % (api, oper)
                        raise Exception(msg)
        return args

    def get_notations(self, api):
        """Return notations used for given API"""
        notationmap = self.notations()
        if not notationmap:
            return {}
        notations = {}
        if '' in notationmap:
            notations = dict(notationmap[''])  # notations applied to all APIs
            if api in notationmap:  # overwrite the one for provided API
                notations.update(notationmap[api])
        return notations

    def parser(self, dasquery, dformat, data, api):
        """
        DAS data parser. Input parameters:

        - *query* input DAS query
        - *dformat* is a data format, e.g. XML, JSON
        - *data* is a data source, either file-like object or
          actual data
        - *api* is API name
        """
        prim_key = self.dasmapping.primary_key(self.name, api)
        counter = 0
        if dformat.lower() == 'xml':
            tags = self.dasmapping.api2daskey(self.name, api)
            gen = xml_parser(data, prim_key, tags)
            for row in gen:
                counter += 1
                yield row
        elif dformat.lower() == 'json' or dformat.lower() == 'dasjson':
            gen = json_parser(data, self.logger)
            das_dict = {}
            for row in gen:
                if dformat.lower() == 'dasjson':
                    for key, val in row.items():
                        if key != 'results':
                            das_dict[key] = val
                    row = row['results']
                if isinstance(row, list):
                    for item in row:
                        if item:
                            if prim_key in item:
                                counter += 1
                                yield item
                            else:
                                counter += 1
                                yield {prim_key: item}
                else:
                    if prim_key in row:
                        counter += 1
                        yield row
                    else:
                        counter += 1
                        yield {prim_key: row}
        else:
            msg = 'Unsupported data format="%s", API="%s"' % (dformat, api)
            raise Exception(msg)
        msg = "api=%s, format=%s " % (api, dformat)
        msg += "prim_key=%s yield %s rows" % (prim_key, counter)
        self.logger.info(msg)

    def translator(self, api, genrows):
        """
        Convert raw results into DAS records. 
        """
        prim_key = self.dasmapping.primary_key(self.name, api)
        count = 0
        for row in genrows:
            row2das(self.dasmapping.notation2das, self.name, api, row)
            count += 1
            # check for primary key existance, since it can be overriden
            # by row2das. For example DBS3 uses flat namespace, so we
            # override dataset=>name, while dataset still is a primary key
            if isinstance(row, list):
                yield {prim_key: row}
            elif prim_key in row:
                if prim_key in row[prim_key]:
                    yield row[prim_key]  # remapping may create nested dict
                else:
                    yield row
            else:
                yield {prim_key: row}
        msg = "yield %s rows" % count
        self.logger.debug(msg)

    def set_misses(self, dasquery, api, genrows):
        """
        Check and adjust DAS records wrt input query. If some of the DAS
        keys are missing, add it with its value to the DAS record.
        """
        # look-up primary key
        prim_key = self.dasmapping.primary_key(self.name, api)

        # Scan all docs and store those whose size above MongoDB limit into
        # GridFS
        map_key = self.dasmapping.primary_mapkey(self.name, api)
        genrows = parse2gridfs(self.gfs, map_key, genrows, self.logger)

        spec = dasquery.mongo_query['spec']
        row = next(genrows)
        ddict = DotDict(row)
        keys2adjust = []
        for key in spec.keys():
            val = ddict.get(key)
            if spec[key] != val and key not in keys2adjust:
                keys2adjust.append(key)
        msg = "adjust keys %s" % keys2adjust
        self.logger.debug(msg)
        count = 0
        if keys2adjust:
            # adjust of the rows
            for row in yield_rows(row, genrows):
                ddict = DotDict(row)
                pval = ddict.get(map_key)
                if isinstance(pval, dict) and 'error' in pval:
                    ddict[map_key] = ''
                    ddict.update({prim_key: pval})
                for key in keys2adjust:
                    value = spec[key]
                    existing_value = ddict.get(key)
                    # the way to deal with proximity/patern/condition results
                    if  (isinstance(value, str) or isinstance(value, unicode))\
                        and value.find('*') != -1: # we got pattern
                        if existing_value:
                            value = existing_value
                    elif isinstance(value, dict) or \
                        isinstance(value, list): # we got condition
                        if existing_value:
                            value = existing_value
                        elif isinstance(value, dict) and \
                        '$in' in value: # we got a range {'$in': []}
                            value = value['$in']
                        elif isinstance(value, dict) and \
                        '$lte' in value and '$gte' in value:
                            # we got a between range
                            value = [value['$gte'], value['$lte']]
                        else:
                            value = json.dumps(value)
                    elif existing_value and value != existing_value:
                        # we got proximity results
                        if 'proximity' in ddict:
                            proximity = DotDict({key: existing_value})
                            ddict['proximity'].update(proximity)
                        else:
                            proximity = DotDict({})
                            proximity[key] = existing_value
                            ddict['proximity'] = proximity
                    else:
                        if existing_value:
                            value = existing_value
                    ddict[key] = value
                yield ddict
                count += 1
        else:
            yield row
            for row in genrows:
                yield row
                count += 1
        msg = "yield %s rows" % count
        self.logger.debug(msg)

    def api(self, dasquery):
        """
        Data service api method, can be defined by data-service class.
        It parse input query and invoke appropriate data-service API
        call. All results are stored into the DAS cache along with
        api call inserted into Analytics DB.
        """
        self.logger.info(dasquery)
        genrows = self.apimap(dasquery)
        if not genrows:
            return
        jobs = []
        for url, api, args, dformat, expire in genrows:
            # insert DAS query record for given API
            header = dasheader(self.name, dasquery, expire, api, url)
            self.localcache.insert_query_record(dasquery, header)
            # fetch DAS data records
            if self.multitask:
                jobs.append(self.taskmgr.spawn(self.apicall, \
                            dasquery, url, api, args, dformat, expire))
            else:
                self.apicall(dasquery, url, api, args, dformat, expire)
        if self.multitask:
            self.taskmgr.joinall(jobs)

    def apicall(self, dasquery, url, api, args, dformat, expire):
        """
        Data service api method, can be defined by data-service class.
        It parse input query and invoke appropriate data-service API
        call. All results are stored into the DAS cache along with
        api call inserted into Analytics DB.

        We invoke explicitly close call for our datastream instead
        of using context manager since this method as well as
        getdata/parser can be overwritten by child classes.
        """
        datastream = None
        try:
            args = self.inspect_params(api, args)
            time0 = time.time()
            headers = make_headers(dformat)
            datastream, expire = self.getdata(url, args, expire, headers)
            self.logger.info("%s expire %s" % (api, expire))
            rawrows = self.parser(dasquery, dformat, datastream, api)
            dasrows = self.translator(api, rawrows)
            ctime = time.time() - time0
            self.write_to_cache(dasquery, expire, url, api, args, dasrows,
                                ctime)
        except Exception as exc:
            msg  = 'Fail to process: url=%s, api=%s, args=%s' \
                    % (url, api, args)
            print(msg)
            print_exc(exc)
        close(datastream)

    def url_instance(self, url, _instance):
        """
        Virtual method to adjust URL for a given instance,
        must be implemented in service classes
        """
        return url

    def adjust_url(self, url, instance):
        """
        Adjust data-service URL wrt provided instance, e.g.
        DBS carry several instances
        """
        if instance:
            url = self.url_instance(url, instance)
        return url

    def apimap(self, dasquery):
        """
        Analyze input query and yield url, api, args, format, expire
        for further processing.
        """
        srv = self.name  # get local copy to avoid threading issues
        cond = getarg(dasquery.mongo_query, 'spec', {})
        instance = dasquery.mongo_query.get('instance', self.dbs_global)
        skeys = getarg(dasquery.mongo_query, 'fields', [])
        if not skeys:
            skeys = []
        self.logger.info("\n")
        for api, value in self.map.items():
            expire = value['expire']
            iformat = value['format']
            url = self.adjust_url(value['url'], instance)
            if not url:
                msg = '--- rejects API %s, no URL' % api
                self.logger.info(msg)
                continue
            args = dict(value['params'])  # make new copy, since we'll adjust
            wild = value.get('wild_card', '*')
            found = 0
            # check if input parameters are covered by API
            if not self.dasmapping.check_api_match(srv, api, cond):
                msg = '--- rejects API %s, does not cover input condition keys' \
                        % api
                self.logger.info(msg)
                continue
            # once we now that API covers input set of parameters we check
            # every input parameter for pattern matching
            for key, val in cond.items():
                # check if keys from conditions are accepted by API
                # need to convert key (which is daskeys.map) into
                # input api parameter
                for apiparam in self.dasmapping.das2api(srv, api, key, val):
                    if apiparam in args:
                        args[apiparam] = val
                        found += 1
            # VK 20160708, wrong statement, it caused to pass
            # datasets API for query dataset in [path1, path2]
            # I'll leave block here until I test and verify that
            # commented out block will not cause other issues
            #
            # check the case when we only have single condition key
            # and it is the key we look-up
#             if  not found and skeys == [k.split('.')[0] for k in cond.keys()]:
#                 found = 1
# check if number of keys on cond and args are the same
            if len(cond.keys()) != found:
                msg = "--- reject API %s, not all condition keys are covered" \
                        % api
                self.logger.info(msg)
                msg = 'args=%s' % args
                self.logger.debug(msg)
                continue
            if not found:
                msg = "--- rejects API %s, parameters don't match" % api
                self.logger.info(msg)
                msg = 'args=%s' % args
                self.logger.debug(msg)
                continue
            self.adjust_params(api, args, instance)
            # delete args keys whose value is optional
            delete_keys(args, 'optional')
            # check that there is no "required" parameter left in args,
            # since such api will not work
            if 'required' in args.values():
                msg = '--- rejects API %s, parameter is required' % api
                self.logger.info(msg)
                msg = 'args=%s' % args
                self.logger.debug(msg)
                continue
            # adjust pattern symbols in arguments
            if wild != '*':
                for key, val in args.items():
                    if isinstance(val, str) or isinstance(val, unicode):
                        val = val.replace('*', wild)
                    args[key] = val

            # compare query selection keys with API look-up keys
            api_lkeys = self.dasmapping.api_lkeys(srv, api)
            if set(api_lkeys) != set(skeys):
                msg = "--- rejects API %s, api_lkeys(%s)!=skeys(%s)"\
                        % (api, api_lkeys, skeys)
                self.logger.info(msg)
                continue

            msg = '+++ %s passes API %s' % (srv, api)
            self.logger.info(msg)
            msg = 'args=%s' % args
            self.logger.debug(msg)

            msg = "yield "
            msg += "system ***%s***, url=%s, api=%s, args=%s, format=%s, " \
                % (srv, url, api, args, iformat)
            msg += "expire=%s, wild_card=%s" \
                % (expire, wild)
            self.logger.debug(msg)

            yield url, api, args, iformat, expire
Ejemplo n.º 30
0
    def __init__(self, config):
        self.config = config
        self.emptyset_expire = \
                expire_timestamp(config['das'].get('emptyset_expire', 5))
        self.dburi = config['mongodb']['dburi']
        self.cache_size = config['mongodb']['bulkupdate_size']
        self.dbname = config['dasdb']['dbname']
        self.verbose = config['verbose']
        self.logger = PrintManager('DASMongocache', self.verbose)
        self.mapping = config['dasmapping']
        self.logging = config['dasdb'].get('logging', False)
        self.rec_ttl = config['dasdb'].get('record_ttl', 24 * 60 * 60)
        self.del_ttl = config['dasdb'].get('delta_ttl', 60)
        self.cleanup_del_ttl = config['dasdb'].get('cleanup_delta_ttl', 3600)
        self.retry = config['dasdb'].get('retry', 3)
        self.das_son_manipulator = DAS_SONManipulator()

        # Initialize MongoDB connection
        self.col_ = self.config['dasdb']['cachecollection']
        self.mrcol_ = self.config['dasdb']['mrcollection']
        self.merge_ = self.config['dasdb']['mergecollection']
        self.gfs = db_gridfs(self.dburi)

        msg = "%s@%s" % (self.dburi, self.dbname)
        self.logger.info(msg)

        # ensure that we have the following indexes
        common_idx = [
            ('file.name', DESCENDING),
            ('dataset.name', DESCENDING),
            ('block.name', DESCENDING),
            ('run.run_number', DESCENDING),
        ]
        index_list = [('das.expire', ASCENDING), ('das_id', ASCENDING),
                      ('das.system', ASCENDING), ('qhash', DESCENDING),
                      ('das.record', ASCENDING)]
        create_indexes(self.col, index_list + common_idx)
        index_list = [('das.expire', ASCENDING), ('das_id', ASCENDING),
                      ('qhash', DESCENDING), ('das.record', ASCENDING),
                      ('das.ts', ASCENDING)]
        create_indexes(self.merge, index_list)
        # NOTE: I found that creating index in merge collection leads to
        # MongoDB error when records contains multiple arrays on indexed
        # keys. For example, when we query file,run,lumi both file and run
        # are arrays in MongoDB. In this case the final sort in MongoDB
        # bark with the following message:
        # cannot sort with keys that are parallel arrays
        # it looks like that there is no fix for that yet
        # see
        # http://stackoverflow.com/questions/6516725/how-do-i-index-two-arrays-in-mongodb
        # therefore I temporary disabled create_indexes call on merge
        # collection which was used to have index to ease final sort,
        # especially in a case when a lot of records correspond to inital
        # query, e.g. file records.
        # On another hand, the most common use case where sort fails is
        # getting file records, and I can add one compound key to ease sort
        # but I can't add another compound key on array field, e.g. run
        common_idx = [[('qhash', DESCENDING), ('file.name', DESCENDING)]]
        create_indexes(self.merge, index_list + common_idx)

        # thread which clean-up DAS collections
        thname = 'mongocache_cleanup'
        cols = [
            config['dasdb']['cachecollection'],
            config['dasdb']['mrcollection'], config['dasdb']['mergecollection']
        ]
Ejemplo n.º 31
0
 def __init__(self, **kwargs):
     self.logger = PrintManager('QueryRunner', kwargs.get('verbose', 0))
     self.das = kwargs['DAS']
     self.dasquery = DASQuery(kwargs['dasquery'])
Ejemplo n.º 32
0
class DASMongocache(object):
    """
    DAS cache based MongoDB.
    """
    def __init__(self, config):
        self.config = config
        self.emptyset_expire = \
                expire_timestamp(config['das'].get('emptyset_expire', 5))
        self.dburi = config['mongodb']['dburi']
        self.cache_size = config['mongodb']['bulkupdate_size']
        self.dbname = config['dasdb']['dbname']
        self.verbose = config['verbose']
        self.logger = PrintManager('DASMongocache', self.verbose)
        self.mapping = config['dasmapping']
        self.logging = config['dasdb'].get('logging', False)
        self.rec_ttl = config['dasdb'].get('record_ttl', 24 * 60 * 60)
        self.del_ttl = config['dasdb'].get('delta_ttl', 60)
        self.cleanup_del_ttl = config['dasdb'].get('cleanup_delta_ttl', 3600)
        self.retry = config['dasdb'].get('retry', 3)
        self.das_son_manipulator = DAS_SONManipulator()

        # Initialize MongoDB connection
        self.col_ = self.config['dasdb']['cachecollection']
        self.mrcol_ = self.config['dasdb']['mrcollection']
        self.merge_ = self.config['dasdb']['mergecollection']
        self.gfs = db_gridfs(self.dburi)

        msg = "%s@%s" % (self.dburi, self.dbname)
        self.logger.info(msg)

        # ensure that we have the following indexes
        common_idx = [
            ('file.name', DESCENDING),
            ('dataset.name', DESCENDING),
            ('block.name', DESCENDING),
            ('run.run_number', DESCENDING),
        ]
        index_list = [('das.expire', ASCENDING), ('das_id', ASCENDING),
                      ('das.system', ASCENDING), ('qhash', DESCENDING),
                      ('das.record', ASCENDING)]
        create_indexes(self.col, index_list + common_idx)
        index_list = [('das.expire', ASCENDING), ('das_id', ASCENDING),
                      ('qhash', DESCENDING), ('das.record', ASCENDING),
                      ('das.ts', ASCENDING)]
        create_indexes(self.merge, index_list)
        # NOTE: I found that creating index in merge collection leads to
        # MongoDB error when records contains multiple arrays on indexed
        # keys. For example, when we query file,run,lumi both file and run
        # are arrays in MongoDB. In this case the final sort in MongoDB
        # bark with the following message:
        # cannot sort with keys that are parallel arrays
        # it looks like that there is no fix for that yet
        # see
        # http://stackoverflow.com/questions/6516725/how-do-i-index-two-arrays-in-mongodb
        # therefore I temporary disabled create_indexes call on merge
        # collection which was used to have index to ease final sort,
        # especially in a case when a lot of records correspond to inital
        # query, e.g. file records.
        # On another hand, the most common use case where sort fails is
        # getting file records, and I can add one compound key to ease sort
        # but I can't add another compound key on array field, e.g. run
        common_idx = [[('qhash', DESCENDING), ('file.name', DESCENDING)]]
        create_indexes(self.merge, index_list + common_idx)

        # thread which clean-up DAS collections
        thname = 'mongocache_cleanup'
        cols = [
            config['dasdb']['cachecollection'],
            config['dasdb']['mrcollection'], config['dasdb']['mergecollection']
        ]

    @property
    def col(self):
        "col property provides access to DAS cache collection"
        conn = db_connection(self.dburi)
        mdb = conn[self.dbname]
        colnames = mdb.collection_names()
        if not colnames or self.col_ not in colnames:
            try:
                mdb.create_collection(self.col_)
            except OperationFailure:
                pass
        mdb.add_son_manipulator(self.das_son_manipulator)
        return mdb[self.col_]

    @property
    def merge(self):
        "merge property provides access to DAS merge collection"
        conn = db_connection(self.dburi)
        mdb = conn[self.dbname]
        colnames = mdb.collection_names()
        if not colnames or self.merge_ not in colnames:
            try:
                mdb.create_collection(self.merge_)
            except OperationFailure:
                pass
        mdb.add_son_manipulator(self.das_son_manipulator)
        return mdb[self.merge_]

    @property
    def mrcol(self):
        "mrcol property provides access to DAS map-reduce collection"
        conn = db_connection(self.dburi)
        mdb = conn[self.dbname]
        mdb.add_son_manipulator(self.das_son_manipulator)
        return mdb[self.mrcol_]

    def get_dataset_hashes(self, dasquery):
        "Get dataset hashes from DBS database"
        spec = dasquery.mongo_query.get('spec', {})
        inst = dasquery.instance
        conn = db_connection(self.dburi)
        if spec and inst:
            dataset = spec.get('dataset.name', None)
            if dataset:
                if dataset.find('*') != -1:
                    cond = {'dataset': re.compile(dataset.replace('*', '.*'))}
                else:
                    cond = {'dataset': dataset}
                for row in conn['dbs'][inst].find(cond):
                    if 'qhash' in row:
                        yield row['qhash']

    def check_datasets(self, dasquery):
        "Check dataset presence in DAS cache for given das query"
        hashes = [r for r in self.get_dataset_hashes(dasquery)]
        if hashes:
            spec = {'qhash': {'$in': hashes}}
            if len(hashes) == self.merge.find(spec, **PYMONGO_OPTS).count():
                dasquery._hashes = hashes

    def get_superset_keys(self, key, value):
        """
        This is a special-case version of similar_keys,
        intended for analysers that want to quickly
        find possible superset queries of a simple
        query of the form key=value.
        """

        msg = "%s=%s" % (key, value)
        self.logger.debug(msg)
        cond = {'query.spec.key': key}
        for row in self.col.find(cond, **PYMONGO_OPTS):
            mongo_query = decode_mongo_query(row['query'])
            for thiskey, thisvalue in mongo_query.items():
                if thiskey == key:
                    if fnmatch.fnmatch(value, thisvalue):
                        yield thisvalue

    def get_fields(self, dasquery):
        "Prepare fields to extract from MongoDB"
        fields = dasquery.mongo_query.get('fields', [])
        if fields and 'records' in fields:
            fields = None  # look-up all records
        filters = dasquery.filters
        cond = {}
        if filters:
            new_fields = []
            for dasfilter in filters:
                if dasfilter == 'unique':
                    continue
                if  fields and dasfilter not in fields and \
                    dasfilter not in new_fields:
                    if  dasfilter.find('=') == -1 and dasfilter.find('<') == -1\
                    and dasfilter.find('>') == -1:
                        new_fields.append(dasfilter)
                    else:
                        cond = parse_filters(dasquery.mongo_query)
            if not new_fields and fields:
                new_fields = list(fields)
            return new_fields, cond
        return fields, cond

    def remove_expired(self, dasquery, collection):
        """
        Remove expired records from DAS cache. We need to perform this
        operation very carefullly since we don't use transation and on-going
        commits can invoke this method (see das_core.py).  Therefore we use
        MongoDB $or operator to wipe out queries which match DASQuery hash and
        already expired or queries which lived in cache more then rec_ttl
        config parameter. The later operation just prevent DAS cache from
        growing.
        """
        conn = db_connection(self.dburi)
        mdb = conn[self.dbname]
        mdb.add_son_manipulator(self.das_son_manipulator)
        col = mdb[collection]
        # use additional delta to check data record expiration
        # we add this delta to ensure that there is no records close to
        # current timestamp which may expire during request processing
        spec = {
            'qhash': dasquery.qhash,
            'das.expire': {
                '$lt': time.time() + self.del_ttl
            }
        }
        col.delete_many(spec)

    def check_services(self, dasquery):
        """
        Check if DAS cache contains DAS records with service response for
        given query.
        """
        das_rec = self.find(dasquery)
        if not das_rec:
            return False
        if 'das' not in das_rec:
            return False
        if 'services' not in das_rec['das']:
            return False
        spec = {
            'qhash': dasquery.qhash,
            'das.system': {
                '$ne': 'das'
            },
            'das.expire': {
                '$gt': time.time()
            }
        }
        nres = self.col.find(spec, **PYMONGO_OPTS).count()
        if nres:
            return True
        return False

    def find(self, dasquery):
        """
        Find provided query in DAS cache.
        """
        cond = {
            'qhash': dasquery.qhash,
            'das.system': 'das',
            'das.expire': {
                '$gt': time.time()
            }
        }
        return find_one(self.col, cond)

    def find_specs(self, dasquery, system='das'):
        """
        Check if cache has query whose specs are identical to provided query.
        Return all matches.
        """
        if dasquery.hashes:
            cond = {'qhash': {'$in': dasquery.hashes}}
        else:
            cond = {'qhash': dasquery.qhash}
        if system:
            cond.update({'das.system': system})
        cond.update({'das.expire': {'$gt': time.time()}})
        return self.col.find(cond, **PYMONGO_OPTS)

    def get_das_ids(self, dasquery):
        """
        Return list of DAS ids associated with given query
        """
        das_ids = []
        try:
            das_ids = \
                [r['_id'] for r in self.col.find_specs(dasquery, system='')]
        except:
            pass
        return das_ids

    def update_das_expire(self, dasquery, timestamp):
        "Update timestamp of all DAS data records for given query"
        nval = {'$set': {'das.expire': timestamp}}
        spec = {'qhash': dasquery.qhash}
        self.col.update_many(spec, nval)
        self.merge.update_many(spec, nval)

    def das_record(self, dasquery):
        "Retrieve DAS record for given query"
        cond = {'qhash': dasquery.qhash, 'das.expire': {'$gt': time.time()}}
        return find_one(self.col, cond)

    def find_records(self, das_id):
        " Return all the records matching a given das_id"
        return self.col.find({'das_id': das_id}, **PYMONGO_OPTS)

    def is_error_in_records(self, dasquery, collection='cache'):
        "Scan DAS cache for error records and return true or not"
        if collection == 'cache':
            results = self.col.find({'qhash': dasquery.qhash}, **PYMONGO_OPTS)
        else:
            results = self.merge.find({'qhash': dasquery.qhash},
                                      **PYMONGO_OPTS)
        error = None
        reason = None
        for row in results:
            if 'error' in row:
                error = row.get('error')
                reason = row.get('reason', '')
                break
        return error, reason

    def add_to_record(self, dasquery, info, system=None):
        "Add to existing DAS record provided info"
        if system:
            self.col.update_one(
                {
                    'query': dasquery.storage_query,
                    'das.system': system
                }, {'$set': info},
                upsert=True)
        else:
            self.col.update_one({'query': dasquery.storage_query},
                                {'$set': info},
                                upsert=True)

    def find_min_expire(self, dasquery):
        """Find minimal expire timestamp across all records for given DAS query"""
        spec = {'qhash': dasquery.qhash}
        min_expire = 2 * time.time()  # upper bound, will update
        for rec in self.col.find(spec, **PYMONGO_OPTS):
            if 'das' in rec and 'expire' in rec['das']:
                estamp = rec['das']['expire']
                if min_expire > estamp:
                    min_expire = estamp
        return long(min_expire)

    def find_query_record(self, dasquery):
        "Find DAS query records and return them to the caller"
        spec = {
            'qhash': dasquery.qhash,
            'das.record': record_codes('query_record')
        }
        return self.col.find(spec, **PYMONGO_OPTS)

    def update_query_record(self, dasquery, status, header=None, reason=None):
        "Update DAS record for provided query"
        ctime = time.time()
        das_spec = {'qhash': dasquery.qhash, 'das.system': 'das'}
        min_expire = self.find_min_expire(dasquery)
        if header:
            system = header['das']['system']
            sts = header['das']['status']
            expire = header['das']['expire']
            spec = {'qhash': dasquery.qhash, 'das.system': system}
            new_expire = None
            for rec in self.col.find(spec, **PYMONGO_OPTS):
                if 'das' in rec and 'expire' in rec['das']:
                    if rec['das']['expire'] > expire:
                        new_expire = expire
                        ndict = {'das.expire': expire, 'das.status': status}
                        cdict = {'das.ctime': ctime}
                        udict = {'$set': ndict, '$push': cdict}
                        oid = ObjectId(rec['_id'])
                        self.col.update_one({'_id': oid}, udict)
            if new_expire:
                udict = {
                    '$set': {
                        'das.expire': new_expire
                    },
                    '$push': {
                        'das.ctime': ctime
                    }
                }
                self.col.update_one(das_spec, udict)
        else:
            udict = {
                '$set': {
                    'das.status': status,
                    'das.expire': min_expire
                },
                '$push': {
                    'das.ctime': ctime
                }
            }
            self.col.update_one(das_spec, udict)
        if reason:
            udict = {'$set': {'das.reason': reason}}
            self.col.update_one(das_spec, udict)
        # align all expire timestamps when we recieve ok status
        if status == 'ok':
            udict = {'$set': {'das.expire': min_expire}}
            self.col.update_one(das_spec, udict)

    def apilist(self, dasquery):
        "Return list of apis for given dasquery"
        spec = {
            'qhash': dasquery.qhash,
            'das.record': record_codes('query_record')
        }
        apis = []
        for row in self.col.find(spec, ['das.api'], **PYMONGO_OPTS):
            try:
                apis += row['das']['api']
            except Exception as _err:
                pass
        return apis

    def incache(self,
                dasquery,
                collection='merge',
                system=None,
                api=None,
                query_record=False):
        """
        Check if we have query results in cache, otherwise return null.
        Please note, input parameter query means MongoDB query, please
        consult MongoDB API for more details,
        http://api.mongodb.org/python/
        """
        if query_record:
            record = record_codes('query_record')
        else:
            record = spec4data_records()
        spec = {
            'qhash': dasquery.qhash,
            'das.record': record,
            'das.expire': {
                '$gt': time.time()
            }
        }
        if system:
            spec.update({'das.system': system})
        if api:
            spec.update({'das.api': api})
        conn = db_connection(self.dburi)
        mdb = conn[self.dbname]
        mdb.add_son_manipulator(self.das_son_manipulator)
        col = mdb[collection]
        res = col.find(spec, **PYMONGO_OPTS).count()
        msg = "(%s, coll=%s) found %s results" % (dasquery, collection, res)
        self.logger.info(msg)
        if res:
            return True
        return False

    def nresults(self, dasquery, collection='merge'):
        """Return number of results for given query."""
        if dasquery.aggregators:
            return len(dasquery.aggregators)
        # Distinguish 2 use cases, unique filter and general query
        # in first one we should count only unique records, in later
        # we can rely on DB count() method. Pleas keep in mind that
        # usage of fields in find doesn't account for counting, since it
        # is a view over records found with spec, so we don't need to use it.
        fields, filter_cond = self.get_fields(dasquery)
        if not fields:
            spec = dasquery.mongo_query.get('spec', {})
        elif dasquery.hashes:
            spec = {
                'qhash': {
                    '$in': dasquery.hashes
                },
                'das.record': spec4data_records()
            }
        else:
            spec = {'qhash': dasquery.qhash, 'das.record': spec4data_records()}
        if filter_cond:
            spec.update(filter_cond)
        conn = db_connection(self.dburi)
        mdb = conn[self.dbname]
        mdb.add_son_manipulator(self.das_son_manipulator)
        col = mdb[collection]
        if dasquery.unique_filter:
            skeys = self.mongo_sort_keys(collection, dasquery)
            if skeys:
                gen = col.find(spec, **PYMONGO_OPTS).sort(skeys)
            else:
                gen = col.find(spec, **PYMONGO_OPTS)
            res = len([r for r in unique_filter(gen)])
        else:
            res = col.find(spec, **PYMONGO_OPTS).count()
            if not res:  # double check that this is really the case
                time.sleep(1)
                res = col.find(spec, **PYMONGO_OPTS).count()
        msg = "%s" % res
        self.logger.info(msg)
        return res

    def mongo_sort_keys(self, collection, dasquery):
        """
        Find list of sort keys for a given DAS query. Check existing
        indexes and either use fields or spec keys to find them out.
        Return list of mongo sort keys in a form of (key, order).
        """
        # try to get sort keys all the time to get ordered list of
        # docs which allow unique_filter to apply afterwards
        fields = dasquery.mongo_query.get('fields')
        spec = dasquery.mongo_query.get('spec')
        skeys = dasquery.sortkeys
        mongo_skeys = []
        if skeys:
            for key in skeys:
                if key.find('-') != -1:  # reverse order, e.g. desc
                    mongo_skeys.append((key.replace('-', ''), DESCENDING))
                else:
                    mongo_skeys.append((key, ASCENDING))
        else:
            existing_idx = [i for i in self.existing_indexes(collection)]
            if fields:
                lkeys = []
                for key in fields:
                    for mkey in self.mapping.mapkeys(key):
                        if mkey not in lkeys:
                            lkeys.append(mkey)
            else:
                lkeys = list(spec.keys())
            keys = [k for k in lkeys \
                if k.find('das') == -1 and k.find('_id') == -1 and \
                        k in existing_idx]
            mongo_skeys = [(k, ASCENDING) for k in keys]
        return mongo_skeys

    def existing_indexes(self, collection='merge'):
        """
        Get list of existing indexes in DB. They are returned by
        index_information API in the following for:

        .. doctest::

            {u'_id_': {u'key': [(u'_id', 1)], u'v': 0},
             u'das.expire_1': {u'key': [(u'das.expire', 1)], u'v': 0},
             ...
             u'tier.name_-1': {u'key': [(u'tier.name', -1)], u'v': 0}}
        """
        conn = db_connection(self.dburi)
        mdb = conn[self.dbname]
        mdb.add_son_manipulator(self.das_son_manipulator)
        col = mdb[collection]
        for val in col.index_information().values():
            for idx in val['key']:
                yield idx[0]  # index name

    def get_records(self, coll, spec, fields, skeys, idx, limit, unique=False):
        "Generator to get records from MongoDB."
        try:
            conn = db_connection(self.dburi)
            mdb = conn[self.dbname]
            mdb.add_son_manipulator(self.das_son_manipulator)
            col = mdb[coll]
            nres = col.find(spec, **PYMONGO_OPTS).count()
            if nres == 1 or nres <= limit:
                limit = 0
            if limit:
                res = col.find(spec, fields, sort=skeys, skip=idx, limit=limit)
            else:
                res = col.find(spec, fields, sort=skeys, **PYMONGO_OPTS)
            if unique:
                res = unique_filter(res)
            for row in res:
                yield row
        except Exception as exp:
            print_exc(exp)
            row = {'exception': str(exp)}
            res = []
            yield row

    def get_from_cache(self, dasquery, idx=0, limit=0, collection='merge'):
        "Generator which retrieves results from the cache"
        if dasquery.service_apis_map():  # valid DAS query
            result = self.get_das_records(dasquery, idx, limit, collection)
            for row in result:
                yield row
        else:  # pure MongoDB query
            fields = dasquery.mongo_query.get('fields', [])
            if fields == None:
                fields = []
            spec = dasquery.mongo_query.get('spec', {})
            if dasquery.filters:
                if not fields:
                    fields = []
                fields += dasquery.filters
                pkeys = [k.split('.')[0] for k in fields]
            fields += das_record_keys()
            if 'records' in dasquery.query:
                fields = None  # special case for DAS 'records' keyword
            skeys = self.mongo_sort_keys(collection, dasquery)
            result  = self.get_records(collection, spec, fields, skeys, \
                            idx, limit, dasquery.unique_filter)
            for row in result:
                if dasquery.filters:
                    if pkeys and set(pkeys) & set(row.keys()):
                        yield row
                else:
                    yield row

    def get_das_records(self, dasquery, idx=0, limit=0, collection='merge'):
        "Generator which retrieves DAS records from the cache"
        msg = "(%s, %s, %s, coll=%s)" % (dasquery, idx, limit, collection)
        self.logger.info(msg)

        idx = int(idx)
        fields, filter_cond = self.get_fields(dasquery)
        if fields == None:
            fields = []
        if not fields:
            spec = dasquery.mongo_query.get('spec', {})
        elif dasquery.hashes:
            spec = {
                'qhash': {
                    '$in': dasquery.hashes
                },
                'das.record': spec4data_records()
            }
        else:
            spec = {'qhash': dasquery.qhash, 'das.record': spec4data_records()}
        if filter_cond:
            spec.update(filter_cond)
        if 'records' in dasquery.query:
            fields = None  # retrieve all fields for records DAS query
        else:
            # be sure to extract das internal keys
            fields += das_record_keys()
        # try to get sort keys all the time to get ordered list of
        # docs which allow unique_filter to apply afterwards
        skeys = self.mongo_sort_keys(collection, dasquery)
        res     = self.get_records(collection, spec, fields, skeys, \
                        idx, limit, dasquery.unique_filter)
        counter = 0
        for row in res:
            counter += 1
            yield row
        msg = 'qhash %s, found %s record(s) in %s collection' \
                % (dasquery.qhash, counter, collection)
        print(dastimestamp('DAS INFO '), msg)

        if counter:
            msg = "yield %s record(s)" % counter
            self.logger.info(msg)

        # if no raw records were yield we look-up possible error records
        # and reset timestamp for record with system:['das']
        if not counter:
            spec = {'qhash': dasquery.qhash}
            nrec = self.col.find(spec, **PYMONGO_OPTS).count()
            if nrec:
                msg = "for query %s, found %s non-result record(s)" \
                        % (dasquery, nrec)
                print(dastimestamp('DAS WARNING'), msg)
                for rec in self.col.find(spec, **PYMONGO_OPTS):
                    if 'query' in rec:
                        print(dastimestamp('DAS das record'), rec)
            self.update_das_expire(dasquery, etstamp())

    def map_reduce(self, mr_input, dasquery, collection='merge'):
        """
        Wrapper around _map_reduce to allow sequential map/reduce
        operations, e.g. map/reduce out of map/reduce.

        mr_input is either alias name or list of alias names for
        map/reduce functions.

        Input dasquery which is applied to first
        iteration of map/reduce functions.
        """
        # NOTE: I need to revisit mapreduce.
        spec = dasquery.mongo_query['spec']
        if not isinstance(mr_input, list):
            mrlist = [mr_input]
        else:
            mrlist = mr_input
        conn = db_connection(self.dburi)
        mdb = conn[self.dbname]
        mdb.add_son_manipulator(self.das_son_manipulator)
        coll = mdb[collection]
        for mapreduce in mrlist:
            if mapreduce == mrlist[0]:
                cond = spec
            else:
                cond = None
            coll = self._map_reduce(coll, mapreduce, cond)
        for row in coll.find():
            yield row

    def _map_reduce(self, coll, mapreduce, spec=None):
        """
        Perform map/reduce operation over DAS cache using provided
        collection, mapreduce name and optional conditions.
        """
        self.logger.debug("(%s, %s)" % (mapreduce, spec))
        record = find_one(self.mrcol, {'name': mapreduce})
        if not record:
            raise Exception("Map/reduce function '%s' not found" % mapreduce)
        fmap = record['map']
        freduce = record['reduce']
        if spec:
            result = coll.map_reduce(Code(fmap), Code(freduce), query=spec)
        else:
            result = coll.map_reduce(Code(fmap), Code(freduce))
        msg = "found %s records in %s" % (result.count(), result.name)
        self.logger.info(msg)
        self.logger.debug(fmap)
        self.logger.debug(freduce)
        return result

    def get_map_reduce(self, name=None):
        """
        Return definition of map/reduce functions for provided name
        or gives full list.
        """
        spec = {}
        if name:
            spec = {'name': name}
        result = self.mrcol.find(spec, **PYMONGO_OPTS)
        for row in result:
            yield row

    def merge_records(self, dasquery, attempt=0):
        """
        Merge DAS records for provided query. We perform the following
        steps:
        1. get all queries from das.cache by ordering them by primary key
        2. run aggregtor function to merge neighbors
        3. insert records into das.merge
        """
        ### TMP for asyncio
        #         time.sleep(attempt+3) # pymongo 3.2 don't yet flush in time

        # remove any entries in merge collection for this query
        self.merge.delete_many({'qhash': dasquery.qhash})
        # proceed
        self.logger.debug(dasquery)
        id_list = []
        expire = 9999999999  # future
        # get all API records for given DAS query
        spec = {
            'qhash': dasquery.qhash,
            'das.expire': {
                '$gt': time.time()
            },
            'das.record': record_codes('query_record')
        }
        records = self.col.find(spec, **PYMONGO_OPTS)
        for row in records:
            # find smallest expire timestamp to be used by aggregator
            rexpire = row.get('das', {}).get('expire', expire)
            if rexpire < expire:
                expire = rexpire
            if row['_id'] not in id_list:
                id_list.append(row['_id'])
        inserted = 0
        lookup_keys = set()
        fields = dasquery.mongo_query.get('fields')
        if not fields:  # Mongo
            fields = []
        for key in fields:
            for pkey in self.mapping.mapkeys(key):
                lookup_keys.add(pkey)
        for pkey in lookup_keys:
            skey = [(pkey, DESCENDING)]
            # lookup all service records
            spec = {'das_id': {'$in': id_list}, 'das.primary_key': pkey}
            if self.verbose:
                nrec = self.col.find(spec, **PYMONGO_OPTS).sort(skey).count()
                msg = "merging %s records, for %s key" % (nrec, pkey)
            else:
                msg = "merging records, for %s key" % pkey
            self.logger.debug(msg)
            # use exhaust=False since we process all records in aggregator
            # and it can be delay in processing
            records = self.col.find(spec, **PYMONGO_NOEXHAUST).sort(skey)
            # aggregate all records
            agen = aggregator(dasquery, records, expire)
            # diff aggregated records
            gen = das_diff(agen, self.mapping.diff_keys(pkey.split('.')[0]))
            # insert all records into das.merge using bulk insert
            size = self.cache_size
            try:
                res = self.merge.insert_many(gen)
                inserted += len(res.inserted_ids)
            except InvalidDocument as exp:
                print(dastimestamp('DAS WARNING'),
                      'InvalidDocument during merge', str(exp))
                msg = "Caught bson error: " + str(exp)
                self.logger.info(msg)
                records = self.col.find(spec, **PYMONGO_OPTS).sort(skey)
                gen = aggregator(dasquery, records, expire)
                genrows = parse2gridfs(self.gfs, pkey, gen, self.logger)
                das_dict = {
                    'das': {
                        'expire': expire,
                        'das.record': record_codes('gridfs_record'),
                        'primary_key': [k for k in lookup_keys],
                        'system': ['gridfs']
                    },
                    'qhash': dasquery.qhash,
                    'cache_id': [],
                    'das_id': id_list
                }
                for row in genrows:
                    row.update(das_dict)
                    self.merge.insert(row)
            except InvalidOperation as exp:
                pass
            except DuplicateKeyError as err:
                print(dastimestamp('DAS WARNING'),
                      'DuplicateKeyError during merge')
                if not isinstance(gen, list):
                    raise err
        status = 'fail'
        if inserted:
            status = 'ok'
        elif not lookup_keys:  # we get query w/o fields
            msg = 'qhash %s, no lookup_keys' % dasquery.qhash
            print(dastimestamp('DAS WARNING'), msg)
            status = 'ok'
        else:  # we didn't merge anything, it is DB look-up failure
            msg  = 'qhash %s, did not insert into das.merge, attempt %s' \
                    % (dasquery.qhash, attempt)
            print(dastimestamp('DAS WARNING'), msg)
            empty_expire = etstamp()
            lkeys = list(lookup_keys)
            das = dict(expire=empty_expire,
                       primary_key=lkeys[0],
                       condition_keys=lkeys,
                       instance=dasquery.instance,
                       system=['das'],
                       services=dasquery.services,
                       record=record_codes('empty_record'),
                       ts=time.time(),
                       api=[])
            empty_record = {
                'das': das,
                'qhash': dasquery.qhash,
                'cache_id': [],
                'das_id': id_list
            }
            for key in lkeys:
                empty_record.update({key.split('.')[0]: []})
            for key, val in dasquery.mongo_query['spec'].items():
                if key.find('.') == -1:
                    empty_record[key] = []
                else:  # it is compound key, e.g. site.name
                    newkey, newval = convert_dot_notation(key, val)
                    empty_record[newkey] = adjust_mongo_keyvalue(newval)
            self.merge.insert(empty_record)
            # update DAS records (both meta and data ones, by using qhash)
            nval = {'$set': {'das.expire': empty_expire}}
            spec = {'qhash': dasquery.qhash}
            self.col.update_many(spec, nval)
        return status

    def update_cache(self, dasquery, results, header, system, api):
        """
        Insert results into cache. Use bulk insert controller by
        self.cache_size. Upon completion ensure indexies.
        """
        # update results records in DAS cache
        gen = self.generate_records(dasquery, results, header)
        inserted = 0
        # bulk insert
        try:
            res = self.col.insert_many(gen,
                                       ordered=False,
                                       bypass_document_validation=True)
            inserted += len(res.inserted_ids)
        except InvalidOperation:
            pass

        # update query record for this sub-system
        self.update_query_record_system(dasquery, system, api, 'ok')

        if dasquery.qcache:  # custom DASQuery cache
            self.update_das_expire(dasquery, expire_timestamp(dasquery.qcache))

    def update_query_record_system(self, dasquery, system, api, status):
        "Update system status of dasquery in das.cache collection"
        spec = {
            'qhash': dasquery.qhash,
            'das.system': system,
            'das.api': api,
            'das.record': record_codes('query_record')
        }
        udict = {'$set': {'das.status': status}}
        #         print("### update_query_record", spec)
        doc = self.col.find_one_and_update(
            spec, udict, return_document=ReturnDocument.AFTER)
#         print(doc)

    def insert_query_record(self, dasquery, header):
        """
        Insert query record into DAS cache.
        """
        # check presence of API record in a cache
        dasheader = header['das']
        system = dasheader['system']
        api = dasheader['api']
        collection = 'cache'
        check_query = True
        expire = dasheader.get('expire', None)
        if expire:
            dasheader['expire'] = adjust_expire(expire)
        if not self.incache(dasquery, collection, system, api, check_query):
            msg = "query=%s, header=%s" % (dasquery, header)
            self.logger.debug(msg)
            q_record = dict(das=dasheader, query=dasquery.storage_query)
            q_record['das']['record'] = record_codes('query_record')
            q_record['das']['status'] = "requested"
            q_record['qhash'] = dasquery.qhash
            q_record['das']['ctime'] = [time.time()]
            res = self.col.insert_one(q_record)
            if not res:
                msg = 'unable to insert query record'
                print(dastimestamp('DAS ERROR '), dasquery, msg,
                      ', will retry')
                time.sleep(1)
                res = self.col.insert(q_record)
                if not res:
                    print(dastimestamp('DAS ERROR '), dasquery, msg)

    def generate_records(self, dasquery, results, header):
        """
        Iterate over provided results, update records and yield them
        to next level (update_cache)
        """
        self.logger.debug("(%s) store to cache" % dasquery)
        if not results:
            return

        dasheader = header['das']
        expire = adjust_expire(dasheader['expire'])
        system = dasheader['system']  # DAS service names, e.g. combined
        services = dasheader['services']  # CMS services used to get data
        api = dasheader['api']
        prim_key = header.get('prim_key', None)
        if not prim_key:
            # get primary key from a list of lookup keys which has the
            # following structure [{'api':[keys]}, {...}]
            lup_keys = header['lookup_keys']
            lkeys = [l for i in lup_keys for k in i.values() for l in k]
            prim_key = lkeys[0] if 'summary' not in lkeys else 'summary'
        cond_keys = list(dasquery.mongo_query['spec'].keys())
        # get API record id
        spec = {
            'qhash': dasquery.qhash,
            'das.system': system,
            'das.expire': {
                '$gt': time.time()
            },
            'das.record': record_codes('query_record')
        }
        counter = 0
        rids = [str(r['_id']) for r in \
                self.col.find(spec, ['_id'], **PYMONGO_OPTS)]
        if rids:
            if isinstance(results, list) or isinstance(results, GeneratorType):
                for item in results:
                    counter += 1
                    if 'das' in item:
                        expire = item.get('das').get('expire', expire)
                        dasheader['expire'] = expire
                    item['das'] = dict(expire=expire,
                                       primary_key=prim_key,
                                       condition_keys=cond_keys,
                                       instance=dasquery.instance,
                                       system=system,
                                       services=services,
                                       record=record_codes('data_record'),
                                       ts=time.time(),
                                       api=api)
                    item['das_id'] = rids
                    item['qhash'] = dasquery.qhash
                    yield item
            else:
                print("\n\n ### results = ", str(results))
                raise Exception(
                    'Provided results is not a list/generator type')
        if expire != dasheader['expire']:  # update DAS records
            header['das']['expire'] = expire
        # update das record with new status
        status = 'Update DAS cache, %s API' % header['das']['api'][0]
        self.update_query_record(dasquery, status, header)

        msg = "\n%s yield %s rows" % (dasheader['system'], counter)
        self.logger.info(msg)

    def remove_from_cache(self, dasquery):
        """
        Remove query from DAS cache. To do so, we retrieve API record
        and remove all data records from das.cache and das.merge
        """
        records = self.col.find({'qhash': dasquery.qhash}, **PYMONGO_OPTS)
        id_list = []
        for row in records:
            if row['_id'] not in id_list:
                id_list.append(row['_id'])
        spec = {'das_id': {'$in': id_list}}
        self.merge.remove(spec)
        self.merge.remove({'qhash': dasquery.qhash})
        self.col.remove(spec)
        self.col.remove({'qhash': dasquery.qhash})

    def clean_cache(self, collection=None):
        """
        Clean expired docs in das.cache and das.merge.
        """
        current_time = time.time()
        query = {'das.expire': {'$lt': current_time}}
        if not collection or collection == 'merge':
            self.merge.remove(query)
        if not collection or collection == 'cache':
            self.col.remove(query)

    def delete_cache(self):
        """
        Delete all results in DAS cache/merge collection, including
        internal indexes.
        """
        self.col.remove({})
        try:
            self.col.drop_indexes()
        except:
            pass
        self.merge.remove({})
        try:
            self.merge.drop_indexes()
        except:
            pass
Ejemplo n.º 33
0
class DASKeyLearning(object):
    """
    This class manages DAS key-learning DB.

    Key-learning is an intermittent process (triggered infrequently
    by a task running in the analytics framework), which involves
    searching through the raw cache for (a subset of but with
    maximum primary key coverage) all output documents, generating
    the set of all data members (in a dotted-dict fashion) and storing
    those as primary-key:data-member records (with an associated
    last-updated-time).

    """
    def __init__(self, config):
        self.verbose = config['verbose']
        self.logger = PrintManager('DASKeyLearning', self.verbose)
        self.services = config['services']
        self.dburi = config['mongodb']['dburi']
        self.dbname = config['keylearningdb']['dbname']
        self.colname = config['keylearningdb']['collname']

        self.mapping = config['dasmapping']

        msg = "%s@%s" % (self.dburi, self.dbname)
        self.logger.info(msg)

        self.das_son_manipulator = DAS_SONManipulator()
        index_list = [('system', ASCENDING), ('urn', ASCENDING), \
                ('members', ASCENDING), ('stems', ASCENDING)]
        create_indexes(self.col, index_list)

    @property
    def col(self):
        "col property provides access to DAS keylearning collection"
        conn = db_connection(self.dburi)
        mdb = conn[self.dbname]
        colnames = mdb.collection_names()
        if not colnames or self.colname not in colnames:
            try:
                mdb.create_collection(self.colname)
            except OperationFailure:
                pass
        mdb.add_son_manipulator(self.das_son_manipulator)
        return mdb[self.colname]

    def add_record(self, dasquery, rec):
        """
        Add/update to keylearning DB keys/attributes from given record.
        To do so, we parse it and call add_members method.
        """
        if not ('das' in rec and 'system' in rec['das']):
            return
        das = rec['das']
        if 'system' not in das or 'api' not in das or 'primary_key' not in das:
            return
        systems = das['system']
        apis = das['api']
        pkey = das['primary_key'].split('.')[0]
        data = rec.get(pkey, [])
        members = dict_members(data, pkey)
        for srv, api in zip(systems, apis):
            self.add_members(srv, api, members)
        # insert new record for query patern
        fields = dasquery.mongo_query.get('fields', [])
        if fields:
            for field in fields:
                if field in DAS_RECORD_KEYS:
                    continue
                new_members = [m for m in dict_members(rec[field], field) if m]
                members += new_members
        for attr in members:
            spec = {'member': attr}
            doc = {'query_pat': dasquery.query_pat}
            self.col.update(spec, {'$addToSet': doc}, upsert=True)

    def add_members(self, system, urn, members):
        """
        Add a list of data members for a given API (system, urn, url),
        and generate, which are stored as separate records.
        """
        msg = "system=%s, urn=%s, members=%s)" % (system, urn, members)
        self.logger.info(msg)

        result = self.col.find_one({'system': system, 'urn': urn})
        if result:
            self.col.update({'_id': ObjectId(result['_id'])},
                            {'$addToSet': {
                                'members': {
                                    '$each': members
                                }
                            }})
        else:
            keys = self.mapping.api2daskey(system, urn)
            self.col.insert({
                'system': system,
                'urn': urn,
                'keys': keys,
                'members': members
            })

        for member in members:
            if not self.col.find_one({'member': member}):
                self.col.insert({'member': member, 'stems': stem(member)})

    def text_search(self, text):
        """
        Perform a text search for data members matching a string. The input is
        split if it already includes dotted elements (in which case we need to
        find a member matching all the split elements), otherwise we look for
        any member whose stem list contains the text.
        """
        text = text.lower()
        if '.' in text:
            possible_members = self.col.find(\
                    {'stems': {'$all': text.split('.')}}, fields=['member'])
        else:
            possible_members = self.col.find({'stems': text},\
                                             fields=['member'])
        return [doc['member'] for doc in possible_members]

    def attributes(self):
        """
        Return full list of keyword attributes known in DAS.
        """
        spec = {'member': {'$exists': True}}
        return self.col.find(spec)

    def member_info(self, member):
        """
        Once the text search has identified a member that might be a match,
        return which systems, APIs and hence DAS keys this points to.
        """
        result = []
        for doc in self.col.find({'members': member},
                                 fields=['system', 'urn', 'keys']):

            result.append({
                'system': doc['system'],
                'urn': doc['urn'],
                'keys': doc['keys']
            })
        return result

    def key_search(self, text, limitkey=None):
        """
        Try and find suggested DAS keys, by performing a member search and then
        mapping back to the DAS keys those are produced by.
        """
        text = text.lower()
        result = collections.defaultdict(set)
        for member in self.text_search(text):
            for info in self.member_info(member):
                result[tuple(info['keys'])].add(member)
        if limitkey:
            for key in result:
                if not limitkey in key:
                    del result[key]
        return result

    def members_for_keys(self, keys):
        """
        Return all the members that exactly match the set of keys
        """
        result = []
        for doc in self.col.find({'keys': {
                '$all': keys,
                '$size': len(keys)
        }},
                                 fields=['members']):
            result += doc['members']
        return result

    def has_member(self, member):
        """
        Return true if we know anything about the given member.
        """
        if self.col.find_one({'member': member}):
            return True
        else:
            return False

    def list_members(self):
        "Return list of members in keylearning collection"
        return self.col.find({
            'members': {
                '$exists': 'True'
            },
            'system': {
                '$exists': 'True'
            },
            'urn': {
                '$exists': 'True'
            }
        })
Ejemplo n.º 34
0
    def __init__(self, config):
        self.config  = config
        self.emptyset_expire = \
                expire_timestamp(config['das'].get('emptyset_expire', 5))
        self.dburi   = config['mongodb']['dburi']
        self.cache_size = config['mongodb']['bulkupdate_size']
        self.dbname  = config['dasdb']['dbname']
        self.verbose = config['verbose']
        self.logger  = PrintManager('DASMongocache', self.verbose)
        self.mapping = config['dasmapping']
        self.logging = config['dasdb'].get('logging', False)
        self.rec_ttl = config['dasdb'].get('record_ttl', 24*60*60)
        self.del_ttl = config['dasdb'].get('delta_ttl', 60)
        self.cleanup_del_ttl = config['dasdb'].get('cleanup_delta_ttl', 3600)
        self.retry   = config['dasdb'].get('retry', 3)
        self.das_son_manipulator = DAS_SONManipulator()

        # Initialize MongoDB connection
        self.col_    = self.config['dasdb']['cachecollection']
        self.mrcol_  = self.config['dasdb']['mrcollection']
        self.merge_  = self.config['dasdb']['mergecollection']
        self.gfs     = db_gridfs(self.dburi)

        msg = "%s@%s" % (self.dburi, self.dbname)
        self.logger.info(msg)

        # ensure that we have the following indexes
        common_idx = [
                      ('file.name', DESCENDING),
                      ('dataset.name', DESCENDING),
                      ('block.name', DESCENDING),
                      ('run.run_number', DESCENDING),
                      ]
        index_list = [('das.expire', ASCENDING), ('das_id', ASCENDING),
                      ('das.system', ASCENDING),
                      ('qhash', DESCENDING),
                      ('das.record', ASCENDING)]
        create_indexes(self.col, index_list + common_idx)
        index_list = [('das.expire', ASCENDING), ('das_id', ASCENDING),
                      ('qhash', DESCENDING),
                      ('das.record', ASCENDING),
                      ('das.ts', ASCENDING)]
        create_indexes(self.merge, index_list)
        # NOTE: I found that creating index in merge collection leads to
        # MongoDB error when records contains multiple arrays on indexed
        # keys. For example, when we query file,run,lumi both file and run
        # are arrays in MongoDB. In this case the final sort in MongoDB
        # bark with the following message:
        # cannot sort with keys that are parallel arrays
        # it looks like that there is no fix for that yet
        # see
        # http://stackoverflow.com/questions/6516725/how-do-i-index-two-arrays-in-mongodb
        # therefore I temporary disabled create_indexes call on merge
        # collection which was used to have index to ease final sort,
        # especially in a case when a lot of records correspond to inital
        # query, e.g. file records.
        # On another hand, the most common use case where sort fails is
        # getting file records, and I can add one compound key to ease sort
        # but I can't add another compound key on array field, e.g. run
        common_idx = [[('qhash', DESCENDING), ('file.name', DESCENDING)]]
        create_indexes(self.merge, index_list + common_idx)

        # thread which clean-up DAS collections
        thname = 'mongocache_cleanup'
        cols   = [config['dasdb']['cachecollection'],
                  config['dasdb']['mrcollection'],
                  config['dasdb']['mergecollection']]
Ejemplo n.º 35
0
class DASMongocache(object):
    """
    DAS cache based MongoDB. 
    """
    def __init__(self, config):
        self.emptyset_expire = expire_timestamp(\
            config['das'].get('emptyset_expire', 5))
        self.dburi   = config['mongodb']['dburi']
        self.cache_size = config['mongodb']['bulkupdate_size']
        self.dbname  = config['dasdb']['dbname']
        self.verbose = config['verbose']
        self.logger  = PrintManager('DASMongocache', self.verbose)
        self.mapping = config['dasmapping']

        self.conn    = db_connection(self.dburi)
        self.mdb     = self.conn[self.dbname]
        self.col     = self.mdb[config['dasdb']['cachecollection']]
        self.mrcol   = self.mdb[config['dasdb']['mrcollection']]
        self.merge   = self.mdb[config['dasdb']['mergecollection']]
        self.gfs     = db_gridfs(self.dburi)

        self.logdb   = DASLogdb(config)

        self.das_internal_keys = ['das_id', 'das', 'cache_id', 'qhash']

        msg = "%s@%s" % (self.dburi, self.dbname)
        self.logger.info(msg)

        self.add_manipulator()

        # ensure that we have the following indexes
        index_list = [('das.expire', ASCENDING), ('das_id', ASCENDING),
                      ('das.system', ASCENDING),
                      ('qhash', DESCENDING),
                      ('das.empty_record', ASCENDING)]
        create_indexes(self.col, index_list)
        index_list = [('das.expire', ASCENDING), ('das_id', ASCENDING),
                      ('qhash', DESCENDING),
                      ('das.empty_record', ASCENDING), ('das.ts', ASCENDING)]
        create_indexes(self.merge, index_list)
        
    def add_manipulator(self):
        """
        Add DAS-specific MongoDB SON manipulator to perform
        conversion of inserted data into DAS cache.
        """
        das_son_manipulator = DAS_SONManipulator()
        self.mdb.add_son_manipulator(das_son_manipulator)
        msg = "DAS_SONManipulator %s" \
        % das_son_manipulator
        self.logger.debug(msg)

    def similar_queries(self, dasquery):
        """
        Check if we have query results in cache whose conditions are
        superset of provided query. The method only works for single
        key whose value is substring of value in input query.
        For example, if cache contains records about T1 sites, 
        then input query T1_CH_CERN is subset of results stored in cache.
        """
        spec = dasquery.mongo_query.get('spec', {})
        cond = {'query.spec.key': {'$in' : spec.keys()}, 'qhash':dasquery.qhash}
        for row in self.col.find(cond):
            found_query = DASQuery(row['query'])
            if  dasquery.qhash == found_query.qhash:
                msg = "%s similar to %s" % (dasquery, found_query)
                self.logger.info(msg)
                return found_query
        return False
    
    def get_superset_keys(self, key, value):
        """
        This is a special-case version of similar_keys,
        intended for analysers that want to quickly
        find possible superset queries of a simple
        query of the form key=value.
        """
        
        msg = "%s=%s" % (key, value)
        self.logger.debug(msg)
        cond = {'query.spec.key': key}
        for row in self.col.find(cond):
            mongo_query = decode_mongo_query(row['query'])
            for thiskey, thisvalue in mongo_query.iteritems():
                if thiskey == key:
                    if fnmatch.fnmatch(value, thisvalue):
                        yield thisvalue

    def get_fields(self, dasquery):
        "Prepare fields to extract from MongoDB"
        fields     = dasquery.mongo_query.get('fields', None)
        if  fields == ['records']:
            fields = None # look-up all records
        filters    = dasquery.filters
        cond       = {}
        if  filters:
            new_fields = []
            for dasfilter in filters:
                if  dasfilter == 'unique':
                    continue
                if  dasfilter not in fields and \
                    dasfilter not in new_fields:
                    if  dasfilter.find('=') == -1 and dasfilter.find('<') == -1\
                    and dasfilter.find('>') == -1:
                        new_fields.append(dasfilter)
                    else:
                        cond = parse_filters(dasquery.mongo_query)
            if  not new_fields and fields:
                new_fields = list(fields)
            return new_fields, cond
        return fields, cond

    def remove_expired(self, collection):
        """
        Remove expired records from DAS cache.
        """
        timestamp = int(time.time())
        col  = self.mdb[collection]
        spec = {'das.expire' : {'$lt' : timestamp}}
        if  self.verbose:
            nrec = col.find(spec).count()
            msg  = "will remove %s records" % nrec
            msg += ", localtime=%s" % timestamp
            self.logger.debug(msg)
        self.logdb.insert(collection, {'delete': self.col.find(spec).count()})
        col.remove(spec)

    def find(self, dasquery):
        """
        Find provided query in DAS cache.
        """
        cond = {'qhash': dasquery.qhash, 'das.system':'das'}
        return self.col.find_one(cond)

    def find_specs(self, dasquery, system='das'):
        """
        Check if cache has query whose specs are identical to provided query.
        Return all matches.
        """
        cond = {'qhash': dasquery.qhash}
        if  system:
            cond.update({'das.system': system})
        return self.col.find(cond)

    def get_das_ids(self, dasquery):
        """
        Return list of DAS ids associated with given query
        """
        das_ids = []
        try:
            das_ids = \
                [r['_id'] for r in self.col.find_specs(dasquery, system='')]
        except:
            pass
        return das_ids

    def update_das_expire(self, dasquery, timestamp):
        "Update timestamp of all DAS data records for given query"
        nval = {'$set': {'das.expire':timestamp}}
        spec = {'qhash' : dasquery.qhash}
        self.col.update(spec, nval, multi=True, safe=True)
        self.merge.update(spec, nval, multi=True, safe=True)

    def das_record(self, dasquery):
        "Retrieve DAS record for given query"
        return self.col.find_one({'qhash': dasquery.qhash})
    
    def find_records(self, das_id):
        " Return all the records matching a given das_id"
        return self.col.find({'das_id': das_id})

    def add_to_record(self, dasquery, info, system=None):
        "Add to existing DAS record provided info"
        if  system:
            self.col.update({'query': dasquery.storage_query,
                             'das.system':system},
                            {'$set': info}, upsert=True, safe=True)
        else:
            self.col.update({'query': dasquery.storage_query},
                            {'$set': info}, upsert=True, safe=True)

    def update_query_record(self, dasquery, status, header=None):
        "Update DAS record for provided query"
        if  header:
            system = header['das']['system']
            spec1  = {'qhash': dasquery.qhash, 'das.system': 'das'}
            dasrecord = self.col.find_one(spec1)
            spec2  = {'qhash': dasquery.qhash, 'das.system': system}
            sysrecord = self.col.find_one(spec2)
            hexpire = header['das']['expire']
            dexpire = hexpire
            if  dasrecord and dasrecord.has_key('das'):
                dexpire = dasrecord['das'].get('expire', None)
            if  dexpire and hexpire > dexpire:
                expire = dexpire
            else:
                expire = hexpire
            if  sysrecord:
                api  = header['das']['api']
                url  = header['das']['url']
                sapi = sysrecord['das'].get('api', [])
                surl = sysrecord['das'].get('url', [])
                if  set(api) & set(sapi) == set(api) and \
                    set(url) & set(surl) == set(url):
                    self.col.update({'_id':ObjectId(sysrecord['_id'])},
                        {'$set': {'das.expire':expire, 'das.status':status}},
                        safe=True)
                else:
                    self.col.update({'_id':ObjectId(sysrecord['_id'])},
                        {'$pushAll':{'das.api':header['das']['api'],
                                     'das.urn':header['das']['api'],
                                     'das.url':header['das']['url'],
                                     'das.ctime':header['das']['ctime'],
                                    },
                         '$set': {'das.expire':expire, 'das.status':status}},
                        safe=True)
            if  dasrecord:
                self.col.update({'_id':ObjectId(dasrecord['_id'])},
                     {'$set': {'das.expire':expire}}, safe=True)
        else:
            self.col.update({'qhash': dasquery.qhash,
                             'das.system':'das'},
                            {'$set': {'das.status': status}}, safe=True)

    def incache(self, dasquery, collection='merge', system=None):
        """
        Check if we have query results in cache, otherwise return null.
        Please note, input parameter query means MongoDB query, please
        consult MongoDB API for more details,
        http://api.mongodb.org/python/
        """
        self.remove_expired(collection)
        col  = self.mdb[collection]
        spec = {'qhash':dasquery.qhash}
        if  system:
            spec.update({'das.system': system})
        res  = col.find(spec=spec).count()
        msg  = "(%s, coll=%s) found %s results" % (dasquery, collection, res)
        self.logger.info(msg)
        if  res:
            return True
        return False

    def nresults(self, dasquery, collection='merge'):
        """Return number of results for given query."""
        if  dasquery.aggregators:
            return len(dasquery.aggregators)
        # Distinguish 2 use cases, unique filter and general query
        # in first one we should count only unique records, in later
        # we can rely on DB count() method. Pleas keep in mind that
        # usage of fields in find doesn't account for counting, since it
        # is a view over records found with spec, so we don't need to use it.
        col  = self.mdb[collection]
        fields, filter_cond = self.get_fields(dasquery)
        if  not fields:
            spec = dasquery.mongo_query.get('spec', {})
        else:
            spec = {'qhash':dasquery.qhash, 'das.empty_record':0}
        if  filter_cond:
            spec.update(filter_cond)
        if  dasquery.unique_filter:
            skeys = self.mongo_sort_keys(collection, dasquery)
            if  skeys:
                gen = col.find(spec=spec).sort(skeys)
            else:
                gen = col.find(spec=spec)
            res = len([r for r in unique_filter(gen)])
        else:
            res = col.find(spec=spec).count()
        msg = "%s" % res
        self.logger.info(msg)
        return res

    def mongo_sort_keys(self, collection, dasquery):
        """
        Find list of sort keys for a given DAS query. Check existing
        indexes and either use fields or spec keys to find them out.
        Return list of mongo sort keys in a form of (key, order).
        """
        # try to get sort keys all the time to get ordered list of
        # docs which allow unique_filter to apply afterwards
        fields = dasquery.mongo_query.get('fields')
        spec   = dasquery.mongo_query.get('spec')
        skeys  = dasquery.sortkeys
        mongo_skeys = []
        if  skeys:
            for key in skeys:
                if  key.find('-') != -1: # reverse order, e.g. desc
                    mongo_skeys.append((key.replace('-', ''), DESCENDING))
                else:
                    mongo_skeys.append((key, ASCENDING))
        else:
            existing_idx = [i for i in self.existing_indexes(collection)]
            if  fields:
                lkeys = []
                for key in fields:
                    for mkey in self.mapping.mapkeys(key):
                        if  mkey not in lkeys:
                            lkeys.append(mkey)
            else:
                lkeys = spec.keys()
            keys = [k for k in lkeys \
                if k.find('das') == -1 and k.find('_id') == -1 and \
                        k in existing_idx]
            mongo_skeys = [(k, ASCENDING) for k in keys]
        return mongo_skeys

    def existing_indexes(self, collection='merge'):
        """
        Get list of existing indexes in DB. They are returned by index_information
        API in the following for:

        .. doctest::

            {u'_id_': {u'key': [(u'_id', 1)], u'v': 0},
             u'das.expire_1': {u'key': [(u'das.expire', 1)], u'v': 0},
             ...
             u'tier.name_-1': {u'key': [(u'tier.name', -1)], u'v': 0}}
        """
        col = self.mdb[collection]
        for val in col.index_information().values():
            for idx in val['key']:
                yield idx[0] # index name

    def get_records(self, col, spec, fields, skeys, idx, limit, unique=False):
        "Generator to get records from MongoDB. It correctly applies"
        if  fields:
            for key in fields: # ensure that fields keys will be presented
                if  key not in self.das_internal_keys and \
                    not spec.has_key(key):
                    spec.update({key: {'$exists':True}})
        try:
            res = col.find(spec=spec, fields=fields)
            if  skeys:
                res = res.sort(skeys)
            if  not unique:
                if  idx:
                    res = res.skip(idx)
                if  limit:
                    res = res.limit(limit)
        except Exception as exp:
            print_exc(exp)
            row = {'exception': str(exp)}
            res = []
            yield row
        if  unique:
            if  limit:
                gen = itertools.islice(unique_filter(res), idx, idx+limit)
            else:
                gen = unique_filter(res)
            for row in gen:
                yield row
        else:
            for row in res:
                yield row

    def get_from_cache(self, dasquery, idx=0, limit=0, collection='merge'):
        "Generator which retrieves results from the cache"
        if  dasquery.service_apis_map(): # valid DAS query
            result = self.get_das_records(dasquery, idx, limit, collection)
        else: # pure MongoDB query
            coll    = self.mdb[collection]
            fields  = dasquery.mongo_query.get('fields', None)
            spec    = dasquery.mongo_query.get('spec', {})
            if  dasquery.filters:
                if  fields == None:
                    fields = dasquery.filters
                else:
                    fields += dasquery.filters
            skeys   = self.mongo_sort_keys(collection, dasquery)
            result  = self.get_records(coll, spec, fields, skeys, \
                            idx, limit, dasquery.unique_filter)
        for row in result:
            yield row

    def get_das_records(self, dasquery, idx=0, limit=0, collection='merge'):
        "Generator which retrieves DAS records from the cache"
        col = self.mdb[collection]
        msg = "(%s, %s, %s, coll=%s)" % (dasquery, idx, limit, collection)
        self.logger.info(msg)

        idx = int(idx)
        fields, filter_cond = self.get_fields(dasquery)
        if  not fields:
            spec = dasquery.mongo_query.get('spec', {})
        else:
            spec = {'qhash':dasquery.qhash, 'das.empty_record':0}
        if  filter_cond:
            spec.update(filter_cond)
        if  fields: # be sure to extract das internal keys
            fields += self.das_internal_keys
        # try to get sort keys all the time to get ordered list of
        # docs which allow unique_filter to apply afterwards
        skeys   = self.mongo_sort_keys(collection, dasquery)
        res     = self.get_records(col, spec, fields, skeys, \
                        idx, limit, dasquery.unique_filter)
        counter = 0
        for row in res:
            counter += 1
            yield row

        if  counter:
            msg = "yield %s record(s)" % counter
            self.logger.info(msg)

        # if no raw records were yield we look-up possible error records
        if  not counter:
            nrec = self.col.find({'qhash':dasquery.qhash}).count()
            if  nrec:
                msg = "for query %s, found %s non-result record(s)" \
                        % (dasquery, nrec)
                prf = 'DAS WARNING, monogocache:get_from_cache '
                print dastimestamp(prf), msg

    def map_reduce(self, mr_input, dasquery, collection='merge'):
        """
        Wrapper around _map_reduce to allow sequential map/reduce
        operations, e.g. map/reduce out of map/reduce. 

        mr_input is either alias name or list of alias names for
        map/reduce functions.

        Input dasquery which is applied to first
        iteration of map/reduce functions.
        """
        # NOTE: I need to revisit mapreduce.
        spec = dasquery.mongo_query['spec']
        if  not isinstance(mr_input, list):
            mrlist = [mr_input]
        else:
            mrlist = mr_input
        coll = self.mdb[collection]
        for mapreduce in mrlist:
            if  mapreduce == mrlist[0]:
                cond = spec
            else:
                cond = None
            coll = self._map_reduce(coll, mapreduce, cond)
        for row in coll.find():
            yield row

    def _map_reduce(self, coll, mapreduce, spec=None):
        """
        Perform map/reduce operation over DAS cache using provided
        collection, mapreduce name and optional conditions.
        """
        self.logger.debug("(%s, %s)" % (mapreduce, spec))
        record = self.mrcol.find_one({'name':mapreduce})
        if  not record:
            raise Exception("Map/reduce function '%s' not found" % mapreduce)
        fmap = record['map']
        freduce = record['reduce']
        if  spec:
            result = coll.map_reduce(Code(fmap), Code(freduce), query=spec)
        else:
            result = coll.map_reduce(Code(fmap), Code(freduce))
        msg = "found %s records in %s" % (result.count(), result.name)
        self.logger.info(msg)
        self.logger.debug(fmap)
        self.logger.debug(freduce)
        return result

    def get_map_reduce(self, name=None):
        """
        Return definition of map/reduce functions for provided name
        or gives full list.
        """
        spec = {}
        if  name:
            spec = {'name':name}
        result = self.mrcol.find(spec)
        for row in result:
            yield row

    def merge_records(self, dasquery):
        """
        Merge DAS records for provided query. We perform the following
        steps:
        1. get all queries from das.cache by ordering them by primary key
        2. run aggregtor function to merge neighbors
        3. insert records into das.merge
        """
        self.logger.debug(dasquery)
        id_list = []
        expire  = 9999999999 # future
        # get all API records for given DAS query
        spec    = {'qhash':dasquery.qhash, 'query':{'$exists':True}}
        records = self.col.find(spec)
        for row in records:
            # find smallest expire timestamp to be used by aggregator
            if  row['das']['expire'] < expire:
                expire = row['das']['expire']
            if  row['_id'] not in id_list:
                id_list.append(row['_id'])
        inserted = 0
        lookup_keys = set()
        fields = dasquery.mongo_query.get('fields')
        if  not fields: # Mongo
            fields = []
        for key in fields:
            for pkey in self.mapping.mapkeys(key):
                lookup_keys.add(pkey)
        for pkey in lookup_keys:
            skey = [(pkey, DESCENDING)]
            # lookup all service records
            spec = {'das_id': {'$in': id_list}, 'das.primary_key': pkey}
            if  self.verbose:
                nrec = self.col.find(spec).sort(skey).count()
                msg  = "merging %s records, for %s key" % (nrec, pkey) 
            else:
                msg  = "merging records, for %s key" % pkey
            self.logger.debug(msg)
            records = self.col.find(spec).sort(skey)
            # aggregate all records
            agen = aggregator(dasquery, records, expire)
            # diff aggregated records
            gen  = das_diff(agen, self.mapping.diff_keys(pkey.split('.')[0]))
            # insert all records into das.merge using bulk insert
            size = self.cache_size
            try:
                while True:
                    nres = self.merge.insert(\
                        itertools.islice(gen, size), safe=True)
                    if  nres and isinstance(nres, list):
                        inserted += len(nres)
                    else:
                        break
            except InvalidDocument as exp:
                msg = "Caught bson error: " + str(exp)
                self.logger.info(msg)
                records = self.col.find(spec).sort(skey)
                gen = aggregator(dasquery, records, expire)
                genrows = parse2gridfs(self.gfs, pkey, gen, self.logger)
                das_dict = {'das':{'expire':expire, 'empty_record': 0,
                        'primary_key':[k for k in lookup_keys],
                        'system': ['gridfs']}, 'qhash':dasquery.qhash,
                        'cache_id':[], 'das_id': id_list}
                for row in genrows:
                    row.update(das_dict)
                    self.merge.insert(row, safe=True)
            except InvalidOperation:
                pass
        if  inserted:
            self.logdb.insert('merge', {'insert': inserted})
        elif  not lookup_keys: # we get query w/o fields
            pass
        else: # we didn't merge anything, it is DB look-up failure
            empty_expire = time.time() + 20 # secs, short enough to expire
            empty_record = {'das':{'expire':empty_expire,
                                   'primary_key':list(lookup_keys),
                                   'empty_record': 1},
                            'cache_id':[], 'das_id': id_list}
            for key, val in dasquery.mongo_query['spec'].iteritems():
                if  key.find('.') == -1:
                    empty_record[key] = []
                else: # it is compound key, e.g. site.name
                    newkey, newval = convert_dot_notation(key, val)
                    empty_record[newkey] = adjust_mongo_keyvalue(newval)
            self.merge.insert(empty_record, safe=True)
            # update DAS records (both meta and data ones, by using qhash)
            nval = {'$set': {'das.expire':empty_expire}}
            spec = {'qhash':dasquery.qhash}
            self.col.update(spec, nval, multi=True, safe=True)

    def update_cache(self, dasquery, results, header):
        """
        Insert results into cache. Use bulk insert controller by
        self.cache_size. Upon completion ensure indexies.
        """
        # insert/check query record in DAS cache
        self.insert_query_record(dasquery, header)

        # update results records in DAS cache
        gen  = self.generate_records(dasquery, results, header)
        inserted = 0
        # bulk insert
        try:
            while True:
                nres = self.col.insert(\
                        itertools.islice(gen, self.cache_size), safe=True)
                if  nres and isinstance(nres, list):
                    inserted += len(nres)
                else:
                    break
        except InvalidOperation:
            pass
        if  inserted:
            self.logdb.insert('cache', {'insert': inserted})

    def insert_query_record(self, dasquery, header):
        """
        Insert query record into DAS cache.
        """
        dasheader  = header['das']
        # check presence of API record in a cache
        system     = dasheader['system']
        if  not self.incache(dasquery, collection='cache', system=system):
            msg = "query=%s, header=%s" % (dasquery, header)
            self.logger.debug(msg)
            q_record = dict(das=dasheader, query=dasquery.storage_query)
            q_record['das']['empty_record'] = 0
            q_record['das']['status'] = "requested"
            q_record['qhash'] = dasquery.qhash
            self.col.insert(q_record, safe=True)

    def generate_records(self, dasquery, results, header):
        """
        Iterate over provided results, update records and yield them
        to next level (update_cache)
        """
        self.logger.debug("(%s) store to cache" % dasquery)
        if  not results:
            return
        # update das record with new status
        status = 'Update DAS cache, %s API' % header['das']['api'][0]
        self.update_query_record(dasquery, status, header)

        dasheader  = header['das']
        expire     = dasheader['expire']
        system     = dasheader['system']
        rec        = [k for i in header['lookup_keys'] for k in i.values()]
        cond_keys  = dasquery.mongo_query['spec'].keys()
        # get API record id
        spec       = {'qhash':dasquery.qhash, 'das.system':system}
        record     = self.col.find_one(spec, fields=['_id'])
        counter    = 0
        prim_key   = rec[0][0]#use rec instead of lkeys[0] which re-order items
        if  record:
            objid  = record['_id']
            if  isinstance(results, list) or isinstance(results, GeneratorType):
                for item in results:
                    counter += 1
                    item['das'] = dict(expire=expire, primary_key=prim_key,
                                       condition_keys=cond_keys,
                                       instance=dasquery.instance,
                                       system=system, empty_record=0)
                    item['das_id'] = str(objid)
                    item['qhash'] = dasquery.qhash
                    yield item
            else:
                print "\n\n ### results = ", str(results)
                raise Exception('Provided results is not a list/generator type')
        self.logger.info("\n")
        msg = "%s yield %s rows" % (dasheader['system'], counter)
        self.logger.info(msg)

    def remove_from_cache(self, dasquery):
        """
        Remove query from DAS cache. To do so, we retrieve API record
        and remove all data records from das.cache and das.merge
        """
        records = self.col.find({'qhash':dasquery.qhash})
        id_list = []
        for row in records:
            if  row['_id'] not in id_list:
                id_list.append(row['_id'])
        spec = {'das_id':{'$in':id_list}}
        self.logdb.insert('merge', {'delete': self.col.find(spec).count()})
        self.merge.remove(spec)
        self.logdb.insert('cache', {'delete': self.col.find(spec).count()})
        self.col.remove(spec)
        self.col.remove({'qhash':dasquery.qhash})

    def clean_cache(self):
        """
        Clean expired docs in das.cache and das.merge. 
        """
        current_time = time.time()
        query = {'das.expire': { '$lt':current_time} }
        self.logdb.insert('merge', {'delete': self.merge.find(query).count()})
        self.merge.remove(query)
        self.logdb.insert('cache', {'delete': self.col.find(query).count()})
        self.col.remove(query)

    def delete_cache(self):
        """
        Delete all results in DAS cache/merge collection, including
        internal indexes.
        """
        self.logdb.insert('cache', {'delete': self.col.count()})
        self.col.remove({})
        try: 
            self.col.drop_indexes()
        except:
            pass
        self.logdb.insert('merge', {'delete': self.merge.count()})
        self.merge.remove({})
        try:
            self.merge.drop_indexes()
        except:
            pass
Ejemplo n.º 36
0
Archivo: das_core.py Proyecto: ktf/DAS
class DASCore(object):
    """
    DAS core class.
    """
    def __init__(self, config=None, debug=0,
                nores=False, logger=None, engine=None, multitask=True):
        if  config:
            dasconfig = config
        else:
            dasconfig = das_readconfig()
        verbose       = dasconfig['verbose']
        self.stdout   = debug
        if  isinstance(debug, int):
            self.verbose = debug
            dasconfig['verbose'] = debug
        else:
            self.verbose = verbose
        das_timer('DASCore::init', self.verbose)
        self.operators = das_operators()

        # set noresults option
        self.noresults = False
        if  nores:
            dasconfig['write_cache'] = True
            self.noresults = nores

        self.multitask = dasconfig['das'].get('multitask', True)
        if  debug or self.verbose:
            self.multitask = False # in verbose mode do not use multitask
            dasconfig['das']['multitask'] = False
        if  not multitask: # explicitly call DASCore ctor, e.g. in analytics
            self.multitask = False
            dasconfig['das']['multitask'] = False
        dasconfig['engine'] = engine
        if  self.multitask:
            nworkers = dasconfig['das'].get('core_workers', 5)
            if  engine:
                thr_name = 'DASCore:PluginTaskManager'
                self.taskmgr = PluginTaskManager(\
                        engine, nworkers=nworkers, name=thr_name)
                self.taskmgr.subscribe()
            else:
                thr_name = 'DASCore:TaskManager'
                self.taskmgr = TaskManager(nworkers=nworkers, name=thr_name)
        else:
            self.taskmgr = None

        if  logger:
            self.logger = logger
        else:
            self.logger = PrintManager('DASCore', self.verbose)

        # define Mapping/Analytics/Parser in this order since Parser depends
        # on first two
        dasmapping = DASMapping(dasconfig)
        dasconfig['dasmapping'] = dasmapping
        self.mapping = dasmapping

        self.analytics = DASAnalytics(dasconfig)
        dasconfig['dasanalytics'] = self.analytics

        self.keylearning = DASKeyLearning(dasconfig)
        dasconfig['keylearning'] = self.keylearning

        # init DAS cache
        self.rawcache = DASMongocache(dasconfig)
        dasconfig['rawcache'] = self.rawcache

        # plug-in architecture: loop over registered data-services in
        # dasconfig; load appropriate module/class; register data
        # service with DASCore.
        self.systems = dasmapping.list_systems()
        # pointer to the DAS top level directory
        dasroot = '/'.join(__file__.split('/')[:-3])
        for name in self.systems:
            try:
                klass  = 'DAS/services/%s/%s_service.py' \
                    % (name, name)
                srvfile = os.path.join(dasroot, klass)
                with file(srvfile) as srvclass:
                    for line in srvclass:
                        if  line.find('(DASAbstractService)') != -1:
                            klass = line.split('(DASAbstractService)')[0]
                            klass = klass.split('class ')[-1] 
                            break
                mname  = 'DAS.services.%s.%s_service' % (name, name)
                module = __import__(mname, fromlist=[klass])
                obj = getattr(module, klass)(dasconfig)
                setattr(self, name, obj)
                SERVICES[name] = obj
            except IOError as err:
                if  debug > 1:
                    # we have virtual services, so IOError can be correct
                    print_exc(err)
                try:
                    mname  = 'DAS.services.generic_service'
                    module = __import__(mname, fromlist=['GenericService'])
                    obj    = module.GenericService(name, dasconfig)
                    setattr(self, name, obj)
                except Exception as exc:
                    print_exc(exc)
                    msg = "Unable to load %s data-service plugin" % name
                    raise Exception(msg)
            except Exception as exc:
                print_exc(exc)
                msg = "Unable to load %s data-service plugin" % name
                raise Exception(msg)

        # loop over systems and get system keys, add mapping keys to final list
        self.service_keys = {}
        self.service_parameters = {}
        for name in self.systems: 
            skeys = getattr(self, name).keys()
            self.service_keys[getattr(self, name).name] = skeys
            sparams = getattr(self, name).parameters()
            self.service_parameters[getattr(self, name).name] = sparams

        self.service_keys['special'] = das_special_keys()
        self.dasconfig = dasconfig
        das_timer('DASCore::init', self.verbose)

    def keys(self):
        """
        Return map of data service keys
        """
        return self.service_keys

    def das_keys(self):
        """
        Return map of data service keys
        """
        _keys = ['records']
        for values in self.service_keys.values():
            for key in values:
                if  key not in _keys:
                    _keys.append(key)
        return _keys

    def result(self, query, idx=0, limit=None):
        """
        Get results either from cache or from explicit call
        """
        self.logger.info('input query=%s' % query)
        results = []
        dasquery = DASQuery(query)
        dasquery.add_to_analytics()
        query    = dasquery.mongo_query
        # check if we have any service which cover the query
        # otherwise decompose it into list of queries
        service_map = dasquery.service_apis_map()
        if  not service_map:
            msg  = 'no APIs found to answer input query, will decompose it'
            self.logger.info(msg)
            skeys = query['fields']
            if  not skeys:
                skeys = []
            for key in skeys:
                newquery = DASQuery(dict(fields=[key], spec=query['spec']))
                self.call(newquery) # process query
        else:
            self.call(dasquery) # process query

        # lookup provided query in a cache
        if  not self.noresults:
            results = self.get_from_cache(dasquery, idx, limit)
        return results

    def remove_from_cache(self, dasquery):
        """
        Delete in cache entries about input query
        """
        self.rawcache.remove_from_cache(dasquery)

    def get_status(self, dasquery):
        """
        Look-up status of provided query in a cache.
        Return status of the query request and its hash.
        """
        status = None
        error  = None
        reason = None
        for col in ['merge', 'cache']:
            self.rawcache.remove_expired(dasquery, col)
        if  dasquery and 'fields' in dasquery.mongo_query:
            fields = dasquery.mongo_query['fields']
            if  fields and isinstance(fields, list) and 'queries' in fields:
                return 'ok', error, reason
        record = self.rawcache.find(dasquery)
        error, reason = self.rawcache.is_error_in_records(dasquery)
        try:
            if  record and 'das' in record and 'status' in record['das']:
                status = record['das']['status']
                if  not error:
                    error = record['das'].get('error', error)
                if  not reason:
                    reason = record['das'].get('reason', reason)
                return status, error, reason
        except Exception as exc:
            print_exc(exc)
            status = error = reason = None
            self.rawcache.remove_from_cache(dasquery)
        return status, error, reason

    def worker(self, srv, dasquery):
        """Main worker function which calls data-srv call function"""
        self.logger.info('##### %s ######\n' % srv)
        das_timer(srv, self.verbose)
        getattr(getattr(self, srv), 'call')(dasquery)
        das_timer(srv, self.verbose)

    def insert_query_records(self, dasquery):
        """
        Insert DAS query records into DAS cache and return list of services
        which will answer this query
        """
        services = dasquery.services
        self.logger.info('Potential services = %s' % services)
        if  not services:
            msg  = 'No data-services for query %s' % dasquery
            msg += 'mongo_query: %s' % dasquery.mongo_query
            msg += 'params: %s' % dasquery.params()
            print dastimestamp('DAS WARNING '), msg

        # get list of URI which can answer this query
        ack_services = []
        for srv in services:
            gen = [t for t in getattr(getattr(self, srv), 'apimap')(dasquery)]
            for url, api, args, iformat, expire in gen:
                header = dasheader(srv, dasquery, expire, api, url, ctime=0)
                self.rawcache.insert_query_record(dasquery, header)
                if  srv not in ack_services:
                    ack_services.append(srv)
        if  not ack_services:
            ack_services = services
        if  dasquery.query.find('records ') != -1:
            srv_status = True # skip DAS queries w/ records request
        expire = 2*60 # 2 minutes, it should be overwriten by data-srv
        header = dasheader("das", dasquery, expire, api='das_core',
                services=dict(das=ack_services))
        header['lookup_keys'] = []
        self.rawcache.insert_query_record(dasquery, header)
        das_timer('das_record', self.verbose)
        return ack_services

    def call(self, query, add_to_analytics=True, **kwds):
        """
        Top level DAS api which execute a given query using underlying
        data-services. It follows the following steps:

            - parse input query
            - identify data-sercices based on selection keys
              and where clause conditions
            - construct DAS workflow and execute data-service 
              API calls. At this step individual 
              data-services store results into DAS cache.

        Return status 0/1 depending on success of the calls, can be
        used by workers on cache server.

        kwds is provided for compatibility with web layer, e.g. it
        may invoke this method with additional pid parameter.
        """
        def update_das_query(dasquery, status, reason=None):
            "Update DAS query record with given status and reason"
            self.rawcache.update_query_record(dasquery, status, reason=reason)
            self.rawcache.add_to_record(\
                    dasquery, {'das.timer': get_das_timer()}, system='das')
            # make sure that das record is updated, we use 7 iteration which
            # sum up into 1 minute to cover default syncdelay value of mongo
            # server (in a future it would be better to find programatically
            # this syncdelay value, but it seems pymongo driver does not
            # provide any API for it.
            for idx in xrange(1, 7):
                spec = {'qhash':dasquery.qhash, 'das.system':['das']}
                res = self.rawcache.col.find_one(spec)
                if  res:
                    dbstatus = res.get('das', {}).get('status', None)
                    if  dbstatus == status:
                        break
                    msg = 'qhash %s, das.status=%s, status=%s, wait for update' \
                            % (dasquery.qhash, dbstatus, status)
                    print dastimestamp('DAS WARNING'), msg
                time.sleep(idx*idx)
                self.rawcache.update_query_record(dasquery, status, reason=reason)

        self.logger.info('input query=%s' % query)
        das_timer('DASCore::call', self.verbose)
        if  isinstance(query, object) and hasattr(query, '__class__')\
            and query.__class__.__name__ == 'DASQuery':
            dasquery = query
        else:
            dasquery = DASQuery(query)
        for col in ['merge', 'cache']:
            self.rawcache.remove_expired(dasquery, col)
        if  add_to_analytics:
            dasquery.add_to_analytics()
        query  = dasquery.mongo_query
        spec   = query.get('spec')
        fields = query.get('fields')
        if  fields == ['records']:
            msg = 'look-up all records in cache'
            self.logger.info(msg)
            return 'in cache'
        if  spec == dict(records='*'):
            self.logger.info("look-up everything in cache")
            return 'in cache'
        for record in self.rawcache.find_specs(dasquery):
            status = record['das']['status']
            msg = 'found query %s in cache, status=%s\n' \
                        % (record['query'], status)
            self.logger.info(msg)
            print dastimestamp('DAS INFO'), msg
            return status

        self.logger.info(dasquery)
        das_timer('das_record', self.verbose)
        services = self.insert_query_records(dasquery)
        if  not services:
            msg = 'unable to locate data-services to fulfill this request'
            msg += ', will iterate over all registered services'
            print dastimestamp('DAS WARNING '), dasquery, msg
            services = dasquery.services if dasquery.services else self.systems
        try:
            if  self.multitask:
                jobs = []
                for srv in sorted(services):
                    jobs.append(self.taskmgr.spawn(self.worker, srv, dasquery))
                self.taskmgr.joinall(jobs)
            else:
                for srv in services:
                    self.worker(srv, dasquery)
        except Exception as exc:
            print_exc(exc)
            return 'fail'
        self.logger.info('\n##### merging ######\n')
        update_das_query(dasquery, 'merging')
        das_timer('merge', self.verbose)
        self.rawcache.merge_records(dasquery)
        das_timer('merge', self.verbose)
        # check if we have service records and properly setup status
        self.logger.info('\n##### check services ######\n')
        das_services = self.rawcache.check_services(dasquery)
        reason = ''
        status = 'ok'
        if  not das_services:
            if  'records' in dasquery.query:
                status = 'ok' # keep status ok for 'records' queries
            else:
                reason = 'no data records found in DAS cache'
                status = 'fail'
                print dastimestamp('DAS ERROR '), dasquery, reason
        update_das_query(dasquery, status, reason)
        das_timer('DASCore::call', self.verbose)
        return status

    def processing_time(self, dasquery):
        "Look-up and return DAS query processing time"
        query_record = self.rawcache.find(dasquery)
        if  query_record:
            das = query_record.get('das', None)
            if  isinstance(das, dict):
                ctime = das.get('ctime', [])
                if  ctime:
                    return ctime[-1]-ctime[0]
        return None

    def nresults(self, dasquery, coll='merge'):
        """
        Return total number of results (count) for provided query
        Code should match body of get_from_cache method.
        """
        fields = dasquery.mongo_query.get('fields', None)
        if  dasquery.mapreduce:
            result = self.rawcache.map_reduce(dasquery.mapreduce, dasquery)
            return len([1 for _ in result])
        elif dasquery.aggregators:
            return len(dasquery.aggregators)
        elif isinstance(fields, list) and 'queries' in fields:
            return len([1 for _ in self.get_queries(dasquery)])
        return self.rawcache.nresults(dasquery, coll)

    def apilist(self, dasquery):
        "Return list of APIs answer given das query"
        return self.rawcache.apilist(dasquery)

    def incache(self, dasquery, coll='merge'):
        """
        Answer the question if given query in DAS cache or not
        """
        return self.rawcache.incache(dasquery, collection=coll)

    def get_from_cache(self, dasquery, idx=0, limit=0, collection='merge'):
        """
        Look-up results from the merge cache and yield them for
        further processing.
        """
        das_timer('DASCore::get_from_cache', self.verbose)
        msg = 'col=%s, query=%s, idx=%s, limit=%s'\
                % (collection, dasquery, idx, limit)
        self.logger.info(msg)

        fields  = dasquery.mongo_query.get('fields', None)

        if  dasquery.mapreduce:
            res = self.rawcache.map_reduce(dasquery.mapreduce, dasquery)
        elif dasquery.aggregators:
            # extract das information from rawcache
            rows  = self.rawcache.get_from_cache(\
                    dasquery, collection=collection)
            first = rows.next()
            sinfo = das_sinfo(first)
            # to perform aggregation we need:
            # - loop over all aggregator functions
            # - loop over all data-services
            # - loop over all APIs within a data-services
            # the code below does that, it applies aggregator
            # to selected (based on key/srv/api) records
            res = []
            _id = 0
            time0  = time.time()
            expire = 300 # min expire
            for func, key in dasquery.aggregators:
                afunc = getattr(das_aggregator, 'das_%s' % func)
                found = False
                for srv, apis, in sinfo.items():
                    for api in apis:
                        rows  = self.rawcache.get_from_cache(\
                                dasquery, collection=collection)
                        gen   = api_rows(rows, api)
                        data  = afunc(key, gen)
                        ctime = time.time() - time0
                        das   = dasheader(srv, dasquery, expire, api=api,
                                ctime=ctime)
                        if  isinstance(data, dict) and data['value'] != 'N/A':
                            aggr = {'_id':_id, 'function': func,
                                    'key': key, 'result': data}
                            aggr.update(das)
                            res.append(aggr)
                            _id += 1
                            found = True
                if  not found: # when we got nothing add empty result record
                    empty = {'value':'N/A'}
                    ctime = time.time() - time0
                    das = dasheader('das', dasquery, expire, api='das_core',
                            ctime=ctime)
                    rec = {'_id':0, 'function':func, 'key':key, 'result':empty}
                    rec.update(das)
                    res.append(rec)
        elif isinstance(fields, list) and 'queries' in fields:
            res = itertools.islice(self.get_queries(dasquery), idx, idx+limit)
        else:
            res = self.rawcache.get_from_cache(dasquery, idx, limit, \
                    collection=collection)
        for row in res:
            fix_times(row)
            yield row
        das_timer('DASCore::get_from_cache', self.verbose)

    def get_queries(self, dasquery):
        """
        Look-up (popular) queries in DAS analytics/logging db
        """
        das_timer('DASCore::get_queries', self.verbose)
        fields = dasquery.mongo_query.get('fields')
        spec   = dasquery.mongo_query.get('spec')
        if  'popular' in fields:
            res = self.analytics.get_popular_queries(spec)
        else:
            datestamp = spec.get('date')
            if  isinstance(datestamp, dict):
                value = datestamp.get('$in')
                res = \
                self.analytics.list_queries(after=value[0], before=value[1])
            elif isinstance(datestamp, int):
                res = self.analytics.list_queries(after=datestamp)
            elif not datestamp:
                res = self.analytics.list_queries()
            else:
                msg = 'Unsupported date value: %s' % datestamp
                raise Exception(msg)
        for row in res:
            rid = row.pop('_id')
            yield dict(das_query=row, _id=rid)
        das_timer('DASCore::get_queries', self.verbose)
Ejemplo n.º 37
0
    def __init__(self,
                 config=None,
                 debug=0,
                 nores=False,
                 logger=None,
                 engine=None,
                 multitask=True):
        if config:
            dasconfig = config
        else:
            dasconfig = das_readconfig()
        verbose = dasconfig['verbose']
        self.stdout = debug
        if isinstance(debug, int) and debug:
            self.verbose = debug
            dasconfig['verbose'] = debug
        else:
            self.verbose = verbose
        das_timer('DASCore::init', self.verbose)
        self.operators = das_operators()
        self.collect_wait_time = dasconfig['das'].get('collect_wait_time', 120)

        # set noresults option
        self.noresults = False
        if nores:
            dasconfig['write_cache'] = True
            self.noresults = nores

        self.init_expire = dasconfig['das'].get('init_expire', 5 * 60)
        self.multitask = dasconfig['das'].get('multitask', True)
        if debug or self.verbose:
            self.multitask = False  # in verbose mode do not use multitask
            dasconfig['das']['multitask'] = False
        if not multitask:  # explicitly call DASCore ctor
            self.multitask = False
            dasconfig['das']['multitask'] = False
        dasconfig['engine'] = engine
        if self.multitask:
            nworkers = dasconfig['das'].get('core_workers', 5)
            #             if  engine:
            #                 thr_name = 'DASCore:PluginTaskManager'
            #                 self.taskmgr = PluginTaskManager(\
            #                         engine, nworkers=nworkers, name=thr_name)
            #                 self.taskmgr.subscribe()
            #             else:
            #                 thr_name = 'DASCore:TaskManager'
            #                 self.taskmgr = TaskManager(nworkers=nworkers, name=thr_name)
            thr_name = 'DASCore:TaskManager'
            self.taskmgr = TaskManager(nworkers=nworkers, name=thr_name)
        else:
            self.taskmgr = None

        if logger:
            self.logger = logger
        else:
            self.logger = PrintManager('DASCore', self.verbose)

        # define Mapping/Analytics/Parser in this order since Parser depends
        # on first two
        dasmapping = DASMapping(dasconfig)
        dasconfig['dasmapping'] = dasmapping
        self.mapping = dasmapping

        self.keylearning = DASKeyLearning(dasconfig)
        dasconfig['keylearning'] = self.keylearning

        # init DAS cache
        self.rawcache = DASMongocache(dasconfig)
        dasconfig['rawcache'] = self.rawcache

        # plug-in architecture: loop over registered data-services in
        # dasconfig; load appropriate module/class; register data
        # service with DASCore.
        self.systems = dasmapping.list_systems()
        # pointer to the DAS top level directory
        dasroot = '/'.join(__file__.split('/')[:-3])
        for name in self.systems:
            try:
                klass  = 'DAS/services/%s/%s_service.py' \
                    % (name, name)
                srvfile = os.path.join(dasroot, klass)
                with open(srvfile) as srvclass:
                    for line in srvclass:
                        if line.find('(DASAbstractService)') != -1:
                            klass = line.split('(DASAbstractService)')[0]
                            klass = klass.split('class ')[-1]
                            break
                mname = 'DAS.services.%s.%s_service' % (name, name)
                module = __import__(mname, fromlist=[klass])
                obj = getattr(module, klass)(dasconfig)
                setattr(self, name, obj)
            except IOError as err:
                if debug > 1:
                    # we have virtual services, so IOError can be correct
                    print_exc(err)
                try:
                    mname = 'DAS.services.generic_service'
                    module = __import__(mname, fromlist=['GenericService'])
                    obj = module.GenericService(name, dasconfig)
                    setattr(self, name, obj)
                except Exception as exc:
                    print_exc(exc)
                    msg = "Unable to load %s data-service plugin" % name
                    raise Exception(msg)
            except Exception as exc:
                print_exc(exc)
                msg = "Unable to load %s data-service plugin" % name
                raise Exception(msg)

        # loop over systems and get system keys, add mapping keys to final list
        self.service_keys = {}
        self.service_parameters = {}
        for name in self.systems:
            skeys = list(getattr(self, name).keys())
            self.service_keys[getattr(self, name).name] = skeys
            sparams = getattr(self, name).parameters()
            self.service_parameters[getattr(self, name).name] = sparams

        self.service_keys['special'] = das_special_keys()
        self.dasconfig = dasconfig
        das_timer('DASCore::init', self.verbose)
Ejemplo n.º 38
0
class DASKeyLearning(object):
    """
    This class manages DAS key-learning DB.
    
    Key-learning is an intermittent process (triggered infrequently
    by a task running in the analytics framework), which involves
    searching through the raw cache for (a subset of but with
    maximum primary key coverage) all output documents, generating
    the set of all data members (in a dotted-dict fashion) and storing
    those as primary-key:data-member records (with an associated
    last-updated-time).
    
    """
    def __init__(self, config):
        self.verbose  = config['verbose']
        self.logger   = PrintManager('DASKeyLearning', self.verbose)
        self.services = config['services']
        self.dburi    = config['mongodb']['dburi']
        self.dbname   = config['keylearningdb']['dbname']
        self.colname  = config['keylearningdb']['collname']
        
        self.mapping  = config['dasmapping']

        msg = "%s@%s" % (self.dburi, self.dbname)
        self.logger.info(msg)
        
        self.col = None
        self.create_db()
        
        

    def create_db(self):
        """
        Establish connection to MongoDB back-end and create DB.
        """
        self.col = db_connection(self.dburi)[self.dbname][self.colname]
        
    def add_members(self, system, urn, members):
        """
        Add a list of data members for a given API (system, urn, url),
        and generate, which are stored as separate records.
        """
        msg = "system=%s, urn=%s, members=%s)" % (system, urn, members)
        self.logger.info(msg)
        
        result = self.col.find_one({'system': system, 'urn': urn})
        if result:       
            self.col.update({'_id': result['_id']},
                            {'$addToSet': {'members': {'$each': members}}})
        else:
            keys = self.mapping.api2daskey(system, urn)
            self.col.insert({'system': system,
                             'urn': urn,
                             'keys': keys,
                             'members': members})
                
        for member in members:
            if not self.col.find_one({'member': member}):
                self.col.insert({'member': member,
                                 'stems': self.stem(member)})
                
        index_list = [('system', 1), ('urn', 1), ('members', 1), ('stems', 1)]
        create_indexes(self.col, index_list)
        
    def stem(self, member):
        """
        Produce an extended set of strings which can be used for text-search.
        TODO: Use PyStemmer or something more sophisticated here.
        """
        
        return member.lower().split('.')
    
    def text_search(self, text):
        """
        Perform a text search for data members matching a string. The input is
        split if it already includes dotted elements (in which case we need to find
        a member matching all the split elements), otherwise we look for any member
        whose stem list contains the text.
        """
        text = text.lower()
        if '.' in text:
            possible_members = self.col.find({'stems': {'$all': text.split('.')}}, 
                                             fields=['member'])
        else:
            possible_members = self.col.find({'stems': text}, 
                                             fields=['member'])
        
        return [doc['member'] for doc in possible_members]
        
    
    def member_info(self, member):
        """
        Once the text search has identified a member that might be a match,
        return which systems, APIs and hence DAS keys this points to.
        """
        result = []
        for doc in self.col.find({'members': member}, 
                                 fields=['system', 'urn', 'keys']):
            
            result.append({'system': doc['system'],
                           'urn': doc['urn'],
                           'keys': doc['keys']})
        return result
    
    def key_search(self, text, limitkey=None):
        """
        Try and find suggested DAS keys, by performing a member search and then
        mapping back to the DAS keys those are produced by.
        """
        text = text.lower()
        result = collections.defaultdict(set)
        for member in self.text_search(text):
            for info in self.member_info(member):
                result[tuple(info['keys'])].add(member)
        if limitkey:
            for key in result:
                if not limitkey in key:
                    del result[key]
        return result
    
    def members_for_keys(self, keys):
        """
        Return all the members that exactly match the set of keys
        """
        result = []
        for doc in self.col.find({'keys': {'$all': keys, '$size': len(keys)}},
                                 fields=['members']):
            result += doc['members']
        return result
         
    
    def has_member(self, member):
        """
        Return true if we know anything about the given member.
        """
        if self.col.find_one({'member': member}):
            return True
        else:
            return False
Ejemplo n.º 39
0
class DASParserDB(object):
    """
    Caching layer for the PLY parser.
    """
    def __init__(self, config):
        self.verbose  = config['verbose']
        self.logger   = PrintManager('DASParserDB', self.verbose)
        self.dburi    = config['mongodb']['dburi']
        self.dbname   = config['parserdb']['dbname']
        self.sizecap  = config['parserdb'].get('sizecap', 5*1024*1024)
        self.colname  = config['parserdb']['collname']
        
        msg = "DASParserCache::__init__ %s@%s" % (self.dburi, self.dbname)
        self.logger.info(msg)
        
        self.col = None
        self.create_db()

    def create_db(self):
        """
        Create db collection
        """
        conn = db_connection(self.dburi)
        dbn  = conn[self.dbname]
        if  self.colname not in dbn.collection_names():
            dbn.create_collection(self.colname, capped=True, size=self.sizecap)
        self.col = dbn[self.colname]

    def lookup_query(self, rawtext):
        """
        Check the parser cache for a given rawtext query.
        Search is done with the hash of this string.
        
        Returns a tuple (status, value) for the cases
        (PARSERCACHE_VALID, mongo_query) - valid query found
        (PARSERCACHE_INVALID, error) - error message for invalid query
        (PARSERCACHE_NOTFOUND, None) - not in the cache
        """
        result = self.col.find_one({'hash':genkey(rawtext)},
                        fields=['query', 'error'])

        if result and result['query']:
            if self.verbose:
                self.logger.debug("DASParserCache: found valid %s->%s" %\
                                  (rawtext, result['query']))
            
            query = decode_mongo_query(result['query'])
            return (PARSERCACHE_VALID, query)
        elif result and result['error']:
            if self.verbose:
                self.logger.debug("DASParserCache: found invalid %s->%s" %\
                                  (rawtext, result['error']))
            return (PARSERCACHE_INVALID, result['error'])
        else:
            if self.verbose:
                self.logger.debug("DASParserCache: not found %s" %\
                                  (rawtext))
            return (PARSERCACHE_NOTFOUND, None)

    def insert_valid_query(self, rawtext, query):
        "Insert a query that was successfully transformed"	
        self._insert_query(rawtext, query, None)

    def insert_invalid_query(self, rawtext, error):
        "Insert the error message for an invalid query"
        self._insert_query(rawtext, None, error)

    def _insert_query(self, rawtext, query, error):
        """Internal method to insert a query"""
        if  self.verbose:
            self.logger.debug("DASParserCache: insert %s->%s/%s" %\
	                          (rawtext, query, error))
        # since MongoDB does not support insertion of $ sign in queries
        # we need to encode inserted query
        if  query:
            encquery = encode_mongo_query(query)
        else:
            encquery = ""
        self.col.insert({'raw':rawtext, 'hash':genkey(rawtext),
                         'query':encquery, 'error':str(error)})
Ejemplo n.º 40
0
class DASParserDB(object):
    """
    Caching layer for the PLY parser.
    """
    def __init__(self, config):
        self.verbose = config['verbose']
        self.logger = PrintManager('DASParserDB', self.verbose)
        self.dburi = config['mongodb']['dburi']
        self.dbname = config['parserdb']['dbname']
        self.sizecap = config['parserdb'].get('sizecap', 5 * 1024 * 1024)
        self.colname = config['parserdb']['collname']
        msg = "DASParserCache::__init__ %s@%s" % (self.dburi, self.dbname)
        self.logger.info(msg)
        self.create_db()

    def create_db(self):
        """
        Create db collection
        """
        conn = db_connection(self.dburi)
        dbn = conn[self.dbname]
        if self.colname not in dbn.collection_names():
            dbn.create_collection(self.colname, capped=True, size=self.sizecap)
        col = dbn[self.colname]
        index_list = [('qhash', DESCENDING)]
        create_indexes(col, index_list)

    @property
    def col(self):
        "Collection object to MongoDB"
        conn = db_connection(self.dburi)
        dbn = conn[self.dbname]
        col = dbn[self.colname]
        return col

    def lookup_query(self, rawtext):
        """
        Check the parser cache for a given rawtext query.
        Search is done with the qhash of this string.
        Returns a tuple (status, value) for the cases
        (PARSERCACHE_VALID, mongo_query) - valid query found
        (PARSERCACHE_INVALID, error) - error message for invalid query
        (PARSERCACHE_NOTFOUND, None) - not in the cache
        """
        result = find_one(self.col, {'qhash':genkey(rawtext)}, \
                        fields=['query', 'error'])

        if result and result['query']:
            if self.verbose:
                self.logger.debug("DASParserCache: found valid %s->%s" %\
                                  (rawtext, result['query']))
            query = decode_mongo_query(result['query'])
            return (PARSERCACHE_VALID, query)
        elif result and result['error']:
            if self.verbose:
                self.logger.debug("DASParserCache: found invalid %s->%s" %\
                                  (rawtext, result['error']))
            return (PARSERCACHE_INVALID, result['error'])
        else:
            if self.verbose:
                self.logger.debug("DASParserCache: not found %s" %\
                                  (rawtext))
            return (PARSERCACHE_NOTFOUND, None)

    def insert_valid_query(self, rawtext, query):
        "Insert a query that was successfully transformed"
        self._insert_query(rawtext, query, None)

    def insert_invalid_query(self, rawtext, error):
        "Insert the error message for an invalid query"
        self._insert_query(rawtext, None, error)

    def _insert_query(self, rawtext, query, error):
        """Internal method to insert a query"""
        if self.verbose:
            self.logger.debug("DASParserCache: insert %s->%s/%s" %\
                           (rawtext, query, error))
        # since MongoDB does not support insertion of $ sign in queries
        # we need to encode inserted query
        if query:
            encquery = encode_mongo_query(query)
        else:
            encquery = ""
        self.col.insert({
            'raw': rawtext,
            'qhash': genkey(rawtext),
            'query': encquery,
            'error': str(error)
        })
Ejemplo n.º 41
0
class DASMapping(object):
    """
    This class manages DAS mapping DB.
    """

    def __init__(self, config):
        self.verbose = config["verbose"]
        self.logger = PrintManager("DASMapping", self.verbose)
        self.services = config["services"]
        self.dburi = config["mongodb"]["dburi"]
        self.dbname = config["mappingdb"]["dbname"]
        self.colname = config["mappingdb"]["collname"]

        msg = "%s@%s" % (self.dburi, self.dbname)
        self.logger.info(msg)

        self.create_db()

        self.keymap = {}  # to be filled at run time
        self.presentationcache = {}  # to be filled at run time
        self.reverse_presentation = {}  # to be filled at run time
        self.notationcache = {}  # to be filled at run time
        self.diffkeycache = {}  # to be filled at run time
        self.apicache = {}  # to be filled at run time
        self.apiinfocache = {}  # to be filled at run time
        self.init_notationcache()
        self.init_presentationcache()

    # ===============
    # Management APIs
    # ===============
    def init_notationcache(self):
        """
        Initialize notation cache by reading notations.
        """
        for system, notations in self.notations().iteritems():
            for row in notations:
                key = system, row["notation"]
                if self.notationcache.has_key(key):
                    self.notationcache[key] += [(row["api"], row["map"])]
                else:
                    self.notationcache[key] = [(row["api"], row["map"])]

    def init_presentationcache(self):
        """
        Initialize presentation cache by reading presentation map.
        """
        query = {"presentation": {"$ne": None}}
        data = self.col.find_one(query)
        if data:
            self.presentationcache = data["presentation"]
            for daskey, uilist in self.presentationcache.iteritems():
                for row in uilist:
                    link = None
                    if row.has_key("link"):
                        link = row["link"]
                    if row.has_key("diff"):
                        self.diffkeycache[daskey] = row["diff"]
                    tdict = {daskey: {"mapkey": row["das"], "link": link}}
                    if self.reverse_presentation.has_key(row["ui"]):
                        self.reverse_presentation[row["ui"]].update(tdict)
                    else:
                        self.reverse_presentation[row["ui"]] = {daskey: {"mapkey": row["das"], "link": link}}

    def create_db(self):
        """
        Establish connection to MongoDB back-end and create DB.
        """
        self.conn = db_connection(self.dburi)
        self.db = self.conn[self.dbname]
        self.col = self.db[self.colname]

    def delete_db(self):
        """
        Delete mapping DB in MongoDB back-end.
        """
        self.conn.drop_database(self.dbname)

    def delete_db_collection(self):
        """
        Delete mapping DB collection in MongoDB.
        """
        self.db.drop_collection(self.colname)

    def check_maps(self):
        """
        Check if there are records in Mapping DB
        """
        return self.col.count()

    def remove(self, spec):
        """
        Remove record in DAS Mapping DB for provided Mongo spec.
        """
        self.col.remove(spec)

    def add(self, record):
        """
        Add new record into mapping DB. Example of URI record

        .. doctest::

            {
             system:dbs, 
             urn : listBlocks, 
             url : "http://a.b.com/api"
             params : [
                 {"apiversion":1_2_2, test:"*"}
             ]
             daskeys: [
                 {"key" : "block", "map":"block.name", "pattern":""}
             ]
             das2api: [
                 {"das_key":"site", "api_param":"se", 
                       "pattern":"re.compile('^T[0-3]_')"}
             ]
            }

        Example of notation record:

        .. doctest::

             notations: [
                 {"notation" : "storage_element_name", "map":"site", "api": ""},
             ]
        """
        msg = "record=%s" % record
        self.logger.debug(msg)
        self.col.insert(record)
        index = None
        if record.has_key("urn"):
            index = [("system", DESCENDING), ("daskeys", DESCENDING), ("urn", DESCENDING)]
        elif record.has_key("notations"):
            index = [("system", DESCENDING), ("notations.api_param", DESCENDING)]
        elif record.has_key("presentation"):
            index = []
        else:
            msg = "Invalid record %s" % record
            raise Exception(msg)
        if index:
            create_indexes(self.col, index)

    # ==================
    # Informational APIs
    # ==================
    def list_systems(self):
        """
        List all DAS systems.
        """
        cond = {"system": {"$ne": None}}
        gen = (row["system"] for row in self.col.find(cond, ["system"]))
        return list(set(gen2list(gen)) & set(self.services))

    def list_apis(self, system=None):
        """
        List all APIs.
        """
        if self.apicache and self.apicache.has_key(system):
            return self.apicache[system]
        cond = {"urn": {"$ne": None}}
        if system:
            cond["system"] = system
        gen = (row["urn"] for row in self.col.find(cond, ["urn"]))
        self.apicache[system] = gen2list(gen)
        return self.apicache[system]

    def api_info(self, api_name):
        """
        Return full API info record.
        """
        return self.apiinfocache.get(api_name, self.col.find_one({"urn": api_name}))

    def relational_keys(self, system1, system2):
        """
        Return a list of relational keys between provided systems
        """
        for system, keys in self.daskeys().iteritems():
            if system == system1:
                keys1 = keys
            if system == system2:
                keys2 = keys
        return list(set(keys1) & set(keys2))

    def daskeys(self, das_system=None):
        """
        Return a dict with all known DAS keys.
        """
        cond = {"system": {"$ne": None}}
        if das_system:
            cond = {"system": das_system}
        gen = (row["system"] for row in self.col.find(cond, ["system"]))
        kdict = {}
        for system in gen:
            query = {"system": system, "urn": {"$ne": None}}
            keys = []
            for row in self.col.find(query):
                for entry in row["daskeys"]:
                    if entry["key"] not in keys:
                        keys.append(entry["key"])
            kdict[system] = keys
        return kdict

    # ============
    # Look-up APIs
    # ============
    def primary_key(self, das_system, urn):
        """
        Return DAS primary key for provided system and urn
        """
        cond = {"system": das_system, "urn": urn}
        daskeys = self.col.find(cond, ["daskeys.key"])
        for row in daskeys:
            if row and row.has_key("daskeys"):
                for dkey in row["daskeys"]:
                    if dkey.has_key("key"):
                        return dkey["key"]

    def primary_mapkey(self, das_system, urn):
        """
        Return DAS primary map key for provided system and urn
        """
        cond = {"system": das_system, "urn": urn}
        mapkeys = self.col.find(cond, ["daskeys.map"])
        for row in mapkeys:
            if row and row.has_key("daskeys"):
                for mkey in row["daskeys"]:
                    if mkey.has_key("map"):
                        return mkey["map"]

    def find_daskey(self, das_system, map_key, value=None):
        """
        Find das key for given system and map key.
        """
        msg = "system=%s\n" % das_system
        cond = {"system": das_system, "daskeys.map": map_key}
        daskeys = []
        for row in self.col.find(cond, ["daskeys"]):
            if row and row.has_key("daskeys"):
                for dkey in row["daskeys"]:
                    if dkey.has_key("key"):
                        if value:
                            pval = dkey.get("pattern", "")
                            if pval:
                                pat = re.compile(pval)
                                if pat.match(str(value)):
                                    daskeys.append(dkey["key"])
                                else:
                                    msg += "-- reject key=%s, val=%s, pat=%s\n" % (map_key, value, pval)
                                    self.logger.debug(msg)
                            else:
                                daskeys.append(dkey["key"])
                        else:
                            daskeys.append(dkey["key"])
        return daskeys

    def find_mapkey(self, das_system, das_key, value=None):
        """
        Find map key for given system and das key.
        """
        msg = "system=%s\n" % das_system
        cond = {"system": das_system, "daskeys.key": das_key}
        for row in self.col.find(cond, ["daskeys", "urn"]):
            if row and row.has_key("daskeys"):
                for key in row["daskeys"]:
                    if key.has_key("map") and key["key"] == das_key:
                        if value:
                            pval = key.get("pattern", "")
                            pat = re.compile(pval)
                            if pat.match(str(value)):
                                return key["map"]
                            else:
                                msg += "-- reject key=%s, val=%s, pat=%s\n" % (das_key, value, key["pattern"])
                                self.logger.debug(msg)
                                continue
                        else:
                            return key["map"]

    def mapkeys(self, daskey):
        """
        Find primary key for a given daskey
        """
        if self.keymap.has_key(daskey):
            return self.keymap[daskey]
        spec = {"daskeys.key": daskey}
        mapkeys = []
        for row in self.col.find(spec, ["daskeys"]):
            for kmap in row["daskeys"]:
                if kmap["key"] == daskey and kmap["map"] not in mapkeys:
                    mapkeys.append(kmap["map"])
        self.keymap[daskey] = mapkeys
        return self.keymap[daskey]

    def find_apis(self, das_system, map_key):
        """
        Find list of apis which correspond to provided
        system and das map key.
        """
        cond = {"system": das_system, "daskeys.map": map_key}
        apilist = []
        for row in self.col.find(cond, ["urn"]):
            if row.has_key("urn") and row["urn"] not in apilist:
                apilist.append(row["urn"])
        return apilist

    def check_dasmap(self, system, urn, das_map, value=None):
        """
        Check if provided system/urn/das_map is a valid combination
        in mapping db. If value for das_map key is provided we verify
        it against pattern in DB.
        """
        if not value:
            cond = {"system": system, "daskeys.map": das_map, "urn": urn}
            return self.col.find(cond).count()
        cond = {"system": system, "daskeys.map": das_map, "urn": urn}
        for row in self.col.find(cond, ["daskeys.pattern"]):
            for item in row["daskeys"]:
                pat = re.compile(item["pattern"])
                if pat.match(str(value)):
                    return True
        return False

    def find_system(self, key):
        """
        Return system name for provided DAS key.
        """
        cond = {"daskeys.key": key}
        gen = (row["system"] for row in self.col.find(cond, ["system"]))
        systems = []
        for system in gen:
            if system not in systems:
                systems.append(system)
        systems.sort()
        return systems

    def lookup_keys(self, system, daskey, api=None, value=None):
        """
        Returns lookup keys for given system and provided
        selection DAS key, e.g. block => block.name
        """
        query = {"system": system, "daskeys.key": daskey}
        if api:
            query["urn"] = api
        lookupkeys = []
        for row in self.col.find(query):
            for kdict in row["daskeys"]:
                if kdict["key"] == daskey:
                    lkey = kdict["map"]
                else:
                    continue
                if value and kdict["pattern"]:
                    pat = re.compile(kdict["pattern"])
                    if pat.match(str(value)):
                        if lkey not in lookupkeys:
                            lookupkeys.append(lkey)
                else:
                    if lkey not in lookupkeys:
                        lookupkeys.append(lkey)
        if not lookupkeys:
            msg = "Unable to find look-up key for "
            msg += "system=%s, daskey=%s, api=%s, value=%s" % (system, daskey, api, value)
            raise Exception(msg)
        return lookupkeys

    def api2das(self, system, api_input_name):
        """
        Translates data-service API input parameter into DAS QL key,
        e.g. run_number => run.
        """
        query = {"system": system, "das2api.api_param": api_input_name}
        names = []
        for adas in self.col.find(query, ["das2api"]):
            for row in adas["das2api"]:
                try:
                    aparam = row["api_param"]
                    daskey = row["das_key"]
                    if aparam == api_input_name and daskey not in names:
                        names.append(daskey)
                except Exception, err:
                    print "ERROR: look-up api_param/das_key in", row
                    raise err
        return names
Ejemplo n.º 42
0
class DASCore(object):
    """
    DAS core class.
    """
    def __init__(self,
                 config=None,
                 debug=0,
                 nores=False,
                 logger=None,
                 engine=None,
                 multitask=True):
        if config:
            dasconfig = config
        else:
            dasconfig = das_readconfig()
        verbose = dasconfig['verbose']
        self.stdout = debug
        if isinstance(debug, int) and debug:
            self.verbose = debug
            dasconfig['verbose'] = debug
        else:
            self.verbose = verbose
        das_timer('DASCore::init', self.verbose)
        self.operators = das_operators()
        self.collect_wait_time = dasconfig['das'].get('collect_wait_time', 120)

        # set noresults option
        self.noresults = False
        if nores:
            dasconfig['write_cache'] = True
            self.noresults = nores

        self.init_expire = dasconfig['das'].get('init_expire', 5 * 60)
        self.multitask = dasconfig['das'].get('multitask', True)
        if debug or self.verbose:
            self.multitask = False  # in verbose mode do not use multitask
            dasconfig['das']['multitask'] = False
        if not multitask:  # explicitly call DASCore ctor
            self.multitask = False
            dasconfig['das']['multitask'] = False
        dasconfig['engine'] = engine
        if self.multitask:
            nworkers = dasconfig['das'].get('core_workers', 5)
            #             if  engine:
            #                 thr_name = 'DASCore:PluginTaskManager'
            #                 self.taskmgr = PluginTaskManager(\
            #                         engine, nworkers=nworkers, name=thr_name)
            #                 self.taskmgr.subscribe()
            #             else:
            #                 thr_name = 'DASCore:TaskManager'
            #                 self.taskmgr = TaskManager(nworkers=nworkers, name=thr_name)
            thr_name = 'DASCore:TaskManager'
            self.taskmgr = TaskManager(nworkers=nworkers, name=thr_name)
        else:
            self.taskmgr = None

        if logger:
            self.logger = logger
        else:
            self.logger = PrintManager('DASCore', self.verbose)

        # define Mapping/Analytics/Parser in this order since Parser depends
        # on first two
        dasmapping = DASMapping(dasconfig)
        dasconfig['dasmapping'] = dasmapping
        self.mapping = dasmapping

        self.keylearning = DASKeyLearning(dasconfig)
        dasconfig['keylearning'] = self.keylearning

        # init DAS cache
        self.rawcache = DASMongocache(dasconfig)
        dasconfig['rawcache'] = self.rawcache

        # plug-in architecture: loop over registered data-services in
        # dasconfig; load appropriate module/class; register data
        # service with DASCore.
        self.systems = dasmapping.list_systems()
        # pointer to the DAS top level directory
        dasroot = '/'.join(__file__.split('/')[:-3])
        for name in self.systems:
            try:
                klass  = 'DAS/services/%s/%s_service.py' \
                    % (name, name)
                srvfile = os.path.join(dasroot, klass)
                with open(srvfile) as srvclass:
                    for line in srvclass:
                        if line.find('(DASAbstractService)') != -1:
                            klass = line.split('(DASAbstractService)')[0]
                            klass = klass.split('class ')[-1]
                            break
                mname = 'DAS.services.%s.%s_service' % (name, name)
                module = __import__(mname, fromlist=[klass])
                obj = getattr(module, klass)(dasconfig)
                setattr(self, name, obj)
            except IOError as err:
                if debug > 1:
                    # we have virtual services, so IOError can be correct
                    print_exc(err)
                try:
                    mname = 'DAS.services.generic_service'
                    module = __import__(mname, fromlist=['GenericService'])
                    obj = module.GenericService(name, dasconfig)
                    setattr(self, name, obj)
                except Exception as exc:
                    print_exc(exc)
                    msg = "Unable to load %s data-service plugin" % name
                    raise Exception(msg)
            except Exception as exc:
                print_exc(exc)
                msg = "Unable to load %s data-service plugin" % name
                raise Exception(msg)

        # loop over systems and get system keys, add mapping keys to final list
        self.service_keys = {}
        self.service_parameters = {}
        for name in self.systems:
            skeys = list(getattr(self, name).keys())
            self.service_keys[getattr(self, name).name] = skeys
            sparams = getattr(self, name).parameters()
            self.service_parameters[getattr(self, name).name] = sparams

        self.service_keys['special'] = das_special_keys()
        self.dasconfig = dasconfig
        das_timer('DASCore::init', self.verbose)

    def keys(self):
        """
        Return map of data service keys
        """
        return self.service_keys

    def das_keys(self):
        """
        Return map of data service keys
        """
        _keys = ['records']
        for values in self.service_keys.values():
            for key in values:
                if key not in _keys:
                    _keys.append(key)
        return _keys

    def result(self, query, idx=0, limit=None):
        """
        Get results either from cache or from explicit call
        """
        self.logger.info('input query=%s' % query)
        results = []
        dasquery = DASQuery(query)
        query = dasquery.mongo_query
        # check if we have any service which cover the query
        # otherwise decompose it into list of queries
        service_map = dasquery.service_apis_map()
        if not service_map:
            msg = 'no APIs found to answer input query, will decompose it'
            self.logger.info(msg)
            skeys = query['fields']
            if not skeys:
                skeys = []
            for key in skeys:
                newquery = DASQuery(dict(fields=[key], spec=query['spec']))
                self.call(newquery)  # process query
        else:
            self.call(dasquery)  # process query

        # lookup provided query in a cache
        if not self.noresults:
            results = self.get_from_cache(dasquery, idx, limit)
        return results

    def remove_from_cache(self, dasquery):
        """
        Delete in cache entries about input query
        """
        self.rawcache.remove_from_cache(dasquery)

    def get_status(self, dasquery):
        """
        Look-up status of provided query in a cache.
        Return status of the query request and its hash.
        """
        status = None
        error = None
        reason = None
        if dasquery and 'fields' in dasquery.mongo_query:
            fields = dasquery.mongo_query['fields']
            if fields and isinstance(fields, list) and 'queries' in fields:
                return 'ok', error, reason
        record = self.rawcache.find(dasquery)
        error, reason = self.rawcache.is_error_in_records(dasquery)
        try:
            if record and 'das' in record and 'status' in record['das']:
                status = record['das']['status']
                if not error:
                    error = record['das'].get('error', error)
                if not reason:
                    reason = record['das'].get('reason', reason)
                return status, error, reason
        except Exception as exc:
            print_exc(exc)
            status = error = reason = None
            self.rawcache.remove_from_cache(dasquery)
        return status, error, reason

    def status(self):
        "Return status of given service"
        sdict = {'das': self.taskmgr.status()}
        for srv in sorted(self.systems):
            sdict[srv] = getattr(getattr(self, srv), 'status')()
        return sdict

    def worker(self, srv, dasquery):
        """Main worker function which calls data-srv call function"""
        self.logger.info('##### %s ######\n' % srv)
        das_timer(srv, self.verbose)
        getattr(getattr(self, srv), 'call')(dasquery)
        das_timer(srv, self.verbose)

    def insert_query_records(self, dasquery):
        """
        Insert DAS query records into DAS cache and return list of services
        which will answer this query
        """
        services = dasquery.services
        self.logger.info('Potential services = %s' % services)
        if not services:
            msg = 'No data-services for query %s' % dasquery
            msg += 'mongo_query: %s' % dasquery.mongo_query
            msg += 'params: %s' % dasquery.params()
            print(dastimestamp('DAS WARNING '), msg)

        # get list of URI which can answer this query
        ack_services = []
        for srv in services:
            gen = [t for t in getattr(getattr(self, srv), 'apimap')(dasquery)]
            for url, api, args, iformat, expire in gen:
                header = dasheader(srv, dasquery, expire, api, url, ctime=0)
                self.rawcache.insert_query_record(dasquery, header)
                if srv not in ack_services:
                    ack_services.append(srv)
        if not ack_services:
            ack_services = services
        if dasquery.query.find('records ') != -1:
            srv_status = True  # skip DAS queries w/ records request
        # create das record with initial expire tstamp
        expire = time.time() + self.init_expire
        header = dasheader("das",
                           dasquery,
                           expire,
                           api='das_core',
                           services=dict(das=ack_services))
        header['lookup_keys'] = []
        self.rawcache.insert_query_record(dasquery, header)
        das_timer('das_record', self.verbose)
        return ack_services

    def call(self, query, **kwds):
        """
        Top level DAS api which execute a given query using underlying
        data-services. It follows the following steps:

            - parse input query
            - identify data-sercices based on selection keys
              and where clause conditions
            - construct DAS workflow and execute data-service 
              API calls. At this step individual 
              data-services store results into DAS cache.

        Return status 0/1 depending on success of the calls, can be
        used by workers on cache server.

        kwds is provided for compatibility with web layer, e.g. it
        may invoke this method with additional pid parameter.
        """
        def update_das_query(dasquery, status, reason=None):
            "Update DAS query record with given status and reason"
            self.rawcache.update_query_record(dasquery, status, reason=reason)
            self.rawcache.add_to_record(\
                    dasquery, {'das.timer': get_das_timer()}, system='das')

        self.logger.info('input query=%s' % query)
        das_timer('DASCore::call', self.verbose)
        if  isinstance(query, object) and hasattr(query, '__class__')\
            and query.__class__.__name__ == 'DASQuery':
            dasquery = query
        else:
            dasquery = DASQuery(query)
        for col in ['merge', 'cache']:
            self.rawcache.remove_expired(dasquery, col)
        query = dasquery.mongo_query
        spec = query.get('spec')
        fields = query.get('fields')
        if fields == ['records']:
            msg = 'look-up all records in cache'
            self.logger.info(msg)
            return 'in cache'
        if spec == dict(records='*'):
            self.logger.info("look-up everything in cache")
            return 'in cache'
        for record in self.rawcache.find_specs(dasquery):
            status = record['das']['status']
            msg = 'found query %s in cache, status=%s\n' \
                        % (record['query'], status)
            self.logger.info(msg)
            print(dastimestamp('DAS INFO'), msg)
            return status

        self.logger.info(dasquery)
        das_timer('das_record', self.verbose)
        services = self.insert_query_records(dasquery)
        if not services:
            msg = 'unable to locate data-services to fulfill this request'
            msg += ', will iterate over all registered services'
            print(dastimestamp('DAS WARNING '), dasquery, msg)
            services = dasquery.services if dasquery.services else self.systems
        try:
            if self.multitask:
                jobs = []
                for srv in sorted(services):
                    jobs.append(self.taskmgr.spawn(self.worker, srv, dasquery))
                self.taskmgr.joinall(jobs)
            else:
                for srv in services:
                    self.worker(srv, dasquery)
        except Exception as exc:
            print_exc(exc)
            return 'fail'
        self.logger.info('\n##### merging ######\n')
        update_das_query(dasquery, 'merging')
        das_timer('merge', self.verbose)

        # check that all query record statuses are ok, i.e. we did insert records
        # this status is set by self.rawcache.update_cache
        for idx in range(self.collect_wait_time):
            records = self.rawcache.find_query_record(dasquery)
            statuses = []
            for row in records:
                system = row['das']['system']
                status = row['das']['status']
                self.logger.info("### query record status %s %s %s" %
                                 (dasquery.qhash, system, status))
                statuses.append(status)
            all_statuses = sorted(list(set(statuses)))
            # at this point we're looking that all services will have 'ok' and das status will be 'merging'
            if len(all_statuses) == 2 and all_statuses == ['merging', 'ok']:
                break
            time.sleep(1)

        # now we can merge records
        status = self.rawcache.merge_records(dasquery)
        das_timer('merge', self.verbose)
        # check if we have service records and properly setup status
        self.logger.info('\n##### check services ######\n')
        das_services = self.rawcache.check_services(dasquery)
        reason = ''
        status = 'ok'
        if not das_services:
            if 'records' in dasquery.query:
                status = 'ok'  # keep status ok for 'records' queries
            else:
                reason = 'no data records found in DAS cache'
                status = 'fail'
                print(dastimestamp('DAS ERROR '), dasquery, reason)
        update_das_query(dasquery, status, reason)
        das_timer('DASCore::call', self.verbose)
        return status

    def processing_time(self, dasquery):
        "Look-up and return DAS query processing time"
        query_record = self.rawcache.find(dasquery)
        if query_record:
            das = query_record.get('das', None)
            if isinstance(das, dict):
                ctime = das.get('ctime', [])
                if ctime:
                    return ctime[-1] - ctime[0]
        return None

    def nresults(self, dasquery, coll='merge'):
        """
        Return total number of results (count) for provided query
        Code should match body of get_from_cache method.
        """
        fields = dasquery.mongo_query.get('fields', None)
        if dasquery.mapreduce:
            result = self.rawcache.map_reduce(dasquery.mapreduce, dasquery)
            return len([1 for _ in result])
        elif dasquery.aggregators:
            return len(dasquery.aggregators)
        return self.rawcache.nresults(dasquery, coll)

    def apilist(self, dasquery):
        "Return list of APIs answer given das query"
        return self.rawcache.apilist(dasquery)

    def incache(self, dasquery, coll='merge'):
        """
        Answer the question if given query in DAS cache or not
        """
        return self.rawcache.incache(dasquery, collection=coll)

    def get_from_cache(self, dasquery, idx=0, limit=0, collection='merge'):
        """
        Look-up results from the merge cache and yield them for
        further processing.
        """
        das_timer('DASCore::get_from_cache', self.verbose)
        msg = 'col=%s, query=%s, idx=%s, limit=%s'\
                % (collection, dasquery, idx, limit)
        self.logger.info(msg)

        fields = dasquery.mongo_query.get('fields', None)

        if dasquery.mapreduce:
            res = self.rawcache.map_reduce(dasquery.mapreduce, dasquery)
        elif dasquery.aggregators:
            # extract das information from rawcache
            rows  = self.rawcache.get_from_cache(\
                    dasquery, collection=collection)
            first = next(rows)
            sinfo = das_sinfo(first)
            # to perform aggregation we need:
            # - loop over all aggregator functions
            # - loop over all data-services
            # - loop over all APIs within a data-services
            # the code below does that, it applies aggregator
            # to selected (based on key/srv/api) records
            res = []
            _id = 0
            time0 = time.time()
            expire = 300  # min expire
            for func, key in dasquery.aggregators:
                afunc = getattr(das_aggregator, 'das_%s' % func)
                found = False
                for srv, apis, in sinfo.items():
                    for api in apis:
                        rows  = self.rawcache.get_from_cache(\
                                dasquery, collection=collection)
                        gen = api_rows(rows, api)
                        data = afunc(key, gen)
                        ctime = time.time() - time0
                        das = dasheader(srv,
                                        dasquery,
                                        expire,
                                        api=api,
                                        ctime=ctime)
                        if isinstance(data, dict) and data['value'] != 'N/A':
                            aggr = {
                                '_id': _id,
                                'function': func,
                                'key': key,
                                'result': data
                            }
                            aggr.update(das)
                            res.append(aggr)
                            _id += 1
                            found = True
                if not found:  # when we got nothing add empty result record
                    empty = {'value': 'N/A'}
                    ctime = time.time() - time0
                    das = dasheader('das',
                                    dasquery,
                                    expire,
                                    api='das_core',
                                    ctime=ctime)
                    rec = {
                        '_id': 0,
                        'function': func,
                        'key': key,
                        'result': empty
                    }
                    rec.update(das)
                    res.append(rec)
        else:
            res = self.rawcache.get_from_cache(dasquery, idx, limit, \
                    collection=collection)
        # we assume that all records from single query will have
        # identical structure, therefore it will be sufficient to update
        # keylearning DB only with first record
        count = 0
        for row in res:
            if not count:
                self.keylearning.add_record(dasquery, row)
            fix_times(row)
            yield row
            count += 1
        das_timer('DASCore::get_from_cache', self.verbose)
Ejemplo n.º 43
0
class DASAnalytics(object):
    """
    DAS analytics DB manager.
    """
    def __init__(self, config):
        self.verbose = config['verbose']
        self.logger  = PrintManager('DASAnalytics', self.verbose)
        self.dburi   = config['mongodb']['dburi']
        self.dbname  = config['analyticsdb']['dbname']        
        self.colname = config['analyticsdb']['collname']
        self.history = config['analyticsdb']['history']
        msg = "%s@%s" % (self.dburi, self.dbname)
        self.logger.info(msg)
        self.create_db()

    def create_db(self):
        """
        Create analytics DB in MongoDB back-end.
        """
        self.conn = db_connection(self.dburi)
        database  = self.conn[self.dbname]
        das_son_manipulator = DAS_SONManipulator()
        database.add_son_manipulator(das_son_manipulator)
        self.col  = database[self.colname]
#        if  self.dbname not in self.conn.database_names():
#            capped_size = 104857600
#            options   = {'capped':True, 'size': capped_size}
#            database  = self.conn[self.dbname]
#            database.create_collection('self.colname', **options)
#            print "####CREATE CAPPED ANALYTICS"
#        self.col  = self.conn[self.dbname][self.colname] 

    def delete_db(self):
        """
        Delete analytics DB in MongoDB back-end.
        """
        self.conn.drop_database(self.dbname)

    def delete_db_collection(self):
        """
        Delete analytics DB collection in MongoDB.
        """
        self.conn.drop_collection(self.colname)

    def add_query(self, query, mongoquery):
        """
        Add DAS-QL/MongoDB-QL queries into analytics.
        
        A unique record is contained for each (qhash, dhash) pair.
        For each an array of call-times is contained.
        """
        if  isinstance(mongoquery, dict):
            mongoquery = encode_mongo_query(mongoquery)
        msg = 'query=%s, mongoquery=%s' % (query, mongoquery)
        self.logger.debug(msg)
        dhash = genkey(query)
        qhash = genkey(mongoquery)

        now = time.time()

        existing = self.col.find_one({'qhash': qhash, 'dhash': dhash})
        if existing:
            # check if times contains very old timestamps
            rec = self.col.find({'_id': ObjectId(existing['_id']), 
                                 'times':{'$lt' : now - self.history}})
            if  rec:
                self.col.update({'_id': ObjectId(existing['_id'])},
                    {'$pull': {'times': {'$lt' : now - self.history}}})
            # update times array with new timestamp
            self.col.update({'_id': ObjectId(existing['_id'])},
                            {'$push': {'times': now}})
        else:
            record = dict(query=query, mongoquery=mongoquery,
                        qhash=qhash, dhash=dhash, times=[now])
            self.col.insert(record)

        index = [('qhash', DESCENDING),
                 ('dhash', DESCENDING)]
        create_indexes(self.col, index)
        
    def clean_queries(self):
        """
        Standalone method to clean up expired call-times from query records,
        since otherwise only the active record is cleaned.
        
        This is too expensive to do with every operation, and mongodb
        does not allow multiple modifications to a single field in a single
        update operation (ie, we can't do $push and $pull in one update),
        so it should probably be done asynchronously at fixed intervals.
        """
        
        self.logger.debug('')
        
        now = time.time()
        
        #clean out the times array
        self.col.update({'times': {'$exists': True}},
                        {'$pull': {'times': {'$lt': now - self.history}}})
        #now delete any with no times
        self.col.remove({'times': {'$size': 0}})
        #and should maybe delete anything with the same qhash here?

    def remove_expired(self):
        "Moved from AbstractService -  remove old apicall records"
        spec = {'apicall.expire':{'$lt' : int(time.time())}}
        self.col.remove(spec)

    def add_summary(self, identifier, start, finish, **payload):
        """
        Add an analyzer summary, with given analyzer identifier,
        start and finish times and payload.
        
        It is intended that a summary document is deposited on
        each run of an analyzer (if desirable) and is thereafter
        immutable.
        """
        msg = '(%s, %s->%s, %s)' % (identifier, start, finish, payload)
        self.logger.debug(msg)
        
        # clean-up analyzer records whose start timestamp is too old
        spec = {'start':{'$lt':time.time()-self.history},
                'analyzer': {'$exists': True}}
        self.col.remove(spec)

        # insert new analyzer record
        record = {'analyzer':identifier,
                  'start': start,
                  'finish': finish}
        payload.update(record) #ensure key fields are set correctly
        self.col.insert(payload)
        # ensure summary items are indexed for quick extract
        create_indexes(self.col, [('analyzer', DESCENDING), ('start', ASCENDING)])

    def get_summary(self, identifier, after=None, before=None, **query):
        """
        Retrieve a summary document for a given analyzer-identifier,
        optionally specifying a time range.
        """
        cond = {'analyzer': identifier}
        if after:
            cond['start'] = {'$gte': after}
        if before:
            cond['finish'] = {'$lte': before}
        if query:
            cond.update(query)
        return list(self.col.find(cond))

    def add_api(self, system, query, api, args):
        """
        Add API info to analytics DB. 
        Here args is a dict of API parameters.
        """
        orig_query = query
        if  isinstance(query, dict):
            query = encode_mongo_query(query)
        msg = '(%s, %s, %s, %s)' % (system, query, api, args)
        self.logger.debug(msg)
        # find query record
        qhash = genkey(query)
        record = self.col.find_one({'qhash':qhash}, fields=['dasquery'])
        if  not record:
            self.add_query("", orig_query)
        # find api record
        record = self.col.find_one({'qhash':qhash, 'system':system,
                        'api.name':api, 'api.params':args}) 
        apidict = dict(name=api, params=args)
        if  record:
            self.col.update({'_id':record['_id']}, {'$inc':{'counter':1}})
        else:
            record = dict(system=system, api=apidict, qhash=qhash, counter=1)
            self.col.insert(record)
        index = [('system', DESCENDING), ('dasquery', DESCENDING),
                 ('api.name', DESCENDING), ('qhash', DESCENDING) ]
        create_indexes(self.col, index)
        
    def insert_apicall(self, system, query, url, api, api_params, expire):
        """
        Remove obsolete apicall records and
        insert into Analytics DB provided information about API call.
        Moved from AbstractService.
        
        Updated so that we do not have multiple records when performing
        forced updates (ie, the old record is not yet expired) - now
        look for an existing record with the same parameters (I'm hoping
        the fact that some of the variables are indexed will make this
        fast even though not all are), and if it exists just update
        the expiry. Otherwise insert a new record.
        """
        msg = 'query=%s, url=%s,' % (query, url)
        msg += 'api=%s, args=%s, expire=%s' % (api, api_params, expire)
        self.logger.debug(msg)
        expire = expire_timestamp(expire)
        query = encode_mongo_query(query)
        qhash = genkey(query)
        self.remove_expired()
        existing = self.col.find_one({'apicall.system':     system,
                                      'apicall.url':        url,
                                      'apicall.api':        api,
                                      'apicall.api_params': api_params,
                                      'apicall.qhash':      qhash})
        if existing:
            self.logger.debug("updating")
            self.col.update({'_id': existing['_id']},
                            {'$set':{'apicall.expire': expire}})
        else:
            self.col.insert({'apicall':{'api_params':   api_params,
                                        'url':          url,
                                        'api':          api,
                                        'system':       system,
                                        'expire':       expire,
                                        'qhash':        qhash}})
        index_list = [('apicall.url', DESCENDING),
                      ('apicall.api', DESCENDING),
                      ('qhash', DESCENDING)]
        create_indexes(self.col, index_list)
        
    def update_apicall(self, query, das_dict):
        """
        Update apicall record with provided DAS dict.
        Moved from AbstractService
        """
        msg = 'DBSAnalytics::update_apicall, query=%s, das_dict=%s'\
                % (query, das_dict)
        self.logger.debug(msg)
        spec = {'apicall.qhash':genkey(encode_mongo_query(query))} 
        record = self.col.find_one(spec)
        self.col.update({'_id':ObjectId(record['_id'])},
            {'$set':{'dasapi':das_dict,
                     'apicall.expire':das_dict['response_expires']}})

    def update(self, system, query):
        """
        Update records for given system/query.
        """
        if  isinstance(query, dict):
            query = encode_mongo_query(query)
        msg = 'system=%s, query=%s' % (system, query)
        self.logger.debug(msg)
        qhash = genkey(query)
        if  system:
            cond = {'qhash':qhash, 'system':system}
        else:
            cond = {'qhash':qhash}
        self.col.update(cond, {'$inc' : {'counter':1}}, multi=True)

    def list_systems(self):
        """
        List all DAS systems.
        """
        cond = { 'system' : { '$ne' : None } }
        gen = (row['system'] for row in self.col.find(cond, ['system']))
        return gen2list(gen)

    def list_queries(self, qhash=None, dhash=None, query_regex=None,
                     key=None, after=None, before=None):
        """
        List inserted queries based on many criteria.
        """
        cond = {'mongoquery': {'$exists': True}}
        if qhash:
            cond['qhash'] = qhash
        if dhash:
            cond['dhash'] = dhash
        if query_regex:
            cond['dasquery'] = {'$regex':query_regex}
        if key:
            cond['mongoquery.spec.key'] = key
        # in this case we need a specific element to be within the range,
        # so we need to use elemMatch
        if before and after:
            cond['times'] = {'$gt': after, '$lt': before}
        # in these cases we only need to match any element
        elif after:
            cond['times'] = {'$gt': after}
        elif before:
            cond['times'] = {'$lt': before}
        
        return self.col.find(cond)
            
    def get_popular_queries(self, spec):
        """
        Get popular queries based on provided spec, which can be
        in a form of time stamp range, etc.
        """
        cond = {'counter':{'$exists':True}}
        for row in self.col.find(fields=['qhash'], spec=cond).\
                sort('counter', DESCENDING):
            spec = {'qhash': row['qhash'], 'counter':{'$exists': False}}
            for res in self.col.find(spec=spec):
                yield res

    def list_apis(self, system=None):
        """
        List all APIs.
        """
        cond = { 'api.name' : { '$ne' : None } }
        if  system:
            cond['system'] = system
        gen = (row['api']['name'] for row in \
                self.col.find(cond, ['api.name']))
        return gen2list(gen)
    
    def list_apicalls(self, qhash=None, api=None, url=None):
        "Replace ad-hoc calls in AbstractService"
        cond = {}
        if qhash:
            cond['apicall.qhash'] = qhash
        if api:
            cond['apicall.api'] = api
        if url:
            cond['apicall.url'] = url
        
        return list(self.col.find(cond))

    def api_params(self, api):
        """
        Retrieve API parameters from analytics DB
        """
        cond = {'api.name':api}
        gen = (row['api']['params'] for row in \
                self.col.find(cond, ['api.params']))
        return gen2list(gen)

    def api_counter(self, api, args=None):
        """
        Retrieve API counter from analytics DB. User must supply
        API name and optional dict of parameters.
        """
        cond = {'api.name': api}
        if  args:
            for key, val in args.iteritems():
                cond[key] = val
        return self.col.find_one(cond, ['counter'])['counter']
Ejemplo n.º 44
0
class HotspotBase(object):
    """
    This is a base-class for periodically-running
    analyzers that want to examine the moving average
    of some key->counter map, and pick the top few
    for further attention.

    DASQueries are extracted from analytics DB. The selected items
    are passed to generate_task callback implemented in subclasses.
    It look-up DAS query expiration timestamp and if necessary
    calls DAS to get it (along with results of the query).
    """
    def __init__(self, **kwargs):
        self.logger = PrintManager('HotspotBase', kwargs.get('verbose', 0))
        self.das = kwargs['DAS']
        self.fraction = float(kwargs.get('fraction', 0.15))
        self.mode = kwargs.get('mode','calls').lower()
        self.period = int(kwargs.get('period', 86400*30))
        self.interval = kwargs['interval']
        self.allowed_gap = int(kwargs.get('allowed_gap', 3600))
        self.identifier = kwargs['identifier']

    def __call__(self):
        """
        Perform a hotspot-like analysis. Subclasses shouldn't
        need to reimplement this method.

        We start with building selection chain. It consists of
        analytics summaries -> items -> preselected items ->
        mutable items. The final set of items is passed to
        task generation step (implemented in subclasses).
        The final report is generated and returned back.
        """

        epoch_end = time.time()
        epoch_start = epoch_end - self.period

        summaries = self.get_summaries(epoch_start, epoch_end)
        self.logger.info("Got %s summaries" % len(summaries))

        items = self.get_all_items(summaries)
        self.logger.info("Got %s items" % len(items))

        items = self.preselect_items(items)
        self.logger.info("Preselected to %s items" % len(items))

        items = self.select_items(items)
        self.logger.info("Selected %s items (%s:%s)" \
                         % (len(items), self.mode, self.fraction))

        items = self.mutate_items(items)
        self.logger.info("Mutated to %s items" % len(items))

        retval = {'mode': self.mode,
                  'fraction': self.fraction,
                  'epoch_start': epoch_start,
                  'epoch_end': epoch_end,
                  'summaries': len(summaries),
                  'selected': dict(items).items()}

        new_tasks = []
        failed_items = []
        for item, count in items.items():
            try:
                self.logger.info("Generating task for %s" % item)
                for task in \
                    self.generate_task(item, count, epoch_start, epoch_end):
                    new_tasks.append(task)
            except Exception as exc:
                failed_items.append((item, count, str(exc)))
        retval['new_tasks'] = new_tasks
        retval['failed_items'] = failed_items

        retval.update(self.report())

        return retval

    def generate_task(self, item, count, epoch_start, epoch_end):
        """
        For the given selected key, generate an appropriate task
        dictionary as understood by taskscheduler.

        Should be a generator or return an iterable
        """
        raise NotImplementedError

    def report(self):
        """
        Generate some extra keys to go in the job report, if desired.
        """
        return {}

    def preselect_items(self, items):
        """
        This is a part of selection chain.

        Optionally, preselect the items for consideration.
        A subclass wishing to exclude certain key types could
        do so here (but could also do so in make_one_summary).

        This is a good place to implement clustering algorithm
        for selected items. For example, if several queries are
        selected, we may analyze who has more weight and only
        pass those for task generation step.
        """
        return items

    def mutate_items(self, items):
        """
        This is a last part of selection chain.

        Optionally, mutate the selected items.
        A subclass wishing to merge together keys should
        do so here.
        """
        return items

    def get_all_items(self, summaries):
        """
        Merge the summary dictionaries.
        """
        items = collections.defaultdict(int)
        for summary in summaries:
            for key, val in summary.items():
                items[key] += val
        return items

    def select_items(self, items):
        """
        Take a mapping of item->count pairs and determine
        which are "hot" based on the selected mode.
        """
        sorted_keys = sorted(items.keys(), key=lambda x: items[x], reverse=True)
        selected_items = {}
        if self.mode == 'calls':
            total_calls = sum(items.values())
            running_total = 0
            for key in sorted_keys:
                running_total += items[key]
                selected_items[key] = items[key]
                if running_total > total_calls * self.fraction:
                    break
        elif self.mode == 'keys':
            selected_items = dict([(k, items[k])
               for k in sorted_keys[0:int(len(sorted_keys)*self.fraction)]])
        elif self.mode == 'fixed':
            selected_items = dict([(k, items[k])
               for k in sorted_keys[0:int(self.fraction)]])
        else:
            raise NotImplementedError
        return selected_items

    def get_summaries(self, epoch_start, epoch_end):
        """
        Fetch all the available pre-computed summaries
        and determine if any need to be constructed at this time.
        """
        #get all the summaries we can from this time
        try:
            summaries = self.das.analytics.get_summary(self.identifier,
                                                       after=epoch_start,
                                                       before=epoch_end)
            self.logger.info("Found %s summary documents." % len(summaries))
        except:
            summaries = []
        #see how much coverage of the requested period we have
        summaries = sorted(summaries, key=lambda x: x['start'])
        extra_summaries = []
        last_time = epoch_start
        for summary in summaries:
            if last_time < summary['start']:
                result = self.make_summary(last_time, summary['start'])
                extra_summaries.extend(result)
            last_time = summary['finish']
        result = self.make_summary(last_time, epoch_end)
        extra_summaries.extend(result)
        summaries = [dict(s['keys']) for s in summaries]
        summaries += extra_summaries

        return summaries

    def make_summary(self, start, finish):
        """
        Split the summarise requests into interval-sized chunks and decide
        if they're necessary at all.
        """
        self.logger.info("Found summary gap: %s->%s (%s)" \
                         % (start, finish, finish-start))
        result = []
        delta = finish - start
        if delta > self.allowed_gap:
            if delta > self.interval:
                blocks = int(delta/self.interval)
                span = delta/blocks
                self.logger.info("Gap longer than interval, " +\
                                 "creating %s summaries." % blocks)
                for i in xrange(blocks):
                    try:
                        summary = self.make_one_summary(start+span*i,
                                                        start+span*(i+1))
                        self.das.analytics.add_summary(self.identifier,
                                               start+span*i,
                                               start+span*(i+1),
                                               keys=(dict(summary)).items())
                        result.append(summary)
                    except:
                        pass

            else:
                try:
                    summary = self.make_one_summary(start, finish)
                    self.das.analytics.add_summary(self.identifier,
                                                   start,
                                                   finish,
                                                   keys=(dict(summary)).items())
                    result.append(summary)
                except:
                    pass
        else:
            self.logger.info("...short enough to ignore.")

        return result

    def make_one_summary(self, start, finish):
        """
        Actually make a summary of item->count pairs
        for the specified time range. Subclasses need to
        implement this for the analysis in question.
        """
        raise NotImplementedError
Ejemplo n.º 45
0
class DASAbstractService(object):
    """
    Abstract class describing DAS service. It initialized with a name which
    is used to identify service parameters from DAS configuration file.
    Those parameters are keys, verbosity level, URL of the data-service.
    """
    def __init__(self, name, config):
        self.name = name
        try:
            self.verbose      = config['verbose']
            title             = 'DASAbstactService_%s' % self.name
            self.logger       = PrintManager(title, self.verbose)
            self.dasmapping   = config['dasmapping']
            self.write2cache  = config.get('write_cache', True)
            self.multitask    = config['das'].get('multitask', True)
            self.error_expire = config['das'].get('error_expire', 300) 
            self.dbs_global   = None # to be configured at run time
            self.dburi        = config['mongodb']['dburi']
            engine            = config.get('engine', None)
            self.gfs          = db_gridfs(self.dburi)
        except Exception as exc:
            print_exc(exc)
            raise Exception('fail to parse DAS config')

        # read key/cert info
        try:
            self.ckey, self.cert = get_key_cert()
        except Exception as exc:
            print_exc(exc)
            self.ckey = None
            self.cert = None

        if  self.multitask:
            nworkers = config['das'].get('api_workers', 3)
            thr_weights = config['das'].get('thread_weights', [])
            for system_weight in thr_weights:
                system, weight = system_weight.split(':')
                if  system == self.name:
                    nworkers *= int(weight)
            if  engine:
                thr_name = 'DASAbstractService:%s:PluginTaskManager' % self.name
                self.taskmgr = PluginTaskManager(\
                        engine, nworkers=nworkers, name=thr_name)
                self.taskmgr.subscribe()
            else:
                thr_name = 'DASAbstractService:%s:TaskManager' % self.name
                self.taskmgr = TaskManager(nworkers=nworkers, name=thr_name)
        else:
            self.taskmgr = None

        self.map        = {}   # to be defined by data-service implementation
        self._keys      = None # to be defined at run-time in self.keys
        self._params    = None # to be defined at run-time in self.parameters
        self._notations = {}   # to be defined at run-time in self.notations

        self.logger.info('initialized')
        # define internal cache manager to put 'raw' results into cache
        if  'rawcache' in config and config['rawcache']:
            self.localcache   = config['rawcache']
        else:
            msg = 'Undefined rawcache, please check your configuration'
            raise Exception(msg)

    def services(self):
        """
        Return sub-subsystems used to retrieve data records. It is used
        in dasheader call to setup das.services field. This method can be
        overwritten in sub-classes, otherwise returns dict of service name
        and CMS systems used to retrieve data records.
        """
        return {self.name:[self.name]}

    def version(self):
        """Return data-services version, should be implemented in sub-classes"""
        return ''

    def keys(self):
        """
        Return service keys
        """
        if  self._keys:
            return self._keys
        srv_keys = []
        for _api, params in self.map.items():
            for key in params['keys']:
                if  not key in srv_keys:
                    srv_keys.append(key)
        self._keys = srv_keys
        return srv_keys

    def parameters(self):
        """
        Return mapped service parameters
        """
        if  self._params:
            return self._params
        srv_params = []
        for _api, params in self.map.items():
            for key in params['params']:
                param_list = self.dasmapping.api2das(self.name, key)
                for par in param_list:
                    if  not par in srv_params:
                        srv_params.append(par)
        self._params = srv_params
        return srv_params

    def notations(self):
        """
        Return a map of system notations.
        """
        if  self._notations:
            return self._notations
        for _, rows in self.dasmapping.notations(self.name).items():
            for row in rows:
                api  = row['api']
                nmap = row['rec_key']
                notation = row['api_output']
                if  api in self._notations:
                    self._notations[api].update({notation:nmap})
                else:
                    self._notations[api] = {notation:nmap}
        return self._notations

    def getdata(self, url, params, expire, headers=None, post=None):
        """URL call wrapper"""
        if  url.find('https:') != -1:
            return getdata(url, params, headers, expire, post,
                self.error_expire, self.verbose, self.ckey, self.cert,
                system=self.name)
        else:
            return getdata(url, params, headers, expire, post,
                self.error_expire, self.verbose, system=self.name)

    def call(self, dasquery):
        """
        Invoke service API to execute given query.
        Return results as a collect list set.
        """
        self.logger.info(dasquery)
        # check the cache for records with given query/system
        res = self.localcache.incache(dasquery,
                                      collection='cache',
                                      system=self.name)
        if  res:
            msg  = "found records in local cache"
            self.logger.info(msg)
            return
        # ask data-service api to get results, they'll be store them in
        # cache, so return at the end what we have in cache.
        self.api(dasquery)

    def write_to_cache(self, dasquery, expire, url, api, args, gen, ctime):
        """
        Write provided result set into DAS cache.
        """
        if  not self.write2cache:
            return

        # before going to cache we should check/set possible misses, e.g.
        # primary key when error is thrown
        result = self.set_misses(dasquery, api, gen)

        # update the cache
        header = dasheader(self.name, dasquery, expire, api, url,
                services=self.services())
        header['lookup_keys'] = self.lookup_keys(api)
        header['prim_key'] = self.dasmapping.primary_mapkey(self.name, api)
        header['ctime'] = ctime
        self.localcache.update_cache(dasquery, result, header)

        msg  = 'cache has been updated,\n'
        self.logger.debug(msg)

    def adjust_params(self, api, kwds, instance=None):
        """
        Data-service specific parser to adjust parameters according to
        its specifications. For example, DQ service accepts a string
        of parameters, rather parameter set, while DBS2 can reuse
        some parameters for different API, e.g. I can use dataset path
        to pass to listPrimaryDatasets as primary_dataset pattern.
        """
        pass

    def lookup_keys(self, api):
        """
        Return look-up keys of data output for given data-service API.
        """
        lkeys = self.dasmapping.lookup_keys(self.name, api)
        return [{api:lkeys}]

    def inspect_params(self, api, args):
        """
        Perform API parameter inspection. Check if API accept a range
        of parameters, etc.
        """
        for key, value in args.items():
            if  isinstance(value, dict):
                minval = None
                maxval = None
                for oper, val in value.items():
                    if  oper == '$in':
                        minval = int(val[0])
                        maxval = int(val[-1])
                        args[key] = range(minval, maxval)
                    elif oper == '$lt':
                        maxval = int(val)
                        args[key] = maxval
                    elif oper == '$lte':
                        maxval = int(val)
                        args[key] = maxval
                    elif oper == '$gt':
                        minval = int(val)
                        args[key] = minval
                    elif oper == '$gte':
                        minval = int(val)
                        args[key] = minval
                    else:
                        msg = '%s does not support operator %s' % (api, oper)
                        raise Exception(msg)
        return args

    def get_notations(self, api):
        """Return notations used for given API"""
        notationmap = self.notations()
        if  not notationmap:
            return {}
        notations = {}
        if  '' in notationmap:
            notations = dict(notationmap['']) # notations applied to all APIs
            if  api in notationmap: # overwrite the one for provided API
                notations.update(notationmap[api])
        return notations

    def parser(self, dasquery, dformat, data, api):
        """
        DAS data parser. Input parameters:

        - *query* input DAS query
        - *dformat* is a data format, e.g. XML, JSON
        - *data* is a data source, either file-like object or
          actual data
        - *api* is API name
        """
        prim_key  = self.dasmapping.primary_key(self.name, api)
        counter   = 0
        if  dformat.lower() == 'xml':
            tags = self.dasmapping.api2daskey(self.name, api)
            gen  = xml_parser(data, prim_key, tags)
            for row in gen:
                counter += 1
                yield row
        elif dformat.lower() == 'json' or dformat.lower() == 'dasjson':
            gen  = json_parser(data, self.logger)
            das_dict = {}
            for row in gen:
                if  dformat.lower() == 'dasjson':
                    for key, val in row.items():
                        if  key != 'results':
                            das_dict[key] = val
                    row = row['results']
                if  isinstance(row, list):
                    for item in row:
                        if  item:
                            if  prim_key in item:
                                counter += 1
                                yield item
                            else:
                                counter += 1
                                yield {prim_key:item}
                else:
                    if  prim_key in row:
                        counter += 1
                        yield row
                    else:
                        counter += 1
                        yield {prim_key:row}
        else:
            msg = 'Unsupported data format="%s", API="%s"' % (dformat, api)
            raise Exception(msg)
        msg  = "api=%s, format=%s " % (api, dformat)
        msg += "prim_key=%s yield %s rows" % (prim_key, counter)
        self.logger.info(msg)

    def translator(self, api, genrows):
        """
        Convert raw results into DAS records. 
        """
        prim_key  = self.dasmapping.primary_key(self.name, api)
        count = 0
        for row in genrows:
            row2das(self.dasmapping.notation2das, self.name, api, row)
            count += 1
            # check for primary key existance, since it can be overriden
            # by row2das. For example DBS3 uses flat namespace, so we
            # override dataset=>name, while dataset still is a primary key
            if  isinstance(row, list):
                yield {prim_key:row}
            elif  prim_key in row:
                if  prim_key in row[prim_key]:
                    yield row[prim_key] # remapping may create nested dict
                else:
                    yield row
            else:
                yield {prim_key:row}
        msg = "yield %s rows" % count
        self.logger.debug(msg)

    def set_misses(self, dasquery, api, genrows):
        """
        Check and adjust DAS records wrt input query. If some of the DAS
        keys are missing, add it with its value to the DAS record.
        """
        # look-up primary key
        prim_key  = self.dasmapping.primary_key(self.name, api)

        # Scan all docs and store those whose size above MongoDB limit into
        # GridFS
        map_key = self.dasmapping.primary_mapkey(self.name, api)
        genrows = parse2gridfs(self.gfs, map_key, genrows, self.logger)

        spec  = dasquery.mongo_query['spec']
        row   = next(genrows)
        ddict = DotDict(row)
        keys2adjust = []
        for key in spec.keys():
            val = ddict.get(key)
            if  spec[key] != val and key not in keys2adjust:
                keys2adjust.append(key)
        msg   = "adjust keys %s" % keys2adjust
        self.logger.debug(msg)
        count = 0
        if  keys2adjust:
            # adjust of the rows
            for row in yield_rows(row, genrows):
                ddict = DotDict(row)
                pval  = ddict.get(map_key)
                if  isinstance(pval, dict) and 'error' in pval:
                    ddict[map_key] = ''
                    ddict.update({prim_key: pval})
                for key in keys2adjust:
                    value = spec[key]
                    existing_value = ddict.get(key)
                    # the way to deal with proximity/patern/condition results
                    if  (isinstance(value, str) or isinstance(value, unicode))\
                        and value.find('*') != -1: # we got pattern
                        if  existing_value:
                            value = existing_value
                    elif isinstance(value, dict) or \
                        isinstance(value, list): # we got condition
                        if  existing_value:
                            value = existing_value
                        elif isinstance(value, dict) and \
                        '$in' in value: # we got a range {'$in': []}
                            value = value['$in']
                        elif isinstance(value, dict) and \
                        '$lte' in value and '$gte' in value:
                            # we got a between range
                            value = [value['$gte'], value['$lte']]
                        else: 
                            value = json.dumps(value) 
                    elif existing_value and value != existing_value:
                        # we got proximity results
                        if  'proximity' in ddict:
                            proximity = DotDict({key:existing_value})
                            ddict['proximity'].update(proximity)
                        else:
                            proximity = DotDict({})
                            proximity[key] = existing_value
                            ddict['proximity'] = proximity
                    else:
                        if  existing_value:
                            value = existing_value
                    ddict[key] = value
                yield ddict
                count += 1
        else:
            yield row
            for row in genrows:
                yield row
                count += 1
        msg   = "yield %s rows" % count
        self.logger.debug(msg)
            
    def api(self, dasquery):
        """
        Data service api method, can be defined by data-service class.
        It parse input query and invoke appropriate data-service API
        call. All results are stored into the DAS cache along with
        api call inserted into Analytics DB.
        """
        self.logger.info(dasquery)
        genrows = self.apimap(dasquery)
        if  not genrows:
            return
        jobs = []
        for url, api, args, dformat, expire in genrows:
            # insert DAS query record for given API
            header = dasheader(self.name, dasquery, expire, api, url)
            self.localcache.insert_query_record(dasquery, header)
            # fetch DAS data records
            if  self.multitask:
                jobs.append(self.taskmgr.spawn(self.apicall, \
                            dasquery, url, api, args, dformat, expire))
            else:
                self.apicall(dasquery, url, api, args, dformat, expire)
        if  self.multitask:
            self.taskmgr.joinall(jobs)

    def apicall(self, dasquery, url, api, args, dformat, expire):
        """
        Data service api method, can be defined by data-service class.
        It parse input query and invoke appropriate data-service API
        call. All results are stored into the DAS cache along with
        api call inserted into Analytics DB.

        We invoke explicitly close call for our datastream instead
        of using context manager since this method as well as
        getdata/parser can be overwritten by child classes.
        """
        datastream  = None
        try:
            args    = self.inspect_params(api, args)
            time0   = time.time()
            headers = make_headers(dformat)
            datastream, expire = self.getdata(url, args, expire, headers)
            self.logger.info("%s expire %s" % (api, expire))
            rawrows = self.parser(dasquery, dformat, datastream, api)
            dasrows = self.translator(api, rawrows)
            ctime   = time.time() - time0
            self.write_to_cache(dasquery, expire, url, api, args,
                    dasrows, ctime)
        except Exception as exc:
            msg  = 'Fail to process: url=%s, api=%s, args=%s' \
                    % (url, api, args)
            print(msg)
            print_exc(exc)
        close(datastream)

    def url_instance(self, url, _instance):
        """
        Virtual method to adjust URL for a given instance,
        must be implemented in service classes
        """
        return url

    def adjust_url(self, url, instance):
        """
        Adjust data-service URL wrt provided instance, e.g.
        DBS carry several instances
        """
        if  instance:
            url = self.url_instance(url, instance)
        return url

    def apimap(self, dasquery):
        """
        Analyze input query and yield url, api, args, format, expire
        for further processing.
        """
        srv   = self.name # get local copy to avoid threading issues
        cond  = getarg(dasquery.mongo_query, 'spec', {})
        instance = dasquery.mongo_query.get('instance', self.dbs_global)
        skeys = getarg(dasquery.mongo_query, 'fields', [])
        if  not skeys:
            skeys = []
        self.logger.info("\n")
        for api, value in self.map.items():
            expire = value['expire']
            iformat = value['format']
            url    = self.adjust_url(value['url'], instance)
            if  not url:
                msg = '--- rejects API %s, no URL' % api
                self.logger.info(msg)
                continue
            args   = dict(value['params']) # make new copy, since we'll adjust
            wild   = value.get('wild_card', '*')
            found  = 0
            # check if input parameters are covered by API
            if  not self.dasmapping.check_api_match(srv, api, cond):
                msg = '--- rejects API %s, does not cover input condition keys' \
                        % api
                self.logger.info(msg)
                continue
            # once we now that API covers input set of parameters we check
            # every input parameter for pattern matching
            for key, val in cond.items():
                # check if keys from conditions are accepted by API
                # need to convert key (which is daskeys.map) into
                # input api parameter
                for apiparam in self.dasmapping.das2api(srv, api, key, val):
                    if  apiparam in args:
                        args[apiparam] = val
                        found += 1
            # VK 20160708, wrong statement, it caused to pass
            # datasets API for query dataset in [path1, path2]
            # I'll leave block here until I test and verify that
            # commented out block will not cause other issues
            #
            # check the case when we only have single condition key
            # and it is the key we look-up
#             if  not found and skeys == [k.split('.')[0] for k in cond.keys()]:
#                 found = 1
            # check if number of keys on cond and args are the same
            if  len(cond.keys()) != found:
                msg = "--- reject API %s, not all condition keys are covered" \
                        % api
                self.logger.info(msg)
                msg = 'args=%s' % args
                self.logger.debug(msg)
                continue
            if  not found:
                msg = "--- rejects API %s, parameters don't match" % api
                self.logger.info(msg)
                msg = 'args=%s' % args
                self.logger.debug(msg)
                continue
            self.adjust_params(api, args, instance)
            # delete args keys whose value is optional
            delete_keys(args, 'optional')
            # check that there is no "required" parameter left in args,
            # since such api will not work
            if 'required' in args.values():
                msg = '--- rejects API %s, parameter is required' % api
                self.logger.info(msg)
                msg = 'args=%s' % args
                self.logger.debug(msg)
                continue
            # adjust pattern symbols in arguments
            if  wild != '*':
                for key, val in args.items():
                    if  isinstance(val, str) or isinstance(val, unicode):
                        val   = val.replace('*', wild)
                    args[key] = val

            # compare query selection keys with API look-up keys
            api_lkeys = self.dasmapping.api_lkeys(srv, api)
            if  set(api_lkeys) != set(skeys):
                msg = "--- rejects API %s, api_lkeys(%s)!=skeys(%s)"\
                        % (api, api_lkeys, skeys)
                self.logger.info(msg)
                continue

            msg = '+++ %s passes API %s' % (srv, api)
            self.logger.info(msg)
            msg = 'args=%s' % args
            self.logger.debug(msg)

            msg  = "yield "
            msg += "system ***%s***, url=%s, api=%s, args=%s, format=%s, " \
                % (srv, url, api, args, iformat)
            msg += "expire=%s, wild_card=%s" \
                % (expire, wild)
            self.logger.debug(msg)

            yield url, api, args, iformat, expire
Ejemplo n.º 46
0
class DASMongocache(object):
    """
    DAS cache based MongoDB.
    """
    def __init__(self, config):
        self.config  = config
        self.emptyset_expire = \
                expire_timestamp(config['das'].get('emptyset_expire', 5))
        self.dburi   = config['mongodb']['dburi']
        self.cache_size = config['mongodb']['bulkupdate_size']
        self.dbname  = config['dasdb']['dbname']
        self.verbose = config['verbose']
        self.logger  = PrintManager('DASMongocache', self.verbose)
        self.mapping = config['dasmapping']
        self.logging = config['dasdb'].get('logging', False)
        self.rec_ttl = config['dasdb'].get('record_ttl', 24*60*60)
        self.del_ttl = config['dasdb'].get('delta_ttl', 60)
        self.cleanup_del_ttl = config['dasdb'].get('cleanup_delta_ttl', 3600)
        self.retry   = config['dasdb'].get('retry', 3)
        self.das_son_manipulator = DAS_SONManipulator()

        # Initialize MongoDB connection
        self.col_    = self.config['dasdb']['cachecollection']
        self.mrcol_  = self.config['dasdb']['mrcollection']
        self.merge_  = self.config['dasdb']['mergecollection']
        self.gfs     = db_gridfs(self.dburi)

        msg = "%s@%s" % (self.dburi, self.dbname)
        self.logger.info(msg)

        # ensure that we have the following indexes
        common_idx = [
                      ('file.name', DESCENDING),
                      ('dataset.name', DESCENDING),
                      ('block.name', DESCENDING),
                      ('run.run_number', DESCENDING),
                      ]
        index_list = [('das.expire', ASCENDING), ('das_id', ASCENDING),
                      ('das.system', ASCENDING),
                      ('qhash', DESCENDING),
                      ('das.record', ASCENDING)]
        create_indexes(self.col, index_list + common_idx)
        index_list = [('das.expire', ASCENDING), ('das_id', ASCENDING),
                      ('qhash', DESCENDING),
                      ('das.record', ASCENDING),
                      ('das.ts', ASCENDING)]
        create_indexes(self.merge, index_list)
        # NOTE: I found that creating index in merge collection leads to
        # MongoDB error when records contains multiple arrays on indexed
        # keys. For example, when we query file,run,lumi both file and run
        # are arrays in MongoDB. In this case the final sort in MongoDB
        # bark with the following message:
        # cannot sort with keys that are parallel arrays
        # it looks like that there is no fix for that yet
        # see
        # http://stackoverflow.com/questions/6516725/how-do-i-index-two-arrays-in-mongodb
        # therefore I temporary disabled create_indexes call on merge
        # collection which was used to have index to ease final sort,
        # especially in a case when a lot of records correspond to inital
        # query, e.g. file records.
        # On another hand, the most common use case where sort fails is
        # getting file records, and I can add one compound key to ease sort
        # but I can't add another compound key on array field, e.g. run
        common_idx = [[('qhash', DESCENDING), ('file.name', DESCENDING)]]
        create_indexes(self.merge, index_list + common_idx)

        # thread which clean-up DAS collections
        thname = 'mongocache_cleanup'
        cols   = [config['dasdb']['cachecollection'],
                  config['dasdb']['mrcollection'],
                  config['dasdb']['mergecollection']]

    @property
    def col(self):
        "col property provides access to DAS cache collection"
        conn = db_connection(self.dburi)
        mdb  = conn[self.dbname]
        colnames = mdb.collection_names()
        if  not colnames or self.col_ not in colnames:
            try:
                mdb.create_collection(self.col_)
            except OperationFailure:
                pass
        mdb.add_son_manipulator(self.das_son_manipulator)
        return mdb[self.col_]

    @property
    def merge(self):
        "merge property provides access to DAS merge collection"
        conn = db_connection(self.dburi)
        mdb  = conn[self.dbname]
        colnames = mdb.collection_names()
        if  not colnames or self.merge_ not in colnames:
            try:
                mdb.create_collection(self.merge_)
            except OperationFailure:
                pass
        mdb.add_son_manipulator(self.das_son_manipulator)
        return mdb[self.merge_]

    @property
    def mrcol(self):
        "mrcol property provides access to DAS map-reduce collection"
        conn = db_connection(self.dburi)
        mdb  = conn[self.dbname]
        mdb.add_son_manipulator(self.das_son_manipulator)
        return mdb[self.mrcol_]

    def get_dataset_hashes(self, dasquery):
        "Get dataset hashes from DBS database"
        spec = dasquery.mongo_query.get('spec', {})
        inst = dasquery.instance
        conn = db_connection(self.dburi)
        if  spec and inst:
            dataset = spec.get('dataset.name', None)
            if  dataset:
                if  dataset.find('*') != -1:
                    cond = {'dataset':re.compile(dataset.replace('*', '.*'))}
                else:
                    cond = {'dataset': dataset}
                for row in conn['dbs'][inst].find(cond):
                    if  'qhash' in row:
                        yield row['qhash']

    def check_datasets(self, dasquery):
        "Check dataset presence in DAS cache for given das query"
        hashes = [r for r in self.get_dataset_hashes(dasquery)]
        if  hashes:
            spec = {'qhash': {'$in': hashes}}
            if  len(hashes) == self.merge.find(spec, **PYMONGO_OPTS).count():
                dasquery._hashes = hashes

    def get_superset_keys(self, key, value):
        """
        This is a special-case version of similar_keys,
        intended for analysers that want to quickly
        find possible superset queries of a simple
        query of the form key=value.
        """

        msg = "%s=%s" % (key, value)
        self.logger.debug(msg)
        cond = {'query.spec.key': key}
        for row in self.col.find(cond, **PYMONGO_OPTS):
            mongo_query = decode_mongo_query(row['query'])
            for thiskey, thisvalue in mongo_query.items():
                if thiskey == key:
                    if fnmatch.fnmatch(value, thisvalue):
                        yield thisvalue

    def get_fields(self, dasquery):
        "Prepare fields to extract from MongoDB"
        fields     = dasquery.mongo_query.get('fields', [])
        if  fields and 'records' in fields:
            fields = None # look-up all records
        filters    = dasquery.filters
        cond       = {}
        if  filters:
            new_fields = []
            for dasfilter in filters:
                if  dasfilter == 'unique':
                    continue
                if  fields and dasfilter not in fields and \
                    dasfilter not in new_fields:
                    if  dasfilter.find('=') == -1 and dasfilter.find('<') == -1\
                    and dasfilter.find('>') == -1:
                        new_fields.append(dasfilter)
                    else:
                        cond = parse_filters(dasquery.mongo_query)
            if  not new_fields and fields:
                new_fields = list(fields)
            return new_fields, cond
        return fields, cond

    def remove_expired(self, dasquery, collection):
        """
        Remove expired records from DAS cache. We need to perform this
        operation very carefullly since we don't use transation and on-going
        commits can invoke this method (see das_core.py).  Therefore we use
        MongoDB $or operator to wipe out queries which match DASQuery hash and
        already expired or queries which lived in cache more then rec_ttl
        config parameter. The later operation just prevent DAS cache from
        growing.
        """
        conn   = db_connection(self.dburi)
        mdb    = conn[self.dbname]
        mdb.add_son_manipulator(self.das_son_manipulator)
        col    = mdb[collection]
        # use additional delta to check data record expiration
        # we add this delta to ensure that there is no records close to
        # current timestamp which may expire during request processing
        spec = {'qhash':dasquery.qhash,
                'das.expire':{'$lt':time.time()+self.del_ttl}}
        col.delete_many(spec)

    def check_services(self, dasquery):
        """
        Check if DAS cache contains DAS records with service response for
        given query.
        """
        das_rec  = self.find(dasquery)
        if  not das_rec:
            return False
        if  'das' not in das_rec:
            return False
        if  'services' not in das_rec['das']:
            return False
        spec = {'qhash':dasquery.qhash, 'das.system':{'$ne':'das'},
                'das.expire':{'$gt':time.time()}}
        nres = self.col.find(spec, **PYMONGO_OPTS).count()
        if  nres:
            return True
        return False

    def find(self, dasquery):
        """
        Find provided query in DAS cache.
        """
        cond = {'qhash': dasquery.qhash, 'das.system':'das',
                'das.expire': {'$gt':time.time()}}
        return find_one(self.col, cond)

    def find_specs(self, dasquery, system='das'):
        """
        Check if cache has query whose specs are identical to provided query.
        Return all matches.
        """
        if dasquery.hashes:
            cond = {'qhash':{'$in':dasquery.hashes}}
        else:
            cond = {'qhash': dasquery.qhash}
        if  system:
            cond.update({'das.system': system})
        cond.update({'das.expire':{'$gt':time.time()}})
        return self.col.find(cond, **PYMONGO_OPTS)

    def get_das_ids(self, dasquery):
        """
        Return list of DAS ids associated with given query
        """
        das_ids = []
        try:
            das_ids = \
                [r['_id'] for r in self.col.find_specs(dasquery, system='')]
        except:
            pass
        return das_ids

    def update_das_expire(self, dasquery, timestamp):
        "Update timestamp of all DAS data records for given query"
        nval = {'$set': {'das.expire':timestamp}}
        spec = {'qhash' : dasquery.qhash}
        self.col.update_many(spec, nval)
        self.merge.update_many(spec, nval)

    def das_record(self, dasquery):
        "Retrieve DAS record for given query"
        cond = {'qhash': dasquery.qhash, 'das.expire':{'$gt':time.time()}}
        return find_one(self.col, cond)

    def find_records(self, das_id):
        " Return all the records matching a given das_id"
        return self.col.find({'das_id': das_id}, **PYMONGO_OPTS)

    def is_error_in_records(self, dasquery, collection='cache'):
        "Scan DAS cache for error records and return true or not"
        if  collection == 'cache':
            results = self.col.find({'qhash':dasquery.qhash}, **PYMONGO_OPTS)
        else:
            results = self.merge.find({'qhash':dasquery.qhash}, **PYMONGO_OPTS)
        error  = None
        reason = None
        for row in results:
            if 'error' in row:
                error  = row.get('error')
                reason = row.get('reason', '')
                break
        return error, reason

    def add_to_record(self, dasquery, info, system=None):
        "Add to existing DAS record provided info"
        if  system:
            self.col.update_one({'query': dasquery.storage_query,
                             'das.system':system},
                            {'$set': info}, upsert=True)
        else:
            self.col.update_one({'query': dasquery.storage_query},
                            {'$set': info}, upsert=True)

    def find_min_expire(self, dasquery):
        """Find minimal expire timestamp across all records for given DAS query"""
        spec   = {'qhash': dasquery.qhash}
        min_expire = 2*time.time() # upper bound, will update
        for rec in self.col.find(spec, **PYMONGO_OPTS):
            if  'das' in rec and 'expire' in rec['das']:
                estamp = rec['das']['expire']
                if  min_expire > estamp:
                    min_expire = estamp
        return long(min_expire)

    def find_query_record(self, dasquery):
        "Find DAS query records and return them to the caller"
        spec = {'qhash':dasquery.qhash,
                'das.record':record_codes('query_record')}
        return self.col.find(spec, **PYMONGO_OPTS)

    def update_query_record(self, dasquery, status, header=None, reason=None):
        "Update DAS record for provided query"
        ctime = time.time()
        das_spec = {'qhash': dasquery.qhash, 'das.system':'das'}
        min_expire = self.find_min_expire(dasquery)
        if  header:
            system = header['das']['system']
            sts    = header['das']['status']
            expire = header['das']['expire']
            spec   = {'qhash': dasquery.qhash, 'das.system': system}
            new_expire = None
            for rec in self.col.find(spec, **PYMONGO_OPTS):
                if  'das' in rec and 'expire' in rec['das']:
                    if  rec['das']['expire'] > expire:
                        new_expire = expire
                        ndict = {'das.expire':expire, 'das.status':status}
                        cdict = {'das.ctime':ctime}
                        udict = {'$set':ndict, '$push':cdict}
                        oid   = ObjectId(rec['_id'])
                        self.col.update_one({'_id':oid}, udict)
            if  new_expire:
                udict = {'$set': {'das.expire': new_expire},
                         '$push': {'das.ctime':ctime}}
                self.col.update_one(das_spec, udict)
        else:
            udict = {'$set': {'das.status':status, 'das.expire': min_expire},
                     '$push': {'das.ctime':ctime}}
            self.col.update_one(das_spec, udict)
        if  reason:
            udict = {'$set': {'das.reason':reason}}
            self.col.update_one(das_spec, udict)
        # align all expire timestamps when we recieve ok status
        if  status == 'ok':
            udict = {'$set': {'das.expire': min_expire}}
            self.col.update_one(das_spec, udict)

    def apilist(self, dasquery):
        "Return list of apis for given dasquery"
        spec = {'qhash':dasquery.qhash,
                'das.record':record_codes('query_record')}
        apis = []
        for row in self.col.find(spec, ['das.api'], **PYMONGO_OPTS):
            try:
                apis += row['das']['api']
            except Exception as _err:
                pass
        return apis

    def incache(self, dasquery, collection='merge', system=None, api=None,
            query_record=False):
        """
        Check if we have query results in cache, otherwise return null.
        Please note, input parameter query means MongoDB query, please
        consult MongoDB API for more details,
        http://api.mongodb.org/python/
        """
        if  query_record:
            record = record_codes('query_record')
        else:
            record = spec4data_records()
        spec = {'qhash':dasquery.qhash, 'das.record':record,
                'das.expire':{'$gt':time.time()}}
        if  system:
            spec.update({'das.system': system})
        if  api:
            spec.update({'das.api': api})
        conn = db_connection(self.dburi)
        mdb  = conn[self.dbname]
        mdb.add_son_manipulator(self.das_son_manipulator)
        col  = mdb[collection]
        res  = col.find(spec, **PYMONGO_OPTS).count()
        msg  = "(%s, coll=%s) found %s results" % (dasquery, collection, res)
        self.logger.info(msg)
        if  res:
            return True
        return False

    def nresults(self, dasquery, collection='merge'):
        """Return number of results for given query."""
        if  dasquery.aggregators:
            return len(dasquery.aggregators)
        # Distinguish 2 use cases, unique filter and general query
        # in first one we should count only unique records, in later
        # we can rely on DB count() method. Pleas keep in mind that
        # usage of fields in find doesn't account for counting, since it
        # is a view over records found with spec, so we don't need to use it.
        fields, filter_cond = self.get_fields(dasquery)
        if  not fields:
            spec = dasquery.mongo_query.get('spec', {})
        elif dasquery.hashes:
            spec = {'qhash':{'$in':dasquery.hashes},
                    'das.record': spec4data_records()}
        else:
            spec = {'qhash':dasquery.qhash,
                    'das.record': spec4data_records()}
        if  filter_cond:
            spec.update(filter_cond)
        conn = db_connection(self.dburi)
        mdb  = conn[self.dbname]
        mdb.add_son_manipulator(self.das_son_manipulator)
        col  = mdb[collection]
        if  dasquery.unique_filter:
            skeys = self.mongo_sort_keys(collection, dasquery)
            if  skeys:
                gen = col.find(spec, **PYMONGO_OPTS).sort(skeys)
            else:
                gen = col.find(spec, **PYMONGO_OPTS)
            res = len([r for r in unique_filter(gen)])
        else:
            res = col.find(spec, **PYMONGO_OPTS).count()
            if  not res: # double check that this is really the case
                time.sleep(1)
                res = col.find(spec, **PYMONGO_OPTS).count()
        msg = "%s" % res
        self.logger.info(msg)
        return res

    def mongo_sort_keys(self, collection, dasquery):
        """
        Find list of sort keys for a given DAS query. Check existing
        indexes and either use fields or spec keys to find them out.
        Return list of mongo sort keys in a form of (key, order).
        """
        # try to get sort keys all the time to get ordered list of
        # docs which allow unique_filter to apply afterwards
        fields = dasquery.mongo_query.get('fields')
        spec   = dasquery.mongo_query.get('spec')
        skeys  = dasquery.sortkeys
        mongo_skeys = []
        if  skeys:
            for key in skeys:
                if  key.find('-') != -1: # reverse order, e.g. desc
                    mongo_skeys.append((key.replace('-', ''), DESCENDING))
                else:
                    mongo_skeys.append((key, ASCENDING))
        else:
            existing_idx = [i for i in self.existing_indexes(collection)]
            if  fields:
                lkeys = []
                for key in fields:
                    for mkey in self.mapping.mapkeys(key):
                        if  mkey not in lkeys:
                            lkeys.append(mkey)
            else:
                lkeys = list(spec.keys())
            keys = [k for k in lkeys \
                if k.find('das') == -1 and k.find('_id') == -1 and \
                        k in existing_idx]
            mongo_skeys = [(k, ASCENDING) for k in keys]
        return mongo_skeys

    def existing_indexes(self, collection='merge'):
        """
        Get list of existing indexes in DB. They are returned by
        index_information API in the following for:

        .. doctest::

            {u'_id_': {u'key': [(u'_id', 1)], u'v': 0},
             u'das.expire_1': {u'key': [(u'das.expire', 1)], u'v': 0},
             ...
             u'tier.name_-1': {u'key': [(u'tier.name', -1)], u'v': 0}}
        """
        conn = db_connection(self.dburi)
        mdb  = conn[self.dbname]
        mdb.add_son_manipulator(self.das_son_manipulator)
        col = mdb[collection]
        for val in col.index_information().values():
            for idx in val['key']:
                yield idx[0] # index name

    def get_records(self, coll, spec, fields, skeys, idx, limit, unique=False):
        "Generator to get records from MongoDB."
        try:
            conn = db_connection(self.dburi)
            mdb  = conn[self.dbname]
            mdb.add_son_manipulator(self.das_son_manipulator)
            col = mdb[coll]
            nres = col.find(spec, **PYMONGO_OPTS).count()
            if  nres == 1 or nres <= limit:
                limit = 0
            if  limit:
                res = col.find(spec, fields, sort=skeys, skip=idx, limit=limit)
            else:
                res = col.find(spec, fields, sort=skeys, **PYMONGO_OPTS)
            if  unique:
                res = unique_filter(res)
            for row in res:
                yield row
        except Exception as exp:
            print_exc(exp)
            row = {'exception': str(exp)}
            res = []
            yield row

    def get_from_cache(self, dasquery, idx=0, limit=0, collection='merge'):
        "Generator which retrieves results from the cache"
        if  dasquery.service_apis_map(): # valid DAS query
            result = self.get_das_records(dasquery, idx, limit, collection)
            for row in result:
                yield row
        else: # pure MongoDB query
            fields  = dasquery.mongo_query.get('fields', [])
            if  fields == None:
                fields = []
            spec    = dasquery.mongo_query.get('spec', {})
            if  dasquery.filters:
                if  not fields:
                    fields = []
                fields += dasquery.filters
                pkeys   = [k.split('.')[0] for k in fields]
            fields += das_record_keys()
            if  'records' in dasquery.query:
                fields = None # special case for DAS 'records' keyword
            skeys   = self.mongo_sort_keys(collection, dasquery)
            result  = self.get_records(collection, spec, fields, skeys, \
                            idx, limit, dasquery.unique_filter)
            for row in result:
                if  dasquery.filters:
                    if  pkeys and set(pkeys) & set(row.keys()):
                        yield row
                else:
                    yield row

    def get_das_records(self, dasquery, idx=0, limit=0, collection='merge'):
        "Generator which retrieves DAS records from the cache"
        msg = "(%s, %s, %s, coll=%s)" % (dasquery, idx, limit, collection)
        self.logger.info(msg)

        idx = int(idx)
        fields, filter_cond = self.get_fields(dasquery)
        if  fields == None:
            fields = []
        if  not fields:
            spec = dasquery.mongo_query.get('spec', {})
        elif dasquery.hashes:
            spec = {'qhash':{'$in':dasquery.hashes},
                    'das.record': spec4data_records()}
        else:
            spec = {'qhash':dasquery.qhash,
                    'das.record': spec4data_records()}
        if  filter_cond:
            spec.update(filter_cond)
        if  'records' in dasquery.query:
            fields  = None # retrieve all fields for records DAS query
        else:
            # be sure to extract das internal keys
            fields += das_record_keys()
        # try to get sort keys all the time to get ordered list of
        # docs which allow unique_filter to apply afterwards
        skeys   = self.mongo_sort_keys(collection, dasquery)
        res     = self.get_records(collection, spec, fields, skeys, \
                        idx, limit, dasquery.unique_filter)
        counter = 0
        for row in res:
            counter += 1
            yield row
        msg = 'qhash %s, found %s record(s) in %s collection' \
                % (dasquery.qhash, counter, collection)
        print(dastimestamp('DAS INFO '), msg)

        if  counter:
            msg = "yield %s record(s)" % counter
            self.logger.info(msg)

        # if no raw records were yield we look-up possible error records
        # and reset timestamp for record with system:['das']
        if  not counter:
            spec = {'qhash':dasquery.qhash}
            nrec = self.col.find(spec, **PYMONGO_OPTS).count()
            if  nrec:
                msg = "for query %s, found %s non-result record(s)" \
                        % (dasquery, nrec)
                print(dastimestamp('DAS WARNING'), msg)
                for rec in self.col.find(spec, **PYMONGO_OPTS):
                    if  'query' in rec:
                        print(dastimestamp('DAS das record'), rec)
            self.update_das_expire(dasquery, etstamp())

    def map_reduce(self, mr_input, dasquery, collection='merge'):
        """
        Wrapper around _map_reduce to allow sequential map/reduce
        operations, e.g. map/reduce out of map/reduce.

        mr_input is either alias name or list of alias names for
        map/reduce functions.

        Input dasquery which is applied to first
        iteration of map/reduce functions.
        """
        # NOTE: I need to revisit mapreduce.
        spec = dasquery.mongo_query['spec']
        if  not isinstance(mr_input, list):
            mrlist = [mr_input]
        else:
            mrlist = mr_input
        conn = db_connection(self.dburi)
        mdb  = conn[self.dbname]
        mdb.add_son_manipulator(self.das_son_manipulator)
        coll = mdb[collection]
        for mapreduce in mrlist:
            if  mapreduce == mrlist[0]:
                cond = spec
            else:
                cond = None
            coll = self._map_reduce(coll, mapreduce, cond)
        for row in coll.find():
            yield row

    def _map_reduce(self, coll, mapreduce, spec=None):
        """
        Perform map/reduce operation over DAS cache using provided
        collection, mapreduce name and optional conditions.
        """
        self.logger.debug("(%s, %s)" % (mapreduce, spec))
        record = find_one(self.mrcol, {'name':mapreduce})
        if  not record:
            raise Exception("Map/reduce function '%s' not found" % mapreduce)
        fmap = record['map']
        freduce = record['reduce']
        if  spec:
            result = coll.map_reduce(Code(fmap), Code(freduce), query=spec)
        else:
            result = coll.map_reduce(Code(fmap), Code(freduce))
        msg = "found %s records in %s" % (result.count(), result.name)
        self.logger.info(msg)
        self.logger.debug(fmap)
        self.logger.debug(freduce)
        return result

    def get_map_reduce(self, name=None):
        """
        Return definition of map/reduce functions for provided name
        or gives full list.
        """
        spec = {}
        if  name:
            spec = {'name':name}
        result = self.mrcol.find(spec, **PYMONGO_OPTS)
        for row in result:
            yield row

    def merge_records(self, dasquery, attempt=0):
        """
        Merge DAS records for provided query. We perform the following
        steps:
        1. get all queries from das.cache by ordering them by primary key
        2. run aggregtor function to merge neighbors
        3. insert records into das.merge
        """
        ### TMP for asyncio
#         time.sleep(attempt+3) # pymongo 3.2 don't yet flush in time

        # remove any entries in merge collection for this query
        self.merge.delete_many({'qhash':dasquery.qhash})
        # proceed
        self.logger.debug(dasquery)
        id_list = []
        expire  = 9999999999 # future
        # get all API records for given DAS query
        spec    = {'qhash':dasquery.qhash,
                   'das.expire':{'$gt':time.time()},
                   'das.record':record_codes('query_record')}
        records = self.col.find(spec, **PYMONGO_OPTS)
        for row in records:
            # find smallest expire timestamp to be used by aggregator
            rexpire = row.get('das', {}).get('expire', expire)
            if  rexpire < expire:
                expire = rexpire
            if  row['_id'] not in id_list:
                id_list.append(row['_id'])
        inserted = 0
        lookup_keys = set()
        fields = dasquery.mongo_query.get('fields')
        if  not fields: # Mongo
            fields = []
        for key in fields:
            for pkey in self.mapping.mapkeys(key):
                lookup_keys.add(pkey)
        for pkey in lookup_keys:
            skey = [(pkey, DESCENDING)]
            # lookup all service records
            spec = {'das_id': {'$in': id_list}, 'das.primary_key': pkey}
            if  self.verbose:
                nrec = self.col.find(spec, **PYMONGO_OPTS).sort(skey).count()
                msg  = "merging %s records, for %s key" % (nrec, pkey)
            else:
                msg  = "merging records, for %s key" % pkey
            self.logger.debug(msg)
            # use exhaust=False since we process all records in aggregator
            # and it can be delay in processing
            records = self.col.find(spec, **PYMONGO_NOEXHAUST).sort(skey)
            # aggregate all records
            agen = aggregator(dasquery, records, expire)
            # diff aggregated records
            gen = das_diff(agen, self.mapping.diff_keys(pkey.split('.')[0]))
            # insert all records into das.merge using bulk insert
            size = self.cache_size
            try:
                res = self.merge.insert_many(gen)
                inserted += len(res.inserted_ids)
            except InvalidDocument as exp:
                print(dastimestamp('DAS WARNING'), 'InvalidDocument during merge', str(exp))
                msg = "Caught bson error: " + str(exp)
                self.logger.info(msg)
                records = self.col.find(spec, **PYMONGO_OPTS).sort(skey)
                gen = aggregator(dasquery, records, expire)
                genrows = parse2gridfs(self.gfs, pkey, gen, self.logger)
                das_dict = {'das':{'expire':expire,
                        'das.record': record_codes('gridfs_record'),
                        'primary_key':[k for k in lookup_keys],
                        'system': ['gridfs']}, 'qhash':dasquery.qhash,
                        'cache_id':[], 'das_id': id_list}
                for row in genrows:
                    row.update(das_dict)
                    self.merge.insert(row)
            except InvalidOperation as exp:
                pass
            except DuplicateKeyError as err:
                print(dastimestamp('DAS WARNING'), 'DuplicateKeyError during merge')
                if  not isinstance(gen, list):
                    raise err
        status = 'fail'
        if  inserted:
            status = 'ok'
        elif  not lookup_keys: # we get query w/o fields
            msg = 'qhash %s, no lookup_keys' % dasquery.qhash
            print(dastimestamp('DAS WARNING'), msg)
            status = 'ok'
        else: # we didn't merge anything, it is DB look-up failure
            msg  = 'qhash %s, did not insert into das.merge, attempt %s' \
                    % (dasquery.qhash, attempt)
            print(dastimestamp('DAS WARNING'), msg)
            empty_expire = etstamp()
            lkeys = list(lookup_keys)
            das = dict(expire=empty_expire, primary_key=lkeys[0],
                       condition_keys=lkeys,
                       instance=dasquery.instance,
                       system=['das'], services=dasquery.services,
                       record=record_codes('empty_record'),
                       ts=time.time(), api=[])
            empty_record = {'das':das, 'qhash': dasquery.qhash,
                            'cache_id':[], 'das_id': id_list}
            for key in lkeys:
                empty_record.update({key.split('.')[0]:[]})
            for key, val in dasquery.mongo_query['spec'].items():
                if  key.find('.') == -1:
                    empty_record[key] = []
                else: # it is compound key, e.g. site.name
                    newkey, newval = convert_dot_notation(key, val)
                    empty_record[newkey] = adjust_mongo_keyvalue(newval)
            self.merge.insert(empty_record)
            # update DAS records (both meta and data ones, by using qhash)
            nval = {'$set': {'das.expire':empty_expire}}
            spec = {'qhash':dasquery.qhash}
            self.col.update_many(spec, nval)
        return status

    def update_cache(self, dasquery, results, header, system, api):
        """
        Insert results into cache. Use bulk insert controller by
        self.cache_size. Upon completion ensure indexies.
        """
        # update results records in DAS cache
        gen  = self.generate_records(dasquery, results, header)
        inserted = 0
        # bulk insert
        try:
            res = self.col.insert_many(gen, ordered=False, bypass_document_validation=True)
            inserted += len(res.inserted_ids)
        except InvalidOperation:
            pass

        # update query record for this sub-system
        self.update_query_record_system(dasquery, system, api, 'ok')

        if  dasquery.qcache: # custom DASQuery cache
            self.update_das_expire(dasquery, expire_timestamp(dasquery.qcache))

    def update_query_record_system(self, dasquery, system, api, status):
        "Update system status of dasquery in das.cache collection"
        spec = {'qhash': dasquery.qhash, 'das.system': system, 'das.api': api,
                'das.record':record_codes('query_record')}
        udict = {'$set': {'das.status':status}}
#         print("### update_query_record", spec)
        doc=self.col.find_one_and_update(spec, udict, return_document=ReturnDocument.AFTER)
#         print(doc)

    def insert_query_record(self, dasquery, header):
        """
        Insert query record into DAS cache.
        """
        # check presence of API record in a cache
        dasheader   = header['das']
        system      = dasheader['system']
        api         = dasheader['api']
        collection  = 'cache'
        check_query = True
        expire = dasheader.get('expire', None)
        if  expire:
            dasheader['expire'] = adjust_expire(expire)
        if  not self.incache(dasquery, collection, system, api, check_query):
            msg = "query=%s, header=%s" % (dasquery, header)
            self.logger.debug(msg)
            q_record = dict(das=dasheader, query=dasquery.storage_query)
            q_record['das']['record'] = record_codes('query_record')
            q_record['das']['status'] = "requested"
            q_record['qhash'] = dasquery.qhash
            q_record['das']['ctime'] = [time.time()]
            res = self.col.insert_one(q_record)
            if  not res:
                msg = 'unable to insert query record'
                print(dastimestamp('DAS ERROR '), dasquery, msg, ', will retry')
                time.sleep(1)
                res = self.col.insert(q_record)
                if  not res:
                    print(dastimestamp('DAS ERROR '), dasquery, msg)

    def generate_records(self, dasquery, results, header):
        """
        Iterate over provided results, update records and yield them
        to next level (update_cache)
        """
        self.logger.debug("(%s) store to cache" % dasquery)
        if  not results:
            return

        dasheader  = header['das']
        expire     = adjust_expire(dasheader['expire'])
        system     = dasheader['system'] # DAS service names, e.g. combined
        services   = dasheader['services'] # CMS services used to get data
        api        = dasheader['api']
        prim_key   = header.get('prim_key', None)
        if  not prim_key:
            # get primary key from a list of lookup keys which has the
            # following structure [{'api':[keys]}, {...}]
            lup_keys = header['lookup_keys']
            lkeys    = [l for i in lup_keys for k in i.values() for l in k]
            prim_key = lkeys[0] if 'summary' not in lkeys else 'summary'
        cond_keys  = list(dasquery.mongo_query['spec'].keys())
        # get API record id
        spec       = {'qhash':dasquery.qhash, 'das.system':system,
                      'das.expire': {'$gt':time.time()},
                      'das.record': record_codes('query_record')}
        counter    = 0
        rids = [str(r['_id']) for r in \
                self.col.find(spec, ['_id'], **PYMONGO_OPTS)]
        if  rids:
            if  isinstance(results, list) or isinstance(results, GeneratorType):
                for item in results:
                    counter += 1
                    if  'das' in item:
                        expire = item.get('das').get('expire', expire)
                        dasheader['expire'] = expire
                    item['das'] = dict(expire=expire, primary_key=prim_key,
                                       condition_keys=cond_keys,
                                       instance=dasquery.instance,
                                       system=system, services=services,
                                       record=record_codes('data_record'),
                                       ts=time.time(), api=api)
                    item['das_id'] = rids
                    item['qhash'] = dasquery.qhash
                    yield item
            else:
                print("\n\n ### results = ", str(results))
                raise Exception('Provided results is not a list/generator type')
        if  expire != dasheader['expire']: # update DAS records
            header['das']['expire'] = expire
        # update das record with new status
        status = 'Update DAS cache, %s API' % header['das']['api'][0]
        self.update_query_record(dasquery, status, header)

        msg = "\n%s yield %s rows" % (dasheader['system'], counter)
        self.logger.info(msg)

    def remove_from_cache(self, dasquery):
        """
        Remove query from DAS cache. To do so, we retrieve API record
        and remove all data records from das.cache and das.merge
        """
        records = self.col.find({'qhash':dasquery.qhash}, **PYMONGO_OPTS)
        id_list = []
        for row in records:
            if  row['_id'] not in id_list:
                id_list.append(row['_id'])
        spec = {'das_id':{'$in':id_list}}
        self.merge.remove(spec)
        self.merge.remove({'qhash':dasquery.qhash})
        self.col.remove(spec)
        self.col.remove({'qhash':dasquery.qhash})

    def clean_cache(self, collection=None):
        """
        Clean expired docs in das.cache and das.merge.
        """
        current_time = time.time()
        query = {'das.expire': { '$lt':current_time} }
        if  not collection or collection == 'merge':
            self.merge.remove(query)
        if  not collection or collection == 'cache':
            self.col.remove(query)

    def delete_cache(self):
        """
        Delete all results in DAS cache/merge collection, including
        internal indexes.
        """
        self.col.remove({})
        try:
            self.col.drop_indexes()
        except:
            pass
        self.merge.remove({})
        try:
            self.merge.drop_indexes()
        except:
            pass
Ejemplo n.º 47
0
Archivo: das_core.py Proyecto: ktf/DAS
    def __init__(self, config=None, debug=0,
                nores=False, logger=None, engine=None, multitask=True):
        if  config:
            dasconfig = config
        else:
            dasconfig = das_readconfig()
        verbose       = dasconfig['verbose']
        self.stdout   = debug
        if  isinstance(debug, int):
            self.verbose = debug
            dasconfig['verbose'] = debug
        else:
            self.verbose = verbose
        das_timer('DASCore::init', self.verbose)
        self.operators = das_operators()

        # set noresults option
        self.noresults = False
        if  nores:
            dasconfig['write_cache'] = True
            self.noresults = nores

        self.multitask = dasconfig['das'].get('multitask', True)
        if  debug or self.verbose:
            self.multitask = False # in verbose mode do not use multitask
            dasconfig['das']['multitask'] = False
        if  not multitask: # explicitly call DASCore ctor, e.g. in analytics
            self.multitask = False
            dasconfig['das']['multitask'] = False
        dasconfig['engine'] = engine
        if  self.multitask:
            nworkers = dasconfig['das'].get('core_workers', 5)
            if  engine:
                thr_name = 'DASCore:PluginTaskManager'
                self.taskmgr = PluginTaskManager(\
                        engine, nworkers=nworkers, name=thr_name)
                self.taskmgr.subscribe()
            else:
                thr_name = 'DASCore:TaskManager'
                self.taskmgr = TaskManager(nworkers=nworkers, name=thr_name)
        else:
            self.taskmgr = None

        if  logger:
            self.logger = logger
        else:
            self.logger = PrintManager('DASCore', self.verbose)

        # define Mapping/Analytics/Parser in this order since Parser depends
        # on first two
        dasmapping = DASMapping(dasconfig)
        dasconfig['dasmapping'] = dasmapping
        self.mapping = dasmapping

        self.analytics = DASAnalytics(dasconfig)
        dasconfig['dasanalytics'] = self.analytics

        self.keylearning = DASKeyLearning(dasconfig)
        dasconfig['keylearning'] = self.keylearning

        # init DAS cache
        self.rawcache = DASMongocache(dasconfig)
        dasconfig['rawcache'] = self.rawcache

        # plug-in architecture: loop over registered data-services in
        # dasconfig; load appropriate module/class; register data
        # service with DASCore.
        self.systems = dasmapping.list_systems()
        # pointer to the DAS top level directory
        dasroot = '/'.join(__file__.split('/')[:-3])
        for name in self.systems:
            try:
                klass  = 'DAS/services/%s/%s_service.py' \
                    % (name, name)
                srvfile = os.path.join(dasroot, klass)
                with file(srvfile) as srvclass:
                    for line in srvclass:
                        if  line.find('(DASAbstractService)') != -1:
                            klass = line.split('(DASAbstractService)')[0]
                            klass = klass.split('class ')[-1] 
                            break
                mname  = 'DAS.services.%s.%s_service' % (name, name)
                module = __import__(mname, fromlist=[klass])
                obj = getattr(module, klass)(dasconfig)
                setattr(self, name, obj)
                SERVICES[name] = obj
            except IOError as err:
                if  debug > 1:
                    # we have virtual services, so IOError can be correct
                    print_exc(err)
                try:
                    mname  = 'DAS.services.generic_service'
                    module = __import__(mname, fromlist=['GenericService'])
                    obj    = module.GenericService(name, dasconfig)
                    setattr(self, name, obj)
                except Exception as exc:
                    print_exc(exc)
                    msg = "Unable to load %s data-service plugin" % name
                    raise Exception(msg)
            except Exception as exc:
                print_exc(exc)
                msg = "Unable to load %s data-service plugin" % name
                raise Exception(msg)

        # loop over systems and get system keys, add mapping keys to final list
        self.service_keys = {}
        self.service_parameters = {}
        for name in self.systems: 
            skeys = getattr(self, name).keys()
            self.service_keys[getattr(self, name).name] = skeys
            sparams = getattr(self, name).parameters()
            self.service_parameters[getattr(self, name).name] = sparams

        self.service_keys['special'] = das_special_keys()
        self.dasconfig = dasconfig
        das_timer('DASCore::init', self.verbose)
Ejemplo n.º 48
0
    def setUp(self):
        """
        set up DAS core module
        """
        debug = 0
        self.db = 'test_mapping.db'
        config = deepcopy(das_readconfig())
        dburi = config['mongodb']['dburi']
        logger = PrintManager('TestDASMapping', verbose=debug)
        config['logger'] = logger
        config['verbose'] = debug
        dbname = 'test_mapping'
        collname = 'db'
        config['mappingdb'] = dict(dburi=dburi,
                                   dbname=dbname,
                                   collname=collname)
        # add some maps to mapping db
        conn = MongoClient(dburi)
        conn.drop_database(dbname)
        self.coll = conn[dbname][collname]
        self.pmap = {
            "presentation": {
                "block": [{
                    "ui": "Block name",
                    "das": "block.name"
                }, {
                    "ui": "Block size",
                    "das": "block.size"
                }]
            },
            "type": "presentation"
        }
        self.coll.insert(self.pmap)

        url = 'https://cmsweb.cern.ch/dbs/prod/global/DBSReader/acquisitioneras/'
        dformat = 'JSON'
        system = 'dbs3'
        expire = 100
        rec = {
            'system':
            system,
            'urn':
            'acquisitioneras',
            'format':
            dformat,
            'instances': ['prod/global'],
            'url':
            url,
            'expire':
            expire,
            'lookup':
            'era',
            'params': {},
            'das_map': [{
                "das_key": "era",
                "rec_key": "era.name",
                "api_arg": "era"
            }],
            'type':
            'service'
        }
        self.coll.insert(rec)

        ver_token = verification_token(self.coll.find(**PYMONGO_OPTS))
        rec = {'verification_token': ver_token, 'type': 'verification_token'}
        self.coll.insert(rec)

        self.mgr = DASMapping(config)