Beispiel #1
0
 def test_info(self):
     "Test logger info method"
     old_stdout = sys.stdout
     logger = PrintManager(self.name, verbose=1)
     sys.stdout = StringIO.StringIO()
     logger.info('test')
     result = sys.stdout.getvalue()
     expect = 'INFO %s:%s test\n' % (self.name, funcname())
     self.assertEqual(expect, result)
     sys.stdout = old_stdout
Beispiel #2
0
 def test_info(self):
     "Test logger info method"
     old_stdout = sys.stdout
     logger = PrintManager(self.name, verbose=1)
     sys.stdout = StringIO()
     logger.info('test')
     result = sys.stdout.getvalue()
     expect = 'INFO %s:%s test\n' % (self.name, funcname())
     self.assertEqual(expect, result)
     sys.stdout = old_stdout
Beispiel #3
0
class QueryRunner(object):
    "Replaces das_robot"
    task_options = [{'name':'query', 'type':'string', 'default':None,
                   'help':'Query to issue using das_core::call'}]
    def __init__(self, **kwargs):
        self.logger = PrintManager('QueryRunner', kwargs.get('verbose', 0))
        self.das = kwargs['DAS']
        self.dasquery = DASQuery(kwargs['dasquery'])
    def __call__(self):
        "__call__ implementation"
        self.logger.info("Issuing query %s" % self.dasquery)
        result = self.das.call(self.dasquery, add_to_analytics=False)
        return {'result':result}
Beispiel #4
0
Datei: Test.py Projekt: ktf/DAS
class Test(object):
    """
    This is a test task that prints a message (supplied in kwargs) to stdout.
    It will randomly sometimes raise exceptions (5%), spawn subtasks (4%) or
    disable resubmission of itself (1%).
    """
    task_title = "Test task"
    task_options = [{"name":"message", "type":"string",
                     "default":"hello world", "help":"Message to print"}]

    def __init__(self, **kwargs):
        self.logger = PrintManager('Test', kwargs.get('verbose', 0))
        self.name = kwargs['name']
        self.message = kwargs['message']
        self.index = kwargs['index']

    def __call__(self):
        self.logger.info('%s from index=%s' % (self.message, self.index))

        result = {}

        effect = random.random()
        if effect > 0.99:
            self.logger.info('..disabling resubmission')
            result['resubmit'] = False
        elif effect > 0.95:
            task = {'name':'spawn-of-%s' % self.index,
                    'classname':'Test',
                    'interval': random.randint(1,30),
                    'kwargs':{'message':'spawn-of-%s' % self.message}}
            effect2 = random.random()
            if effect2 > 0.50:
                task['only_once'] =  True
                self.logger.info('..spawning run-once task')
            elif effect2 > 0.25:
                task['max_runs'] = random.randint(1, 5)
                self.logger.info(\
                '..spawning task to run %s times' % task['max_runs'])
            else:
                task['only_before'] = time.time() + random.randint(1, 120)
                self.logger.info(\
                '..spawning task to run until %s' % task['only_before'])
            result['new_tasks'] = [task]
        elif effect > 0.90:
            self.logger.error('..raising an exception')
            raise

        return result
Beispiel #5
0
class DASCore(object):
    """
    DAS core class.
    """
    def __init__(self, config=None, debug=None, 
                nores=False, logger=None, engine=None, multitask=True):
        if  config:
            dasconfig = config
        else:
            dasconfig = das_readconfig()
        verbose       = dasconfig['verbose']
        self.stdout   = debug
        if  isinstance(debug, int):
            self.verbose = debug
            dasconfig['verbose'] = debug
        else:
            self.verbose = verbose
        das_timer('DASCore::init', self.verbose)
        self.operators = das_operators()

        # set noresults option
        self.noresults = False
        if  nores:
            dasconfig['write_cache'] = True
            self.noresults = nores

        self.multitask = dasconfig['das'].get('multitask', True)
        if  debug or self.verbose:
            self.multitask = False # in verbose mode do not use multitask
            dasconfig['das']['multitask'] = False
        if  not multitask: # explicitly call DASCore ctor, e.g. in analytics
            self.multitask = False
            dasconfig['das']['multitask'] = False
        dasconfig['engine'] = engine
        if  self.multitask:
            nworkers = dasconfig['das'].get('core_workers', 5)
            if  engine:
                thr_name = 'DASCore:PluginTaskManager'
                self.taskmgr = PluginTaskManager(\
                        engine, nworkers=nworkers, name=thr_name)
                self.taskmgr.subscribe()
            else:
                thr_name = 'DASCore:TaskManager'
                self.taskmgr = TaskManager(nworkers=nworkers, name=thr_name)
        else:
            self.taskmgr = None

        if  logger:
            self.logger = logger
        else:
            self.logger = PrintManager('DASCore', self.verbose)

        # define Mapping/Analytics/Parser in this order since Parser depends
        # on first two
        dasmapping = DASMapping(dasconfig)
        dasconfig['dasmapping'] = dasmapping
        self.mapping = dasmapping

        self.analytics = DASAnalytics(dasconfig)
        dasconfig['dasanalytics'] = self.analytics

        self.mongoparser = ql_manager(dasconfig)
        dasconfig['mongoparser'] = self.mongoparser

        self.keylearning = DASKeyLearning(dasconfig)
        dasconfig['keylearning'] = self.keylearning

        # init DAS cache
        self.rawcache = DASMongocache(dasconfig)
        dasconfig['rawcache'] = self.rawcache

        # plug-in architecture: loop over registered data-services in
        # dasconfig; load appropriate module/class; register data
        # service with DASCore.
        self.systems = dasmapping.list_systems()
        # pointer to the DAS top level directory
        dasroot = '/'.join(__file__.split('/')[:-3])
        for name in self.systems:
            try:
                klass  = 'DAS/services/%s/%s_service.py' \
                    % (name, name)
                srvfile = os.path.join(dasroot, klass)
                with file(srvfile) as srvclass:
                    for line in srvclass:
                        if  line.find('(DASAbstractService)') != -1:
                            klass = line.split('(DASAbstractService)')[0]
                            klass = klass.split('class ')[-1] 
                            break
                mname  = 'DAS.services.%s.%s_service' % (name, name)
                module = __import__(mname, fromlist=[klass])
                obj = getattr(module, klass)(dasconfig)
                setattr(self, name, obj)
                SERVICES[name] = obj
            except IOError as err:
                if  debug > 1:
                    # we have virtual services, so IOError can be correct
                    print_exc(err)
                try:
                    mname  = 'DAS.services.generic_service'
                    module = __import__(mname, fromlist=['GenericService'])
                    obj    = module.GenericService(name, dasconfig)
                    setattr(self, name, obj)
                except Exception as exc:
                    print_exc(exc)
                    msg = "Unable to load %s data-service plugin" % name
                    raise Exception(msg)
            except Exception as exc:
                print_exc(exc)
                msg = "Unable to load %s data-service plugin" % name
                raise Exception(msg)

        # loop over systems and get system keys, add mapping keys to final list
        self.service_keys = {}
        self.service_parameters = {}
        for name in self.systems: 
            skeys = getattr(self, name).keys()
            self.service_keys[getattr(self, name).name] = skeys
            sparams = getattr(self, name).parameters()
            self.service_parameters[getattr(self, name).name] = sparams

        self.service_keys['special'] = das_special_keys()
        self.dasconfig = dasconfig
        das_timer('DASCore::init', self.verbose)

    def keys(self):
        """
        Return map of data service keys
        """
        return self.service_keys

    def das_keys(self):
        """
        Return map of data service keys
        """
        _keys = ['records']
        for values in self.service_keys.values():
            for key in values:
                if  key not in _keys:
                    _keys.append(key)
        return _keys

    def result(self, query, idx=0, limit=None):
        """
        Get results either from cache or from explicit call
        """
        self.logger.info('input query=%s' % query)
        results = []
        dasquery = DASQuery(query, mongoparser=self.mongoparser)
        dasquery.add_to_analytics()
        query    = dasquery.mongo_query
        # check if we have any service which cover the query
        # otherwise decompose it into list of queries
        service_map = dasquery.service_apis_map()
        if  not service_map:
            msg  = 'no APIs found to answer input query, will decompose it'
            self.logger.info(msg)
            skeys = query['fields']
            if  not skeys:
                skeys = []
            for key in skeys:
                newquery = DASQuery(dict(fields=[key], spec=query['spec']),
                                        mongoparser=self.mongoparser)
                self.call(newquery) # process query
        else:
            self.call(dasquery) # process query

        # lookup provided query in a cache
        if  not self.noresults:
            results = self.get_from_cache(dasquery, idx, limit)
        return results

    def remove_from_cache(self, dasquery):
        """
        Delete in cache entries about input query
        """
        self.rawcache.remove_from_cache(dasquery)

    def get_status(self, dasquery):
        """
        Look-up status of provided query in a cache.
        Return status of the query request and its hash.
        """
        if  dasquery and dasquery.mongo_query.has_key('fields'):
            fields = dasquery.mongo_query['fields']
            if  fields and isinstance(fields, list) and 'queries' in fields:
                return 'ok', dasquery.qhash
        status = 0
        record = self.rawcache.find(dasquery)
        try:
            if  record and record.has_key('das') and \
                record['das'].has_key('status'):
                status = record['das']['status']
                return status, record['qhash']
        except:
            pass

        similar_dasquery = self.rawcache.similar_queries(dasquery)
        if  similar_dasquery:
            record = self.rawcache.find(similar_dasquery)
            if  record and record.has_key('das') and \
                record['das'].has_key('status'):
                similar_query_status = record['das']['status']
                return similar_query_status, record['qhash']
        return status, 0

    def worker(self, srv, dasquery):
        """Main worker function which calls data-srv call function"""
        self.logger.info('##### %s ######\n' % srv)
        das_timer(srv, self.verbose)
        getattr(getattr(self, srv), 'call')(dasquery)
        das_timer(srv, self.verbose)

    def call(self, query, add_to_analytics=True, **kwds):
        """
        Top level DAS api which execute a given query using underlying
        data-services. It follows the following steps:

            - parse input query
            - identify data-sercices based on selection keys
              and where clause conditions
            - construct DAS workflow and execute data-service 
              API calls. At this step individual 
              data-services store results into DAS cache.

        Return status 0/1 depending on success of the calls, can be
        used by workers on cache server.

        kwds is provided for compatibility with web layer, e.g. it
        may invoke this method with additional pid parameter.
        """
        self.logger.info('input query=%s' % query)
        das_timer('DASCore::call', self.verbose)
        services = []
        if  isinstance(query, object) and hasattr(query, '__class__')\
            and query.__class__.__name__ == 'DASQuery':
            dasquery = query
        else:
            dasquery = DASQuery(query, mongoparser=self.mongoparser)
        if  add_to_analytics:
            dasquery.add_to_analytics()
        query = dasquery.mongo_query
        if  dasquery.mongo_query.has_key('system'):
            system = query['system']
            if  isinstance(system, str) or isinstance(system, unicode):
                services = [system]
            elif isinstance(system, list):
                services = system
            else:
                msg = 'Unsupported system=%s type=%s in DAS query' \
                        % (system, type(system))
                raise Exception(msg)
        spec   = query.get('spec')
        fields = query.get('fields')
        if  fields == ['records']:
            msg = 'look-up all records in cache'
            self.logger.info(msg)
            return 'in cache'
        if  spec == dict(records='*'):
            self.logger.info("look-up everything in cache")
            return 'in cache'
        for record in self.rawcache.find_specs(dasquery):
            status = record['das']['status']
            msg = 'found query %s in cache, status=%s\n' \
                        % (record['query'], status)
            self.logger.info(msg)
            return status
        similar_dasquery = self.rawcache.similar_queries(dasquery)
        if  similar_dasquery:
            for record in self.rawcache.find_specs(similar_dasquery):
                if  record:
                    try:
                        status = record['das']['status']
                    except:
                        status = 'N/A'
                        msg = 'Fail to look-up das.status, record=%s' % record
                        self.logger.info(msg)
                msg  = 'found SIMILAR query in cache,'
                msg += 'query=%s, status=%s\n' % (record['query'], status)
                self.logger.info(msg)
                return status

        self.logger.info(dasquery)
        params = dasquery.params()
        if  not services:
            services = params['services']
        self.logger.info('services = %s' % services)
        das_timer('das_record', self.verbose)
        # initial expire tstamp 1 day (long enough to be overwriten by data-srv)
        expire = expire_timestamp(time.time()+1*24*60*60)
        header = dasheader("das", dasquery, expire)
        header['lookup_keys'] = []
        self.rawcache.insert_query_record(dasquery, header)
        das_timer('das_record', self.verbose)
        try:
            if  self.multitask:
                jobs = []
                for srv in services:
                    jobs.append(self.taskmgr.spawn(self.worker, srv, dasquery))
                self.taskmgr.joinall(jobs)
            else:
                for srv in services:
                    self.worker(srv, dasquery)
        except Exception as exc:
            print_exc(exc)
            return 'fail'
        self.logger.info('\n##### merging ######\n')
        self.rawcache.update_query_record(dasquery, 'merging')
        das_timer('merge', self.verbose)
        self.rawcache.merge_records(dasquery)
        das_timer('merge', self.verbose)
        self.rawcache.update_query_record(dasquery, 'ok')
        self.rawcache.add_to_record(\
                dasquery, {'das.timer': get_das_timer()}, system='das')
        das_timer('DASCore::call', self.verbose)
        return 'ok'

    def nresults(self, dasquery, coll='merge'):
        """
        Return total number of results (count) for provided query
        Code should match body of get_from_cache method.
        """
        fields = dasquery.mongo_query.get('fields', None)
        if  dasquery.mapreduce:
            result = self.rawcache.map_reduce(dasquery.mapreduce, dasquery)
            return len([1 for _ in result])
        elif dasquery.aggregators:
            return len(dasquery.aggregators)
        elif isinstance(fields, list) and 'queries' in fields:
            return len([1 for _ in self.get_queries(dasquery)])
        return self.rawcache.nresults(dasquery, coll)

    def incache(self, dasquery, coll='merge'):
        """
        Answer the question if given query in DAS cache or not
        """
        return self.rawcache.incache(dasquery, collection=coll)

    def get_from_cache(self, dasquery, idx=0, limit=0, collection='merge'):
        """
        Look-up results from the merge cache and yield them for
        further processing.
        """
        das_timer('DASCore::get_from_cache', self.verbose)
        msg = 'col=%s, query=%s, idx=%s, limit=%s'\
                % (collection, dasquery, idx, limit)
        self.logger.info(msg)

        fields  = dasquery.mongo_query.get('fields', None)

        if  dasquery.mapreduce:
            res = self.rawcache.map_reduce(dasquery.mapreduce, dasquery)
        elif dasquery.aggregators:
            res = []
            _id = 0
            for func, key in dasquery.aggregators:
                rows = self.rawcache.get_from_cache(\
                        dasquery, collection=collection)
                data = getattr(das_aggregator, 'das_%s' % func)(key, rows)
                res += \
                [{'_id':_id, 'function': func, 'key': key, 'result': data}]
                _id += 1
        elif isinstance(fields, list) and 'queries' in fields:
            res = itertools.islice(self.get_queries(dasquery), idx, idx+limit)
        else:
            res = self.rawcache.get_from_cache(dasquery, idx, limit, \
                    collection=collection)
        for row in res:
            fix_times(row)
            yield row
        das_timer('DASCore::get_from_cache', self.verbose)

    def get_queries(self, dasquery):
        """
        Look-up (popular) queries in DAS analytics/logging db
        """
        das_timer('DASCore::get_queries', self.verbose)
        fields = dasquery.mongo_query.get('fields')
        spec   = dasquery.mongo_query.get('spec')
        if  'popular' in fields:
            res = self.analytics.get_popular_queries(spec)
        else:
            datestamp = spec.get('date')
            if  isinstance(datestamp, dict):
                value = datestamp.get('$in')
                res = \
                self.analytics.list_queries(after=value[0], before=value[1])
            elif isinstance(datestamp, int):
                res = self.analytics.list_queries(after=datestamp)
            elif not datestamp:
                res = self.analytics.list_queries()
            else:
                msg = 'Unsupported date value: %s' % datestamp
                raise Exception(msg)
        for row in res:
            rid = row.pop('_id')
            yield dict(das_query=row, _id=rid)
        das_timer('DASCore::get_queries', self.verbose)
Beispiel #6
0
class DASMongocache(object):
    """
    DAS cache based MongoDB.
    """
    def __init__(self, config):
        self.config  = config
        self.emptyset_expire = \
                expire_timestamp(config['das'].get('emptyset_expire', 5))
        self.dburi   = config['mongodb']['dburi']
        self.cache_size = config['mongodb']['bulkupdate_size']
        self.dbname  = config['dasdb']['dbname']
        self.verbose = config['verbose']
        self.logger  = PrintManager('DASMongocache', self.verbose)
        self.mapping = config['dasmapping']
        self.logging = config['dasdb'].get('logging', False)
        self.rec_ttl = config['dasdb'].get('record_ttl', 24*60*60)
        self.del_ttl = config['dasdb'].get('delta_ttl', 60)
        self.cleanup_del_ttl = config['dasdb'].get('cleanup_delta_ttl', 3600)
        self.retry   = config['dasdb'].get('retry', 3)
        self.das_son_manipulator = DAS_SONManipulator()

        # Initialize MongoDB connection
        self.col_    = self.config['dasdb']['cachecollection']
        self.mrcol_  = self.config['dasdb']['mrcollection']
        self.merge_  = self.config['dasdb']['mergecollection']
        self.gfs     = db_gridfs(self.dburi)

        msg = "%s@%s" % (self.dburi, self.dbname)
        self.logger.info(msg)

        # ensure that we have the following indexes
        common_idx = [
                      ('file.name', DESCENDING),
                      ('dataset.name', DESCENDING),
                      ('block.name', DESCENDING),
                      ('run.run_number', DESCENDING),
                      ]
        index_list = [('das.expire', ASCENDING), ('das_id', ASCENDING),
                      ('das.system', ASCENDING),
                      ('qhash', DESCENDING),
                      ('das.record', ASCENDING)]
        create_indexes(self.col, index_list + common_idx)
        index_list = [('das.expire', ASCENDING), ('das_id', ASCENDING),
                      ('qhash', DESCENDING),
                      ('das.record', ASCENDING),
                      ('das.ts', ASCENDING)]
        create_indexes(self.merge, index_list)
        # NOTE: I found that creating index in merge collection leads to
        # MongoDB error when records contains multiple arrays on indexed
        # keys. For example, when we query file,run,lumi both file and run
        # are arrays in MongoDB. In this case the final sort in MongoDB
        # bark with the following message:
        # cannot sort with keys that are parallel arrays
        # it looks like that there is no fix for that yet
        # see
        # http://stackoverflow.com/questions/6516725/how-do-i-index-two-arrays-in-mongodb
        # therefore I temporary disabled create_indexes call on merge
        # collection which was used to have index to ease final sort,
        # especially in a case when a lot of records correspond to inital
        # query, e.g. file records.
        # On another hand, the most common use case where sort fails is
        # getting file records, and I can add one compound key to ease sort
        # but I can't add another compound key on array field, e.g. run
        common_idx = [[('qhash', DESCENDING), ('file.name', DESCENDING)]]
        create_indexes(self.merge, index_list + common_idx)

        # thread which clean-up DAS collections
        thname = 'mongocache_cleanup'
        cols   = [config['dasdb']['cachecollection'],
                  config['dasdb']['mrcollection'],
                  config['dasdb']['mergecollection']]

    @property
    def col(self):
        "col property provides access to DAS cache collection"
        conn = db_connection(self.dburi)
        mdb  = conn[self.dbname]
        colnames = mdb.collection_names()
        if  not colnames or self.col_ not in colnames:
            try:
                mdb.create_collection(self.col_)
            except OperationFailure:
                pass
        mdb.add_son_manipulator(self.das_son_manipulator)
        return mdb[self.col_]

    @property
    def merge(self):
        "merge property provides access to DAS merge collection"
        conn = db_connection(self.dburi)
        mdb  = conn[self.dbname]
        colnames = mdb.collection_names()
        if  not colnames or self.merge_ not in colnames:
            try:
                mdb.create_collection(self.merge_)
            except OperationFailure:
                pass
        mdb.add_son_manipulator(self.das_son_manipulator)
        return mdb[self.merge_]

    @property
    def mrcol(self):
        "mrcol property provides access to DAS map-reduce collection"
        conn = db_connection(self.dburi)
        mdb  = conn[self.dbname]
        mdb.add_son_manipulator(self.das_son_manipulator)
        return mdb[self.mrcol_]

    def get_dataset_hashes(self, dasquery):
        "Get dataset hashes from DBS database"
        spec = dasquery.mongo_query.get('spec', {})
        inst = dasquery.instance
        conn = db_connection(self.dburi)
        if  spec and inst:
            dataset = spec.get('dataset.name', None)
            if  dataset:
                if  dataset.find('*') != -1:
                    cond = {'dataset':re.compile(dataset.replace('*', '.*'))}
                else:
                    cond = {'dataset': dataset}
                for row in conn['dbs'][inst].find(cond):
                    if  'qhash' in row:
                        yield row['qhash']

    def check_datasets(self, dasquery):
        "Check dataset presence in DAS cache for given das query"
        hashes = [r for r in self.get_dataset_hashes(dasquery)]
        if  hashes:
            spec = {'qhash': {'$in': hashes}}
            if  len(hashes) == self.merge.find(spec, **PYMONGO_OPTS).count():
                dasquery._hashes = hashes

    def get_superset_keys(self, key, value):
        """
        This is a special-case version of similar_keys,
        intended for analysers that want to quickly
        find possible superset queries of a simple
        query of the form key=value.
        """

        msg = "%s=%s" % (key, value)
        self.logger.debug(msg)
        cond = {'query.spec.key': key}
        for row in self.col.find(cond, **PYMONGO_OPTS):
            mongo_query = decode_mongo_query(row['query'])
            for thiskey, thisvalue in mongo_query.items():
                if thiskey == key:
                    if fnmatch.fnmatch(value, thisvalue):
                        yield thisvalue

    def get_fields(self, dasquery):
        "Prepare fields to extract from MongoDB"
        fields     = dasquery.mongo_query.get('fields', [])
        if  fields and 'records' in fields:
            fields = None # look-up all records
        filters    = dasquery.filters
        cond       = {}
        if  filters:
            new_fields = []
            for dasfilter in filters:
                if  dasfilter == 'unique':
                    continue
                if  fields and dasfilter not in fields and \
                    dasfilter not in new_fields:
                    if  dasfilter.find('=') == -1 and dasfilter.find('<') == -1\
                    and dasfilter.find('>') == -1:
                        new_fields.append(dasfilter)
                    else:
                        cond = parse_filters(dasquery.mongo_query)
            if  not new_fields and fields:
                new_fields = list(fields)
            return new_fields, cond
        return fields, cond

    def remove_expired(self, dasquery, collection):
        """
        Remove expired records from DAS cache. We need to perform this
        operation very carefullly since we don't use transation and on-going
        commits can invoke this method (see das_core.py).  Therefore we use
        MongoDB $or operator to wipe out queries which match DASQuery hash and
        already expired or queries which lived in cache more then rec_ttl
        config parameter. The later operation just prevent DAS cache from
        growing.
        """
        conn   = db_connection(self.dburi)
        mdb    = conn[self.dbname]
        mdb.add_son_manipulator(self.das_son_manipulator)
        col    = mdb[collection]
        # use additional delta to check data record expiration
        # we add this delta to ensure that there is no records close to
        # current timestamp which may expire during request processing
        spec = {'qhash':dasquery.qhash,
                'das.expire':{'$lt':time.time()+self.del_ttl}}
        col.delete_many(spec)

    def check_services(self, dasquery):
        """
        Check if DAS cache contains DAS records with service response for
        given query.
        """
        das_rec  = self.find(dasquery)
        if  not das_rec:
            return False
        if  'das' not in das_rec:
            return False
        if  'services' not in das_rec['das']:
            return False
        spec = {'qhash':dasquery.qhash, 'das.system':{'$ne':'das'},
                'das.expire':{'$gt':time.time()}}
        nres = self.col.find(spec, **PYMONGO_OPTS).count()
        if  nres:
            return True
        return False

    def find(self, dasquery):
        """
        Find provided query in DAS cache.
        """
        cond = {'qhash': dasquery.qhash, 'das.system':'das',
                'das.expire': {'$gt':time.time()}}
        return find_one(self.col, cond)

    def find_specs(self, dasquery, system='das'):
        """
        Check if cache has query whose specs are identical to provided query.
        Return all matches.
        """
        if dasquery.hashes:
            cond = {'qhash':{'$in':dasquery.hashes}}
        else:
            cond = {'qhash': dasquery.qhash}
        if  system:
            cond.update({'das.system': system})
        cond.update({'das.expire':{'$gt':time.time()}})
        return self.col.find(cond, **PYMONGO_OPTS)

    def get_das_ids(self, dasquery):
        """
        Return list of DAS ids associated with given query
        """
        das_ids = []
        try:
            das_ids = \
                [r['_id'] for r in self.col.find_specs(dasquery, system='')]
        except:
            pass
        return das_ids

    def update_das_expire(self, dasquery, timestamp):
        "Update timestamp of all DAS data records for given query"
        nval = {'$set': {'das.expire':timestamp}}
        spec = {'qhash' : dasquery.qhash}
        self.col.update_many(spec, nval)
        self.merge.update_many(spec, nval)

    def das_record(self, dasquery):
        "Retrieve DAS record for given query"
        cond = {'qhash': dasquery.qhash, 'das.expire':{'$gt':time.time()}}
        return find_one(self.col, cond)

    def find_records(self, das_id):
        " Return all the records matching a given das_id"
        return self.col.find({'das_id': das_id}, **PYMONGO_OPTS)

    def is_error_in_records(self, dasquery, collection='cache'):
        "Scan DAS cache for error records and return true or not"
        if  collection == 'cache':
            results = self.col.find({'qhash':dasquery.qhash}, **PYMONGO_OPTS)
        else:
            results = self.merge.find({'qhash':dasquery.qhash}, **PYMONGO_OPTS)
        error  = None
        reason = None
        for row in results:
            if 'error' in row:
                error  = row.get('error')
                reason = row.get('reason', '')
                break
        return error, reason

    def add_to_record(self, dasquery, info, system=None):
        "Add to existing DAS record provided info"
        if  system:
            self.col.update_one({'query': dasquery.storage_query,
                             'das.system':system},
                            {'$set': info}, upsert=True)
        else:
            self.col.update_one({'query': dasquery.storage_query},
                            {'$set': info}, upsert=True)

    def find_min_expire(self, dasquery):
        """Find minimal expire timestamp across all records for given DAS query"""
        spec   = {'qhash': dasquery.qhash}
        min_expire = 2*time.time() # upper bound, will update
        for rec in self.col.find(spec, **PYMONGO_OPTS):
            if  'das' in rec and 'expire' in rec['das']:
                estamp = rec['das']['expire']
                if  min_expire > estamp:
                    min_expire = estamp
        return long(min_expire)

    def find_query_record(self, dasquery):
        "Find DAS query records and return them to the caller"
        spec = {'qhash':dasquery.qhash,
                'das.record':record_codes('query_record')}
        return self.col.find(spec, **PYMONGO_OPTS)

    def update_query_record(self, dasquery, status, header=None, reason=None):
        "Update DAS record for provided query"
        ctime = time.time()
        das_spec = {'qhash': dasquery.qhash, 'das.system':'das'}
        min_expire = self.find_min_expire(dasquery)
        if  header:
            system = header['das']['system']
            sts    = header['das']['status']
            expire = header['das']['expire']
            spec   = {'qhash': dasquery.qhash, 'das.system': system}
            new_expire = None
            for rec in self.col.find(spec, **PYMONGO_OPTS):
                if  'das' in rec and 'expire' in rec['das']:
                    if  rec['das']['expire'] > expire:
                        new_expire = expire
                        ndict = {'das.expire':expire, 'das.status':status}
                        cdict = {'das.ctime':ctime}
                        udict = {'$set':ndict, '$push':cdict}
                        oid   = ObjectId(rec['_id'])
                        self.col.update_one({'_id':oid}, udict)
            if  new_expire:
                udict = {'$set': {'das.expire': new_expire},
                         '$push': {'das.ctime':ctime}}
                self.col.update_one(das_spec, udict)
        else:
            udict = {'$set': {'das.status':status, 'das.expire': min_expire},
                     '$push': {'das.ctime':ctime}}
            self.col.update_one(das_spec, udict)
        if  reason:
            udict = {'$set': {'das.reason':reason}}
            self.col.update_one(das_spec, udict)
        # align all expire timestamps when we recieve ok status
        if  status == 'ok':
            udict = {'$set': {'das.expire': min_expire}}
            self.col.update_one(das_spec, udict)

    def apilist(self, dasquery):
        "Return list of apis for given dasquery"
        spec = {'qhash':dasquery.qhash,
                'das.record':record_codes('query_record')}
        apis = []
        for row in self.col.find(spec, ['das.api'], **PYMONGO_OPTS):
            try:
                apis += row['das']['api']
            except Exception as _err:
                pass
        return apis

    def incache(self, dasquery, collection='merge', system=None, api=None,
            query_record=False):
        """
        Check if we have query results in cache, otherwise return null.
        Please note, input parameter query means MongoDB query, please
        consult MongoDB API for more details,
        http://api.mongodb.org/python/
        """
        if  query_record:
            record = record_codes('query_record')
        else:
            record = spec4data_records()
        spec = {'qhash':dasquery.qhash, 'das.record':record,
                'das.expire':{'$gt':time.time()}}
        if  system:
            spec.update({'das.system': system})
        if  api:
            spec.update({'das.api': api})
        conn = db_connection(self.dburi)
        mdb  = conn[self.dbname]
        mdb.add_son_manipulator(self.das_son_manipulator)
        col  = mdb[collection]
        res  = col.find(spec, **PYMONGO_OPTS).count()
        msg  = "(%s, coll=%s) found %s results" % (dasquery, collection, res)
        self.logger.info(msg)
        if  res:
            return True
        return False

    def nresults(self, dasquery, collection='merge'):
        """Return number of results for given query."""
        if  dasquery.aggregators:
            return len(dasquery.aggregators)
        # Distinguish 2 use cases, unique filter and general query
        # in first one we should count only unique records, in later
        # we can rely on DB count() method. Pleas keep in mind that
        # usage of fields in find doesn't account for counting, since it
        # is a view over records found with spec, so we don't need to use it.
        fields, filter_cond = self.get_fields(dasquery)
        if  not fields:
            spec = dasquery.mongo_query.get('spec', {})
        elif dasquery.hashes:
            spec = {'qhash':{'$in':dasquery.hashes},
                    'das.record': spec4data_records()}
        else:
            spec = {'qhash':dasquery.qhash,
                    'das.record': spec4data_records()}
        if  filter_cond:
            spec.update(filter_cond)
        conn = db_connection(self.dburi)
        mdb  = conn[self.dbname]
        mdb.add_son_manipulator(self.das_son_manipulator)
        col  = mdb[collection]
        if  dasquery.unique_filter:
            skeys = self.mongo_sort_keys(collection, dasquery)
            if  skeys:
                gen = col.find(spec, **PYMONGO_OPTS).sort(skeys)
            else:
                gen = col.find(spec, **PYMONGO_OPTS)
            res = len([r for r in unique_filter(gen)])
        else:
            res = col.find(spec, **PYMONGO_OPTS).count()
            if  not res: # double check that this is really the case
                time.sleep(1)
                res = col.find(spec, **PYMONGO_OPTS).count()
        msg = "%s" % res
        self.logger.info(msg)
        return res

    def mongo_sort_keys(self, collection, dasquery):
        """
        Find list of sort keys for a given DAS query. Check existing
        indexes and either use fields or spec keys to find them out.
        Return list of mongo sort keys in a form of (key, order).
        """
        # try to get sort keys all the time to get ordered list of
        # docs which allow unique_filter to apply afterwards
        fields = dasquery.mongo_query.get('fields')
        spec   = dasquery.mongo_query.get('spec')
        skeys  = dasquery.sortkeys
        mongo_skeys = []
        if  skeys:
            for key in skeys:
                if  key.find('-') != -1: # reverse order, e.g. desc
                    mongo_skeys.append((key.replace('-', ''), DESCENDING))
                else:
                    mongo_skeys.append((key, ASCENDING))
        else:
            existing_idx = [i for i in self.existing_indexes(collection)]
            if  fields:
                lkeys = []
                for key in fields:
                    for mkey in self.mapping.mapkeys(key):
                        if  mkey not in lkeys:
                            lkeys.append(mkey)
            else:
                lkeys = list(spec.keys())
            keys = [k for k in lkeys \
                if k.find('das') == -1 and k.find('_id') == -1 and \
                        k in existing_idx]
            mongo_skeys = [(k, ASCENDING) for k in keys]
        return mongo_skeys

    def existing_indexes(self, collection='merge'):
        """
        Get list of existing indexes in DB. They are returned by
        index_information API in the following for:

        .. doctest::

            {u'_id_': {u'key': [(u'_id', 1)], u'v': 0},
             u'das.expire_1': {u'key': [(u'das.expire', 1)], u'v': 0},
             ...
             u'tier.name_-1': {u'key': [(u'tier.name', -1)], u'v': 0}}
        """
        conn = db_connection(self.dburi)
        mdb  = conn[self.dbname]
        mdb.add_son_manipulator(self.das_son_manipulator)
        col = mdb[collection]
        for val in col.index_information().values():
            for idx in val['key']:
                yield idx[0] # index name

    def get_records(self, coll, spec, fields, skeys, idx, limit, unique=False):
        "Generator to get records from MongoDB."
        try:
            conn = db_connection(self.dburi)
            mdb  = conn[self.dbname]
            mdb.add_son_manipulator(self.das_son_manipulator)
            col = mdb[coll]
            nres = col.find(spec, **PYMONGO_OPTS).count()
            if  nres == 1 or nres <= limit:
                limit = 0
            if  limit:
                res = col.find(spec, fields, sort=skeys, skip=idx, limit=limit)
            else:
                res = col.find(spec, fields, sort=skeys, **PYMONGO_OPTS)
            if  unique:
                res = unique_filter(res)
            for row in res:
                yield row
        except Exception as exp:
            print_exc(exp)
            row = {'exception': str(exp)}
            res = []
            yield row

    def get_from_cache(self, dasquery, idx=0, limit=0, collection='merge'):
        "Generator which retrieves results from the cache"
        if  dasquery.service_apis_map(): # valid DAS query
            result = self.get_das_records(dasquery, idx, limit, collection)
            for row in result:
                yield row
        else: # pure MongoDB query
            fields  = dasquery.mongo_query.get('fields', [])
            if  fields == None:
                fields = []
            spec    = dasquery.mongo_query.get('spec', {})
            if  dasquery.filters:
                if  not fields:
                    fields = []
                fields += dasquery.filters
                pkeys   = [k.split('.')[0] for k in fields]
            fields += das_record_keys()
            if  'records' in dasquery.query:
                fields = None # special case for DAS 'records' keyword
            skeys   = self.mongo_sort_keys(collection, dasquery)
            result  = self.get_records(collection, spec, fields, skeys, \
                            idx, limit, dasquery.unique_filter)
            for row in result:
                if  dasquery.filters:
                    if  pkeys and set(pkeys) & set(row.keys()):
                        yield row
                else:
                    yield row

    def get_das_records(self, dasquery, idx=0, limit=0, collection='merge'):
        "Generator which retrieves DAS records from the cache"
        msg = "(%s, %s, %s, coll=%s)" % (dasquery, idx, limit, collection)
        self.logger.info(msg)

        idx = int(idx)
        fields, filter_cond = self.get_fields(dasquery)
        if  fields == None:
            fields = []
        if  not fields:
            spec = dasquery.mongo_query.get('spec', {})
        elif dasquery.hashes:
            spec = {'qhash':{'$in':dasquery.hashes},
                    'das.record': spec4data_records()}
        else:
            spec = {'qhash':dasquery.qhash,
                    'das.record': spec4data_records()}
        if  filter_cond:
            spec.update(filter_cond)
        if  'records' in dasquery.query:
            fields  = None # retrieve all fields for records DAS query
        else:
            # be sure to extract das internal keys
            fields += das_record_keys()
        # try to get sort keys all the time to get ordered list of
        # docs which allow unique_filter to apply afterwards
        skeys   = self.mongo_sort_keys(collection, dasquery)
        res     = self.get_records(collection, spec, fields, skeys, \
                        idx, limit, dasquery.unique_filter)
        counter = 0
        for row in res:
            counter += 1
            yield row
        msg = 'qhash %s, found %s record(s) in %s collection' \
                % (dasquery.qhash, counter, collection)
        print(dastimestamp('DAS INFO '), msg)

        if  counter:
            msg = "yield %s record(s)" % counter
            self.logger.info(msg)

        # if no raw records were yield we look-up possible error records
        # and reset timestamp for record with system:['das']
        if  not counter:
            spec = {'qhash':dasquery.qhash}
            nrec = self.col.find(spec, **PYMONGO_OPTS).count()
            if  nrec:
                msg = "for query %s, found %s non-result record(s)" \
                        % (dasquery, nrec)
                print(dastimestamp('DAS WARNING'), msg)
                for rec in self.col.find(spec, **PYMONGO_OPTS):
                    if  'query' in rec:
                        print(dastimestamp('DAS das record'), rec)
            self.update_das_expire(dasquery, etstamp())

    def map_reduce(self, mr_input, dasquery, collection='merge'):
        """
        Wrapper around _map_reduce to allow sequential map/reduce
        operations, e.g. map/reduce out of map/reduce.

        mr_input is either alias name or list of alias names for
        map/reduce functions.

        Input dasquery which is applied to first
        iteration of map/reduce functions.
        """
        # NOTE: I need to revisit mapreduce.
        spec = dasquery.mongo_query['spec']
        if  not isinstance(mr_input, list):
            mrlist = [mr_input]
        else:
            mrlist = mr_input
        conn = db_connection(self.dburi)
        mdb  = conn[self.dbname]
        mdb.add_son_manipulator(self.das_son_manipulator)
        coll = mdb[collection]
        for mapreduce in mrlist:
            if  mapreduce == mrlist[0]:
                cond = spec
            else:
                cond = None
            coll = self._map_reduce(coll, mapreduce, cond)
        for row in coll.find():
            yield row

    def _map_reduce(self, coll, mapreduce, spec=None):
        """
        Perform map/reduce operation over DAS cache using provided
        collection, mapreduce name and optional conditions.
        """
        self.logger.debug("(%s, %s)" % (mapreduce, spec))
        record = find_one(self.mrcol, {'name':mapreduce})
        if  not record:
            raise Exception("Map/reduce function '%s' not found" % mapreduce)
        fmap = record['map']
        freduce = record['reduce']
        if  spec:
            result = coll.map_reduce(Code(fmap), Code(freduce), query=spec)
        else:
            result = coll.map_reduce(Code(fmap), Code(freduce))
        msg = "found %s records in %s" % (result.count(), result.name)
        self.logger.info(msg)
        self.logger.debug(fmap)
        self.logger.debug(freduce)
        return result

    def get_map_reduce(self, name=None):
        """
        Return definition of map/reduce functions for provided name
        or gives full list.
        """
        spec = {}
        if  name:
            spec = {'name':name}
        result = self.mrcol.find(spec, **PYMONGO_OPTS)
        for row in result:
            yield row

    def merge_records(self, dasquery, attempt=0):
        """
        Merge DAS records for provided query. We perform the following
        steps:
        1. get all queries from das.cache by ordering them by primary key
        2. run aggregtor function to merge neighbors
        3. insert records into das.merge
        """
        ### TMP for asyncio
#         time.sleep(attempt+3) # pymongo 3.2 don't yet flush in time

        # remove any entries in merge collection for this query
        self.merge.delete_many({'qhash':dasquery.qhash})
        # proceed
        self.logger.debug(dasquery)
        id_list = []
        expire  = 9999999999 # future
        # get all API records for given DAS query
        spec    = {'qhash':dasquery.qhash,
                   'das.expire':{'$gt':time.time()},
                   'das.record':record_codes('query_record')}
        records = self.col.find(spec, **PYMONGO_OPTS)
        for row in records:
            # find smallest expire timestamp to be used by aggregator
            rexpire = row.get('das', {}).get('expire', expire)
            if  rexpire < expire:
                expire = rexpire
            if  row['_id'] not in id_list:
                id_list.append(row['_id'])
        inserted = 0
        lookup_keys = set()
        fields = dasquery.mongo_query.get('fields')
        if  not fields: # Mongo
            fields = []
        for key in fields:
            for pkey in self.mapping.mapkeys(key):
                lookup_keys.add(pkey)
        for pkey in lookup_keys:
            skey = [(pkey, DESCENDING)]
            # lookup all service records
            spec = {'das_id': {'$in': id_list}, 'das.primary_key': pkey}
            if  self.verbose:
                nrec = self.col.find(spec, **PYMONGO_OPTS).sort(skey).count()
                msg  = "merging %s records, for %s key" % (nrec, pkey)
            else:
                msg  = "merging records, for %s key" % pkey
            self.logger.debug(msg)
            # use exhaust=False since we process all records in aggregator
            # and it can be delay in processing
            records = self.col.find(spec, **PYMONGO_NOEXHAUST).sort(skey)
            # aggregate all records
            agen = aggregator(dasquery, records, expire)
            # diff aggregated records
            gen = das_diff(agen, self.mapping.diff_keys(pkey.split('.')[0]))
            # insert all records into das.merge using bulk insert
            size = self.cache_size
            try:
                res = self.merge.insert_many(gen)
                inserted += len(res.inserted_ids)
            except InvalidDocument as exp:
                print(dastimestamp('DAS WARNING'), 'InvalidDocument during merge', str(exp))
                msg = "Caught bson error: " + str(exp)
                self.logger.info(msg)
                records = self.col.find(spec, **PYMONGO_OPTS).sort(skey)
                gen = aggregator(dasquery, records, expire)
                genrows = parse2gridfs(self.gfs, pkey, gen, self.logger)
                das_dict = {'das':{'expire':expire,
                        'das.record': record_codes('gridfs_record'),
                        'primary_key':[k for k in lookup_keys],
                        'system': ['gridfs']}, 'qhash':dasquery.qhash,
                        'cache_id':[], 'das_id': id_list}
                for row in genrows:
                    row.update(das_dict)
                    self.merge.insert(row)
            except InvalidOperation as exp:
                pass
            except DuplicateKeyError as err:
                print(dastimestamp('DAS WARNING'), 'DuplicateKeyError during merge')
                if  not isinstance(gen, list):
                    raise err
        status = 'fail'
        if  inserted:
            status = 'ok'
        elif  not lookup_keys: # we get query w/o fields
            msg = 'qhash %s, no lookup_keys' % dasquery.qhash
            print(dastimestamp('DAS WARNING'), msg)
            status = 'ok'
        else: # we didn't merge anything, it is DB look-up failure
            msg  = 'qhash %s, did not insert into das.merge, attempt %s' \
                    % (dasquery.qhash, attempt)
            print(dastimestamp('DAS WARNING'), msg)
            empty_expire = etstamp()
            lkeys = list(lookup_keys)
            das = dict(expire=empty_expire, primary_key=lkeys[0],
                       condition_keys=lkeys,
                       instance=dasquery.instance,
                       system=['das'], services=dasquery.services,
                       record=record_codes('empty_record'),
                       ts=time.time(), api=[])
            empty_record = {'das':das, 'qhash': dasquery.qhash,
                            'cache_id':[], 'das_id': id_list}
            for key in lkeys:
                empty_record.update({key.split('.')[0]:[]})
            for key, val in dasquery.mongo_query['spec'].items():
                if  key.find('.') == -1:
                    empty_record[key] = []
                else: # it is compound key, e.g. site.name
                    newkey, newval = convert_dot_notation(key, val)
                    empty_record[newkey] = adjust_mongo_keyvalue(newval)
            self.merge.insert(empty_record)
            # update DAS records (both meta and data ones, by using qhash)
            nval = {'$set': {'das.expire':empty_expire}}
            spec = {'qhash':dasquery.qhash}
            self.col.update_many(spec, nval)
        return status

    def update_cache(self, dasquery, results, header, system, api):
        """
        Insert results into cache. Use bulk insert controller by
        self.cache_size. Upon completion ensure indexies.
        """
        # update results records in DAS cache
        gen  = self.generate_records(dasquery, results, header)
        inserted = 0
        # bulk insert
        try:
            res = self.col.insert_many(gen, ordered=False, bypass_document_validation=True)
            inserted += len(res.inserted_ids)
        except InvalidOperation:
            pass

        # update query record for this sub-system
        self.update_query_record_system(dasquery, system, api, 'ok')

        if  dasquery.qcache: # custom DASQuery cache
            self.update_das_expire(dasquery, expire_timestamp(dasquery.qcache))

    def update_query_record_system(self, dasquery, system, api, status):
        "Update system status of dasquery in das.cache collection"
        spec = {'qhash': dasquery.qhash, 'das.system': system, 'das.api': api,
                'das.record':record_codes('query_record')}
        udict = {'$set': {'das.status':status}}
#         print("### update_query_record", spec)
        doc=self.col.find_one_and_update(spec, udict, return_document=ReturnDocument.AFTER)
#         print(doc)

    def insert_query_record(self, dasquery, header):
        """
        Insert query record into DAS cache.
        """
        # check presence of API record in a cache
        dasheader   = header['das']
        system      = dasheader['system']
        api         = dasheader['api']
        collection  = 'cache'
        check_query = True
        expire = dasheader.get('expire', None)
        if  expire:
            dasheader['expire'] = adjust_expire(expire)
        if  not self.incache(dasquery, collection, system, api, check_query):
            msg = "query=%s, header=%s" % (dasquery, header)
            self.logger.debug(msg)
            q_record = dict(das=dasheader, query=dasquery.storage_query)
            q_record['das']['record'] = record_codes('query_record')
            q_record['das']['status'] = "requested"
            q_record['qhash'] = dasquery.qhash
            q_record['das']['ctime'] = [time.time()]
            res = self.col.insert_one(q_record)
            if  not res:
                msg = 'unable to insert query record'
                print(dastimestamp('DAS ERROR '), dasquery, msg, ', will retry')
                time.sleep(1)
                res = self.col.insert(q_record)
                if  not res:
                    print(dastimestamp('DAS ERROR '), dasquery, msg)

    def generate_records(self, dasquery, results, header):
        """
        Iterate over provided results, update records and yield them
        to next level (update_cache)
        """
        self.logger.debug("(%s) store to cache" % dasquery)
        if  not results:
            return

        dasheader  = header['das']
        expire     = adjust_expire(dasheader['expire'])
        system     = dasheader['system'] # DAS service names, e.g. combined
        services   = dasheader['services'] # CMS services used to get data
        api        = dasheader['api']
        prim_key   = header.get('prim_key', None)
        if  not prim_key:
            # get primary key from a list of lookup keys which has the
            # following structure [{'api':[keys]}, {...}]
            lup_keys = header['lookup_keys']
            lkeys    = [l for i in lup_keys for k in i.values() for l in k]
            prim_key = lkeys[0] if 'summary' not in lkeys else 'summary'
        cond_keys  = list(dasquery.mongo_query['spec'].keys())
        # get API record id
        spec       = {'qhash':dasquery.qhash, 'das.system':system,
                      'das.expire': {'$gt':time.time()},
                      'das.record': record_codes('query_record')}
        counter    = 0
        rids = [str(r['_id']) for r in \
                self.col.find(spec, ['_id'], **PYMONGO_OPTS)]
        if  rids:
            if  isinstance(results, list) or isinstance(results, GeneratorType):
                for item in results:
                    counter += 1
                    if  'das' in item:
                        expire = item.get('das').get('expire', expire)
                        dasheader['expire'] = expire
                    item['das'] = dict(expire=expire, primary_key=prim_key,
                                       condition_keys=cond_keys,
                                       instance=dasquery.instance,
                                       system=system, services=services,
                                       record=record_codes('data_record'),
                                       ts=time.time(), api=api)
                    item['das_id'] = rids
                    item['qhash'] = dasquery.qhash
                    yield item
            else:
                print("\n\n ### results = ", str(results))
                raise Exception('Provided results is not a list/generator type')
        if  expire != dasheader['expire']: # update DAS records
            header['das']['expire'] = expire
        # update das record with new status
        status = 'Update DAS cache, %s API' % header['das']['api'][0]
        self.update_query_record(dasquery, status, header)

        msg = "\n%s yield %s rows" % (dasheader['system'], counter)
        self.logger.info(msg)

    def remove_from_cache(self, dasquery):
        """
        Remove query from DAS cache. To do so, we retrieve API record
        and remove all data records from das.cache and das.merge
        """
        records = self.col.find({'qhash':dasquery.qhash}, **PYMONGO_OPTS)
        id_list = []
        for row in records:
            if  row['_id'] not in id_list:
                id_list.append(row['_id'])
        spec = {'das_id':{'$in':id_list}}
        self.merge.remove(spec)
        self.merge.remove({'qhash':dasquery.qhash})
        self.col.remove(spec)
        self.col.remove({'qhash':dasquery.qhash})

    def clean_cache(self, collection=None):
        """
        Clean expired docs in das.cache and das.merge.
        """
        current_time = time.time()
        query = {'das.expire': { '$lt':current_time} }
        if  not collection or collection == 'merge':
            self.merge.remove(query)
        if  not collection or collection == 'cache':
            self.col.remove(query)

    def delete_cache(self):
        """
        Delete all results in DAS cache/merge collection, including
        internal indexes.
        """
        self.col.remove({})
        try:
            self.col.drop_indexes()
        except:
            pass
        self.merge.remove({})
        try:
            self.merge.drop_indexes()
        except:
            pass
Beispiel #7
0
class DASParserDB(object):
    """
    Caching layer for the PLY parser.
    """
    def __init__(self, config):
        self.verbose  = config['verbose']
        self.logger   = PrintManager('DASParserDB', self.verbose)
        self.dburi    = config['mongodb']['dburi']
        self.dbname   = config['parserdb']['dbname']
        self.sizecap  = config['parserdb'].get('sizecap', 5*1024*1024)
        self.colname  = config['parserdb']['collname']
        
        msg = "DASParserCache::__init__ %s@%s" % (self.dburi, self.dbname)
        self.logger.info(msg)
        
        self.col = None
        self.create_db()

    def create_db(self):
        """
        Create db collection
        """
        conn = db_connection(self.dburi)
        dbn  = conn[self.dbname]
        if  self.colname not in dbn.collection_names():
            dbn.create_collection(self.colname, capped=True, size=self.sizecap)
        self.col = dbn[self.colname]

    def lookup_query(self, rawtext):
        """
        Check the parser cache for a given rawtext query.
        Search is done with the hash of this string.
        
        Returns a tuple (status, value) for the cases
        (PARSERCACHE_VALID, mongo_query) - valid query found
        (PARSERCACHE_INVALID, error) - error message for invalid query
        (PARSERCACHE_NOTFOUND, None) - not in the cache
        """
        result = self.col.find_one({'hash':genkey(rawtext)},
                        fields=['query', 'error'])

        if result and result['query']:
            if self.verbose:
                self.logger.debug("DASParserCache: found valid %s->%s" %\
                                  (rawtext, result['query']))
            
            query = decode_mongo_query(result['query'])
            return (PARSERCACHE_VALID, query)
        elif result and result['error']:
            if self.verbose:
                self.logger.debug("DASParserCache: found invalid %s->%s" %\
                                  (rawtext, result['error']))
            return (PARSERCACHE_INVALID, result['error'])
        else:
            if self.verbose:
                self.logger.debug("DASParserCache: not found %s" %\
                                  (rawtext))
            return (PARSERCACHE_NOTFOUND, None)

    def insert_valid_query(self, rawtext, query):
        "Insert a query that was successfully transformed"	
        self._insert_query(rawtext, query, None)

    def insert_invalid_query(self, rawtext, error):
        "Insert the error message for an invalid query"
        self._insert_query(rawtext, None, error)

    def _insert_query(self, rawtext, query, error):
        """Internal method to insert a query"""
        if  self.verbose:
            self.logger.debug("DASParserCache: insert %s->%s/%s" %\
	                          (rawtext, query, error))
        # since MongoDB does not support insertion of $ sign in queries
        # we need to encode inserted query
        if  query:
            encquery = encode_mongo_query(query)
        else:
            encquery = ""
        self.col.insert({'raw':rawtext, 'hash':genkey(rawtext),
                         'query':encquery, 'error':str(error)})
Beispiel #8
0
class key_learning(object):
    """
    This is the asynchronous part of the key-learning system, intended
    to run probably not much more than daily once the key learning DB is
    filled.

    This searches through the DAS raw cache for all API output records,
    recording at least `redundancy` das_ids for each primary_key found.

    These das_ids are then used to fetch the query record, which records
    the API system and urn of each of the records in question.

    These documents are then processed to extract all the unique member
    names they contained, which are then injected into the DAS keylearning
    system.
    """
    task_options = [
        {'name': 'redundancy',
         'type': 'int',
         'default': 2,
         'help': 'Number of records to examine per DAS primary key'}]

    def __init__(self, **kwargs):
        self.logger = PrintManager('KeyLearning', kwargs.get('verbose', 0))
        self.das = kwargs['DAS']
        self.redundancy = kwargs.get('redundancy', 10)

    def __call__(self):
        """__call__ implementation"""
        self.das.rawcache.clean_cache("cache")
        rawcache = self.das.rawcache.col
        autodeque = lambda: collections.deque(maxlen=self.redundancy)
        found_ids = collections.defaultdict(autodeque)

        self.logger.info("finding das_ids")
        for doc in rawcache.find({'das.record': record_codes('data_record'),
                                  'das.primary_key': {'$exists': True}},
                                 fields=['das.primary_key', 'das_id']):
            for das_id in doc['das_id']:
                found_ids[doc['das']['primary_key']].append(das_id)

        hit_ids = set()
        self.logger.info("found %s primary_keys" % len(found_ids))
        for key in found_ids:
            self.logger.info("primary_key=%s" % key)
            for das_id in found_ids[key]:
                if _DEBUG:
                    print '-======= DAS ID ======'
                    pprint(das_id)
                    print '-======= HIT ID (ALREADY VISITED) ======'
                    pprint(hit_ids)

                if not das_id in hit_ids:
                    self.logger.info("das_id=%s" % das_id)
                    hit_ids.add(das_id)
                    doc = rawcache.find_one({'_id': ObjectId(das_id)})
                    if doc:
                        self.process_query_record(doc)
                    else:
                        self.logger.warning("no record for das_id=%s" % das_id)

        if _DEBUG:
            print 'result attributes (all):'
            for row in self.das.keylearning.list_members():
                pprint(row)
                res_t = self.das.mapping.primary_key(row['system'], row['urn'])
                print row.get('keys', ''), '-->', res_t, ':', \
                    ', '.join([m for m in row.get('members', [])])

        return {}

    def process_query_record(self, doc):
        """
        Process a rawcache document, extracting the called
        system, urn and url, then looking up the individual data records.
        """
        das_id = str(doc['_id'])
        systems = doc['das']['system']
        urns = doc['das']['urn']

        result = self.das.rawcache.find_records(das_id)

        if _DEBUG:
            print 'in process_query_record. (das_id, systems, urns)=', \
                (das_id, systems, urns)
            print 'result count=', result.count(), '~= systems=', len(systems)
            print 'len(systems)=', len(systems), '~= len(urns)', len(urns)

        if _DEBUG:
            print 'doc:'
            pprint(doc)
            result = [r for r in result]
            print 'results in doc:'
            pprint(result)
            print '-----------------------------------'

        # TODO: it seems these conditions are non-sense!!!
        if len(systems) == len(urns) and len(systems) == 1:
            for _, record in enumerate(result):
                self.process_document(systems[0], urns[0], record)
        else:
            self.logger.warning("got inconsistent system/urn/das_id length")

    def process_document(self, system, urn, doc):
        """
        Process a rawcache document record coming from one API of a service.
        Find all the unique output fields and insert them into the cache.
        """
        self.logger.info("%s::%s" % (system, urn))
        members = set()
        for key in doc.keys():
            if key not in ('das', '_id', 'das_id'):
                members |= self.recursive_walk(doc[key], key)

        if _DEBUG:
            print 'process_document(): das.keylearning.add_members(system=', \
                system, ', urn=', urn, 'members:', list(members)
        self.das.keylearning.add_members(system, urn, list(members))

    @classmethod
    def recursive_walk(cls, doc, prefix):
        """
        Recurse through a nested data structure, finding all
        the unique endpoint names. Lists are iterated over but do
        not add anything to the prefix, eg.:

        a: {b: 1, c: {d: 1, e: 1}, f: [{g: 1}, {h: 1}]} ->
        a.b, a.c.d, a.c.e, a.f.g, a.f.h

        (although normally we would expect each member of a list to
        have the same structure)
        """
        result = set()
        if isinstance(doc, dict):
            for key in doc.keys():
                result |= cls.recursive_walk(doc[key], prefix + '.' + key)
        elif isinstance(doc, list):
            for item in doc:
                result |= cls.recursive_walk(item, prefix)
        else:
            result.add(prefix)
        return result
Beispiel #9
0
class HotspotBase(object):
    """
    This is a base-class for periodically-running
    analyzers that want to examine the moving average
    of some key->counter map, and pick the top few
    for further attention.

    DASQueries are extracted from analytics DB. The selected items
    are passed to generate_task callback implemented in subclasses.
    It look-up DAS query expiration timestamp and if necessary
    calls DAS to get it (along with results of the query).
    """
    def __init__(self, **kwargs):
        self.logger = PrintManager('HotspotBase', kwargs.get('verbose', 0))
        self.das = kwargs['DAS']
        self.fraction = float(kwargs.get('fraction', 0.15))
        self.mode = kwargs.get('mode','calls').lower()
        self.period = int(kwargs.get('period', 86400*30))
        self.interval = kwargs['interval']
        self.allowed_gap = int(kwargs.get('allowed_gap', 3600))
        self.identifier = kwargs['identifier']

    def __call__(self):
        """
        Perform a hotspot-like analysis. Subclasses shouldn't
        need to reimplement this method.

        We start with building selection chain. It consists of
        analytics summaries -> items -> preselected items ->
        mutable items. The final set of items is passed to
        task generation step (implemented in subclasses).
        The final report is generated and returned back.
        """

        epoch_end = time.time()
        epoch_start = epoch_end - self.period

        summaries = self.get_summaries(epoch_start, epoch_end)
        self.logger.info("Got %s summaries" % len(summaries))

        items = self.get_all_items(summaries)
        self.logger.info("Got %s items" % len(items))

        items = self.preselect_items(items)
        self.logger.info("Preselected to %s items" % len(items))

        items = self.select_items(items)
        self.logger.info("Selected %s items (%s:%s)" \
                         % (len(items), self.mode, self.fraction))

        items = self.mutate_items(items)
        self.logger.info("Mutated to %s items" % len(items))

        retval = {'mode': self.mode,
                  'fraction': self.fraction,
                  'epoch_start': epoch_start,
                  'epoch_end': epoch_end,
                  'summaries': len(summaries),
                  'selected': dict(items).items()}

        new_tasks = []
        failed_items = []
        for item, count in items.items():
            try:
                self.logger.info("Generating task for %s" % item)
                for task in \
                    self.generate_task(item, count, epoch_start, epoch_end):
                    new_tasks.append(task)
            except Exception as exc:
                failed_items.append((item, count, str(exc)))
        retval['new_tasks'] = new_tasks
        retval['failed_items'] = failed_items

        retval.update(self.report())

        return retval

    def generate_task(self, item, count, epoch_start, epoch_end):
        """
        For the given selected key, generate an appropriate task
        dictionary as understood by taskscheduler.

        Should be a generator or return an iterable
        """
        raise NotImplementedError

    def report(self):
        """
        Generate some extra keys to go in the job report, if desired.
        """
        return {}

    def preselect_items(self, items):
        """
        This is a part of selection chain.

        Optionally, preselect the items for consideration.
        A subclass wishing to exclude certain key types could
        do so here (but could also do so in make_one_summary).

        This is a good place to implement clustering algorithm
        for selected items. For example, if several queries are
        selected, we may analyze who has more weight and only
        pass those for task generation step.
        """
        return items

    def mutate_items(self, items):
        """
        This is a last part of selection chain.

        Optionally, mutate the selected items.
        A subclass wishing to merge together keys should
        do so here.
        """
        return items

    def get_all_items(self, summaries):
        """
        Merge the summary dictionaries.
        """
        items = collections.defaultdict(int)
        for summary in summaries:
            for key, val in summary.items():
                items[key] += val
        return items

    def select_items(self, items):
        """
        Take a mapping of item->count pairs and determine
        which are "hot" based on the selected mode.
        """
        sorted_keys = sorted(items.keys(), key=lambda x: items[x], reverse=True)
        selected_items = {}
        if self.mode == 'calls':
            total_calls = sum(items.values())
            running_total = 0
            for key in sorted_keys:
                running_total += items[key]
                selected_items[key] = items[key]
                if running_total > total_calls * self.fraction:
                    break
        elif self.mode == 'keys':
            selected_items = dict([(k, items[k])
               for k in sorted_keys[0:int(len(sorted_keys)*self.fraction)]])
        elif self.mode == 'fixed':
            selected_items = dict([(k, items[k])
               for k in sorted_keys[0:int(self.fraction)]])
        else:
            raise NotImplementedError
        return selected_items

    def get_summaries(self, epoch_start, epoch_end):
        """
        Fetch all the available pre-computed summaries
        and determine if any need to be constructed at this time.
        """
        #get all the summaries we can from this time
        try:
            summaries = self.das.analytics.get_summary(self.identifier,
                                                       after=epoch_start,
                                                       before=epoch_end)
            self.logger.info("Found %s summary documents." % len(summaries))
        except:
            summaries = []
        #see how much coverage of the requested period we have
        summaries = sorted(summaries, key=lambda x: x['start'])
        extra_summaries = []
        last_time = epoch_start
        for summary in summaries:
            if last_time < summary['start']:
                result = self.make_summary(last_time, summary['start'])
                extra_summaries.extend(result)
            last_time = summary['finish']
        result = self.make_summary(last_time, epoch_end)
        extra_summaries.extend(result)
        summaries = [dict(s['keys']) for s in summaries]
        summaries += extra_summaries

        return summaries

    def make_summary(self, start, finish):
        """
        Split the summarise requests into interval-sized chunks and decide
        if they're necessary at all.
        """
        self.logger.info("Found summary gap: %s->%s (%s)" \
                         % (start, finish, finish-start))
        result = []
        delta = finish - start
        if delta > self.allowed_gap:
            if delta > self.interval:
                blocks = int(delta/self.interval)
                span = delta/blocks
                self.logger.info("Gap longer than interval, " +\
                                 "creating %s summaries." % blocks)
                for i in xrange(blocks):
                    try:
                        summary = self.make_one_summary(start+span*i,
                                                        start+span*(i+1))
                        self.das.analytics.add_summary(self.identifier,
                                               start+span*i,
                                               start+span*(i+1),
                                               keys=(dict(summary)).items())
                        result.append(summary)
                    except:
                        pass

            else:
                try:
                    summary = self.make_one_summary(start, finish)
                    self.das.analytics.add_summary(self.identifier,
                                                   start,
                                                   finish,
                                                   keys=(dict(summary)).items())
                    result.append(summary)
                except:
                    pass
        else:
            self.logger.info("...short enough to ignore.")

        return result

    def make_one_summary(self, start, finish):
        """
        Actually make a summary of item->count pairs
        for the specified time range. Subclasses need to
        implement this for the analysis in question.
        """
        raise NotImplementedError
Beispiel #10
0
class DASMapping(object):
    """
    This class manages DAS mapping DB.
    """

    def __init__(self, config):
        self.verbose = config["verbose"]
        self.logger = PrintManager("DASMapping", self.verbose)
        self.services = config["services"]
        self.dburi = config["mongodb"]["dburi"]
        self.dbname = config["mappingdb"]["dbname"]
        self.colname = config["mappingdb"]["collname"]

        msg = "%s@%s" % (self.dburi, self.dbname)
        self.logger.info(msg)

        self.create_db()

        self.keymap = {}  # to be filled at run time
        self.presentationcache = {}  # to be filled at run time
        self.reverse_presentation = {}  # to be filled at run time
        self.notationcache = {}  # to be filled at run time
        self.diffkeycache = {}  # to be filled at run time
        self.apicache = {}  # to be filled at run time
        self.apiinfocache = {}  # to be filled at run time
        self.init_notationcache()
        self.init_presentationcache()

    # ===============
    # Management APIs
    # ===============
    def init_notationcache(self):
        """
        Initialize notation cache by reading notations.
        """
        for system, notations in self.notations().iteritems():
            for row in notations:
                key = system, row["notation"]
                if self.notationcache.has_key(key):
                    self.notationcache[key] += [(row["api"], row["map"])]
                else:
                    self.notationcache[key] = [(row["api"], row["map"])]

    def init_presentationcache(self):
        """
        Initialize presentation cache by reading presentation map.
        """
        query = {"presentation": {"$ne": None}}
        data = self.col.find_one(query)
        if data:
            self.presentationcache = data["presentation"]
            for daskey, uilist in self.presentationcache.iteritems():
                for row in uilist:
                    link = None
                    if row.has_key("link"):
                        link = row["link"]
                    if row.has_key("diff"):
                        self.diffkeycache[daskey] = row["diff"]
                    tdict = {daskey: {"mapkey": row["das"], "link": link}}
                    if self.reverse_presentation.has_key(row["ui"]):
                        self.reverse_presentation[row["ui"]].update(tdict)
                    else:
                        self.reverse_presentation[row["ui"]] = {daskey: {"mapkey": row["das"], "link": link}}

    def create_db(self):
        """
        Establish connection to MongoDB back-end and create DB.
        """
        self.conn = db_connection(self.dburi)
        self.db = self.conn[self.dbname]
        self.col = self.db[self.colname]

    def delete_db(self):
        """
        Delete mapping DB in MongoDB back-end.
        """
        self.conn.drop_database(self.dbname)

    def delete_db_collection(self):
        """
        Delete mapping DB collection in MongoDB.
        """
        self.db.drop_collection(self.colname)

    def check_maps(self):
        """
        Check if there are records in Mapping DB
        """
        return self.col.count()

    def remove(self, spec):
        """
        Remove record in DAS Mapping DB for provided Mongo spec.
        """
        self.col.remove(spec)

    def add(self, record):
        """
        Add new record into mapping DB. Example of URI record

        .. doctest::

            {
             system:dbs, 
             urn : listBlocks, 
             url : "http://a.b.com/api"
             params : [
                 {"apiversion":1_2_2, test:"*"}
             ]
             daskeys: [
                 {"key" : "block", "map":"block.name", "pattern":""}
             ]
             das2api: [
                 {"das_key":"site", "api_param":"se", 
                       "pattern":"re.compile('^T[0-3]_')"}
             ]
            }

        Example of notation record:

        .. doctest::

             notations: [
                 {"notation" : "storage_element_name", "map":"site", "api": ""},
             ]
        """
        msg = "record=%s" % record
        self.logger.debug(msg)
        self.col.insert(record)
        index = None
        if record.has_key("urn"):
            index = [("system", DESCENDING), ("daskeys", DESCENDING), ("urn", DESCENDING)]
        elif record.has_key("notations"):
            index = [("system", DESCENDING), ("notations.api_param", DESCENDING)]
        elif record.has_key("presentation"):
            index = []
        else:
            msg = "Invalid record %s" % record
            raise Exception(msg)
        if index:
            create_indexes(self.col, index)

    # ==================
    # Informational APIs
    # ==================
    def list_systems(self):
        """
        List all DAS systems.
        """
        cond = {"system": {"$ne": None}}
        gen = (row["system"] for row in self.col.find(cond, ["system"]))
        return list(set(gen2list(gen)) & set(self.services))

    def list_apis(self, system=None):
        """
        List all APIs.
        """
        if self.apicache and self.apicache.has_key(system):
            return self.apicache[system]
        cond = {"urn": {"$ne": None}}
        if system:
            cond["system"] = system
        gen = (row["urn"] for row in self.col.find(cond, ["urn"]))
        self.apicache[system] = gen2list(gen)
        return self.apicache[system]

    def api_info(self, api_name):
        """
        Return full API info record.
        """
        return self.apiinfocache.get(api_name, self.col.find_one({"urn": api_name}))

    def relational_keys(self, system1, system2):
        """
        Return a list of relational keys between provided systems
        """
        for system, keys in self.daskeys().iteritems():
            if system == system1:
                keys1 = keys
            if system == system2:
                keys2 = keys
        return list(set(keys1) & set(keys2))

    def daskeys(self, das_system=None):
        """
        Return a dict with all known DAS keys.
        """
        cond = {"system": {"$ne": None}}
        if das_system:
            cond = {"system": das_system}
        gen = (row["system"] for row in self.col.find(cond, ["system"]))
        kdict = {}
        for system in gen:
            query = {"system": system, "urn": {"$ne": None}}
            keys = []
            for row in self.col.find(query):
                for entry in row["daskeys"]:
                    if entry["key"] not in keys:
                        keys.append(entry["key"])
            kdict[system] = keys
        return kdict

    # ============
    # Look-up APIs
    # ============
    def primary_key(self, das_system, urn):
        """
        Return DAS primary key for provided system and urn
        """
        cond = {"system": das_system, "urn": urn}
        daskeys = self.col.find(cond, ["daskeys.key"])
        for row in daskeys:
            if row and row.has_key("daskeys"):
                for dkey in row["daskeys"]:
                    if dkey.has_key("key"):
                        return dkey["key"]

    def primary_mapkey(self, das_system, urn):
        """
        Return DAS primary map key for provided system and urn
        """
        cond = {"system": das_system, "urn": urn}
        mapkeys = self.col.find(cond, ["daskeys.map"])
        for row in mapkeys:
            if row and row.has_key("daskeys"):
                for mkey in row["daskeys"]:
                    if mkey.has_key("map"):
                        return mkey["map"]

    def find_daskey(self, das_system, map_key, value=None):
        """
        Find das key for given system and map key.
        """
        msg = "system=%s\n" % das_system
        cond = {"system": das_system, "daskeys.map": map_key}
        daskeys = []
        for row in self.col.find(cond, ["daskeys"]):
            if row and row.has_key("daskeys"):
                for dkey in row["daskeys"]:
                    if dkey.has_key("key"):
                        if value:
                            pval = dkey.get("pattern", "")
                            if pval:
                                pat = re.compile(pval)
                                if pat.match(str(value)):
                                    daskeys.append(dkey["key"])
                                else:
                                    msg += "-- reject key=%s, val=%s, pat=%s\n" % (map_key, value, pval)
                                    self.logger.debug(msg)
                            else:
                                daskeys.append(dkey["key"])
                        else:
                            daskeys.append(dkey["key"])
        return daskeys

    def find_mapkey(self, das_system, das_key, value=None):
        """
        Find map key for given system and das key.
        """
        msg = "system=%s\n" % das_system
        cond = {"system": das_system, "daskeys.key": das_key}
        for row in self.col.find(cond, ["daskeys", "urn"]):
            if row and row.has_key("daskeys"):
                for key in row["daskeys"]:
                    if key.has_key("map") and key["key"] == das_key:
                        if value:
                            pval = key.get("pattern", "")
                            pat = re.compile(pval)
                            if pat.match(str(value)):
                                return key["map"]
                            else:
                                msg += "-- reject key=%s, val=%s, pat=%s\n" % (das_key, value, key["pattern"])
                                self.logger.debug(msg)
                                continue
                        else:
                            return key["map"]

    def mapkeys(self, daskey):
        """
        Find primary key for a given daskey
        """
        if self.keymap.has_key(daskey):
            return self.keymap[daskey]
        spec = {"daskeys.key": daskey}
        mapkeys = []
        for row in self.col.find(spec, ["daskeys"]):
            for kmap in row["daskeys"]:
                if kmap["key"] == daskey and kmap["map"] not in mapkeys:
                    mapkeys.append(kmap["map"])
        self.keymap[daskey] = mapkeys
        return self.keymap[daskey]

    def find_apis(self, das_system, map_key):
        """
        Find list of apis which correspond to provided
        system and das map key.
        """
        cond = {"system": das_system, "daskeys.map": map_key}
        apilist = []
        for row in self.col.find(cond, ["urn"]):
            if row.has_key("urn") and row["urn"] not in apilist:
                apilist.append(row["urn"])
        return apilist

    def check_dasmap(self, system, urn, das_map, value=None):
        """
        Check if provided system/urn/das_map is a valid combination
        in mapping db. If value for das_map key is provided we verify
        it against pattern in DB.
        """
        if not value:
            cond = {"system": system, "daskeys.map": das_map, "urn": urn}
            return self.col.find(cond).count()
        cond = {"system": system, "daskeys.map": das_map, "urn": urn}
        for row in self.col.find(cond, ["daskeys.pattern"]):
            for item in row["daskeys"]:
                pat = re.compile(item["pattern"])
                if pat.match(str(value)):
                    return True
        return False

    def find_system(self, key):
        """
        Return system name for provided DAS key.
        """
        cond = {"daskeys.key": key}
        gen = (row["system"] for row in self.col.find(cond, ["system"]))
        systems = []
        for system in gen:
            if system not in systems:
                systems.append(system)
        systems.sort()
        return systems

    def lookup_keys(self, system, daskey, api=None, value=None):
        """
        Returns lookup keys for given system and provided
        selection DAS key, e.g. block => block.name
        """
        query = {"system": system, "daskeys.key": daskey}
        if api:
            query["urn"] = api
        lookupkeys = []
        for row in self.col.find(query):
            for kdict in row["daskeys"]:
                if kdict["key"] == daskey:
                    lkey = kdict["map"]
                else:
                    continue
                if value and kdict["pattern"]:
                    pat = re.compile(kdict["pattern"])
                    if pat.match(str(value)):
                        if lkey not in lookupkeys:
                            lookupkeys.append(lkey)
                else:
                    if lkey not in lookupkeys:
                        lookupkeys.append(lkey)
        if not lookupkeys:
            msg = "Unable to find look-up key for "
            msg += "system=%s, daskey=%s, api=%s, value=%s" % (system, daskey, api, value)
            raise Exception(msg)
        return lookupkeys

    def api2das(self, system, api_input_name):
        """
        Translates data-service API input parameter into DAS QL key,
        e.g. run_number => run.
        """
        query = {"system": system, "das2api.api_param": api_input_name}
        names = []
        for adas in self.col.find(query, ["das2api"]):
            for row in adas["das2api"]:
                try:
                    aparam = row["api_param"]
                    daskey = row["das_key"]
                    if aparam == api_input_name and daskey not in names:
                        names.append(daskey)
                except Exception, err:
                    print "ERROR: look-up api_param/das_key in", row
                    raise err
        return names
Beispiel #11
0
class DASAbstractService(object):
    """
    Abstract class describing DAS service. It initialized with a name which
    is used to identify service parameters from DAS configuration file.
    Those parameters are keys, verbosity level, URL of the data-service.
    """
    def __init__(self, name, config):
        self.name = name
        try:
            self.verbose      = config['verbose']
            title             = 'DASAbstactService_%s' % self.name
            self.logger       = PrintManager(title, self.verbose)
            self.dasmapping   = config['dasmapping']
            self.write2cache  = config.get('write_cache', True)
            self.multitask    = config['das'].get('multitask', True)
            self.error_expire = config['das'].get('error_expire', 300) 
            self.dbs_global   = None # to be configured at run time
            self.dburi        = config['mongodb']['dburi']
            engine            = config.get('engine', None)
            self.gfs          = db_gridfs(self.dburi)
        except Exception as exc:
            print_exc(exc)
            raise Exception('fail to parse DAS config')

        # read key/cert info
        try:
            self.ckey, self.cert = get_key_cert()
        except Exception as exc:
            print_exc(exc)
            self.ckey = None
            self.cert = None

        if  self.multitask:
            nworkers = config['das'].get('api_workers', 3)
            thr_weights = config['das'].get('thread_weights', [])
            for system_weight in thr_weights:
                system, weight = system_weight.split(':')
                if  system == self.name:
                    nworkers *= int(weight)
            if  engine:
                thr_name = 'DASAbstractService:%s:PluginTaskManager' % self.name
                self.taskmgr = PluginTaskManager(\
                        engine, nworkers=nworkers, name=thr_name)
                self.taskmgr.subscribe()
            else:
                thr_name = 'DASAbstractService:%s:TaskManager' % self.name
                self.taskmgr = TaskManager(nworkers=nworkers, name=thr_name)
        else:
            self.taskmgr = None

        self.map        = {}   # to be defined by data-service implementation
        self._keys      = None # to be defined at run-time in self.keys
        self._params    = None # to be defined at run-time in self.parameters
        self._notations = {}   # to be defined at run-time in self.notations

        self.logger.info('initialized')
        # define internal cache manager to put 'raw' results into cache
        if  'rawcache' in config and config['rawcache']:
            self.localcache   = config['rawcache']
        else:
            msg = 'Undefined rawcache, please check your configuration'
            raise Exception(msg)

    def services(self):
        """
        Return sub-subsystems used to retrieve data records. It is used
        in dasheader call to setup das.services field. This method can be
        overwritten in sub-classes, otherwise returns dict of service name
        and CMS systems used to retrieve data records.
        """
        return {self.name:[self.name]}

    def version(self):
        """Return data-services version, should be implemented in sub-classes"""
        return ''

    def keys(self):
        """
        Return service keys
        """
        if  self._keys:
            return self._keys
        srv_keys = []
        for _api, params in self.map.items():
            for key in params['keys']:
                if  not key in srv_keys:
                    srv_keys.append(key)
        self._keys = srv_keys
        return srv_keys

    def parameters(self):
        """
        Return mapped service parameters
        """
        if  self._params:
            return self._params
        srv_params = []
        for _api, params in self.map.items():
            for key in params['params']:
                param_list = self.dasmapping.api2das(self.name, key)
                for par in param_list:
                    if  not par in srv_params:
                        srv_params.append(par)
        self._params = srv_params
        return srv_params

    def notations(self):
        """
        Return a map of system notations.
        """
        if  self._notations:
            return self._notations
        for _, rows in self.dasmapping.notations(self.name).items():
            for row in rows:
                api  = row['api']
                nmap = row['rec_key']
                notation = row['api_output']
                if  api in self._notations:
                    self._notations[api].update({notation:nmap})
                else:
                    self._notations[api] = {notation:nmap}
        return self._notations

    def getdata(self, url, params, expire, headers=None, post=None):
        """URL call wrapper"""
        if  url.find('https:') != -1:
            return getdata(url, params, headers, expire, post,
                self.error_expire, self.verbose, self.ckey, self.cert,
                system=self.name)
        else:
            return getdata(url, params, headers, expire, post,
                self.error_expire, self.verbose, system=self.name)

    def call(self, dasquery):
        """
        Invoke service API to execute given query.
        Return results as a collect list set.
        """
        self.logger.info(dasquery)
        # check the cache for records with given query/system
        res = self.localcache.incache(dasquery,
                                      collection='cache',
                                      system=self.name)
        if  res:
            msg  = "found records in local cache"
            self.logger.info(msg)
            return
        # ask data-service api to get results, they'll be store them in
        # cache, so return at the end what we have in cache.
        self.api(dasquery)

    def write_to_cache(self, dasquery, expire, url, api, args, gen, ctime):
        """
        Write provided result set into DAS cache.
        """
        if  not self.write2cache:
            return

        # before going to cache we should check/set possible misses, e.g.
        # primary key when error is thrown
        result = self.set_misses(dasquery, api, gen)

        # update the cache
        header = dasheader(self.name, dasquery, expire, api, url,
                services=self.services())
        header['lookup_keys'] = self.lookup_keys(api)
        header['prim_key'] = self.dasmapping.primary_mapkey(self.name, api)
        header['ctime'] = ctime
        self.localcache.update_cache(dasquery, result, header)

        msg  = 'cache has been updated,\n'
        self.logger.debug(msg)

    def adjust_params(self, api, kwds, instance=None):
        """
        Data-service specific parser to adjust parameters according to
        its specifications. For example, DQ service accepts a string
        of parameters, rather parameter set, while DBS2 can reuse
        some parameters for different API, e.g. I can use dataset path
        to pass to listPrimaryDatasets as primary_dataset pattern.
        """
        pass

    def lookup_keys(self, api):
        """
        Return look-up keys of data output for given data-service API.
        """
        lkeys = self.dasmapping.lookup_keys(self.name, api)
        return [{api:lkeys}]

    def inspect_params(self, api, args):
        """
        Perform API parameter inspection. Check if API accept a range
        of parameters, etc.
        """
        for key, value in args.items():
            if  isinstance(value, dict):
                minval = None
                maxval = None
                for oper, val in value.items():
                    if  oper == '$in':
                        minval = int(val[0])
                        maxval = int(val[-1])
                        args[key] = range(minval, maxval)
                    elif oper == '$lt':
                        maxval = int(val)
                        args[key] = maxval
                    elif oper == '$lte':
                        maxval = int(val)
                        args[key] = maxval
                    elif oper == '$gt':
                        minval = int(val)
                        args[key] = minval
                    elif oper == '$gte':
                        minval = int(val)
                        args[key] = minval
                    else:
                        msg = '%s does not support operator %s' % (api, oper)
                        raise Exception(msg)
        return args

    def get_notations(self, api):
        """Return notations used for given API"""
        notationmap = self.notations()
        if  not notationmap:
            return {}
        notations = {}
        if  '' in notationmap:
            notations = dict(notationmap['']) # notations applied to all APIs
            if  api in notationmap: # overwrite the one for provided API
                notations.update(notationmap[api])
        return notations

    def parser(self, dasquery, dformat, data, api):
        """
        DAS data parser. Input parameters:

        - *query* input DAS query
        - *dformat* is a data format, e.g. XML, JSON
        - *data* is a data source, either file-like object or
          actual data
        - *api* is API name
        """
        prim_key  = self.dasmapping.primary_key(self.name, api)
        counter   = 0
        if  dformat.lower() == 'xml':
            tags = self.dasmapping.api2daskey(self.name, api)
            gen  = xml_parser(data, prim_key, tags)
            for row in gen:
                counter += 1
                yield row
        elif dformat.lower() == 'json' or dformat.lower() == 'dasjson':
            gen  = json_parser(data, self.logger)
            das_dict = {}
            for row in gen:
                if  dformat.lower() == 'dasjson':
                    for key, val in row.items():
                        if  key != 'results':
                            das_dict[key] = val
                    row = row['results']
                if  isinstance(row, list):
                    for item in row:
                        if  item:
                            if  prim_key in item:
                                counter += 1
                                yield item
                            else:
                                counter += 1
                                yield {prim_key:item}
                else:
                    if  prim_key in row:
                        counter += 1
                        yield row
                    else:
                        counter += 1
                        yield {prim_key:row}
        else:
            msg = 'Unsupported data format="%s", API="%s"' % (dformat, api)
            raise Exception(msg)
        msg  = "api=%s, format=%s " % (api, dformat)
        msg += "prim_key=%s yield %s rows" % (prim_key, counter)
        self.logger.info(msg)

    def translator(self, api, genrows):
        """
        Convert raw results into DAS records. 
        """
        prim_key  = self.dasmapping.primary_key(self.name, api)
        count = 0
        for row in genrows:
            row2das(self.dasmapping.notation2das, self.name, api, row)
            count += 1
            # check for primary key existance, since it can be overriden
            # by row2das. For example DBS3 uses flat namespace, so we
            # override dataset=>name, while dataset still is a primary key
            if  isinstance(row, list):
                yield {prim_key:row}
            elif  prim_key in row:
                if  prim_key in row[prim_key]:
                    yield row[prim_key] # remapping may create nested dict
                else:
                    yield row
            else:
                yield {prim_key:row}
        msg = "yield %s rows" % count
        self.logger.debug(msg)

    def set_misses(self, dasquery, api, genrows):
        """
        Check and adjust DAS records wrt input query. If some of the DAS
        keys are missing, add it with its value to the DAS record.
        """
        # look-up primary key
        prim_key  = self.dasmapping.primary_key(self.name, api)

        # Scan all docs and store those whose size above MongoDB limit into
        # GridFS
        map_key = self.dasmapping.primary_mapkey(self.name, api)
        genrows = parse2gridfs(self.gfs, map_key, genrows, self.logger)

        spec  = dasquery.mongo_query['spec']
        row   = next(genrows)
        ddict = DotDict(row)
        keys2adjust = []
        for key in spec.keys():
            val = ddict.get(key)
            if  spec[key] != val and key not in keys2adjust:
                keys2adjust.append(key)
        msg   = "adjust keys %s" % keys2adjust
        self.logger.debug(msg)
        count = 0
        if  keys2adjust:
            # adjust of the rows
            for row in yield_rows(row, genrows):
                ddict = DotDict(row)
                pval  = ddict.get(map_key)
                if  isinstance(pval, dict) and 'error' in pval:
                    ddict[map_key] = ''
                    ddict.update({prim_key: pval})
                for key in keys2adjust:
                    value = spec[key]
                    existing_value = ddict.get(key)
                    # the way to deal with proximity/patern/condition results
                    if  (isinstance(value, str) or isinstance(value, unicode))\
                        and value.find('*') != -1: # we got pattern
                        if  existing_value:
                            value = existing_value
                    elif isinstance(value, dict) or \
                        isinstance(value, list): # we got condition
                        if  existing_value:
                            value = existing_value
                        elif isinstance(value, dict) and \
                        '$in' in value: # we got a range {'$in': []}
                            value = value['$in']
                        elif isinstance(value, dict) and \
                        '$lte' in value and '$gte' in value:
                            # we got a between range
                            value = [value['$gte'], value['$lte']]
                        else: 
                            value = json.dumps(value) 
                    elif existing_value and value != existing_value:
                        # we got proximity results
                        if  'proximity' in ddict:
                            proximity = DotDict({key:existing_value})
                            ddict['proximity'].update(proximity)
                        else:
                            proximity = DotDict({})
                            proximity[key] = existing_value
                            ddict['proximity'] = proximity
                    else:
                        if  existing_value:
                            value = existing_value
                    ddict[key] = value
                yield ddict
                count += 1
        else:
            yield row
            for row in genrows:
                yield row
                count += 1
        msg   = "yield %s rows" % count
        self.logger.debug(msg)
            
    def api(self, dasquery):
        """
        Data service api method, can be defined by data-service class.
        It parse input query and invoke appropriate data-service API
        call. All results are stored into the DAS cache along with
        api call inserted into Analytics DB.
        """
        self.logger.info(dasquery)
        genrows = self.apimap(dasquery)
        if  not genrows:
            return
        jobs = []
        for url, api, args, dformat, expire in genrows:
            # insert DAS query record for given API
            header = dasheader(self.name, dasquery, expire, api, url)
            self.localcache.insert_query_record(dasquery, header)
            # fetch DAS data records
            if  self.multitask:
                jobs.append(self.taskmgr.spawn(self.apicall, \
                            dasquery, url, api, args, dformat, expire))
            else:
                self.apicall(dasquery, url, api, args, dformat, expire)
        if  self.multitask:
            self.taskmgr.joinall(jobs)

    def apicall(self, dasquery, url, api, args, dformat, expire):
        """
        Data service api method, can be defined by data-service class.
        It parse input query and invoke appropriate data-service API
        call. All results are stored into the DAS cache along with
        api call inserted into Analytics DB.

        We invoke explicitly close call for our datastream instead
        of using context manager since this method as well as
        getdata/parser can be overwritten by child classes.
        """
        datastream  = None
        try:
            args    = self.inspect_params(api, args)
            time0   = time.time()
            headers = make_headers(dformat)
            datastream, expire = self.getdata(url, args, expire, headers)
            self.logger.info("%s expire %s" % (api, expire))
            rawrows = self.parser(dasquery, dformat, datastream, api)
            dasrows = self.translator(api, rawrows)
            ctime   = time.time() - time0
            self.write_to_cache(dasquery, expire, url, api, args,
                    dasrows, ctime)
        except Exception as exc:
            msg  = 'Fail to process: url=%s, api=%s, args=%s' \
                    % (url, api, args)
            print(msg)
            print_exc(exc)
        close(datastream)

    def url_instance(self, url, _instance):
        """
        Virtual method to adjust URL for a given instance,
        must be implemented in service classes
        """
        return url

    def adjust_url(self, url, instance):
        """
        Adjust data-service URL wrt provided instance, e.g.
        DBS carry several instances
        """
        if  instance:
            url = self.url_instance(url, instance)
        return url

    def apimap(self, dasquery):
        """
        Analyze input query and yield url, api, args, format, expire
        for further processing.
        """
        srv   = self.name # get local copy to avoid threading issues
        cond  = getarg(dasquery.mongo_query, 'spec', {})
        instance = dasquery.mongo_query.get('instance', self.dbs_global)
        skeys = getarg(dasquery.mongo_query, 'fields', [])
        if  not skeys:
            skeys = []
        self.logger.info("\n")
        for api, value in self.map.items():
            expire = value['expire']
            iformat = value['format']
            url    = self.adjust_url(value['url'], instance)
            if  not url:
                msg = '--- rejects API %s, no URL' % api
                self.logger.info(msg)
                continue
            args   = dict(value['params']) # make new copy, since we'll adjust
            wild   = value.get('wild_card', '*')
            found  = 0
            # check if input parameters are covered by API
            if  not self.dasmapping.check_api_match(srv, api, cond):
                msg = '--- rejects API %s, does not cover input condition keys' \
                        % api
                self.logger.info(msg)
                continue
            # once we now that API covers input set of parameters we check
            # every input parameter for pattern matching
            for key, val in cond.items():
                # check if keys from conditions are accepted by API
                # need to convert key (which is daskeys.map) into
                # input api parameter
                for apiparam in self.dasmapping.das2api(srv, api, key, val):
                    if  apiparam in args:
                        args[apiparam] = val
                        found += 1
            # VK 20160708, wrong statement, it caused to pass
            # datasets API for query dataset in [path1, path2]
            # I'll leave block here until I test and verify that
            # commented out block will not cause other issues
            #
            # check the case when we only have single condition key
            # and it is the key we look-up
#             if  not found and skeys == [k.split('.')[0] for k in cond.keys()]:
#                 found = 1
            # check if number of keys on cond and args are the same
            if  len(cond.keys()) != found:
                msg = "--- reject API %s, not all condition keys are covered" \
                        % api
                self.logger.info(msg)
                msg = 'args=%s' % args
                self.logger.debug(msg)
                continue
            if  not found:
                msg = "--- rejects API %s, parameters don't match" % api
                self.logger.info(msg)
                msg = 'args=%s' % args
                self.logger.debug(msg)
                continue
            self.adjust_params(api, args, instance)
            # delete args keys whose value is optional
            delete_keys(args, 'optional')
            # check that there is no "required" parameter left in args,
            # since such api will not work
            if 'required' in args.values():
                msg = '--- rejects API %s, parameter is required' % api
                self.logger.info(msg)
                msg = 'args=%s' % args
                self.logger.debug(msg)
                continue
            # adjust pattern symbols in arguments
            if  wild != '*':
                for key, val in args.items():
                    if  isinstance(val, str) or isinstance(val, unicode):
                        val   = val.replace('*', wild)
                    args[key] = val

            # compare query selection keys with API look-up keys
            api_lkeys = self.dasmapping.api_lkeys(srv, api)
            if  set(api_lkeys) != set(skeys):
                msg = "--- rejects API %s, api_lkeys(%s)!=skeys(%s)"\
                        % (api, api_lkeys, skeys)
                self.logger.info(msg)
                continue

            msg = '+++ %s passes API %s' % (srv, api)
            self.logger.info(msg)
            msg = 'args=%s' % args
            self.logger.debug(msg)

            msg  = "yield "
            msg += "system ***%s***, url=%s, api=%s, args=%s, format=%s, " \
                % (srv, url, api, args, iformat)
            msg += "expire=%s, wild_card=%s" \
                % (expire, wild)
            self.logger.debug(msg)

            yield url, api, args, iformat, expire
Beispiel #12
0
class DASMongocache(object):
    """
    DAS cache based MongoDB. 
    """
    def __init__(self, config):
        self.emptyset_expire = expire_timestamp(\
            config['das'].get('emptyset_expire', 5))
        self.dburi   = config['mongodb']['dburi']
        self.cache_size = config['mongodb']['bulkupdate_size']
        self.dbname  = config['dasdb']['dbname']
        self.verbose = config['verbose']
        self.logger  = PrintManager('DASMongocache', self.verbose)
        self.mapping = config['dasmapping']

        self.conn    = db_connection(self.dburi)
        self.mdb     = self.conn[self.dbname]
        self.col     = self.mdb[config['dasdb']['cachecollection']]
        self.mrcol   = self.mdb[config['dasdb']['mrcollection']]
        self.merge   = self.mdb[config['dasdb']['mergecollection']]
        self.gfs     = db_gridfs(self.dburi)

        self.logdb   = DASLogdb(config)

        self.das_internal_keys = ['das_id', 'das', 'cache_id', 'qhash']

        msg = "%s@%s" % (self.dburi, self.dbname)
        self.logger.info(msg)

        self.add_manipulator()

        # ensure that we have the following indexes
        index_list = [('das.expire', ASCENDING), ('das_id', ASCENDING),
                      ('das.system', ASCENDING),
                      ('qhash', DESCENDING),
                      ('das.empty_record', ASCENDING)]
        create_indexes(self.col, index_list)
        index_list = [('das.expire', ASCENDING), ('das_id', ASCENDING),
                      ('qhash', DESCENDING),
                      ('das.empty_record', ASCENDING), ('das.ts', ASCENDING)]
        create_indexes(self.merge, index_list)
        
    def add_manipulator(self):
        """
        Add DAS-specific MongoDB SON manipulator to perform
        conversion of inserted data into DAS cache.
        """
        das_son_manipulator = DAS_SONManipulator()
        self.mdb.add_son_manipulator(das_son_manipulator)
        msg = "DAS_SONManipulator %s" \
        % das_son_manipulator
        self.logger.debug(msg)

    def similar_queries(self, dasquery):
        """
        Check if we have query results in cache whose conditions are
        superset of provided query. The method only works for single
        key whose value is substring of value in input query.
        For example, if cache contains records about T1 sites, 
        then input query T1_CH_CERN is subset of results stored in cache.
        """
        spec = dasquery.mongo_query.get('spec', {})
        cond = {'query.spec.key': {'$in' : spec.keys()}, 'qhash':dasquery.qhash}
        for row in self.col.find(cond):
            found_query = DASQuery(row['query'])
            if  dasquery.qhash == found_query.qhash:
                msg = "%s similar to %s" % (dasquery, found_query)
                self.logger.info(msg)
                return found_query
        return False
    
    def get_superset_keys(self, key, value):
        """
        This is a special-case version of similar_keys,
        intended for analysers that want to quickly
        find possible superset queries of a simple
        query of the form key=value.
        """
        
        msg = "%s=%s" % (key, value)
        self.logger.debug(msg)
        cond = {'query.spec.key': key}
        for row in self.col.find(cond):
            mongo_query = decode_mongo_query(row['query'])
            for thiskey, thisvalue in mongo_query.iteritems():
                if thiskey == key:
                    if fnmatch.fnmatch(value, thisvalue):
                        yield thisvalue

    def get_fields(self, dasquery):
        "Prepare fields to extract from MongoDB"
        fields     = dasquery.mongo_query.get('fields', None)
        if  fields == ['records']:
            fields = None # look-up all records
        filters    = dasquery.filters
        cond       = {}
        if  filters:
            new_fields = []
            for dasfilter in filters:
                if  dasfilter == 'unique':
                    continue
                if  dasfilter not in fields and \
                    dasfilter not in new_fields:
                    if  dasfilter.find('=') == -1 and dasfilter.find('<') == -1\
                    and dasfilter.find('>') == -1:
                        new_fields.append(dasfilter)
                    else:
                        cond = parse_filters(dasquery.mongo_query)
            if  not new_fields and fields:
                new_fields = list(fields)
            return new_fields, cond
        return fields, cond

    def remove_expired(self, collection):
        """
        Remove expired records from DAS cache.
        """
        timestamp = int(time.time())
        col  = self.mdb[collection]
        spec = {'das.expire' : {'$lt' : timestamp}}
        if  self.verbose:
            nrec = col.find(spec).count()
            msg  = "will remove %s records" % nrec
            msg += ", localtime=%s" % timestamp
            self.logger.debug(msg)
        self.logdb.insert(collection, {'delete': self.col.find(spec).count()})
        col.remove(spec)

    def find(self, dasquery):
        """
        Find provided query in DAS cache.
        """
        cond = {'qhash': dasquery.qhash, 'das.system':'das'}
        return self.col.find_one(cond)

    def find_specs(self, dasquery, system='das'):
        """
        Check if cache has query whose specs are identical to provided query.
        Return all matches.
        """
        cond = {'qhash': dasquery.qhash}
        if  system:
            cond.update({'das.system': system})
        return self.col.find(cond)

    def get_das_ids(self, dasquery):
        """
        Return list of DAS ids associated with given query
        """
        das_ids = []
        try:
            das_ids = \
                [r['_id'] for r in self.col.find_specs(dasquery, system='')]
        except:
            pass
        return das_ids

    def update_das_expire(self, dasquery, timestamp):
        "Update timestamp of all DAS data records for given query"
        nval = {'$set': {'das.expire':timestamp}}
        spec = {'qhash' : dasquery.qhash}
        self.col.update(spec, nval, multi=True, safe=True)
        self.merge.update(spec, nval, multi=True, safe=True)

    def das_record(self, dasquery):
        "Retrieve DAS record for given query"
        return self.col.find_one({'qhash': dasquery.qhash})
    
    def find_records(self, das_id):
        " Return all the records matching a given das_id"
        return self.col.find({'das_id': das_id})

    def add_to_record(self, dasquery, info, system=None):
        "Add to existing DAS record provided info"
        if  system:
            self.col.update({'query': dasquery.storage_query,
                             'das.system':system},
                            {'$set': info}, upsert=True, safe=True)
        else:
            self.col.update({'query': dasquery.storage_query},
                            {'$set': info}, upsert=True, safe=True)

    def update_query_record(self, dasquery, status, header=None):
        "Update DAS record for provided query"
        if  header:
            system = header['das']['system']
            spec1  = {'qhash': dasquery.qhash, 'das.system': 'das'}
            dasrecord = self.col.find_one(spec1)
            spec2  = {'qhash': dasquery.qhash, 'das.system': system}
            sysrecord = self.col.find_one(spec2)
            hexpire = header['das']['expire']
            dexpire = hexpire
            if  dasrecord and dasrecord.has_key('das'):
                dexpire = dasrecord['das'].get('expire', None)
            if  dexpire and hexpire > dexpire:
                expire = dexpire
            else:
                expire = hexpire
            if  sysrecord:
                api  = header['das']['api']
                url  = header['das']['url']
                sapi = sysrecord['das'].get('api', [])
                surl = sysrecord['das'].get('url', [])
                if  set(api) & set(sapi) == set(api) and \
                    set(url) & set(surl) == set(url):
                    self.col.update({'_id':ObjectId(sysrecord['_id'])},
                        {'$set': {'das.expire':expire, 'das.status':status}},
                        safe=True)
                else:
                    self.col.update({'_id':ObjectId(sysrecord['_id'])},
                        {'$pushAll':{'das.api':header['das']['api'],
                                     'das.urn':header['das']['api'],
                                     'das.url':header['das']['url'],
                                     'das.ctime':header['das']['ctime'],
                                    },
                         '$set': {'das.expire':expire, 'das.status':status}},
                        safe=True)
            if  dasrecord:
                self.col.update({'_id':ObjectId(dasrecord['_id'])},
                     {'$set': {'das.expire':expire}}, safe=True)
        else:
            self.col.update({'qhash': dasquery.qhash,
                             'das.system':'das'},
                            {'$set': {'das.status': status}}, safe=True)

    def incache(self, dasquery, collection='merge', system=None):
        """
        Check if we have query results in cache, otherwise return null.
        Please note, input parameter query means MongoDB query, please
        consult MongoDB API for more details,
        http://api.mongodb.org/python/
        """
        self.remove_expired(collection)
        col  = self.mdb[collection]
        spec = {'qhash':dasquery.qhash}
        if  system:
            spec.update({'das.system': system})
        res  = col.find(spec=spec).count()
        msg  = "(%s, coll=%s) found %s results" % (dasquery, collection, res)
        self.logger.info(msg)
        if  res:
            return True
        return False

    def nresults(self, dasquery, collection='merge'):
        """Return number of results for given query."""
        if  dasquery.aggregators:
            return len(dasquery.aggregators)
        # Distinguish 2 use cases, unique filter and general query
        # in first one we should count only unique records, in later
        # we can rely on DB count() method. Pleas keep in mind that
        # usage of fields in find doesn't account for counting, since it
        # is a view over records found with spec, so we don't need to use it.
        col  = self.mdb[collection]
        fields, filter_cond = self.get_fields(dasquery)
        if  not fields:
            spec = dasquery.mongo_query.get('spec', {})
        else:
            spec = {'qhash':dasquery.qhash, 'das.empty_record':0}
        if  filter_cond:
            spec.update(filter_cond)
        if  dasquery.unique_filter:
            skeys = self.mongo_sort_keys(collection, dasquery)
            if  skeys:
                gen = col.find(spec=spec).sort(skeys)
            else:
                gen = col.find(spec=spec)
            res = len([r for r in unique_filter(gen)])
        else:
            res = col.find(spec=spec).count()
        msg = "%s" % res
        self.logger.info(msg)
        return res

    def mongo_sort_keys(self, collection, dasquery):
        """
        Find list of sort keys for a given DAS query. Check existing
        indexes and either use fields or spec keys to find them out.
        Return list of mongo sort keys in a form of (key, order).
        """
        # try to get sort keys all the time to get ordered list of
        # docs which allow unique_filter to apply afterwards
        fields = dasquery.mongo_query.get('fields')
        spec   = dasquery.mongo_query.get('spec')
        skeys  = dasquery.sortkeys
        mongo_skeys = []
        if  skeys:
            for key in skeys:
                if  key.find('-') != -1: # reverse order, e.g. desc
                    mongo_skeys.append((key.replace('-', ''), DESCENDING))
                else:
                    mongo_skeys.append((key, ASCENDING))
        else:
            existing_idx = [i for i in self.existing_indexes(collection)]
            if  fields:
                lkeys = []
                for key in fields:
                    for mkey in self.mapping.mapkeys(key):
                        if  mkey not in lkeys:
                            lkeys.append(mkey)
            else:
                lkeys = spec.keys()
            keys = [k for k in lkeys \
                if k.find('das') == -1 and k.find('_id') == -1 and \
                        k in existing_idx]
            mongo_skeys = [(k, ASCENDING) for k in keys]
        return mongo_skeys

    def existing_indexes(self, collection='merge'):
        """
        Get list of existing indexes in DB. They are returned by index_information
        API in the following for:

        .. doctest::

            {u'_id_': {u'key': [(u'_id', 1)], u'v': 0},
             u'das.expire_1': {u'key': [(u'das.expire', 1)], u'v': 0},
             ...
             u'tier.name_-1': {u'key': [(u'tier.name', -1)], u'v': 0}}
        """
        col = self.mdb[collection]
        for val in col.index_information().values():
            for idx in val['key']:
                yield idx[0] # index name

    def get_records(self, col, spec, fields, skeys, idx, limit, unique=False):
        "Generator to get records from MongoDB. It correctly applies"
        if  fields:
            for key in fields: # ensure that fields keys will be presented
                if  key not in self.das_internal_keys and \
                    not spec.has_key(key):
                    spec.update({key: {'$exists':True}})
        try:
            res = col.find(spec=spec, fields=fields)
            if  skeys:
                res = res.sort(skeys)
            if  not unique:
                if  idx:
                    res = res.skip(idx)
                if  limit:
                    res = res.limit(limit)
        except Exception as exp:
            print_exc(exp)
            row = {'exception': str(exp)}
            res = []
            yield row
        if  unique:
            if  limit:
                gen = itertools.islice(unique_filter(res), idx, idx+limit)
            else:
                gen = unique_filter(res)
            for row in gen:
                yield row
        else:
            for row in res:
                yield row

    def get_from_cache(self, dasquery, idx=0, limit=0, collection='merge'):
        "Generator which retrieves results from the cache"
        if  dasquery.service_apis_map(): # valid DAS query
            result = self.get_das_records(dasquery, idx, limit, collection)
        else: # pure MongoDB query
            coll    = self.mdb[collection]
            fields  = dasquery.mongo_query.get('fields', None)
            spec    = dasquery.mongo_query.get('spec', {})
            if  dasquery.filters:
                if  fields == None:
                    fields = dasquery.filters
                else:
                    fields += dasquery.filters
            skeys   = self.mongo_sort_keys(collection, dasquery)
            result  = self.get_records(coll, spec, fields, skeys, \
                            idx, limit, dasquery.unique_filter)
        for row in result:
            yield row

    def get_das_records(self, dasquery, idx=0, limit=0, collection='merge'):
        "Generator which retrieves DAS records from the cache"
        col = self.mdb[collection]
        msg = "(%s, %s, %s, coll=%s)" % (dasquery, idx, limit, collection)
        self.logger.info(msg)

        idx = int(idx)
        fields, filter_cond = self.get_fields(dasquery)
        if  not fields:
            spec = dasquery.mongo_query.get('spec', {})
        else:
            spec = {'qhash':dasquery.qhash, 'das.empty_record':0}
        if  filter_cond:
            spec.update(filter_cond)
        if  fields: # be sure to extract das internal keys
            fields += self.das_internal_keys
        # try to get sort keys all the time to get ordered list of
        # docs which allow unique_filter to apply afterwards
        skeys   = self.mongo_sort_keys(collection, dasquery)
        res     = self.get_records(col, spec, fields, skeys, \
                        idx, limit, dasquery.unique_filter)
        counter = 0
        for row in res:
            counter += 1
            yield row

        if  counter:
            msg = "yield %s record(s)" % counter
            self.logger.info(msg)

        # if no raw records were yield we look-up possible error records
        if  not counter:
            nrec = self.col.find({'qhash':dasquery.qhash}).count()
            if  nrec:
                msg = "for query %s, found %s non-result record(s)" \
                        % (dasquery, nrec)
                prf = 'DAS WARNING, monogocache:get_from_cache '
                print dastimestamp(prf), msg

    def map_reduce(self, mr_input, dasquery, collection='merge'):
        """
        Wrapper around _map_reduce to allow sequential map/reduce
        operations, e.g. map/reduce out of map/reduce. 

        mr_input is either alias name or list of alias names for
        map/reduce functions.

        Input dasquery which is applied to first
        iteration of map/reduce functions.
        """
        # NOTE: I need to revisit mapreduce.
        spec = dasquery.mongo_query['spec']
        if  not isinstance(mr_input, list):
            mrlist = [mr_input]
        else:
            mrlist = mr_input
        coll = self.mdb[collection]
        for mapreduce in mrlist:
            if  mapreduce == mrlist[0]:
                cond = spec
            else:
                cond = None
            coll = self._map_reduce(coll, mapreduce, cond)
        for row in coll.find():
            yield row

    def _map_reduce(self, coll, mapreduce, spec=None):
        """
        Perform map/reduce operation over DAS cache using provided
        collection, mapreduce name and optional conditions.
        """
        self.logger.debug("(%s, %s)" % (mapreduce, spec))
        record = self.mrcol.find_one({'name':mapreduce})
        if  not record:
            raise Exception("Map/reduce function '%s' not found" % mapreduce)
        fmap = record['map']
        freduce = record['reduce']
        if  spec:
            result = coll.map_reduce(Code(fmap), Code(freduce), query=spec)
        else:
            result = coll.map_reduce(Code(fmap), Code(freduce))
        msg = "found %s records in %s" % (result.count(), result.name)
        self.logger.info(msg)
        self.logger.debug(fmap)
        self.logger.debug(freduce)
        return result

    def get_map_reduce(self, name=None):
        """
        Return definition of map/reduce functions for provided name
        or gives full list.
        """
        spec = {}
        if  name:
            spec = {'name':name}
        result = self.mrcol.find(spec)
        for row in result:
            yield row

    def merge_records(self, dasquery):
        """
        Merge DAS records for provided query. We perform the following
        steps:
        1. get all queries from das.cache by ordering them by primary key
        2. run aggregtor function to merge neighbors
        3. insert records into das.merge
        """
        self.logger.debug(dasquery)
        id_list = []
        expire  = 9999999999 # future
        # get all API records for given DAS query
        spec    = {'qhash':dasquery.qhash, 'query':{'$exists':True}}
        records = self.col.find(spec)
        for row in records:
            # find smallest expire timestamp to be used by aggregator
            if  row['das']['expire'] < expire:
                expire = row['das']['expire']
            if  row['_id'] not in id_list:
                id_list.append(row['_id'])
        inserted = 0
        lookup_keys = set()
        fields = dasquery.mongo_query.get('fields')
        if  not fields: # Mongo
            fields = []
        for key in fields:
            for pkey in self.mapping.mapkeys(key):
                lookup_keys.add(pkey)
        for pkey in lookup_keys:
            skey = [(pkey, DESCENDING)]
            # lookup all service records
            spec = {'das_id': {'$in': id_list}, 'das.primary_key': pkey}
            if  self.verbose:
                nrec = self.col.find(spec).sort(skey).count()
                msg  = "merging %s records, for %s key" % (nrec, pkey) 
            else:
                msg  = "merging records, for %s key" % pkey
            self.logger.debug(msg)
            records = self.col.find(spec).sort(skey)
            # aggregate all records
            agen = aggregator(dasquery, records, expire)
            # diff aggregated records
            gen  = das_diff(agen, self.mapping.diff_keys(pkey.split('.')[0]))
            # insert all records into das.merge using bulk insert
            size = self.cache_size
            try:
                while True:
                    nres = self.merge.insert(\
                        itertools.islice(gen, size), safe=True)
                    if  nres and isinstance(nres, list):
                        inserted += len(nres)
                    else:
                        break
            except InvalidDocument as exp:
                msg = "Caught bson error: " + str(exp)
                self.logger.info(msg)
                records = self.col.find(spec).sort(skey)
                gen = aggregator(dasquery, records, expire)
                genrows = parse2gridfs(self.gfs, pkey, gen, self.logger)
                das_dict = {'das':{'expire':expire, 'empty_record': 0,
                        'primary_key':[k for k in lookup_keys],
                        'system': ['gridfs']}, 'qhash':dasquery.qhash,
                        'cache_id':[], 'das_id': id_list}
                for row in genrows:
                    row.update(das_dict)
                    self.merge.insert(row, safe=True)
            except InvalidOperation:
                pass
        if  inserted:
            self.logdb.insert('merge', {'insert': inserted})
        elif  not lookup_keys: # we get query w/o fields
            pass
        else: # we didn't merge anything, it is DB look-up failure
            empty_expire = time.time() + 20 # secs, short enough to expire
            empty_record = {'das':{'expire':empty_expire,
                                   'primary_key':list(lookup_keys),
                                   'empty_record': 1},
                            'cache_id':[], 'das_id': id_list}
            for key, val in dasquery.mongo_query['spec'].iteritems():
                if  key.find('.') == -1:
                    empty_record[key] = []
                else: # it is compound key, e.g. site.name
                    newkey, newval = convert_dot_notation(key, val)
                    empty_record[newkey] = adjust_mongo_keyvalue(newval)
            self.merge.insert(empty_record, safe=True)
            # update DAS records (both meta and data ones, by using qhash)
            nval = {'$set': {'das.expire':empty_expire}}
            spec = {'qhash':dasquery.qhash}
            self.col.update(spec, nval, multi=True, safe=True)

    def update_cache(self, dasquery, results, header):
        """
        Insert results into cache. Use bulk insert controller by
        self.cache_size. Upon completion ensure indexies.
        """
        # insert/check query record in DAS cache
        self.insert_query_record(dasquery, header)

        # update results records in DAS cache
        gen  = self.generate_records(dasquery, results, header)
        inserted = 0
        # bulk insert
        try:
            while True:
                nres = self.col.insert(\
                        itertools.islice(gen, self.cache_size), safe=True)
                if  nres and isinstance(nres, list):
                    inserted += len(nres)
                else:
                    break
        except InvalidOperation:
            pass
        if  inserted:
            self.logdb.insert('cache', {'insert': inserted})

    def insert_query_record(self, dasquery, header):
        """
        Insert query record into DAS cache.
        """
        dasheader  = header['das']
        # check presence of API record in a cache
        system     = dasheader['system']
        if  not self.incache(dasquery, collection='cache', system=system):
            msg = "query=%s, header=%s" % (dasquery, header)
            self.logger.debug(msg)
            q_record = dict(das=dasheader, query=dasquery.storage_query)
            q_record['das']['empty_record'] = 0
            q_record['das']['status'] = "requested"
            q_record['qhash'] = dasquery.qhash
            self.col.insert(q_record, safe=True)

    def generate_records(self, dasquery, results, header):
        """
        Iterate over provided results, update records and yield them
        to next level (update_cache)
        """
        self.logger.debug("(%s) store to cache" % dasquery)
        if  not results:
            return
        # update das record with new status
        status = 'Update DAS cache, %s API' % header['das']['api'][0]
        self.update_query_record(dasquery, status, header)

        dasheader  = header['das']
        expire     = dasheader['expire']
        system     = dasheader['system']
        rec        = [k for i in header['lookup_keys'] for k in i.values()]
        cond_keys  = dasquery.mongo_query['spec'].keys()
        # get API record id
        spec       = {'qhash':dasquery.qhash, 'das.system':system}
        record     = self.col.find_one(spec, fields=['_id'])
        counter    = 0
        prim_key   = rec[0][0]#use rec instead of lkeys[0] which re-order items
        if  record:
            objid  = record['_id']
            if  isinstance(results, list) or isinstance(results, GeneratorType):
                for item in results:
                    counter += 1
                    item['das'] = dict(expire=expire, primary_key=prim_key,
                                       condition_keys=cond_keys,
                                       instance=dasquery.instance,
                                       system=system, empty_record=0)
                    item['das_id'] = str(objid)
                    item['qhash'] = dasquery.qhash
                    yield item
            else:
                print "\n\n ### results = ", str(results)
                raise Exception('Provided results is not a list/generator type')
        self.logger.info("\n")
        msg = "%s yield %s rows" % (dasheader['system'], counter)
        self.logger.info(msg)

    def remove_from_cache(self, dasquery):
        """
        Remove query from DAS cache. To do so, we retrieve API record
        and remove all data records from das.cache and das.merge
        """
        records = self.col.find({'qhash':dasquery.qhash})
        id_list = []
        for row in records:
            if  row['_id'] not in id_list:
                id_list.append(row['_id'])
        spec = {'das_id':{'$in':id_list}}
        self.logdb.insert('merge', {'delete': self.col.find(spec).count()})
        self.merge.remove(spec)
        self.logdb.insert('cache', {'delete': self.col.find(spec).count()})
        self.col.remove(spec)
        self.col.remove({'qhash':dasquery.qhash})

    def clean_cache(self):
        """
        Clean expired docs in das.cache and das.merge. 
        """
        current_time = time.time()
        query = {'das.expire': { '$lt':current_time} }
        self.logdb.insert('merge', {'delete': self.merge.find(query).count()})
        self.merge.remove(query)
        self.logdb.insert('cache', {'delete': self.col.find(query).count()})
        self.col.remove(query)

    def delete_cache(self):
        """
        Delete all results in DAS cache/merge collection, including
        internal indexes.
        """
        self.logdb.insert('cache', {'delete': self.col.count()})
        self.col.remove({})
        try: 
            self.col.drop_indexes()
        except:
            pass
        self.logdb.insert('merge', {'delete': self.merge.count()})
        self.merge.remove({})
        try:
            self.merge.drop_indexes()
        except:
            pass
Beispiel #13
0
class DASKeyLearning(object):
    """
    This class manages DAS key-learning DB.

    Key-learning is an intermittent process (triggered infrequently
    by a task running in the analytics framework), which involves
    searching through the raw cache for (a subset of but with
    maximum primary key coverage) all output documents, generating
    the set of all data members (in a dotted-dict fashion) and storing
    those as primary-key:data-member records (with an associated
    last-updated-time).

    """
    def __init__(self, config):
        self.verbose  = config['verbose']
        self.logger   = PrintManager('DASKeyLearning', self.verbose)
        self.services = config['services']
        self.dburi    = config['mongodb']['dburi']
        self.dbname   = config['keylearningdb']['dbname']
        self.colname  = config['keylearningdb']['collname']

        self.mapping  = config['dasmapping']

        msg = "%s@%s" % (self.dburi, self.dbname)
        self.logger.info(msg)

        self.das_son_manipulator = DAS_SONManipulator()
        index_list = [('system', ASCENDING), ('urn', ASCENDING), \
                ('members', ASCENDING), ('stems', ASCENDING)]
        create_indexes(self.col, index_list)

    @property
    def col(self):
        "col property provides access to DAS keylearning collection"
        conn = db_connection(self.dburi)
        mdb  = conn[self.dbname]
        colnames = mdb.collection_names()
        if  not colnames or self.colname not in colnames:
            try:
                mdb.create_collection(self.colname)
            except OperationFailure:
                pass
        mdb.add_son_manipulator(self.das_son_manipulator)
        return mdb[self.colname]

    def add_record(self, dasquery, rec):
        """
        Add/update to keylearning DB keys/attributes from given record.
        To do so, we parse it and call add_members method.
        """
        if  not ('das' in rec and 'system' in rec['das']):
            return
        das = rec['das']
        if  'system' not in das or 'api' not in das or 'primary_key' not in das:
            return
        systems = das['system']
        apis = das['api']
        pkey = das['primary_key'].split('.')[0]
        data = rec.get(pkey, [])
        members = dict_members(data, pkey)
        for srv, api in zip(systems, apis):
            self.add_members(srv, api, members)
        # insert new record for query patern
        fields = dasquery.mongo_query.get('fields', [])
        if  fields:
            for field in fields:
                if  field in DAS_RECORD_KEYS:
                    continue
                new_members = [m for m in dict_members(rec[field], field) if m]
                members += new_members
        for attr in members:
            spec = {'member': attr}
            doc = {'query_pat': dasquery.query_pat}
            self.col.update(spec, {'$addToSet': doc}, upsert=True)

    def add_members(self, system, urn, members):
        """
        Add a list of data members for a given API (system, urn, url),
        and generate, which are stored as separate records.
        """
        msg = "system=%s, urn=%s, members=%s)" % (system, urn, members)
        self.logger.info(msg)

        result = self.col.find_one({'system': system, 'urn': urn})
        if result:
            self.col.update({'_id': ObjectId(result['_id'])},
                            {'$addToSet': {'members': {'$each': members}}})
        else:
            keys = self.mapping.api2daskey(system, urn)
            self.col.insert({'system': system,
                             'urn': urn,
                             'keys': keys,
                             'members': members})

        for member in members:
            if not self.col.find_one({'member': member}):
                self.col.insert({'member': member,
                                 'stems': stem(member)})

    def text_search(self, text):
        """
        Perform a text search for data members matching a string. The input is
        split if it already includes dotted elements (in which case we need to
        find a member matching all the split elements), otherwise we look for
        any member whose stem list contains the text.
        """
        text = text.lower()
        if '.' in text:
            possible_members = self.col.find(\
                    {'stems': {'$all': text.split('.')}}, fields=['member'])
        else:
            possible_members = self.col.find({'stems': text},\
                                             fields=['member'])
        return [doc['member'] for doc in possible_members]

    def attributes(self):
        """
        Return full list of keyword attributes known in DAS.
        """
        spec = {'member':{'$exists':True}}
        return self.col.find(spec)

    def member_info(self, member):
        """
        Once the text search has identified a member that might be a match,
        return which systems, APIs and hence DAS keys this points to.
        """
        result = []
        for doc in self.col.find({'members': member},
                                 fields=['system', 'urn', 'keys']):

            result.append({'system': doc['system'],
                           'urn': doc['urn'],
                           'keys': doc['keys']})
        return result

    def key_search(self, text, limitkey=None):
        """
        Try and find suggested DAS keys, by performing a member search and then
        mapping back to the DAS keys those are produced by.
        """
        text = text.lower()
        result = collections.defaultdict(set)
        for member in self.text_search(text):
            for info in self.member_info(member):
                result[tuple(info['keys'])].add(member)
        if limitkey:
            for key in result:
                if not limitkey in key:
                    del result[key]
        return result

    def members_for_keys(self, keys):
        """
        Return all the members that exactly match the set of keys
        """
        result = []
        for doc in self.col.find({'keys': {'$all': keys, '$size': len(keys)}},
                                 fields=['members']):
            result += doc['members']
        return result


    def has_member(self, member):
        """
        Return true if we know anything about the given member.
        """
        if self.col.find_one({'member': member}):
            return True
        else:
            return False

    def list_members(self):
        "Return list of members in keylearning collection"
        return self.col.find({'members': {'$exists': 'True'},
                              'system': {'$exists': 'True'},
                              'urn': {'$exists': 'True'}})
Beispiel #14
0
class DASCore(object):
    """
    DAS core class.
    """

    def __init__(self, config=None, debug=0, nores=False, logger=None, engine=None, multitask=True):
        if config:
            dasconfig = config
        else:
            dasconfig = das_readconfig()
        verbose = dasconfig["verbose"]
        self.stdout = debug
        if isinstance(debug, int):
            self.verbose = debug
            dasconfig["verbose"] = debug
        else:
            self.verbose = verbose
        das_timer("DASCore::init", self.verbose)
        self.operators = das_operators()

        # set noresults option
        self.noresults = False
        if nores:
            dasconfig["write_cache"] = True
            self.noresults = nores

        self.multitask = dasconfig["das"].get("multitask", True)
        if debug or self.verbose:
            self.multitask = False  # in verbose mode do not use multitask
            dasconfig["das"]["multitask"] = False
        if not multitask:  # explicitly call DASCore ctor
            self.multitask = False
            dasconfig["das"]["multitask"] = False
        dasconfig["engine"] = engine
        if self.multitask:
            nworkers = dasconfig["das"].get("core_workers", 5)
            if engine:
                thr_name = "DASCore:PluginTaskManager"
                self.taskmgr = PluginTaskManager(engine, nworkers=nworkers, name=thr_name)
                self.taskmgr.subscribe()
            else:
                thr_name = "DASCore:TaskManager"
                self.taskmgr = TaskManager(nworkers=nworkers, name=thr_name)
        else:
            self.taskmgr = None

        if logger:
            self.logger = logger
        else:
            self.logger = PrintManager("DASCore", self.verbose)

        # define Mapping/Analytics/Parser in this order since Parser depends
        # on first two
        dasmapping = DASMapping(dasconfig)
        dasconfig["dasmapping"] = dasmapping
        self.mapping = dasmapping

        self.keylearning = DASKeyLearning(dasconfig)
        dasconfig["keylearning"] = self.keylearning

        # init DAS cache
        self.rawcache = DASMongocache(dasconfig)
        dasconfig["rawcache"] = self.rawcache

        # plug-in architecture: loop over registered data-services in
        # dasconfig; load appropriate module/class; register data
        # service with DASCore.
        self.systems = dasmapping.list_systems()
        # pointer to the DAS top level directory
        dasroot = "/".join(__file__.split("/")[:-3])
        for name in self.systems:
            try:
                klass = "DAS/services/%s/%s_service.py" % (name, name)
                srvfile = os.path.join(dasroot, klass)
                with open(srvfile) as srvclass:
                    for line in srvclass:
                        if line.find("(DASAbstractService)") != -1:
                            klass = line.split("(DASAbstractService)")[0]
                            klass = klass.split("class ")[-1]
                            break
                mname = "DAS.services.%s.%s_service" % (name, name)
                module = __import__(mname, fromlist=[klass])
                obj = getattr(module, klass)(dasconfig)
                setattr(self, name, obj)
            except IOError as err:
                if debug > 1:
                    # we have virtual services, so IOError can be correct
                    print_exc(err)
                try:
                    mname = "DAS.services.generic_service"
                    module = __import__(mname, fromlist=["GenericService"])
                    obj = module.GenericService(name, dasconfig)
                    setattr(self, name, obj)
                except Exception as exc:
                    print_exc(exc)
                    msg = "Unable to load %s data-service plugin" % name
                    raise Exception(msg)
            except Exception as exc:
                print_exc(exc)
                msg = "Unable to load %s data-service plugin" % name
                raise Exception(msg)

        # loop over systems and get system keys, add mapping keys to final list
        self.service_keys = {}
        self.service_parameters = {}
        for name in self.systems:
            skeys = list(getattr(self, name).keys())
            self.service_keys[getattr(self, name).name] = skeys
            sparams = getattr(self, name).parameters()
            self.service_parameters[getattr(self, name).name] = sparams

        self.service_keys["special"] = das_special_keys()
        self.dasconfig = dasconfig
        das_timer("DASCore::init", self.verbose)

    def keys(self):
        """
        Return map of data service keys
        """
        return self.service_keys

    def das_keys(self):
        """
        Return map of data service keys
        """
        _keys = ["records"]
        for values in self.service_keys.values():
            for key in values:
                if key not in _keys:
                    _keys.append(key)
        return _keys

    def result(self, query, idx=0, limit=None):
        """
        Get results either from cache or from explicit call
        """
        self.logger.info("input query=%s" % query)
        results = []
        dasquery = DASQuery(query)
        query = dasquery.mongo_query
        # check if we have any service which cover the query
        # otherwise decompose it into list of queries
        service_map = dasquery.service_apis_map()
        if not service_map:
            msg = "no APIs found to answer input query, will decompose it"
            self.logger.info(msg)
            skeys = query["fields"]
            if not skeys:
                skeys = []
            for key in skeys:
                newquery = DASQuery(dict(fields=[key], spec=query["spec"]))
                self.call(newquery)  # process query
        else:
            self.call(dasquery)  # process query

        # lookup provided query in a cache
        if not self.noresults:
            results = self.get_from_cache(dasquery, idx, limit)
        return results

    def remove_from_cache(self, dasquery):
        """
        Delete in cache entries about input query
        """
        self.rawcache.remove_from_cache(dasquery)

    def get_status(self, dasquery):
        """
        Look-up status of provided query in a cache.
        Return status of the query request and its hash.
        """
        status = None
        error = None
        reason = None
        if dasquery and "fields" in dasquery.mongo_query:
            fields = dasquery.mongo_query["fields"]
            if fields and isinstance(fields, list) and "queries" in fields:
                return "ok", error, reason
        record = self.rawcache.find(dasquery)
        error, reason = self.rawcache.is_error_in_records(dasquery)
        try:
            if record and "das" in record and "status" in record["das"]:
                status = record["das"]["status"]
                if not error:
                    error = record["das"].get("error", error)
                if not reason:
                    reason = record["das"].get("reason", reason)
                return status, error, reason
        except Exception as exc:
            print_exc(exc)
            status = error = reason = None
            self.rawcache.remove_from_cache(dasquery)
        return status, error, reason

    def worker(self, srv, dasquery):
        """Main worker function which calls data-srv call function"""
        self.logger.info("##### %s ######\n" % srv)
        das_timer(srv, self.verbose)
        getattr(getattr(self, srv), "call")(dasquery)
        das_timer(srv, self.verbose)

    def insert_query_records(self, dasquery):
        """
        Insert DAS query records into DAS cache and return list of services
        which will answer this query
        """
        services = dasquery.services
        self.logger.info("Potential services = %s" % services)
        if not services:
            msg = "No data-services for query %s" % dasquery
            msg += "mongo_query: %s" % dasquery.mongo_query
            msg += "params: %s" % dasquery.params()
            print(dastimestamp("DAS WARNING "), msg)

        # get list of URI which can answer this query
        ack_services = []
        for srv in services:
            gen = [t for t in getattr(getattr(self, srv), "apimap")(dasquery)]
            for url, api, args, iformat, expire in gen:
                header = dasheader(srv, dasquery, expire, api, url, ctime=0)
                self.rawcache.insert_query_record(dasquery, header)
                if srv not in ack_services:
                    ack_services.append(srv)
        if not ack_services:
            ack_services = services
        if dasquery.query.find("records ") != -1:
            srv_status = True  # skip DAS queries w/ records request
        # create das record with initial expire tstamp 2 min in a future
        # it should be sufficient for processing data-srv records
        expire = time.time() + 2 * 60
        header = dasheader("das", dasquery, expire, api="das_core", services=dict(das=ack_services))
        header["lookup_keys"] = []
        self.rawcache.insert_query_record(dasquery, header)
        das_timer("das_record", self.verbose)
        return ack_services

    def call(self, query, **kwds):
        """
        Top level DAS api which execute a given query using underlying
        data-services. It follows the following steps:

            - parse input query
            - identify data-sercices based on selection keys
              and where clause conditions
            - construct DAS workflow and execute data-service 
              API calls. At this step individual 
              data-services store results into DAS cache.

        Return status 0/1 depending on success of the calls, can be
        used by workers on cache server.

        kwds is provided for compatibility with web layer, e.g. it
        may invoke this method with additional pid parameter.
        """

        def update_das_query(dasquery, status, reason=None):
            "Update DAS query record with given status and reason"
            self.rawcache.update_query_record(dasquery, status, reason=reason)
            self.rawcache.add_to_record(dasquery, {"das.timer": get_das_timer()}, system="das")
            # make sure that das record is updated, we use 7 iteration which
            # sum up into 1 minute to cover default syncdelay value of mongo
            # server (in a future it would be better to find programatically
            # this syncdelay value, but it seems pymongo driver does not
            # provide any API for it.
            for idx in range(0, 7):
                spec = {"qhash": dasquery.qhash, "das.system": ["das"]}
                res = self.rawcache.col.find_one(spec)
                if res:
                    dbstatus = res.get("das", {}).get("status", None)
                    if dbstatus == status:
                        break
                    msg = "qhash %s, das.status=%s, status=%s, wait for update" % (dasquery.qhash, dbstatus, status)
                    print(dastimestamp("DAS WARNING"), msg)
                self.rawcache.update_query_record(dasquery, status, reason=reason)
                time.sleep(idx * idx)

        self.logger.info("input query=%s" % query)
        das_timer("DASCore::call", self.verbose)
        if isinstance(query, object) and hasattr(query, "__class__") and query.__class__.__name__ == "DASQuery":
            dasquery = query
        else:
            dasquery = DASQuery(query)
        for col in ["merge", "cache"]:
            self.rawcache.remove_expired(dasquery, col)
        query = dasquery.mongo_query
        spec = query.get("spec")
        fields = query.get("fields")
        if fields == ["records"]:
            msg = "look-up all records in cache"
            self.logger.info(msg)
            return "in cache"
        if spec == dict(records="*"):
            self.logger.info("look-up everything in cache")
            return "in cache"
        for record in self.rawcache.find_specs(dasquery):
            status = record["das"]["status"]
            msg = "found query %s in cache, status=%s\n" % (record["query"], status)
            self.logger.info(msg)
            print(dastimestamp("DAS INFO"), msg)
            return status

        self.logger.info(dasquery)
        das_timer("das_record", self.verbose)
        services = self.insert_query_records(dasquery)
        if not services:
            msg = "unable to locate data-services to fulfill this request"
            msg += ", will iterate over all registered services"
            print(dastimestamp("DAS WARNING "), dasquery, msg)
            services = dasquery.services if dasquery.services else self.systems
        try:
            if self.multitask:
                jobs = []
                for srv in sorted(services):
                    jobs.append(self.taskmgr.spawn(self.worker, srv, dasquery))
                self.taskmgr.joinall(jobs)
            else:
                for srv in services:
                    self.worker(srv, dasquery)
        except Exception as exc:
            print_exc(exc)
            return "fail"
        self.logger.info("\n##### merging ######\n")
        update_das_query(dasquery, "merging")
        das_timer("merge", self.verbose)
        for attempt in range(0, 4):  # try couple of times to avoid DB problems
            time.sleep(attempt)
            status = self.rawcache.merge_records(dasquery, attempt)
            if status == "ok":
                break
        das_timer("merge", self.verbose)
        # check if we have service records and properly setup status
        self.logger.info("\n##### check services ######\n")
        das_services = self.rawcache.check_services(dasquery)
        reason = ""
        status = "ok"
        if not das_services:
            if "records" in dasquery.query:
                status = "ok"  # keep status ok for 'records' queries
            else:
                reason = "no data records found in DAS cache"
                status = "fail"
                print(dastimestamp("DAS ERROR "), dasquery, reason)
        update_das_query(dasquery, status, reason)
        das_timer("DASCore::call", self.verbose)
        return status

    def processing_time(self, dasquery):
        "Look-up and return DAS query processing time"
        query_record = self.rawcache.find(dasquery)
        if query_record:
            das = query_record.get("das", None)
            if isinstance(das, dict):
                ctime = das.get("ctime", [])
                if ctime:
                    return ctime[-1] - ctime[0]
        return None

    def nresults(self, dasquery, coll="merge"):
        """
        Return total number of results (count) for provided query
        Code should match body of get_from_cache method.
        """
        fields = dasquery.mongo_query.get("fields", None)
        if dasquery.mapreduce:
            result = self.rawcache.map_reduce(dasquery.mapreduce, dasquery)
            return len([1 for _ in result])
        elif dasquery.aggregators:
            return len(dasquery.aggregators)
        return self.rawcache.nresults(dasquery, coll)

    def apilist(self, dasquery):
        "Return list of APIs answer given das query"
        return self.rawcache.apilist(dasquery)

    def incache(self, dasquery, coll="merge"):
        """
        Answer the question if given query in DAS cache or not
        """
        return self.rawcache.incache(dasquery, collection=coll)

    def get_from_cache(self, dasquery, idx=0, limit=0, collection="merge"):
        """
        Look-up results from the merge cache and yield them for
        further processing.
        """
        das_timer("DASCore::get_from_cache", self.verbose)
        msg = "col=%s, query=%s, idx=%s, limit=%s" % (collection, dasquery, idx, limit)
        self.logger.info(msg)

        fields = dasquery.mongo_query.get("fields", None)

        if dasquery.mapreduce:
            res = self.rawcache.map_reduce(dasquery.mapreduce, dasquery)
        elif dasquery.aggregators:
            # extract das information from rawcache
            rows = self.rawcache.get_from_cache(dasquery, collection=collection)
            first = next(rows)
            sinfo = das_sinfo(first)
            # to perform aggregation we need:
            # - loop over all aggregator functions
            # - loop over all data-services
            # - loop over all APIs within a data-services
            # the code below does that, it applies aggregator
            # to selected (based on key/srv/api) records
            res = []
            _id = 0
            time0 = time.time()
            expire = 300  # min expire
            for func, key in dasquery.aggregators:
                afunc = getattr(das_aggregator, "das_%s" % func)
                found = False
                for srv, apis in sinfo.items():
                    for api in apis:
                        rows = self.rawcache.get_from_cache(dasquery, collection=collection)
                        gen = api_rows(rows, api)
                        data = afunc(key, gen)
                        ctime = time.time() - time0
                        das = dasheader(srv, dasquery, expire, api=api, ctime=ctime)
                        if isinstance(data, dict) and data["value"] != "N/A":
                            aggr = {"_id": _id, "function": func, "key": key, "result": data}
                            aggr.update(das)
                            res.append(aggr)
                            _id += 1
                            found = True
                if not found:  # when we got nothing add empty result record
                    empty = {"value": "N/A"}
                    ctime = time.time() - time0
                    das = dasheader("das", dasquery, expire, api="das_core", ctime=ctime)
                    rec = {"_id": 0, "function": func, "key": key, "result": empty}
                    rec.update(das)
                    res.append(rec)
        else:
            res = self.rawcache.get_from_cache(dasquery, idx, limit, collection=collection)
        # we assume that all records from single query will have
        # identical structure, therefore it will be sufficient to update
        # keylearning DB only with first record
        count = 0
        for row in res:
            if not count:
                self.keylearning.add_record(dasquery, row)
            fix_times(row)
            yield row
            count += 1
        das_timer("DASCore::get_from_cache", self.verbose)
Beispiel #15
0
class DASMapping(object):
    """
    This class manages DAS mapping DB.
    """

    def __init__(self, config):
        self.verbose = config["verbose"]
        self.logger = PrintManager("DASMapping", self.verbose)
        self.services = config["services"]
        self.dburi = config["mongodb"]["dburi"]
        self.dbname = config["mappingdb"]["dbname"]
        self.colname = config["mappingdb"]["collname"]
        self.map_test = config.get("map_test", True)
        self.main_dbs = config["das"].get("main_dbs", "dbs")
        self.dbsinsts = config["das"].get("dbs_instances", [])

        msg = "%s@%s" % (self.dburi, self.dbname)
        self.logger.info(msg)

        self.init()
        self.on_reload = Event()

        # Monitoring thread which performs auto-reconnection to MongoDB
        thname = "mappingdb_monitor"
        sleep = 5
        reload_time = config["mappingdb"].get("reload_time", 86400)
        reload_time_bad_maps = config["mappingdb"].get("reload_time_bad_maps", 120)
        start_new_thread(
            thname,
            db_monitor,
            (self.dburi, self.init, sleep, self.load_maps, reload_time, self.check_maps, reload_time_bad_maps),
        )

        self.daskeyscache = {}  # to be filled at run time
        self.systems = []  # to be filled at run time
        self.dasmapscache = {}  # to be filled at run time
        self.keymap = {}  # to be filled at run time
        self.presentationcache = {}  # to be filled at run time
        self.reverse_presentation = {}  # to be filled at run time
        self.notationcache = {}  # to be filled at run time
        self.diffkeycache = {}  # to be filled at run time
        self.apicache = {}  # to be filled at run time
        self.dbs_global_url = None  # to be determined at run time
        self.dbs_inst_names = None  # to be determined at run time
        self.load_maps(notify=False)

    @property
    def col(self):
        "Return MongoDB collection object"
        conn = db_connection(self.dburi)
        dbc = conn[self.dbname]
        col = dbc[self.colname]
        return col

    # ===============
    # Management APIs
    # ===============
    def load_maps(self, notify=True):
        "Helper function to reload DAS maps"
        self.init_dasmapscache()
        self.init_notationcache()
        self.init_presentationcache()
        self.systems = None  # re-initialize DAS system list
        self.list_systems()
        self.dbs_global_url = None  # re-initialize DAS dbs global url
        self.dbs_url()
        self.dbs_inst_names = None  # re-initialize DAS dbs instances
        self.dbs_instances()

        if notify:
            self.on_reload()

    def init_dasmapscache(self, records=[]):
        "Read DAS maps and initialize DAS API maps"
        if not records:
            spec = {"type": "service"}
            records = self.col.find(spec, exhaust=True)
        for row in records:
            if "urn" in row:
                api = row["urn"]
                srv = row["system"]
                for dmap in row["das_map"]:
                    for key, val in dmap.iteritems():
                        if key == "pattern":
                            pat = re.compile(val)
                            dmap[key] = pat
                key = (row["system"], row["urn"])
                self.dasmapscache[key] = row

    def init_notationcache(self):
        """
        Initialize notation cache by reading notations.
        """
        for system, notations in self.notations().iteritems():
            for row in notations:
                key = system, row["api_output"]
                if key in self.notationcache:
                    self.notationcache[key] += [(row["api"], row["rec_key"])]
                else:
                    self.notationcache[key] = [(row["api"], row["rec_key"])]

    def init_presentationcache(self):
        """
        Initialize presentation cache by reading presentation map.
        """
        spec = {"type": "presentation"}
        data = find_one(self.col, spec)
        if data:
            self.presentationcache = data["presentation"]
            for daskey, uilist in self.presentationcache.iteritems():
                for row in uilist:
                    link = None
                    if "link" in row:
                        link = row["link"]
                    if "diff" in row:
                        self.diffkeycache[daskey] = row["diff"]
                    tdict = {daskey: {"mapkey": row["das"], "link": link}}
                    if row["ui"] in self.reverse_presentation:
                        self.reverse_presentation[row["ui"]].update(tdict)
                    else:
                        self.reverse_presentation[row["ui"]] = {daskey: {"mapkey": row["das"], "link": link}}

    def das_presentation_map(self):
        "Read DAS presentation map"
        spec = {"type": "presentation"}
        data = find_one(self.col, spec)
        if data:
            for daskey, uilist in data.get("presentation", {}).iteritems():
                for row in uilist:
                    if "link" in row:
                        yield row

    def init(self):
        """
        Establish connection to MongoDB back-end and create DB.
        """
        col = None
        try:
            conn = db_connection(self.dburi)
            if conn:
                dbc = conn[self.dbname]
                col = dbc[self.colname]
        #            print "### DASMapping:init started successfully"
        except ConnectionFailure as _err:
            tstamp = dastimestamp("")
            thread = threading.current_thread()
            print "### MongoDB connection failure thread=%s, id=%s, time=%s" % (thread.name, thread.ident, tstamp)
        except Exception as exc:
            print_exc(exc)
        if col:
            index = [
                ("type", DESCENDING),
                ("system", DESCENDING),
                ("urn", DESCENDING),
                ("das_map.das_key", DESCENDING),
                ("das_map.rec_key", DESCENDING),
                ("das_map.api_arg", DESCENDING),
            ]
            create_indexes(col, index)

    def delete_db(self):
        """
        Delete mapping DB in MongoDB back-end.
        """
        conn = db_connection(self.dburi)
        if conn:
            conn.drop_database(self.dbname)

    def delete_db_collection(self):
        """
        Delete mapping DB collection in MongoDB.
        """
        conn = db_connection(self.dburi)
        if conn:
            dbc = conn[self.dbname]
            dbc.drop_collection(self.colname)

    def check_maps(self):
        """
        Check Mapping DB and return true/false based on its content
        """
        if not self.map_test:
            return True  # do not test DAS maps, useful for unit tests
        udict = defaultdict(int)
        ndict = defaultdict(int)
        pdict = defaultdict(int)
        adict = {}
        maps_hash = False
        for row in self.col.find(exhaust=True):
            check_map_record(row)
            if "urn" in row:
                udict[row["system"]] += 1
            elif "notations" in row:
                ndict[row["system"]] += 1
            elif "presentation" in row:
                pdict["presentation"] += 1
            elif "arecord" in row:
                arec = row["arecord"]
                system = arec["system"]
                rec = {arec["type"]: arec["count"]}
                if system in adict:
                    adict[system].update(rec)
                else:
                    adict[system] = rec
            elif "verification_token" in row:
                maps_hash = row["verification_token"]

        # retrieve uri/notation/presentation maps
        ulist = []
        nlist = []
        for system in adict.keys():
            if "uri" in adict[system]:
                ulist.append(adict[system]["uri"] == udict[system])
                nlist.append(adict[system]["notations"] == ndict[system])
        status_umap = sum(ulist) == len(ulist)
        status_nmap = sum(nlist) == len(nlist)
        status_pmap = adict.get("presentation", {}).get("presentation", 0) == 1
        # verify completeness of maps
        calc_token = verification_token(self.col.find(exhaust=True))
        status_complete = maps_hash and maps_hash == calc_token
        if self.verbose:
            print "### DAS map status, umap=%s, nmap=%s, pmap=%s, complete=%s" % (
                status_umap,
                status_nmap,
                status_pmap,
                status_complete,
            )
        if not status_complete:
            print "### DAS map hash do not match, got=%s calculated=%s" % (maps_hash, calc_token)
        # multiply statuses as a result of this map check
        return status_umap * status_nmap * status_pmap * status_complete

    def remove(self, spec):
        """
        Remove record in DAS Mapping DB for provided Mongo spec.
        """
        self.col.remove(spec)

    def add(self, record):
        """
        Add new record into mapping DB. Example of URI record

        .. doctest::

            {
             system:dbs, 
             urn : listBlocks, 
             url : "http://a.b.com/api"
             params : [{"apiversion":1_2_2, se:"*"}]
             lookup : block
             das_map: [
                 {"das_key":"block", "rec_key":"block.name"},
                 {"das_key":"site", "rec_key":"site.name", "api_arg":"se", "pattern":"^T[0-3]_},
             ]
            }

        Example of notation record:

        .. doctest::

             notations: [
                 {"api_output" : "storage_element_name", "rec_key":"site", "api": ""},
             ]
        """
        msg = "record=%s" % record
        self.logger.debug(msg)
        self.col.insert(record)
        self.init_dasmapscache([record])

    # ==================
    # Informational APIs
    # ==================
    def dbs_global_instance(self, system=None):
        "Retrive from mapping DB DBS url and extract DBS instance"
        if not system:
            system = self.main_dbs
        url = self.dbs_url(system)
        return get_dbs_instance(url)

    def dbs_url(self, system=None):
        "Retrive from mapping DB DBS url"
        if not system:
            system = self.main_dbs
        systems = self.list_systems()
        dbses = set(["dbs", "dbs3"])
        if dbses & set(systems) != dbses:
            # use caching only when we operate with single DBS
            if self.dbs_global_url:
                return self.dbs_global_url
        url = None
        for srv in systems:
            if srv == system:
                apis = self.list_apis(srv)
                url = self.api_info(srv, apis[0])["url"]
                url = parse_dbs_url(srv, url)
                self.dbs_global_url = url
                return url
        return url

    def dbs_instances(self, system=None):
        "Retrive from mapping DB DBS instances"
        # use dbs istances from the config
        if self.dbsinsts and not system:
            return self.dbsinsts
        # default dbs
        if not system:
            system = self.main_dbs
        systems = self.list_systems()
        dbses = set(["dbs", "dbs3"])
        if dbses & set(systems) != dbses:
            # use caching only when we operate with single DBS
            if self.dbs_inst_names:
                return self.dbs_inst_names
        insts = []
        for srv in systems:
            if srv == system:
                apis = self.list_apis(srv)
                insts = self.api_info(srv, apis[0])["instances"]
                self.dbs_inst_names = insts
                return insts
        return insts

    def list_systems(self):
        """
        List all DAS systems.
        """
        if not self.systems:
            spec = {"type": "service", "system": {"$ne": None}}
            gen = (row["system"] for row in self.col.find(spec, ["system"], exhaust=True))
            self.systems = list(set(gen2list(gen)) & set(self.services))
        return self.systems

    def list_apis(self, system=None):
        """
        List all APIs.
        """
        if self.apicache and system in self.apicache:
            return self.apicache[system]
        spec = {"type": "service", "urn": {"$ne": None}}
        if system:
            spec["system"] = system
        gen = (row["urn"] for row in self.col.find(spec, ["urn"], exhaust=True))
        self.apicache[system] = gen2list(gen)
        return self.apicache[system]

    def api_info(self, srv, api_name):
        """
        Return full API info record.
        """
        return self.dasmapscache[(srv, api_name)]

    def relational_keys(self, system1, system2):
        """
        Return a list of relational keys between provided systems
        """
        for system, keys in self.daskeys().iteritems():
            if system == system1:
                keys1 = keys
            if system == system2:
                keys2 = keys
        return list(set(keys1) & set(keys2))

    def daskeys(self, das_system=None):
        """
        Return a dict with all known DAS keys.
        """
        if das_system in self.daskeyscache:
            return self.daskeyscache[das_system]

        spec = {"type": "service", "system": {"$ne": None}}
        if das_system:
            spec = {"system": das_system}
        gen = (row["system"] for row in self.col.find(spec, ["system"], exhaust=True))
        gen = [r for r in gen]
        kdict = {}
        for system in gen:
            spec = {"system": system, "urn": {"$ne": None}}
            keys = []
            for row in self.col.find(spec, exhaust=True):
                for entry in row["das_map"]:
                    if entry["das_key"] not in keys:
                        keys.append(entry["das_key"])
            kdict[system] = keys
        # cache it
        self.daskeyscache[das_system] = kdict
        return kdict

    # ============
    # Look-up APIs
    # ============
    def api_lkeys(self, das_system, api):
        """
        Return DAS lookup keys for given das system and api
        """
        entry = self.dasmapscache[(das_system, api)]
        skeys = entry["lookup"].split(",")
        return skeys

    def primary_key(self, das_system, urn):
        """
        Return DAS primary key for provided system and urn. The DAS primary key
        is a first entry in *lookup* attribute of DAS API record.
        """
        spec = {"system": das_system, "urn": urn}
        record = find_one(self.col, spec)
        if not record:
            return None
        pkey = record["lookup"]
        if pkey.find(",") != -1:
            pkey = pkey.split(",")[0]
        return pkey

    def primary_mapkey(self, das_system, urn):
        """
        Return DAS primary map key for provided system and urn. For example,
        the file DAS key is mapped to file.name, so this API will return
        file.name
        """
        spec = {"system": das_system, "urn": urn}
        record = find_one(self.col, spec)
        mapkey = []
        for row in record["das_map"]:
            lkey = record["lookup"]
            if lkey.find(",") != -1:
                lkey = lkey.split(",")[0]
            if row["das_key"] == lkey:
                return row["rec_key"]
        return mapkey

    def find_daskey(self, das_system, map_key, value=None):
        """
        Find das key for given system and map key.
        """
        msg = "system=%s\n" % das_system
        daskeys = []
        for key, record in self.dasmapscache.iteritems():
            srv, _urn = key
            if das_system != srv:
                continue
            for row in record["das_map"]:
                das_key = row["das_key"]
                rec_key = row["rec_key"]
                if rec_key != map_key:
                    continue
                pat = row.get("pattern", None)
                if value:
                    if pat:
                        if pat.match(str(value)):
                            daskeys.append(das_key)
                        else:
                            msg += "-- reject key=%s, val=%s, pat=%s\n" % (map_key, value, pat.pattern)
                            self.logger.debug(msg)
                    else:
                        daskeys.append(das_key)
                else:
                    daskeys.append(das_key)
        return daskeys

    def find_mapkey(self, das_system, das_key, value=None):
        """
        Find map key for given system and das key.
        """
        msg = "system=%s\n" % das_system
        for key, record in self.dasmapscache.iteritems():
            srv, _urn = key
            if das_system != srv:
                continue
            for row in record["das_map"]:
                if row["das_key"] != das_key:
                    continue
                rec_key = row["rec_key"]
                pat = row.get("pattern", None)
                if value:
                    if pat:
                        if pat.match(str(value)):
                            return rec_key
                        else:
                            msg += "-- reject key=%s, val=%s, pat=%s\n" % (das_key, value, pat.pattern)
                            self.logger.debug(msg)
                            continue
                    else:
                        return rec_key
                else:
                    return rec_key

    def mapkeys(self, daskey):
        """
        Find all lookup keys (primary keys) for a given daskey
        """
        if daskey in self.keymap:
            return self.keymap[daskey]
        spec = {"das_map.das_key": daskey}
        mapkeys = []
        for row in self.col.find(spec, ["das_map"], exhaust=True):
            for kmap in row["das_map"]:
                if kmap["das_key"] == daskey and kmap["rec_key"] not in mapkeys:
                    mapkeys.append(kmap["rec_key"])
        self.keymap[daskey] = mapkeys
        return self.keymap[daskey]

    def find_apis(self, das_system, map_key):
        """
        Find list of apis which correspond to provided
        system and das map key.
        """
        spec = {"system": das_system, "das_map.rec_key": map_key}
        apilist = []
        for row in self.col.find(spec, ["urn"], exhaust=True):
            if "urn" in row and row["urn"] not in apilist:
                apilist.append(row["urn"])
        return apilist

    def find_system(self, key):
        """
        Return system name for provided DAS key.
        """
        spec = {"das_map.das_key": key}
        gen = (row["system"] for row in self.col.find(spec, ["system"], exhaust=True))
        systems = []
        for system in gen:
            if system not in systems:
                systems.append(system)
        systems.sort()
        return systems

    def lookup_keys(self, system, api, daskey=None, value=None):
        """
        Returns lookup keys for given system and provided
        selection DAS key, e.g. block => block.name
        """
        entry = self.dasmapscache.get((system, api), None)
        if not entry:
            return []
        lkeys = entry.get("lookup", []).split(",")
        rkeys = []
        if daskey in lkeys:
            for dmap in entry["das_map"]:
                rec_key = dmap["rec_key"]
                if daskey:
                    if dmap["das_key"] == daskey:
                        pat = dmap.get("pattern", None)
                        if value:
                            if pat.match(str(value)):
                                rkeys.append(rec_key)
                        else:
                            if rec_key not in rkeys:
                                rkeys.append(rec_key)
                else:
                    rkeys.append(rec_key)
        return rkeys

    def api2das(self, system, api_input_name):
        """
        Translates data-service API input parameter into DAS QL key,
        e.g. run_number => run.
        """
        query = {"system": system, "das_map.api_arg": api_input_name}
        names = []
        for adas in self.col.find(query, ["das_map"], exhaust=True):
            for row in adas["das_map"]:
                try:
                    if "api_arg" in row:
                        aparam = row["api_arg"]
                        daskey = row["das_key"]
                        if aparam == api_input_name and daskey not in names:
                            names.append(daskey)
                except Exception, err:
                    print "ERROR: look-up api_param/das_key in", row
                    raise err
        return names
Beispiel #16
0
class DASMapping(object):
    """
    This class manages DAS mapping DB.
    """
    __cached_inst = None
    __cached_params = None

    def __new__(cls, config):
        """
        creates a new instance of the class and cache it or return an existing
         instance if one exists (only when the params match).

        only the last instance is cached, but this simplifies the implementation
        as the param 'config' might be a complex unhashable object.
        """
        # check if we can reuse an existing instance
        if cls.__cached_inst and cls.__cached_params == config:
            if  config['verbose']:
                print("DASMapping::__new__: returning a cached instance")
            return cls.__cached_inst

        # otherwise create and initialize a new instance
        if  config['verbose']:
            print("DASMapping::__new__: creating a new instance")
        self = object.__new__(cls)

        self.verbose  = config['verbose']
        self.logger   = PrintManager('DASMapping', self.verbose)
        self.services = config['services']
        self.dburi    = config['mongodb']['dburi']
        self.dbname   = config['mappingdb']['dbname']
        self.colname  = config['mappingdb']['collname']
        self.map_test = config.get('map_test', True)
        self.main_dbs = config['das'].get('main_dbs', 'dbs3')
        self.dbsinsts = config['das'].get('dbs_instances', [])

        msg = "%s@%s" % (self.dburi, self.dbname)
        self.logger.info(msg)

        self.das_son_manipulator = DAS_SONManipulator()
        index = [('type', DESCENDING),\
                 ('system', DESCENDING),\
                 ('urn', DESCENDING),\
                 ('das_map.das_key', DESCENDING),\
                 ('das_map.rec_key', DESCENDING),\
                 ('das_map.api_arg', DESCENDING),\
                 ]
        create_indexes(self.col, index)

        self.daskeyscache = {}         # to be filled at run time
        self.systems = []              # to be filled at run time
        self.dasmapscache = {}         # to be filled at run time
        self.keymap = {}               # to be filled at run time
        self.presentationcache = {}    # to be filled at run time
        self.reverse_presentation = {} # to be filled at run time
        self.notationcache = {}        # to be filled at run time
        self.diffkeycache = {}         # to be filled at run time
        self.apicache = {}             # to be filled at run time
        self.dbs_global_url = None     # to be determined at run time
        self.dbs_inst_names = None     # to be determined at run time
        self.load_maps()

        # cache the instance and return it
        DASMapping.__cached_inst = self
        DASMapping.__cached_params = config
        return self

    @property
    def col(self):
        "col property provides access to DAS mapping collection"
        conn = db_connection(self.dburi)
        mdb  = conn[self.dbname]
        colnames = mdb.collection_names()
        if  not colnames or self.colname not in colnames:
            try:
                mdb.create_collection(self.colname)
            except OperationFailure:
                pass
        mdb.add_son_manipulator(self.das_son_manipulator)
        return mdb[self.colname]

    # ===============
    # Management APIs
    # ===============
    def load_maps(self):
        "Helper function to reload DAS maps"
        self.init_dasmapscache()
        self.init_notationcache()
        self.init_presentationcache()
        self.systems = None        # re-initialize DAS system list
        self.list_systems()
        self.dbs_global_url = None # re-initialize DAS dbs global url
        self.dbs_url()
        self.dbs_inst_names = None # re-initialize DAS dbs instances
        self.dbs_instances()

    def init_dasmapscache(self, records=None):
        "Read DAS maps and initialize DAS API maps"
        if  not records:
            spec = {'type':'service'}
            records = self.col.find(spec, **PYMONGO_OPTS)
        for row in records:
            if  'urn' in row:
                for dmap in row['das_map']:
                    for key, val in dmap.items():
                        if  key == 'pattern':
                            pat = re.compile(val)
                            dmap[key] = pat
                key = (row['system'], row['urn'])
                self.dasmapscache[key] = row

    def init_notationcache(self):
        """
        Initialize notation cache by reading notations.
        """
        for system, notations in self.notations().items():
            for row in notations:
                key = system, row['api_output']
                if  key in self.notationcache:
                    self.notationcache[key] += [ (row['api'], row['rec_key']) ]
                else:
                    self.notationcache[key] = [ (row['api'], row['rec_key']) ]

    def init_presentationcache(self):
        """
        Initialize presentation cache by reading presentation map.
        """
        spec  = {'type':'presentation'}
        data  = find_one(self.col, spec)
        if  data:
            self.presentationcache = data['presentation']
            for daskey, uilist in self.presentationcache.items():
                for row in uilist:
                    link = None
                    if  'link' in row:
                        link = row['link']
                    if  'diff' in row:
                        self.diffkeycache[daskey] = row['diff']
                    tdict = {daskey : {'mapkey': row['das'], 'link': link}}
                    if  row['ui'] in self.reverse_presentation:
                        self.reverse_presentation[row['ui']].update(tdict)
                    else:
                        self.reverse_presentation[row['ui']] = \
                                {daskey : {'mapkey': row['das'], 'link': link}}

    def das_presentation_map(self):
        "Read DAS presentation map"
        spec  = {'type':'presentation'}
        data  = find_one(self.col, spec)
        if  data:
            for _, uilist in data.get('presentation', {}).items():
                for row in uilist:
                    if  'link' in row:
                        yield row

    def delete_db(self):
        """
        Delete mapping DB in MongoDB back-end.
        """
        conn = db_connection(self.dburi)
        if  conn:
            conn.drop_database(self.dbname)

    def delete_db_collection(self):
        """
        Delete mapping DB collection in MongoDB.
        """
        conn = db_connection(self.dburi)
        if  conn:
            dbc  = conn[self.dbname]
            dbc.drop_collection(self.colname)

    def check_maps(self):
        """
        Check Mapping DB and return true/false based on its content
        """
        if  not self.map_test:
            return True # do not test DAS maps, useful for unit tests
        udict = defaultdict(int)
        ndict = defaultdict(int)
        pdict = defaultdict(int)
        adict = {}
        maps_hash = False
        for row in self.col.find(**PYMONGO_OPTS):
            check_map_record(row)
            if  'urn' in row:
                udict[row['system']] += 1
            elif 'notations' in row:
                ndict[row['system']] += 1
            elif 'presentation' in row:
                pdict['presentation'] += 1
            elif 'arecord' in row:
                arec = row['arecord']
                system = arec['system']
                rec = {arec['type']:arec['count']}
                if  system in adict:
                    adict[system].update(rec)
                else:
                    adict[system] = rec
            elif 'verification_token' in row:
                maps_hash = row['verification_token']

        # retrieve uri/notation/presentation maps
        ulist = []
        nlist = []
        for system in adict.keys():
            if  'uri' in adict[system]:
                ulist.append(adict[system]['uri'] == udict[system])
                nlist.append(adict[system]['notations'] == ndict[system])
        status_umap = sum(ulist) == len(ulist)
        status_nmap = sum(nlist) == len(nlist)
        status_pmap = adict.get('presentation', {}).get('presentation', 0) == 1
        # verify completeness of maps
        calc_token = verification_token(self.col.find(**PYMONGO_OPTS))
        status_complete = maps_hash and maps_hash == calc_token
        if  self.verbose:
            print("### DAS map status, umap=%s, nmap=%s, pmap=%s, complete=%s" \
                    % (status_umap, status_nmap, status_pmap, status_complete))
        if not status_complete:
            print("### DAS map hash do not match, got=%s calculated=%s" \
                    % (maps_hash, calc_token))
        # multiply statuses as a result of this map check
        return status_umap*status_nmap*status_pmap*status_complete

    def remove(self, spec):
        """
        Remove record in DAS Mapping DB for provided Mongo spec.
        """
        self.col.remove(spec)

    def add(self, record):
        """
        Add new record into mapping DB. Example of URI record

        .. doctest::

            {
             system:dbs,
             urn : listBlocks,
             url : "http://a.b.com/api"
             params : [{"apiversion":1_2_2, se:"*"}]
             lookup : block
             das_map: [
                 {"das_key":"block", "rec_key":"block.name"},
                 {"das_key":"site", "rec_key":"site.name", "api_arg":"se", "pattern":"^T[0-3]_},
             ]
            }

        Example of notation record:

        .. doctest::

             notations: [
                 {"api_output" : "storage_element_name", "rec_key":"site", "api": ""},
             ]
        """
        msg = 'record=%s' % record
        self.logger.debug(msg)
        self.col.insert(record)
        self.init_dasmapscache([record])

    # ==================
    # Informational APIs
    # ==================
    def dbs_global_instance(self, system=None):
        "Retrive from mapping DB DBS url and extract DBS instance"
        if  not system:
            system = self.main_dbs
        url = self.dbs_url(system)
        return get_dbs_instance(url)

    def dbs_url(self, system=None):
        "Retrive from mapping DB DBS url"
        if  not system:
            system = self.main_dbs
        systems = self.list_systems()
        dbses   = set(['dbs3'])
        if  dbses & set(systems) != dbses:
            # use caching only when we operate with single DBS
            if  self.dbs_global_url:
                return self.dbs_global_url
        url = None
        for srv in systems:
            if  srv == system:
                apis = self.list_apis(srv)
                url  = self.api_info(srv, apis[0])['url']
                url  = parse_dbs_url(srv, url)
                self.dbs_global_url = url
                return url
        return url

    def dbs_instances(self, system=None):
        "Retrive from mapping DB DBS instances"
        # use dbs istances from the config
        if  self.dbsinsts and not system:
            return self.dbsinsts
        # default dbs
        if  not system:
            system = self.main_dbs
        systems = self.list_systems()
        dbses   = set(['dbs3'])
        if  dbses & set(systems) != dbses:
            # use caching only when we operate with single DBS
            if  self.dbs_inst_names:
                return self.dbs_inst_names
        insts = []
        dbs_global_inst = self.dbs_global_instance(system)
        if  system == 'dbs3' and dbs_global_inst:
            dbs_namespace = dbs_global_inst.split('/')[0]
        else:
            dbs_namespace = None
        for srv in systems:
            if  srv == system:
                apis  = self.list_apis(srv)
                insts = self.api_info(srv, apis[0])['instances']
                if  dbs_namespace:
                    insts = [d for d in insts if d.startswith(dbs_namespace)]
                self.dbs_inst_names = insts
                return insts
        return insts

    def list_systems(self):
        """
        List all DAS systems.
        """
        if  not self.systems:
            spec = { 'type': 'service', 'system' : { '$ne' : None } }
            gen  = (row['system'] \
                    for row in self.col.find(spec, ['system'], **PYMONGO_OPTS))
            self.systems = list( set(gen2list(gen)) & set(self.services) )
        return self.systems

    def list_apis(self, system=None):
        """
        List all APIs.
        """
        if  self.apicache and system in self.apicache:
            return self.apicache[system]
        spec = { 'type': 'service', 'urn' : { '$ne' : None } }
        if  system:
            spec['system'] = system
        gen  = (row['urn'] \
                for row in self.col.find(spec, ['urn'], **PYMONGO_OPTS))
        self.apicache[system] = gen2list(gen)
        return self.apicache[system]

    def api_info(self, srv, api_name):
        """
        Return full API info record.
        """
        return self.dasmapscache[(srv, api_name)]

    def relational_keys(self, system1, system2):
        """
        Return a list of relational keys between provided systems
        """
        for system, keys in self.daskeys().items():
            if  system == system1:
                keys1 = keys
            if  system == system2:
                keys2 = keys
        return list( set(keys1) & set(keys2) )

    def daskeys(self, das_system=None):
        """
        Return a dict with all known DAS keys.
        """
        if  das_system in self.daskeyscache:
            return self.daskeyscache[das_system]

        spec  = { 'type': 'service', 'system' : { '$ne' : None } }
        if  das_system:
            spec  = { 'system' : das_system }
        gen   = (row['system'] \
                for row in self.col.find(spec, ['system'], **PYMONGO_OPTS))
        gen   = [r for r in gen]
        kdict = {}
        for system in gen:
            spec = {'system':system, 'urn':{'$ne':None}}
            keys = []
            for row in self.col.find(spec, **PYMONGO_OPTS):
                for entry in row['das_map']:
                    if  entry['das_key'] not in keys:
                        keys.append(entry['das_key'])
            kdict[system] = keys
        # cache it
        self.daskeyscache[das_system] = kdict
        return kdict

    # ============
    # Look-up APIs
    # ============
    def api_lkeys(self, das_system, api):
        """
        Return DAS lookup keys for given das system and api
        """
        entry = self.dasmapscache[(das_system, api)]
        skeys = entry['lookup'].split(',')
        return skeys

    def primary_key(self, das_system, urn):
        """
        Return DAS primary key for provided system and urn. The DAS primary key
        is a first entry in *lookup* attribute of DAS API record.
        """
        spec = {'system':das_system, 'urn':urn}
        record = find_one(self.col, spec)
        if  not record:
            return None
        pkey = record['lookup']
        if  pkey.find(',') != -1:
            pkey = pkey.split(',')[0]
        return pkey

    def primary_mapkey(self, das_system, urn):
        """
        Return DAS primary map key for provided system and urn. For example,
        the file DAS key is mapped to file.name, so this API will return
        file.name
        """
        spec = {'system':das_system, 'urn':urn}
        record = find_one(self.col, spec)
        mapkey = []
        for row in record['das_map']:
            lkey = record['lookup']
            if  lkey.find(',') != -1:
                lkey = lkey.split(',')[0]
            if  row['das_key'] == lkey:
                return row['rec_key']
        return mapkey

    def find_daskey(self, das_system, map_key, value=None):
        """
        Find das key for given system and map key.
        """
        msg   = 'system=%s\n' % das_system
        daskeys = []
        for key, record in self.dasmapscache.items():
            srv, _ = key
            if  das_system != srv:
                continue
            for row in record['das_map']:
                das_key = row['das_key']
                rec_key = row['rec_key']
                if  rec_key != map_key:
                    continue
                pat = row.get('pattern', None)
                if  value:
                    if  pat:
                        if  pat.match(str(value)):
                            daskeys.append(das_key)
                        else:
                            msg += '-- reject key=%s, val=%s, pat=%s\n'\
                                    % (map_key, value, pat.pattern)
                            self.logger.debug(msg)
                    else:
                        daskeys.append(das_key)
                else:
                    daskeys.append(das_key)
        return daskeys

    def find_mapkey(self, das_system, das_key, value=None):
        """
        Find map key for given system and das key.
        """
        msg   = 'system=%s\n' % das_system
        for key, record in self.dasmapscache.items():
            srv, _ = key
            if  das_system != srv:
                continue
            for row in record['das_map']:
                if  row['das_key'] != das_key:
                    continue
                rec_key = row['rec_key']
                pat = row.get('pattern', None)
                if  value:
                    if  pat:
                        if  pat.match(str(value)):
                            return rec_key
                        else:
                            msg += '-- reject key=%s, val=%s, pat=%s\n'\
                                    % (das_key, value, pat.pattern)
                            self.logger.debug(msg)
                            continue
                    else:
                        return rec_key
                else:
                    return rec_key

    def mapkeys(self, daskey):
        """
        Find all lookup keys (primary keys) for a given daskey
        """
        if  daskey in self.keymap:
            return self.keymap[daskey]
        spec = {'das_map.das_key' : daskey}
        mapkeys = []
        for row in self.col.find(spec, ['das_map'], **PYMONGO_OPTS):
            for kmap in row['das_map']:
                if  kmap['das_key'] == daskey and \
                    kmap['rec_key'] not in mapkeys:
                    mapkeys.append(kmap['rec_key'])
        self.keymap[daskey] = mapkeys
        return self.keymap[daskey]

    def find_apis(self, das_system, map_key):
        """
        Find list of apis which correspond to provided
        system and das map key.
        """
        spec  = { 'system' : das_system, 'das_map.rec_key': map_key }
        apilist = []
        for row in self.col.find(spec, ['urn'], **PYMONGO_OPTS):
            if  'urn' in row and row['urn'] not in apilist:
                apilist.append(row['urn'])
        return apilist

    def find_system(self, key):
        """
        Return system name for provided DAS key.
        """
        spec = { 'das_map.das_key' : key }
        gen  = (row['system'] \
                for row in self.col.find(spec, ['system'], **PYMONGO_OPTS))
        systems = []
        for system in gen:
            if  system not in systems:
                systems.append(system)
        systems.sort()
        return systems

    def lookup_keys(self, system, api, daskey=None, value=None):
        """
        Returns lookup keys for given system and provided
        selection DAS key, e.g. block => block.name
        """
        entry = self.dasmapscache.get((system, api), None)
        if  not entry:
            return []
        lkeys = entry.get('lookup', []).split(',')
        rkeys = []
        if  daskey in lkeys:
            for dmap in entry['das_map']:
                rec_key = dmap['rec_key']
                if  daskey:
                    if  dmap['das_key'] == daskey:
                        pat = dmap.get('pattern', None)
                        if  value:
                            if  pat.match(str(value)):
                                rkeys.append(rec_key)
                        else:
                            if  rec_key not in rkeys:
                                rkeys.append(rec_key)
                else:
                    rkeys.append(rec_key)
        return rkeys

    def api2das(self, system, api_input_name):
        """
        Translates data-service API input parameter into DAS QL key,
        e.g. run_number => run.
        """
        query = {'system':system, 'das_map.api_arg' : api_input_name}
        names = []
        for adas in self.col.find(query, ['das_map'], **PYMONGO_OPTS):
            for row in adas['das_map']:
                try:
                    if  'api_arg' in row:
                        aparam = row['api_arg']
                        daskey = row['das_key']
                        if  aparam == api_input_name and daskey not in names:
                            names.append(daskey)
                except Exception as err:
                    print("ERROR: look-up api_param/das_key in", row)
                    raise err
        return names

    def check_api_match(self, system, api, icond):
        "Check if given API covers condition parameters"
        entry = self.dasmapscache.get((system, api), None)
        if  not entry:
            return False
        ikeys = [k.split('.')[0] for k in icond.keys()]
        dkeys = []
        for row in entry.get('das_map', []):
            if  'api_arg' in row:
                das_key = row['das_key']
                dkeys.append(das_key)
            else:
                dkeys.append(row['das_key'])
        if  set(ikeys) & set(dkeys) == set(ikeys):
            return True
        return False

    def das2api(self, system, api, rec_key, value=None):
        """
        Translates DAS record key into data-service API input parameter,
        e.g. run.number => run_number
        """
        entry = self.dasmapscache.get((system, api), None)
        names = []
        if  not entry:
            return [rec_key]
        for row in entry.get('das_map', []):
            if  'api_arg' in row:
                api_param = row['api_arg']
                pat = row.get('pattern', None)
                if  row['rec_key'] != rec_key:
                    continue
                if  value and pat:
                    if  isinstance(value, dict):
                        if pat.match(json.dumps(value.values()[0])):
                            if  api_param not in names:
                                names.append(api_param)
                    if  pat.match(str(value)):
                        if  api_param not in names:
                            names.append(api_param)
                else:
                    if  api_param not in names:
                        names.append(api_param)
            else:
                names.append(row['rec_key'])
        return names

    def notations(self, system=None):
        """
        Return DAS notation map.
        """
        notationmap = {}
        spec = {'type':'notation'}
        if  system:
            spec['system'] = system
        for item in self.col.find(spec, **PYMONGO_OPTS):
            notationmap[item['system']] = item['notations']
        return notationmap

    def notation2das(self, system, api_param, api=""):
        """
        Translates data-service API parameter name into DAS name, e.g.
        run_number=run. In case when api_param is not presented in DB
        just return it back.
        """
        if  not self.notationcache:
            self.init_notationcache()
        name = api_param
        if  (system, api_param) in self.notationcache:
            for item in self.notationcache[(system, api_param)]:
                _api, das_name = item
                if  _api:
                    if  _api == api:
                        name = das_name
                        break
                else: # valid for all API names
                    name = das_name
        return name

    def api2daskey(self, system, api):
        """
        Returns list of DAS keys which cover provided data-service API
        """
        spec = {'system':system, 'urn':api}
        keys = []
        for row in self.col.find(spec, **PYMONGO_OPTS):
            for entry in row['das_map']:
                keys.append(entry['das_key'])
        return keys

    def servicemap(self, system):
        """
        Constructs data-service map, e.g.

        .. doctest::

            {api: {keys:[list of DAS keys], params: args,
             url:url, format:ext, expire:exp} }
        """
        spec = {'system':system, 'urn':{'$ne':None}}
        smap = {}
        for row in self.col.find(spec, **PYMONGO_OPTS):
            url  = row['url']
            exp  = row['expire']
            ext  = row['format']
            api  = row['urn']
            lookup = row['lookup']
            wild = row.get('wild_card', '*')
            ckey = row.get('ckey')
            cert = row.get('cert')
            services = row.get('services', '')
            keys = []
            for entry in row['das_map']:
                keys.append(entry['das_key'])
            params = dict(row['params'])
            smap[api] = dict(keys=keys, params=params, url=url, expire=exp,\
                            format=ext, wild_card=wild, ckey=ckey, cert=cert,\
                            services=services, lookup=lookup)
        return smap

    def presentation(self, daskey):
        """
        Return web UI presentation keys for provided DAS keyword.
        For example once asked for block we present block.name, block.size, etc.
        """
        if  daskey in self.presentationcache:
            return self.presentationcache[daskey]
        return [daskey]

    def daskey_from_presentation(self, uikey):
        """
        Return triplet (DAS key, DAS access key, link)
        associated with provided UI key.
        """
        if  uikey in self.reverse_presentation:
            return self.reverse_presentation[uikey]

    def diff_keys(self, daskey):
        """
        Return diff keys for provided DAS key.
        """
        if  daskey in self.diffkeycache:
            return self.diffkeycache[daskey]
        return []

    def inputvalues_uris(self):
        """
        Return the info on how to fetch the list of allowed input values for
        certain commonly used input fields (from enabled DAS systems only)
        """
        uris = []
        for row in self.col.find({'type': 'input_values'}, **PYMONGO_OPTS):
            # check that system is active
            if row['system'] not in self.services:
                continue
            uris.extend(row['input_values'])
        return uris
Beispiel #17
0
class DASAbstractService(object):
    """
    Abstract class describing DAS service. It initialized with a name which
    is used to identify service parameters from DAS configuration file.
    Those parameters are keys, verbosity level, URL of the data-service.
    """
    def __init__(self, name, config):
        self.name = name
        try:
            self.verbose = config['verbose']
            title = 'DASAbstactService_%s' % self.name
            self.logger = PrintManager(title, self.verbose)
            self.dasmapping = config['dasmapping']
            self.write2cache = config.get('write_cache', True)
            self.multitask = config['das'].get('multitask', True)
            self.error_expire = config['das'].get('error_expire', 300)
            self.dbs_global = None  # to be configured at run time
            self.dburi = config['mongodb']['dburi']
            engine = config.get('engine', None)
            self.gfs = db_gridfs(self.dburi)
        except Exception as exc:
            print_exc(exc)
            raise Exception('fail to parse DAS config')

        # read key/cert info
        try:
            self.ckey, self.cert = get_key_cert()
        except Exception as exc:
            print_exc(exc)
            self.ckey = None
            self.cert = None

        if self.multitask:
            nworkers = config['das'].get('api_workers', 3)
            thr_weights = config['das'].get('thread_weights', [])
            for system_weight in thr_weights:
                system, weight = system_weight.split(':')
                if system == self.name:
                    nworkers *= int(weight)
#             if  engine:
#                 thr_name = 'DASAbstractService:%s:PluginTaskManager' % self.name
#                 self.taskmgr = PluginTaskManager(\
#                         engine, nworkers=nworkers, name=thr_name)
#                 self.taskmgr.subscribe()
#             else:
#                 thr_name = 'DASAbstractService:%s:TaskManager' % self.name
#                 self.taskmgr = TaskManager(nworkers=nworkers, name=thr_name)
            thr_name = 'DASAbstractService:%s:TaskManager' % self.name
            self.taskmgr = TaskManager(nworkers=nworkers, name=thr_name)
        else:
            self.taskmgr = None

        self.map = {}  # to be defined by data-service implementation
        self._keys = None  # to be defined at run-time in self.keys
        self._params = None  # to be defined at run-time in self.parameters
        self._notations = {}  # to be defined at run-time in self.notations

        self.logger.info('initialized')
        # define internal cache manager to put 'raw' results into cache
        if 'rawcache' in config and config['rawcache']:
            self.localcache = config['rawcache']
        else:
            msg = 'Undefined rawcache, please check your configuration'
            raise Exception(msg)

    def status(self):
        "Return status of the service"
        return self.taskmgr.status()

    def services(self):
        """
        Return sub-subsystems used to retrieve data records. It is used
        in dasheader call to setup das.services field. This method can be
        overwritten in sub-classes, otherwise returns dict of service name
        and CMS systems used to retrieve data records.
        """
        return {self.name: [self.name]}

    def version(self):
        """Return data-services version, should be implemented in sub-classes"""
        return ''

    def keys(self):
        """
        Return service keys
        """
        if self._keys:
            return self._keys
        srv_keys = []
        for _api, params in self.map.items():
            for key in params['keys']:
                if not key in srv_keys:
                    srv_keys.append(key)
        self._keys = srv_keys
        return srv_keys

    def parameters(self):
        """
        Return mapped service parameters
        """
        if self._params:
            return self._params
        srv_params = []
        for _api, params in self.map.items():
            for key in params['params']:
                param_list = self.dasmapping.api2das(self.name, key)
                for par in param_list:
                    if not par in srv_params:
                        srv_params.append(par)
        self._params = srv_params
        return srv_params

    def notations(self):
        """
        Return a map of system notations.
        """
        if self._notations:
            return self._notations
        for _, rows in self.dasmapping.notations(self.name).items():
            for row in rows:
                api = row['api']
                nmap = row['rec_key']
                notation = row['api_output']
                if api in self._notations:
                    self._notations[api].update({notation: nmap})
                else:
                    self._notations[api] = {notation: nmap}
        return self._notations

    def getdata(self, url, params, expire, headers=None, post=None):
        """URL call wrapper"""
        if url.find('https:') != -1:
            return getdata(url,
                           params,
                           headers,
                           expire,
                           post,
                           self.error_expire,
                           self.verbose,
                           self.ckey,
                           self.cert,
                           system=self.name)
        else:
            return getdata(url,
                           params,
                           headers,
                           expire,
                           post,
                           self.error_expire,
                           self.verbose,
                           system=self.name)

    def call(self, dasquery):
        """
        Invoke service API to execute given query.
        Return results as a collect list set.
        """
        self.logger.info(dasquery)
        # check the cache for records with given query/system
        res = self.localcache.incache(dasquery,
                                      collection='cache',
                                      system=self.name)
        if res:
            msg = "found records in local cache"
            self.logger.info(msg)
            return
        # ask data-service api to get results, they'll be store them in
        # cache, so return at the end what we have in cache.
        self.api(dasquery)

    def write_to_cache(self, dasquery, expire, url, api, args, gen, ctime):
        """
        Write provided result set into DAS cache.
        """
        if not self.write2cache:
            return

        # before going to cache we should check/set possible misses, e.g.
        # primary key when error is thrown
        result = self.set_misses(dasquery, api, gen)

        # update the cache
        header = dasheader(self.name,
                           dasquery,
                           expire,
                           api,
                           url,
                           services=self.services())
        header['lookup_keys'] = self.lookup_keys(api)
        header['prim_key'] = self.dasmapping.primary_mapkey(self.name, api)
        header['ctime'] = ctime
        system = self.name
        self.localcache.update_cache(dasquery, result, header, system, api)

        msg = 'cache has been updated,\n'
        self.logger.debug(msg)

    def adjust_params(self, api, kwds, instance=None):
        """
        Data-service specific parser to adjust parameters according to
        its specifications. For example, DQ service accepts a string
        of parameters, rather parameter set, while DBS2 can reuse
        some parameters for different API, e.g. I can use dataset path
        to pass to listPrimaryDatasets as primary_dataset pattern.
        """
        pass

    def lookup_keys(self, api):
        """
        Return look-up keys of data output for given data-service API.
        """
        lkeys = self.dasmapping.lookup_keys(self.name, api)
        return [{api: lkeys}]

    def inspect_params(self, api, args):
        """
        Perform API parameter inspection. Check if API accept a range
        of parameters, etc.
        """
        for key, value in args.items():
            if isinstance(value, dict):
                minval = None
                maxval = None
                for oper, val in value.items():
                    if oper == '$in':
                        minval = int(val[0])
                        maxval = int(val[-1])
                        args[key] = range(minval, maxval)
                    elif oper == '$lt':
                        maxval = int(val)
                        args[key] = maxval
                    elif oper == '$lte':
                        maxval = int(val)
                        args[key] = maxval
                    elif oper == '$gt':
                        minval = int(val)
                        args[key] = minval
                    elif oper == '$gte':
                        minval = int(val)
                        args[key] = minval
                    else:
                        msg = '%s does not support operator %s' % (api, oper)
                        raise Exception(msg)
        return args

    def get_notations(self, api):
        """Return notations used for given API"""
        notationmap = self.notations()
        if not notationmap:
            return {}
        notations = {}
        if '' in notationmap:
            notations = dict(notationmap[''])  # notations applied to all APIs
            if api in notationmap:  # overwrite the one for provided API
                notations.update(notationmap[api])
        return notations

    def parser(self, dasquery, dformat, data, api):
        """
        DAS data parser. Input parameters:

        - *query* input DAS query
        - *dformat* is a data format, e.g. XML, JSON
        - *data* is a data source, either file-like object or
          actual data
        - *api* is API name
        """
        prim_key = self.dasmapping.primary_key(self.name, api)
        counter = 0
        if dformat.lower() == 'xml':
            tags = self.dasmapping.api2daskey(self.name, api)
            gen = xml_parser(data, prim_key, tags)
            for row in gen:
                counter += 1
                yield row
        elif dformat.lower() == 'json' or dformat.lower() == 'dasjson':
            gen = json_parser(data, self.logger)
            das_dict = {}
            for row in gen:
                if dformat.lower() == 'dasjson':
                    for key, val in row.items():
                        if key != 'results':
                            das_dict[key] = val
                    row = row['results']
                if isinstance(row, list):
                    for item in row:
                        if item:
                            if prim_key in item:
                                counter += 1
                                yield item
                            else:
                                counter += 1
                                yield {prim_key: item}
                else:
                    if prim_key in row:
                        counter += 1
                        yield row
                    else:
                        counter += 1
                        yield {prim_key: row}
        else:
            msg = 'Unsupported data format="%s", API="%s"' % (dformat, api)
            raise Exception(msg)
        msg = "api=%s, format=%s " % (api, dformat)
        msg += "prim_key=%s yield %s rows" % (prim_key, counter)
        self.logger.info(msg)

    def translator(self, api, genrows):
        """
        Convert raw results into DAS records. 
        """
        prim_key = self.dasmapping.primary_key(self.name, api)
        count = 0
        for row in genrows:
            row2das(self.dasmapping.notation2das, self.name, api, row)
            count += 1
            # check for primary key existance, since it can be overriden
            # by row2das. For example DBS3 uses flat namespace, so we
            # override dataset=>name, while dataset still is a primary key
            if isinstance(row, list):
                yield {prim_key: row}
            elif prim_key in row:
                if prim_key in row[prim_key]:
                    yield row[prim_key]  # remapping may create nested dict
                else:
                    yield row
            else:
                yield {prim_key: row}
        msg = "yield %s rows" % count
        self.logger.debug(msg)

    def set_misses(self, dasquery, api, genrows):
        """
        Check and adjust DAS records wrt input query. If some of the DAS
        keys are missing, add it with its value to the DAS record.
        """
        # look-up primary key
        prim_key = self.dasmapping.primary_key(self.name, api)

        # Scan all docs and store those whose size above MongoDB limit into
        # GridFS
        map_key = self.dasmapping.primary_mapkey(self.name, api)
        genrows = parse2gridfs(self.gfs, map_key, genrows, self.logger)

        spec = dasquery.mongo_query['spec']
        row = next(genrows)
        ddict = DotDict(row)
        keys2adjust = []
        for key in spec.keys():
            val = ddict.get(key)
            if spec[key] != val and key not in keys2adjust:
                keys2adjust.append(key)
        msg = "adjust keys %s" % keys2adjust
        self.logger.debug(msg)
        count = 0
        if keys2adjust:
            # adjust of the rows
            for row in yield_rows(row, genrows):
                ddict = DotDict(row)
                pval = ddict.get(map_key)
                if isinstance(pval, dict) and 'error' in pval:
                    ddict[map_key] = ''
                    ddict.update({prim_key: pval})
                for key in keys2adjust:
                    value = spec[key]
                    existing_value = ddict.get(key)
                    # the way to deal with proximity/patern/condition results
                    if  (isinstance(value, str) or isinstance(value, unicode))\
                        and value.find('*') != -1: # we got pattern
                        if existing_value:
                            value = existing_value
                    elif isinstance(value, dict) or \
                        isinstance(value, list): # we got condition
                        if existing_value:
                            value = existing_value
                        elif isinstance(value, dict) and \
                        '$in' in value: # we got a range {'$in': []}
                            value = value['$in']
                        elif isinstance(value, dict) and \
                        '$lte' in value and '$gte' in value:
                            # we got a between range
                            value = [value['$gte'], value['$lte']]
                        else:
                            value = json.dumps(value)
                    elif existing_value and value != existing_value:
                        # we got proximity results
                        if 'proximity' in ddict:
                            proximity = DotDict({key: existing_value})
                            ddict['proximity'].update(proximity)
                        else:
                            proximity = DotDict({})
                            proximity[key] = existing_value
                            ddict['proximity'] = proximity
                    else:
                        if existing_value:
                            value = existing_value
                    ddict[key] = value
                yield ddict
                count += 1
        else:
            yield row
            for row in genrows:
                yield row
                count += 1
        msg = "yield %s rows" % count
        self.logger.debug(msg)

    def api(self, dasquery):
        """
        Data service api method, can be defined by data-service class.
        It parse input query and invoke appropriate data-service API
        call. All results are stored into the DAS cache along with
        api call inserted into Analytics DB.
        """
        self.logger.info(dasquery)
        genrows = self.apimap(dasquery)
        if not genrows:
            return
        jobs = []
        for url, api, args, dformat, expire in genrows:
            # insert DAS query record for given API
            header = dasheader(self.name, dasquery, expire, api, url)
            self.localcache.insert_query_record(dasquery, header)
            # fetch DAS data records
            if self.multitask:
                jobs.append(self.taskmgr.spawn(self.apicall, \
                            dasquery, url, api, args, dformat, expire))
            else:
                self.apicall(dasquery, url, api, args, dformat, expire)
        if self.multitask:
            self.taskmgr.joinall(jobs)

    def apicall(self, dasquery, url, api, args, dformat, expire):
        """
        Data service api method, can be defined by data-service class.
        It parse input query and invoke appropriate data-service API
        call. All results are stored into the DAS cache along with
        api call inserted into Analytics DB.

        We invoke explicitly close call for our datastream instead
        of using context manager since this method as well as
        getdata/parser can be overwritten by child classes.
        """
        datastream = None
        try:
            args = self.inspect_params(api, args)
            time0 = time.time()
            headers = make_headers(dformat)
            datastream, expire = self.getdata(url, args, expire, headers)
            self.logger.info("%s expire %s" % (api, expire))
            rawrows = self.parser(dasquery, dformat, datastream, api)
            dasrows = self.translator(api, rawrows)
            ctime = time.time() - time0
            self.write_to_cache(dasquery, expire, url, api, args, dasrows,
                                ctime)
        except Exception as exc:
            msg  = 'Fail to process: url=%s, api=%s, args=%s' \
                    % (url, api, args)
            print(msg)
            print_exc(exc)
        close(datastream)

    def url_instance(self, url, _instance):
        """
        Virtual method to adjust URL for a given instance,
        must be implemented in service classes
        """
        return url

    def adjust_url(self, url, instance):
        """
        Adjust data-service URL wrt provided instance, e.g.
        DBS carry several instances
        """
        if instance:
            url = self.url_instance(url, instance)
        return url

    def apimap(self, dasquery):
        """
        Analyze input query and yield url, api, args, format, expire
        for further processing.
        """
        srv = self.name  # get local copy to avoid threading issues
        cond = getarg(dasquery.mongo_query, 'spec', {})
        instance = dasquery.mongo_query.get('instance', self.dbs_global)
        skeys = getarg(dasquery.mongo_query, 'fields', [])
        if not skeys:
            skeys = []
        self.logger.info("\n")
        for api, value in self.map.items():
            expire = value['expire']
            iformat = value['format']
            url = self.adjust_url(value['url'], instance)
            if not url:
                msg = '--- rejects API %s, no URL' % api
                self.logger.info(msg)
                continue
            args = dict(value['params'])  # make new copy, since we'll adjust
            wild = value.get('wild_card', '*')
            found = 0
            # check if input parameters are covered by API
            if not self.dasmapping.check_api_match(srv, api, cond):
                msg = '--- rejects API %s, does not cover input condition keys' \
                        % api
                self.logger.info(msg)
                continue
            # once we now that API covers input set of parameters we check
            # every input parameter for pattern matching
            for key, val in cond.items():
                # check if keys from conditions are accepted by API
                # need to convert key (which is daskeys.map) into
                # input api parameter
                for apiparam in self.dasmapping.das2api(srv, api, key, val):
                    if apiparam in args:
                        args[apiparam] = val
                        found += 1
            # VK 20160708, wrong statement, it caused to pass
            # datasets API for query dataset in [path1, path2]
            # I'll leave block here until I test and verify that
            # commented out block will not cause other issues
            #
            # check the case when we only have single condition key
            # and it is the key we look-up
#             if  not found and skeys == [k.split('.')[0] for k in cond.keys()]:
#                 found = 1
# check if number of keys on cond and args are the same
            if len(cond.keys()) != found:
                msg = "--- reject API %s, not all condition keys are covered" \
                        % api
                self.logger.info(msg)
                msg = 'args=%s' % args
                self.logger.debug(msg)
                continue
            if not found:
                msg = "--- rejects API %s, parameters don't match" % api
                self.logger.info(msg)
                msg = 'args=%s' % args
                self.logger.debug(msg)
                continue
            self.adjust_params(api, args, instance)
            # delete args keys whose value is optional
            delete_keys(args, 'optional')
            # check that there is no "required" parameter left in args,
            # since such api will not work
            if 'required' in args.values():
                msg = '--- rejects API %s, parameter is required' % api
                self.logger.info(msg)
                msg = 'args=%s' % args
                self.logger.debug(msg)
                continue
            # adjust pattern symbols in arguments
            if wild != '*':
                for key, val in args.items():
                    if isinstance(val, str) or isinstance(val, unicode):
                        val = val.replace('*', wild)
                    args[key] = val

            # compare query selection keys with API look-up keys
            api_lkeys = self.dasmapping.api_lkeys(srv, api)
            if set(api_lkeys) != set(skeys):
                msg = "--- rejects API %s, api_lkeys(%s)!=skeys(%s)"\
                        % (api, api_lkeys, skeys)
                self.logger.info(msg)
                continue

            msg = '+++ %s passes API %s' % (srv, api)
            self.logger.info(msg)
            msg = 'args=%s' % args
            self.logger.debug(msg)

            msg = "yield "
            msg += "system ***%s***, url=%s, api=%s, args=%s, format=%s, " \
                % (srv, url, api, args, iformat)
            msg += "expire=%s, wild_card=%s" \
                % (expire, wild)
            self.logger.debug(msg)

            yield url, api, args, iformat, expire
Beispiel #18
0
class ValueHotspot(HotspotBase):
    """
    This analyzer identifies all arguments to a given primary key,
    selects the top _fraction_ keys. If mode is "calls", the selected
    keys are the keys containing the top fraction of all calls,
    whereas if the mode is "keys", the selection is just the top
    fraction of all keys sorted by number of calls. Finally, you
    can use mode "fixed", in which case the _fraction_ kwarg should
    be a number >= 1 and that many keys are selected (provided they
    exist).
    
    The calls to be considered are defined by _period_ (default 1month).
    
    The optional argument 'allowed_gap' is the maximum gap in
    the summary record we are happy to ignore (default 1h).
    
    The summary identifier is "valuehotspot-key"
    It spawns querymaintainer jobs of name "valuehotspot-foundkey" which
    try and sensibly maintain the given query in cache.
    
    If the option "allow_wildcarding" is given, queries containing
    a wildcard will be considered. Otherwise, these will be ignored.
    
    If the option "find_supersets" is given, it will try and find
    superset queries (already in the cache) answering multiple hot keys
    and have them maintained instead of the specific one.
    
    The option "preempt" controls how long before data expiry the 
    re-fetch is scheduled. By default this is 1 minute (which may 
    be inappropriate for some fast-flux data).
    
    """
    task_options = [{'name':'key', 'type':'string', 'default':None,
                   'help':'DAS primary key to work with'},
                  {'name':'fraction', 'type':'float', 'default':0.15,
                   'help':'Fraction of queries to satisfy (in calls mode) or fraction of all keys (in keys mode) or absolute number of keys (in fixed mode)'},
                  {'name':'mode', 'type':'string', 'default':'calls',
                   'help':'How to interpret the _fraction_ argument'},
                  {'name':'period', 'type':'int', 'default':30*24*60*60,
                   'help':'Duration over which the moving average is performed (in sec)'},
                  {'name':'allowed_gap', 'type':'int', 'default':60*60,
                   'help':'Ignore gaps in summary records of up to this many seconds (<< _period_)'},
                  {'name':'allow_wildcarding', 'type':'bool', 'default':'true',
                   'help':'Whether to consider wildcard queries. Probably best with _find_supersets_'},
                  {'name':'find_supersets', 'type':'bool', 'default':'true',
                   'help':'If wildcard queries are allowed, identify and remove redundant wildcard queries'},
                  {'name':'preempt', 'type':'int', 'default':60,
                   'help':'Cache filling jobs are spawned to refresh data this many seconds before expiry'},
                  {'name':'fields', 'type':'list', 'default':None,
                   'help':'Fields that should be queried, each resulting in "field key=<value>". An attempt will be made to determine them from the mapping if unspecified.'},
                  {'name':'instance', 'type':'string', 'default':'cms_dbs_prod_global',
                   'help':'DBS instance to include in queries'}]
    
    def __init__(self, **kwargs):
        self.key = kwargs['key']
        self.logger = PrintManager('ValueHotspot', kwargs.get('verbose', 0))
        self.allow_wildcarding = kwargs.get('allow_wildcarding', False)
        self.find_supersets = kwargs.get('find_supersets', False)
        self.preempt = int(kwargs.get('preempt', 60))
        self.fields = kwargs.get('fields', None)
        self.instance = kwargs.get('instance', 'cms_dbs_prod_global')
        
        HotspotBase.__init__(self,
                             identifier="valuehotspot-%s" % \
                             (self.key.replace('.','-')),
                             **kwargs)
        
        # set fields if look-up key is present
        if  not self.fields and self.key:
            self.fields = [self.key.split('.')[0]]

        # finally if fields is not yet set, look-up all DAS keys allowed
        # for given query
        if  not self.fields:
            try:
                self.fields = set()
                self.das.mapping.init_presentationcache()
                plist = self.das.mapping.presentation(self.key.split('.', 1)[0])
                for item in plist:
                    if 'link' in item:
                        for link in item['link']:
                            if len(link['query'].split(' ')) == 2:
                                self.fields.add(link['query'].split(' ')[0])
                self.fields.add(self.key.split('.', 1)[0])
                self.fields = list(self.fields)
            except:
                self.fields = []
                    
    
    def generate_task(self, item, count, epoch_start, epoch_end):
        """
        Generate task callback function. It loop over internal fields, e.g.
        dataset, file, etc., build DAS query and requests its expiration
        timestamp. For scheduled queries it yields QueryMaintainer task.
        """
        only_before = epoch_end + self.interval
        for field in self.fields:
            query = {'fields': [field],
                     'spec':[{'key':self.key, 'value': item}], 
                     'instance':self.instance}
            dasquery = DASQuery(query)
            expiry = self.get_query_expiry(dasquery)
            schedule = expiry - self.preempt
            if  schedule < time.time() + 60:
                schedule = time.time() + 60
            interval = schedule - time.time()
            itemname = item.replace('"','')
            if schedule < only_before:
                yield {'classname': 'QueryMaintainer',
                        'name': '%s-%s-%s' % (self.identifier, itemname, field),
                        'only_before': only_before,
                        'interval': interval,
                        'kwargs':{'dasquery':dasquery.storage_query,
                                  'preempt':self.preempt}}
    
    def preselect_items(self, items):
        """
        Select items for task generation.

        TODO: it would be nice to implement clustering algorithm
        which will pass only items with higher weight.
        """
        if not self.allow_wildcarding:
            for key in items.keys():
                if '*' in key:
                    del items[key]
        return items
    
    def mutate_items(self, items):
        """
        Mutate items for task generation. This is a last call in
        selection chain.
        """
        if self.find_supersets:
            new_keys = self.get_superset_keys(items.keys())
            return dict([(k, items.get(k, 0)) for k in new_keys])
        else:
            return items
    
    def get_superset_keys(self, keys):
        """
        For multiple keys, try and identify an existing queries for
        wildcard supersets of the keys, and reduce the keylist appropriately.
        Important to note, this only uses existing queries (won't try
        and make new ones).
        """
        superset_cache = {}
        
        keys = set(keys)
        change_made = True
        while change_made:
            change_made = False
            for key in list(keys):
                if key in superset_cache:
                    superset_keys = superset_cache[key]
                else:
                    try:
                        superset_keys = \
                            sorted([k for k in \
                            self.das.rawcache.get_superset_keys(self.key, key)],\
                            key=len)
                    except:
                        superset_keys = []
                if superset_keys:
                    super_key = superset_keys[0]
                    for key in keys:
                        if fnmatch.fnmatch(super_key, key):
                            keys.remove(key)
                            keys.add(super_key)
                            change_made = True
                    if change_made:
                        break
        return keys
    
    def get_query_expiry(self, dasquery):
        """
        Extract analytics apicall the expire timestamp for given query.
        If query is not found in DAS cache, it invokes DAS call.
        """
        err_return = time.time() + (2*self.preempt)
        try:
            if not self.das.rawcache.incache(dasquery):
                try:
                    self.das.call(dasquery, add_to_analytics=False)
                except Exception as err:
                    print "\n### FAIL input query=%s, err=%s" \
                            % (dasquery, str(err))
                    raise err
            expiries = [result.get('apicall', {}).get('expire', 0) for result in \
                            self.das.analytics.list_apicalls(qhash=dasquery.qhash)]
            if  not expiries:
                return err_return
            return min(expiries)
        except:
            return err_return
              
    def make_one_summary(self, start, finish):
        "Actually make the summary"
        keys = collections.defaultdict(int)
        try:
            queries = self.das.analytics.list_queries(key=self.key,
                                                      after=start,
                                                      before=finish)
        except:
            queries = []
        for query in queries:
            count = len(filter(lambda t: t>=start and t<=finish, 
                               query['times']))
            for spec in query['mongoquery']['spec']:
                if spec['key'] == self.key:
                    keys[spec['value']] += count
        if keys:
            self.logger.info("Found %s queries in %s->%s" \
                             % (len(keys), start, finish))
        else:
            self.logger.info("Found no queries in %s->%s" \
                             % (start, finish))
        return keys
Beispiel #19
0
class DASKeyLearning(object):
    """
    This class manages DAS key-learning DB.

    Key-learning is an intermittent process (triggered infrequently
    by a task running in the analytics framework), which involves
    searching through the raw cache for (a subset of but with
    maximum primary key coverage) all output documents, generating
    the set of all data members (in a dotted-dict fashion) and storing
    those as primary-key:data-member records (with an associated
    last-updated-time).

    """
    def __init__(self, config):
        self.verbose = config['verbose']
        self.logger = PrintManager('DASKeyLearning', self.verbose)
        self.services = config['services']
        self.dburi = config['mongodb']['dburi']
        self.dbname = config['keylearningdb']['dbname']
        self.colname = config['keylearningdb']['collname']

        self.mapping = config['dasmapping']

        msg = "%s@%s" % (self.dburi, self.dbname)
        self.logger.info(msg)

        self.das_son_manipulator = DAS_SONManipulator()
        index_list = [('system', ASCENDING), ('urn', ASCENDING), \
                ('members', ASCENDING), ('stems', ASCENDING)]
        create_indexes(self.col, index_list)

    @property
    def col(self):
        "col property provides access to DAS keylearning collection"
        conn = db_connection(self.dburi)
        mdb = conn[self.dbname]
        colnames = mdb.collection_names()
        if not colnames or self.colname not in colnames:
            try:
                mdb.create_collection(self.colname)
            except OperationFailure:
                pass
        mdb.add_son_manipulator(self.das_son_manipulator)
        return mdb[self.colname]

    def add_record(self, dasquery, rec):
        """
        Add/update to keylearning DB keys/attributes from given record.
        To do so, we parse it and call add_members method.
        """
        if not ('das' in rec and 'system' in rec['das']):
            return
        das = rec['das']
        if 'system' not in das or 'api' not in das or 'primary_key' not in das:
            return
        systems = das['system']
        apis = das['api']
        pkey = das['primary_key'].split('.')[0]
        data = rec.get(pkey, [])
        members = dict_members(data, pkey)
        for srv, api in zip(systems, apis):
            self.add_members(srv, api, members)
        # insert new record for query patern
        fields = dasquery.mongo_query.get('fields', [])
        if fields:
            for field in fields:
                if field in DAS_RECORD_KEYS:
                    continue
                new_members = [m for m in dict_members(rec[field], field) if m]
                members += new_members
        for attr in members:
            spec = {'member': attr}
            doc = {'query_pat': dasquery.query_pat}
            self.col.update(spec, {'$addToSet': doc}, upsert=True)

    def add_members(self, system, urn, members):
        """
        Add a list of data members for a given API (system, urn, url),
        and generate, which are stored as separate records.
        """
        msg = "system=%s, urn=%s, members=%s)" % (system, urn, members)
        self.logger.info(msg)

        result = self.col.find_one({'system': system, 'urn': urn})
        if result:
            self.col.update({'_id': ObjectId(result['_id'])},
                            {'$addToSet': {
                                'members': {
                                    '$each': members
                                }
                            }})
        else:
            keys = self.mapping.api2daskey(system, urn)
            self.col.insert({
                'system': system,
                'urn': urn,
                'keys': keys,
                'members': members
            })

        for member in members:
            if not self.col.find_one({'member': member}):
                self.col.insert({'member': member, 'stems': stem(member)})

    def text_search(self, text):
        """
        Perform a text search for data members matching a string. The input is
        split if it already includes dotted elements (in which case we need to
        find a member matching all the split elements), otherwise we look for
        any member whose stem list contains the text.
        """
        text = text.lower()
        if '.' in text:
            possible_members = self.col.find(\
                    {'stems': {'$all': text.split('.')}}, fields=['member'])
        else:
            possible_members = self.col.find({'stems': text},\
                                             fields=['member'])
        return [doc['member'] for doc in possible_members]

    def attributes(self):
        """
        Return full list of keyword attributes known in DAS.
        """
        spec = {'member': {'$exists': True}}
        return self.col.find(spec)

    def member_info(self, member):
        """
        Once the text search has identified a member that might be a match,
        return which systems, APIs and hence DAS keys this points to.
        """
        result = []
        for doc in self.col.find({'members': member},
                                 fields=['system', 'urn', 'keys']):

            result.append({
                'system': doc['system'],
                'urn': doc['urn'],
                'keys': doc['keys']
            })
        return result

    def key_search(self, text, limitkey=None):
        """
        Try and find suggested DAS keys, by performing a member search and then
        mapping back to the DAS keys those are produced by.
        """
        text = text.lower()
        result = collections.defaultdict(set)
        for member in self.text_search(text):
            for info in self.member_info(member):
                result[tuple(info['keys'])].add(member)
        if limitkey:
            for key in result:
                if not limitkey in key:
                    del result[key]
        return result

    def members_for_keys(self, keys):
        """
        Return all the members that exactly match the set of keys
        """
        result = []
        for doc in self.col.find({'keys': {
                '$all': keys,
                '$size': len(keys)
        }},
                                 fields=['members']):
            result += doc['members']
        return result

    def has_member(self, member):
        """
        Return true if we know anything about the given member.
        """
        if self.col.find_one({'member': member}):
            return True
        else:
            return False

    def list_members(self):
        "Return list of members in keylearning collection"
        return self.col.find({
            'members': {
                '$exists': 'True'
            },
            'system': {
                '$exists': 'True'
            },
            'urn': {
                '$exists': 'True'
            }
        })
Beispiel #20
0
class DASCore(object):
    """
    DAS core class.
    """
    def __init__(self,
                 config=None,
                 debug=0,
                 nores=False,
                 logger=None,
                 engine=None,
                 multitask=True):
        if config:
            dasconfig = config
        else:
            dasconfig = das_readconfig()
        verbose = dasconfig['verbose']
        self.stdout = debug
        if isinstance(debug, int) and debug:
            self.verbose = debug
            dasconfig['verbose'] = debug
        else:
            self.verbose = verbose
        das_timer('DASCore::init', self.verbose)
        self.operators = das_operators()
        self.collect_wait_time = dasconfig['das'].get('collect_wait_time', 120)

        # set noresults option
        self.noresults = False
        if nores:
            dasconfig['write_cache'] = True
            self.noresults = nores

        self.init_expire = dasconfig['das'].get('init_expire', 5 * 60)
        self.multitask = dasconfig['das'].get('multitask', True)
        if debug or self.verbose:
            self.multitask = False  # in verbose mode do not use multitask
            dasconfig['das']['multitask'] = False
        if not multitask:  # explicitly call DASCore ctor
            self.multitask = False
            dasconfig['das']['multitask'] = False
        dasconfig['engine'] = engine
        if self.multitask:
            nworkers = dasconfig['das'].get('core_workers', 5)
            #             if  engine:
            #                 thr_name = 'DASCore:PluginTaskManager'
            #                 self.taskmgr = PluginTaskManager(\
            #                         engine, nworkers=nworkers, name=thr_name)
            #                 self.taskmgr.subscribe()
            #             else:
            #                 thr_name = 'DASCore:TaskManager'
            #                 self.taskmgr = TaskManager(nworkers=nworkers, name=thr_name)
            thr_name = 'DASCore:TaskManager'
            self.taskmgr = TaskManager(nworkers=nworkers, name=thr_name)
        else:
            self.taskmgr = None

        if logger:
            self.logger = logger
        else:
            self.logger = PrintManager('DASCore', self.verbose)

        # define Mapping/Analytics/Parser in this order since Parser depends
        # on first two
        dasmapping = DASMapping(dasconfig)
        dasconfig['dasmapping'] = dasmapping
        self.mapping = dasmapping

        self.keylearning = DASKeyLearning(dasconfig)
        dasconfig['keylearning'] = self.keylearning

        # init DAS cache
        self.rawcache = DASMongocache(dasconfig)
        dasconfig['rawcache'] = self.rawcache

        # plug-in architecture: loop over registered data-services in
        # dasconfig; load appropriate module/class; register data
        # service with DASCore.
        self.systems = dasmapping.list_systems()
        # pointer to the DAS top level directory
        dasroot = '/'.join(__file__.split('/')[:-3])
        for name in self.systems:
            try:
                klass  = 'DAS/services/%s/%s_service.py' \
                    % (name, name)
                srvfile = os.path.join(dasroot, klass)
                with open(srvfile) as srvclass:
                    for line in srvclass:
                        if line.find('(DASAbstractService)') != -1:
                            klass = line.split('(DASAbstractService)')[0]
                            klass = klass.split('class ')[-1]
                            break
                mname = 'DAS.services.%s.%s_service' % (name, name)
                module = __import__(mname, fromlist=[klass])
                obj = getattr(module, klass)(dasconfig)
                setattr(self, name, obj)
            except IOError as err:
                if debug > 1:
                    # we have virtual services, so IOError can be correct
                    print_exc(err)
                try:
                    mname = 'DAS.services.generic_service'
                    module = __import__(mname, fromlist=['GenericService'])
                    obj = module.GenericService(name, dasconfig)
                    setattr(self, name, obj)
                except Exception as exc:
                    print_exc(exc)
                    msg = "Unable to load %s data-service plugin" % name
                    raise Exception(msg)
            except Exception as exc:
                print_exc(exc)
                msg = "Unable to load %s data-service plugin" % name
                raise Exception(msg)

        # loop over systems and get system keys, add mapping keys to final list
        self.service_keys = {}
        self.service_parameters = {}
        for name in self.systems:
            skeys = list(getattr(self, name).keys())
            self.service_keys[getattr(self, name).name] = skeys
            sparams = getattr(self, name).parameters()
            self.service_parameters[getattr(self, name).name] = sparams

        self.service_keys['special'] = das_special_keys()
        self.dasconfig = dasconfig
        das_timer('DASCore::init', self.verbose)

    def keys(self):
        """
        Return map of data service keys
        """
        return self.service_keys

    def das_keys(self):
        """
        Return map of data service keys
        """
        _keys = ['records']
        for values in self.service_keys.values():
            for key in values:
                if key not in _keys:
                    _keys.append(key)
        return _keys

    def result(self, query, idx=0, limit=None):
        """
        Get results either from cache or from explicit call
        """
        self.logger.info('input query=%s' % query)
        results = []
        dasquery = DASQuery(query)
        query = dasquery.mongo_query
        # check if we have any service which cover the query
        # otherwise decompose it into list of queries
        service_map = dasquery.service_apis_map()
        if not service_map:
            msg = 'no APIs found to answer input query, will decompose it'
            self.logger.info(msg)
            skeys = query['fields']
            if not skeys:
                skeys = []
            for key in skeys:
                newquery = DASQuery(dict(fields=[key], spec=query['spec']))
                self.call(newquery)  # process query
        else:
            self.call(dasquery)  # process query

        # lookup provided query in a cache
        if not self.noresults:
            results = self.get_from_cache(dasquery, idx, limit)
        return results

    def remove_from_cache(self, dasquery):
        """
        Delete in cache entries about input query
        """
        self.rawcache.remove_from_cache(dasquery)

    def get_status(self, dasquery):
        """
        Look-up status of provided query in a cache.
        Return status of the query request and its hash.
        """
        status = None
        error = None
        reason = None
        if dasquery and 'fields' in dasquery.mongo_query:
            fields = dasquery.mongo_query['fields']
            if fields and isinstance(fields, list) and 'queries' in fields:
                return 'ok', error, reason
        record = self.rawcache.find(dasquery)
        error, reason = self.rawcache.is_error_in_records(dasquery)
        try:
            if record and 'das' in record and 'status' in record['das']:
                status = record['das']['status']
                if not error:
                    error = record['das'].get('error', error)
                if not reason:
                    reason = record['das'].get('reason', reason)
                return status, error, reason
        except Exception as exc:
            print_exc(exc)
            status = error = reason = None
            self.rawcache.remove_from_cache(dasquery)
        return status, error, reason

    def status(self):
        "Return status of given service"
        sdict = {'das': self.taskmgr.status()}
        for srv in sorted(self.systems):
            sdict[srv] = getattr(getattr(self, srv), 'status')()
        return sdict

    def worker(self, srv, dasquery):
        """Main worker function which calls data-srv call function"""
        self.logger.info('##### %s ######\n' % srv)
        das_timer(srv, self.verbose)
        getattr(getattr(self, srv), 'call')(dasquery)
        das_timer(srv, self.verbose)

    def insert_query_records(self, dasquery):
        """
        Insert DAS query records into DAS cache and return list of services
        which will answer this query
        """
        services = dasquery.services
        self.logger.info('Potential services = %s' % services)
        if not services:
            msg = 'No data-services for query %s' % dasquery
            msg += 'mongo_query: %s' % dasquery.mongo_query
            msg += 'params: %s' % dasquery.params()
            print(dastimestamp('DAS WARNING '), msg)

        # get list of URI which can answer this query
        ack_services = []
        for srv in services:
            gen = [t for t in getattr(getattr(self, srv), 'apimap')(dasquery)]
            for url, api, args, iformat, expire in gen:
                header = dasheader(srv, dasquery, expire, api, url, ctime=0)
                self.rawcache.insert_query_record(dasquery, header)
                if srv not in ack_services:
                    ack_services.append(srv)
        if not ack_services:
            ack_services = services
        if dasquery.query.find('records ') != -1:
            srv_status = True  # skip DAS queries w/ records request
        # create das record with initial expire tstamp
        expire = time.time() + self.init_expire
        header = dasheader("das",
                           dasquery,
                           expire,
                           api='das_core',
                           services=dict(das=ack_services))
        header['lookup_keys'] = []
        self.rawcache.insert_query_record(dasquery, header)
        das_timer('das_record', self.verbose)
        return ack_services

    def call(self, query, **kwds):
        """
        Top level DAS api which execute a given query using underlying
        data-services. It follows the following steps:

            - parse input query
            - identify data-sercices based on selection keys
              and where clause conditions
            - construct DAS workflow and execute data-service 
              API calls. At this step individual 
              data-services store results into DAS cache.

        Return status 0/1 depending on success of the calls, can be
        used by workers on cache server.

        kwds is provided for compatibility with web layer, e.g. it
        may invoke this method with additional pid parameter.
        """
        def update_das_query(dasquery, status, reason=None):
            "Update DAS query record with given status and reason"
            self.rawcache.update_query_record(dasquery, status, reason=reason)
            self.rawcache.add_to_record(\
                    dasquery, {'das.timer': get_das_timer()}, system='das')

        self.logger.info('input query=%s' % query)
        das_timer('DASCore::call', self.verbose)
        if  isinstance(query, object) and hasattr(query, '__class__')\
            and query.__class__.__name__ == 'DASQuery':
            dasquery = query
        else:
            dasquery = DASQuery(query)
        for col in ['merge', 'cache']:
            self.rawcache.remove_expired(dasquery, col)
        query = dasquery.mongo_query
        spec = query.get('spec')
        fields = query.get('fields')
        if fields == ['records']:
            msg = 'look-up all records in cache'
            self.logger.info(msg)
            return 'in cache'
        if spec == dict(records='*'):
            self.logger.info("look-up everything in cache")
            return 'in cache'
        for record in self.rawcache.find_specs(dasquery):
            status = record['das']['status']
            msg = 'found query %s in cache, status=%s\n' \
                        % (record['query'], status)
            self.logger.info(msg)
            print(dastimestamp('DAS INFO'), msg)
            return status

        self.logger.info(dasquery)
        das_timer('das_record', self.verbose)
        services = self.insert_query_records(dasquery)
        if not services:
            msg = 'unable to locate data-services to fulfill this request'
            msg += ', will iterate over all registered services'
            print(dastimestamp('DAS WARNING '), dasquery, msg)
            services = dasquery.services if dasquery.services else self.systems
        try:
            if self.multitask:
                jobs = []
                for srv in sorted(services):
                    jobs.append(self.taskmgr.spawn(self.worker, srv, dasquery))
                self.taskmgr.joinall(jobs)
            else:
                for srv in services:
                    self.worker(srv, dasquery)
        except Exception as exc:
            print_exc(exc)
            return 'fail'
        self.logger.info('\n##### merging ######\n')
        update_das_query(dasquery, 'merging')
        das_timer('merge', self.verbose)

        # check that all query record statuses are ok, i.e. we did insert records
        # this status is set by self.rawcache.update_cache
        for idx in range(self.collect_wait_time):
            records = self.rawcache.find_query_record(dasquery)
            statuses = []
            for row in records:
                system = row['das']['system']
                status = row['das']['status']
                self.logger.info("### query record status %s %s %s" %
                                 (dasquery.qhash, system, status))
                statuses.append(status)
            all_statuses = sorted(list(set(statuses)))
            # at this point we're looking that all services will have 'ok' and das status will be 'merging'
            if len(all_statuses) == 2 and all_statuses == ['merging', 'ok']:
                break
            time.sleep(1)

        # now we can merge records
        status = self.rawcache.merge_records(dasquery)
        das_timer('merge', self.verbose)
        # check if we have service records and properly setup status
        self.logger.info('\n##### check services ######\n')
        das_services = self.rawcache.check_services(dasquery)
        reason = ''
        status = 'ok'
        if not das_services:
            if 'records' in dasquery.query:
                status = 'ok'  # keep status ok for 'records' queries
            else:
                reason = 'no data records found in DAS cache'
                status = 'fail'
                print(dastimestamp('DAS ERROR '), dasquery, reason)
        update_das_query(dasquery, status, reason)
        das_timer('DASCore::call', self.verbose)
        return status

    def processing_time(self, dasquery):
        "Look-up and return DAS query processing time"
        query_record = self.rawcache.find(dasquery)
        if query_record:
            das = query_record.get('das', None)
            if isinstance(das, dict):
                ctime = das.get('ctime', [])
                if ctime:
                    return ctime[-1] - ctime[0]
        return None

    def nresults(self, dasquery, coll='merge'):
        """
        Return total number of results (count) for provided query
        Code should match body of get_from_cache method.
        """
        fields = dasquery.mongo_query.get('fields', None)
        if dasquery.mapreduce:
            result = self.rawcache.map_reduce(dasquery.mapreduce, dasquery)
            return len([1 for _ in result])
        elif dasquery.aggregators:
            return len(dasquery.aggregators)
        return self.rawcache.nresults(dasquery, coll)

    def apilist(self, dasquery):
        "Return list of APIs answer given das query"
        return self.rawcache.apilist(dasquery)

    def incache(self, dasquery, coll='merge'):
        """
        Answer the question if given query in DAS cache or not
        """
        return self.rawcache.incache(dasquery, collection=coll)

    def get_from_cache(self, dasquery, idx=0, limit=0, collection='merge'):
        """
        Look-up results from the merge cache and yield them for
        further processing.
        """
        das_timer('DASCore::get_from_cache', self.verbose)
        msg = 'col=%s, query=%s, idx=%s, limit=%s'\
                % (collection, dasquery, idx, limit)
        self.logger.info(msg)

        fields = dasquery.mongo_query.get('fields', None)

        if dasquery.mapreduce:
            res = self.rawcache.map_reduce(dasquery.mapreduce, dasquery)
        elif dasquery.aggregators:
            # extract das information from rawcache
            rows  = self.rawcache.get_from_cache(\
                    dasquery, collection=collection)
            first = next(rows)
            sinfo = das_sinfo(first)
            # to perform aggregation we need:
            # - loop over all aggregator functions
            # - loop over all data-services
            # - loop over all APIs within a data-services
            # the code below does that, it applies aggregator
            # to selected (based on key/srv/api) records
            res = []
            _id = 0
            time0 = time.time()
            expire = 300  # min expire
            for func, key in dasquery.aggregators:
                afunc = getattr(das_aggregator, 'das_%s' % func)
                found = False
                for srv, apis, in sinfo.items():
                    for api in apis:
                        rows  = self.rawcache.get_from_cache(\
                                dasquery, collection=collection)
                        gen = api_rows(rows, api)
                        data = afunc(key, gen)
                        ctime = time.time() - time0
                        das = dasheader(srv,
                                        dasquery,
                                        expire,
                                        api=api,
                                        ctime=ctime)
                        if isinstance(data, dict) and data['value'] != 'N/A':
                            aggr = {
                                '_id': _id,
                                'function': func,
                                'key': key,
                                'result': data
                            }
                            aggr.update(das)
                            res.append(aggr)
                            _id += 1
                            found = True
                if not found:  # when we got nothing add empty result record
                    empty = {'value': 'N/A'}
                    ctime = time.time() - time0
                    das = dasheader('das',
                                    dasquery,
                                    expire,
                                    api='das_core',
                                    ctime=ctime)
                    rec = {
                        '_id': 0,
                        'function': func,
                        'key': key,
                        'result': empty
                    }
                    rec.update(das)
                    res.append(rec)
        else:
            res = self.rawcache.get_from_cache(dasquery, idx, limit, \
                    collection=collection)
        # we assume that all records from single query will have
        # identical structure, therefore it will be sufficient to update
        # keylearning DB only with first record
        count = 0
        for row in res:
            if not count:
                self.keylearning.add_record(dasquery, row)
            fix_times(row)
            yield row
            count += 1
        das_timer('DASCore::get_from_cache', self.verbose)
Beispiel #21
0
class DASKeyLearning(object):
    """
    This class manages DAS key-learning DB.
    
    Key-learning is an intermittent process (triggered infrequently
    by a task running in the analytics framework), which involves
    searching through the raw cache for (a subset of but with
    maximum primary key coverage) all output documents, generating
    the set of all data members (in a dotted-dict fashion) and storing
    those as primary-key:data-member records (with an associated
    last-updated-time).
    
    """
    def __init__(self, config):
        self.verbose  = config['verbose']
        self.logger   = PrintManager('DASKeyLearning', self.verbose)
        self.services = config['services']
        self.dburi    = config['mongodb']['dburi']
        self.dbname   = config['keylearningdb']['dbname']
        self.colname  = config['keylearningdb']['collname']
        
        self.mapping  = config['dasmapping']

        msg = "%s@%s" % (self.dburi, self.dbname)
        self.logger.info(msg)
        
        self.col = None
        self.create_db()
        
        

    def create_db(self):
        """
        Establish connection to MongoDB back-end and create DB.
        """
        self.col = db_connection(self.dburi)[self.dbname][self.colname]
        
    def add_members(self, system, urn, members):
        """
        Add a list of data members for a given API (system, urn, url),
        and generate, which are stored as separate records.
        """
        msg = "system=%s, urn=%s, members=%s)" % (system, urn, members)
        self.logger.info(msg)
        
        result = self.col.find_one({'system': system, 'urn': urn})
        if result:       
            self.col.update({'_id': result['_id']},
                            {'$addToSet': {'members': {'$each': members}}})
        else:
            keys = self.mapping.api2daskey(system, urn)
            self.col.insert({'system': system,
                             'urn': urn,
                             'keys': keys,
                             'members': members})
                
        for member in members:
            if not self.col.find_one({'member': member}):
                self.col.insert({'member': member,
                                 'stems': self.stem(member)})
                
        index_list = [('system', 1), ('urn', 1), ('members', 1), ('stems', 1)]
        create_indexes(self.col, index_list)
        
    def stem(self, member):
        """
        Produce an extended set of strings which can be used for text-search.
        TODO: Use PyStemmer or something more sophisticated here.
        """
        
        return member.lower().split('.')
    
    def text_search(self, text):
        """
        Perform a text search for data members matching a string. The input is
        split if it already includes dotted elements (in which case we need to find
        a member matching all the split elements), otherwise we look for any member
        whose stem list contains the text.
        """
        text = text.lower()
        if '.' in text:
            possible_members = self.col.find({'stems': {'$all': text.split('.')}}, 
                                             fields=['member'])
        else:
            possible_members = self.col.find({'stems': text}, 
                                             fields=['member'])
        
        return [doc['member'] for doc in possible_members]
        
    
    def member_info(self, member):
        """
        Once the text search has identified a member that might be a match,
        return which systems, APIs and hence DAS keys this points to.
        """
        result = []
        for doc in self.col.find({'members': member}, 
                                 fields=['system', 'urn', 'keys']):
            
            result.append({'system': doc['system'],
                           'urn': doc['urn'],
                           'keys': doc['keys']})
        return result
    
    def key_search(self, text, limitkey=None):
        """
        Try and find suggested DAS keys, by performing a member search and then
        mapping back to the DAS keys those are produced by.
        """
        text = text.lower()
        result = collections.defaultdict(set)
        for member in self.text_search(text):
            for info in self.member_info(member):
                result[tuple(info['keys'])].add(member)
        if limitkey:
            for key in result:
                if not limitkey in key:
                    del result[key]
        return result
    
    def members_for_keys(self, keys):
        """
        Return all the members that exactly match the set of keys
        """
        result = []
        for doc in self.col.find({'keys': {'$all': keys, '$size': len(keys)}},
                                 fields=['members']):
            result += doc['members']
        return result
         
    
    def has_member(self, member):
        """
        Return true if we know anything about the given member.
        """
        if self.col.find_one({'member': member}):
            return True
        else:
            return False
Beispiel #22
0
class KeyLearning(object):
    """
    This is the asynchronous part of the key-learning system, intended
    to run probably not much more than daily once the key learning DB is
    filled.
    
    This searches through the DAS raw cache for all API output records,
    recording at least `redundancy` das_ids for each primary_key found.
    
    These das_ids are then used to fetch the query record, which records
    the API system and urn of each of the records in question.
    
    These documents are then processed to extract all the unique member
    names they contained, which are then injected into the DAS keylearning
    system.
    """
    task_options = [{'name':'redundancy', 'type':'int', 'default':2,
                     'help':'Number of records to examine per DAS primary key'}]
    def __init__(self, **kwargs):
        self.logger = PrintManager('KeyLearning', kwargs.get('verbose', 0))
        self.das = kwargs['DAS']
        self.redundancy = kwargs.get('redundancy', 2)
        
        
    def __call__(self):
        "__call__ implementation"
        self.das.rawcache.remove_expired("cache")
        
        autodeque = lambda: collections.deque(maxlen=self.redundancy)
        found_ids = collections.defaultdict(autodeque)
        
        self.logger.info("finding das_ids")
        for doc in self.das.rawcache.col.find(\
            {'das.empty_record': 0, 'das.primary_key': {'$exists': True}},
            fields=['das.primary_key', 'das_id']):
            found_ids[doc['das']['primary_key']].append(doc['das_id'])
        
        hit_ids = set()
        
        self.logger.info("found %s primary_keys" % len(found_ids))
        
        for key in found_ids:
            self.logger.info("primary_key=%s" % key)
            for das_id in found_ids[key]:
                if not das_id in hit_ids:
                    self.logger.info("das_id=%s" % das_id)
                    hit_ids.add(das_id)
                    doc = self.das.rawcache.col.find_one(\
                        {'_id': ObjectId(das_id)})
                    if doc:
                        self.process_query_record(doc)
                    else:
                        self.logger.warning(\
                        "no record found for das_id=%s" % das_id)
        return {}
    
    def process_query_record(self, doc):
        """
        Process a rawcache document, extracting the called
        system, urn and url, then looking up the individual data records.
        """
        das_id = str(doc['_id'])
        systems = doc['das']['system']
        urns = doc['das']['urn']
        
        result = self.das.rawcache.find_records(das_id)        
        
        if len(systems)==len(urns) and len(systems)==result.count():
            for i, record in enumerate(result):
                self.process_document(systems[i], urns[i], record)
        else:
            self.logger.warning("got inconsistent system/urn/das_id length")
            
            
    def process_document(self, system, urn, doc):
        """
        Process a rawcache document record, finding all the unique
        data members and inserting them into the cache.
        """
        
        self.logger.info("%s::%s" % (system, urn))
        members = set()
        for key in doc.keys():
            if not key in ('das', '_id', 'das_id'):
                members |= self._process_document_recursor(doc[key], key)
        
        self.das.keylearning.add_members(system, urn, list(members))
        
    def _process_document_recursor(self, doc, prefix):
        """
        Recurse through a nested data structure, finding all
        the unique endpoint names. Lists are iterated over but do
        not add anything to the prefix, eg
        
        a: {b: 1, c: {d: 1, e: 1}, f: [{g: 1}, {h: 1}]} ->
        a.b, a.c.d, a.c.e, a.f.g, a.f.h
        
        (although normally we would expect each member of a list to
        have the same structure)
        """
        result = set()
        if isinstance(doc, dict):
            for key in doc.keys():
                result |= self._process_document_recursor(doc[key], 
                                                          prefix+'.'+key)
        elif isinstance(doc, list):
            for item in doc:
                result |= self._process_document_recursor(item, prefix)
        else:
            result.add(prefix)
        return result
Beispiel #23
0
class DASAnalytics(object):
    """
    DAS analytics DB manager.
    """
    def __init__(self, config):
        self.verbose = config['verbose']
        self.logger  = PrintManager('DASAnalytics', self.verbose)
        self.dburi   = config['mongodb']['dburi']
        self.dbname  = config['analyticsdb']['dbname']        
        self.colname = config['analyticsdb']['collname']
        self.history = config['analyticsdb']['history']
        msg = "%s@%s" % (self.dburi, self.dbname)
        self.logger.info(msg)
        self.create_db()

    def create_db(self):
        """
        Create analytics DB in MongoDB back-end.
        """
        self.conn = db_connection(self.dburi)
        database  = self.conn[self.dbname]
        das_son_manipulator = DAS_SONManipulator()
        database.add_son_manipulator(das_son_manipulator)
        self.col  = database[self.colname]
#        if  self.dbname not in self.conn.database_names():
#            capped_size = 104857600
#            options   = {'capped':True, 'size': capped_size}
#            database  = self.conn[self.dbname]
#            database.create_collection('self.colname', **options)
#            print "####CREATE CAPPED ANALYTICS"
#        self.col  = self.conn[self.dbname][self.colname] 

    def delete_db(self):
        """
        Delete analytics DB in MongoDB back-end.
        """
        self.conn.drop_database(self.dbname)

    def delete_db_collection(self):
        """
        Delete analytics DB collection in MongoDB.
        """
        self.conn.drop_collection(self.colname)

    def add_query(self, query, mongoquery):
        """
        Add DAS-QL/MongoDB-QL queries into analytics.
        
        A unique record is contained for each (qhash, dhash) pair.
        For each an array of call-times is contained.
        """
        if  isinstance(mongoquery, dict):
            mongoquery = encode_mongo_query(mongoquery)
        msg = 'query=%s, mongoquery=%s' % (query, mongoquery)
        self.logger.debug(msg)
        dhash = genkey(query)
        qhash = genkey(mongoquery)

        now = time.time()

        existing = self.col.find_one({'qhash': qhash, 'dhash': dhash})
        if existing:
            # check if times contains very old timestamps
            rec = self.col.find({'_id': ObjectId(existing['_id']), 
                                 'times':{'$lt' : now - self.history}})
            if  rec:
                self.col.update({'_id': ObjectId(existing['_id'])},
                    {'$pull': {'times': {'$lt' : now - self.history}}})
            # update times array with new timestamp
            self.col.update({'_id': ObjectId(existing['_id'])},
                            {'$push': {'times': now}})
        else:
            record = dict(query=query, mongoquery=mongoquery,
                        qhash=qhash, dhash=dhash, times=[now])
            self.col.insert(record)

        index = [('qhash', DESCENDING),
                 ('dhash', DESCENDING)]
        create_indexes(self.col, index)
        
    def clean_queries(self):
        """
        Standalone method to clean up expired call-times from query records,
        since otherwise only the active record is cleaned.
        
        This is too expensive to do with every operation, and mongodb
        does not allow multiple modifications to a single field in a single
        update operation (ie, we can't do $push and $pull in one update),
        so it should probably be done asynchronously at fixed intervals.
        """
        
        self.logger.debug('')
        
        now = time.time()
        
        #clean out the times array
        self.col.update({'times': {'$exists': True}},
                        {'$pull': {'times': {'$lt': now - self.history}}})
        #now delete any with no times
        self.col.remove({'times': {'$size': 0}})
        #and should maybe delete anything with the same qhash here?

    def remove_expired(self):
        "Moved from AbstractService -  remove old apicall records"
        spec = {'apicall.expire':{'$lt' : int(time.time())}}
        self.col.remove(spec)

    def add_summary(self, identifier, start, finish, **payload):
        """
        Add an analyzer summary, with given analyzer identifier,
        start and finish times and payload.
        
        It is intended that a summary document is deposited on
        each run of an analyzer (if desirable) and is thereafter
        immutable.
        """
        msg = '(%s, %s->%s, %s)' % (identifier, start, finish, payload)
        self.logger.debug(msg)
        
        # clean-up analyzer records whose start timestamp is too old
        spec = {'start':{'$lt':time.time()-self.history},
                'analyzer': {'$exists': True}}
        self.col.remove(spec)

        # insert new analyzer record
        record = {'analyzer':identifier,
                  'start': start,
                  'finish': finish}
        payload.update(record) #ensure key fields are set correctly
        self.col.insert(payload)
        # ensure summary items are indexed for quick extract
        create_indexes(self.col, [('analyzer', DESCENDING), ('start', ASCENDING)])

    def get_summary(self, identifier, after=None, before=None, **query):
        """
        Retrieve a summary document for a given analyzer-identifier,
        optionally specifying a time range.
        """
        cond = {'analyzer': identifier}
        if after:
            cond['start'] = {'$gte': after}
        if before:
            cond['finish'] = {'$lte': before}
        if query:
            cond.update(query)
        return list(self.col.find(cond))

    def add_api(self, system, query, api, args):
        """
        Add API info to analytics DB. 
        Here args is a dict of API parameters.
        """
        orig_query = query
        if  isinstance(query, dict):
            query = encode_mongo_query(query)
        msg = '(%s, %s, %s, %s)' % (system, query, api, args)
        self.logger.debug(msg)
        # find query record
        qhash = genkey(query)
        record = self.col.find_one({'qhash':qhash}, fields=['dasquery'])
        if  not record:
            self.add_query("", orig_query)
        # find api record
        record = self.col.find_one({'qhash':qhash, 'system':system,
                        'api.name':api, 'api.params':args}) 
        apidict = dict(name=api, params=args)
        if  record:
            self.col.update({'_id':record['_id']}, {'$inc':{'counter':1}})
        else:
            record = dict(system=system, api=apidict, qhash=qhash, counter=1)
            self.col.insert(record)
        index = [('system', DESCENDING), ('dasquery', DESCENDING),
                 ('api.name', DESCENDING), ('qhash', DESCENDING) ]
        create_indexes(self.col, index)
        
    def insert_apicall(self, system, query, url, api, api_params, expire):
        """
        Remove obsolete apicall records and
        insert into Analytics DB provided information about API call.
        Moved from AbstractService.
        
        Updated so that we do not have multiple records when performing
        forced updates (ie, the old record is not yet expired) - now
        look for an existing record with the same parameters (I'm hoping
        the fact that some of the variables are indexed will make this
        fast even though not all are), and if it exists just update
        the expiry. Otherwise insert a new record.
        """
        msg = 'query=%s, url=%s,' % (query, url)
        msg += 'api=%s, args=%s, expire=%s' % (api, api_params, expire)
        self.logger.debug(msg)
        expire = expire_timestamp(expire)
        query = encode_mongo_query(query)
        qhash = genkey(query)
        self.remove_expired()
        existing = self.col.find_one({'apicall.system':     system,
                                      'apicall.url':        url,
                                      'apicall.api':        api,
                                      'apicall.api_params': api_params,
                                      'apicall.qhash':      qhash})
        if existing:
            self.logger.debug("updating")
            self.col.update({'_id': existing['_id']},
                            {'$set':{'apicall.expire': expire}})
        else:
            self.col.insert({'apicall':{'api_params':   api_params,
                                        'url':          url,
                                        'api':          api,
                                        'system':       system,
                                        'expire':       expire,
                                        'qhash':        qhash}})
        index_list = [('apicall.url', DESCENDING),
                      ('apicall.api', DESCENDING),
                      ('qhash', DESCENDING)]
        create_indexes(self.col, index_list)
        
    def update_apicall(self, query, das_dict):
        """
        Update apicall record with provided DAS dict.
        Moved from AbstractService
        """
        msg = 'DBSAnalytics::update_apicall, query=%s, das_dict=%s'\
                % (query, das_dict)
        self.logger.debug(msg)
        spec = {'apicall.qhash':genkey(encode_mongo_query(query))} 
        record = self.col.find_one(spec)
        self.col.update({'_id':ObjectId(record['_id'])},
            {'$set':{'dasapi':das_dict,
                     'apicall.expire':das_dict['response_expires']}})

    def update(self, system, query):
        """
        Update records for given system/query.
        """
        if  isinstance(query, dict):
            query = encode_mongo_query(query)
        msg = 'system=%s, query=%s' % (system, query)
        self.logger.debug(msg)
        qhash = genkey(query)
        if  system:
            cond = {'qhash':qhash, 'system':system}
        else:
            cond = {'qhash':qhash}
        self.col.update(cond, {'$inc' : {'counter':1}}, multi=True)

    def list_systems(self):
        """
        List all DAS systems.
        """
        cond = { 'system' : { '$ne' : None } }
        gen = (row['system'] for row in self.col.find(cond, ['system']))
        return gen2list(gen)

    def list_queries(self, qhash=None, dhash=None, query_regex=None,
                     key=None, after=None, before=None):
        """
        List inserted queries based on many criteria.
        """
        cond = {'mongoquery': {'$exists': True}}
        if qhash:
            cond['qhash'] = qhash
        if dhash:
            cond['dhash'] = dhash
        if query_regex:
            cond['dasquery'] = {'$regex':query_regex}
        if key:
            cond['mongoquery.spec.key'] = key
        # in this case we need a specific element to be within the range,
        # so we need to use elemMatch
        if before and after:
            cond['times'] = {'$gt': after, '$lt': before}
        # in these cases we only need to match any element
        elif after:
            cond['times'] = {'$gt': after}
        elif before:
            cond['times'] = {'$lt': before}
        
        return self.col.find(cond)
            
    def get_popular_queries(self, spec):
        """
        Get popular queries based on provided spec, which can be
        in a form of time stamp range, etc.
        """
        cond = {'counter':{'$exists':True}}
        for row in self.col.find(fields=['qhash'], spec=cond).\
                sort('counter', DESCENDING):
            spec = {'qhash': row['qhash'], 'counter':{'$exists': False}}
            for res in self.col.find(spec=spec):
                yield res

    def list_apis(self, system=None):
        """
        List all APIs.
        """
        cond = { 'api.name' : { '$ne' : None } }
        if  system:
            cond['system'] = system
        gen = (row['api']['name'] for row in \
                self.col.find(cond, ['api.name']))
        return gen2list(gen)
    
    def list_apicalls(self, qhash=None, api=None, url=None):
        "Replace ad-hoc calls in AbstractService"
        cond = {}
        if qhash:
            cond['apicall.qhash'] = qhash
        if api:
            cond['apicall.api'] = api
        if url:
            cond['apicall.url'] = url
        
        return list(self.col.find(cond))

    def api_params(self, api):
        """
        Retrieve API parameters from analytics DB
        """
        cond = {'api.name':api}
        gen = (row['api']['params'] for row in \
                self.col.find(cond, ['api.params']))
        return gen2list(gen)

    def api_counter(self, api, args=None):
        """
        Retrieve API counter from analytics DB. User must supply
        API name and optional dict of parameters.
        """
        cond = {'api.name': api}
        if  args:
            for key, val in args.iteritems():
                cond[key] = val
        return self.col.find_one(cond, ['counter'])['counter']
Beispiel #24
0
class DASParserDB(object):
    """
    Caching layer for the PLY parser.
    """
    def __init__(self, config):
        self.verbose = config['verbose']
        self.logger = PrintManager('DASParserDB', self.verbose)
        self.dburi = config['mongodb']['dburi']
        self.dbname = config['parserdb']['dbname']
        self.sizecap = config['parserdb'].get('sizecap', 5 * 1024 * 1024)
        self.colname = config['parserdb']['collname']
        msg = "DASParserCache::__init__ %s@%s" % (self.dburi, self.dbname)
        self.logger.info(msg)
        self.create_db()

    def create_db(self):
        """
        Create db collection
        """
        conn = db_connection(self.dburi)
        dbn = conn[self.dbname]
        if self.colname not in dbn.collection_names():
            dbn.create_collection(self.colname, capped=True, size=self.sizecap)
        col = dbn[self.colname]
        index_list = [('qhash', DESCENDING)]
        create_indexes(col, index_list)

    @property
    def col(self):
        "Collection object to MongoDB"
        conn = db_connection(self.dburi)
        dbn = conn[self.dbname]
        col = dbn[self.colname]
        return col

    def lookup_query(self, rawtext):
        """
        Check the parser cache for a given rawtext query.
        Search is done with the qhash of this string.
        Returns a tuple (status, value) for the cases
        (PARSERCACHE_VALID, mongo_query) - valid query found
        (PARSERCACHE_INVALID, error) - error message for invalid query
        (PARSERCACHE_NOTFOUND, None) - not in the cache
        """
        result = find_one(self.col, {'qhash':genkey(rawtext)}, \
                        fields=['query', 'error'])

        if result and result['query']:
            if self.verbose:
                self.logger.debug("DASParserCache: found valid %s->%s" %\
                                  (rawtext, result['query']))
            query = decode_mongo_query(result['query'])
            return (PARSERCACHE_VALID, query)
        elif result and result['error']:
            if self.verbose:
                self.logger.debug("DASParserCache: found invalid %s->%s" %\
                                  (rawtext, result['error']))
            return (PARSERCACHE_INVALID, result['error'])
        else:
            if self.verbose:
                self.logger.debug("DASParserCache: not found %s" %\
                                  (rawtext))
            return (PARSERCACHE_NOTFOUND, None)

    def insert_valid_query(self, rawtext, query):
        "Insert a query that was successfully transformed"
        self._insert_query(rawtext, query, None)

    def insert_invalid_query(self, rawtext, error):
        "Insert the error message for an invalid query"
        self._insert_query(rawtext, None, error)

    def _insert_query(self, rawtext, query, error):
        """Internal method to insert a query"""
        if self.verbose:
            self.logger.debug("DASParserCache: insert %s->%s/%s" %\
                           (rawtext, query, error))
        # since MongoDB does not support insertion of $ sign in queries
        # we need to encode inserted query
        if query:
            encquery = encode_mongo_query(query)
        else:
            encquery = ""
        self.col.insert({
            'raw': rawtext,
            'qhash': genkey(rawtext),
            'query': encquery,
            'error': str(error)
        })
Beispiel #25
0
class DASCore(object):
    """
    DAS core class.
    """
    def __init__(self, config=None, debug=0,
                nores=False, logger=None, engine=None, multitask=True):
        if  config:
            dasconfig = config
        else:
            dasconfig = das_readconfig()
        verbose       = dasconfig['verbose']
        self.stdout   = debug
        if  isinstance(debug, int):
            self.verbose = debug
            dasconfig['verbose'] = debug
        else:
            self.verbose = verbose
        das_timer('DASCore::init', self.verbose)
        self.operators = das_operators()

        # set noresults option
        self.noresults = False
        if  nores:
            dasconfig['write_cache'] = True
            self.noresults = nores

        self.multitask = dasconfig['das'].get('multitask', True)
        if  debug or self.verbose:
            self.multitask = False # in verbose mode do not use multitask
            dasconfig['das']['multitask'] = False
        if  not multitask: # explicitly call DASCore ctor, e.g. in analytics
            self.multitask = False
            dasconfig['das']['multitask'] = False
        dasconfig['engine'] = engine
        if  self.multitask:
            nworkers = dasconfig['das'].get('core_workers', 5)
            if  engine:
                thr_name = 'DASCore:PluginTaskManager'
                self.taskmgr = PluginTaskManager(\
                        engine, nworkers=nworkers, name=thr_name)
                self.taskmgr.subscribe()
            else:
                thr_name = 'DASCore:TaskManager'
                self.taskmgr = TaskManager(nworkers=nworkers, name=thr_name)
        else:
            self.taskmgr = None

        if  logger:
            self.logger = logger
        else:
            self.logger = PrintManager('DASCore', self.verbose)

        # define Mapping/Analytics/Parser in this order since Parser depends
        # on first two
        dasmapping = DASMapping(dasconfig)
        dasconfig['dasmapping'] = dasmapping
        self.mapping = dasmapping

        self.analytics = DASAnalytics(dasconfig)
        dasconfig['dasanalytics'] = self.analytics

        self.keylearning = DASKeyLearning(dasconfig)
        dasconfig['keylearning'] = self.keylearning

        # init DAS cache
        self.rawcache = DASMongocache(dasconfig)
        dasconfig['rawcache'] = self.rawcache

        # plug-in architecture: loop over registered data-services in
        # dasconfig; load appropriate module/class; register data
        # service with DASCore.
        self.systems = dasmapping.list_systems()
        # pointer to the DAS top level directory
        dasroot = '/'.join(__file__.split('/')[:-3])
        for name in self.systems:
            try:
                klass  = 'DAS/services/%s/%s_service.py' \
                    % (name, name)
                srvfile = os.path.join(dasroot, klass)
                with file(srvfile) as srvclass:
                    for line in srvclass:
                        if  line.find('(DASAbstractService)') != -1:
                            klass = line.split('(DASAbstractService)')[0]
                            klass = klass.split('class ')[-1] 
                            break
                mname  = 'DAS.services.%s.%s_service' % (name, name)
                module = __import__(mname, fromlist=[klass])
                obj = getattr(module, klass)(dasconfig)
                setattr(self, name, obj)
                SERVICES[name] = obj
            except IOError as err:
                if  debug > 1:
                    # we have virtual services, so IOError can be correct
                    print_exc(err)
                try:
                    mname  = 'DAS.services.generic_service'
                    module = __import__(mname, fromlist=['GenericService'])
                    obj    = module.GenericService(name, dasconfig)
                    setattr(self, name, obj)
                except Exception as exc:
                    print_exc(exc)
                    msg = "Unable to load %s data-service plugin" % name
                    raise Exception(msg)
            except Exception as exc:
                print_exc(exc)
                msg = "Unable to load %s data-service plugin" % name
                raise Exception(msg)

        # loop over systems and get system keys, add mapping keys to final list
        self.service_keys = {}
        self.service_parameters = {}
        for name in self.systems: 
            skeys = getattr(self, name).keys()
            self.service_keys[getattr(self, name).name] = skeys
            sparams = getattr(self, name).parameters()
            self.service_parameters[getattr(self, name).name] = sparams

        self.service_keys['special'] = das_special_keys()
        self.dasconfig = dasconfig
        das_timer('DASCore::init', self.verbose)

    def keys(self):
        """
        Return map of data service keys
        """
        return self.service_keys

    def das_keys(self):
        """
        Return map of data service keys
        """
        _keys = ['records']
        for values in self.service_keys.values():
            for key in values:
                if  key not in _keys:
                    _keys.append(key)
        return _keys

    def result(self, query, idx=0, limit=None):
        """
        Get results either from cache or from explicit call
        """
        self.logger.info('input query=%s' % query)
        results = []
        dasquery = DASQuery(query)
        dasquery.add_to_analytics()
        query    = dasquery.mongo_query
        # check if we have any service which cover the query
        # otherwise decompose it into list of queries
        service_map = dasquery.service_apis_map()
        if  not service_map:
            msg  = 'no APIs found to answer input query, will decompose it'
            self.logger.info(msg)
            skeys = query['fields']
            if  not skeys:
                skeys = []
            for key in skeys:
                newquery = DASQuery(dict(fields=[key], spec=query['spec']))
                self.call(newquery) # process query
        else:
            self.call(dasquery) # process query

        # lookup provided query in a cache
        if  not self.noresults:
            results = self.get_from_cache(dasquery, idx, limit)
        return results

    def remove_from_cache(self, dasquery):
        """
        Delete in cache entries about input query
        """
        self.rawcache.remove_from_cache(dasquery)

    def get_status(self, dasquery):
        """
        Look-up status of provided query in a cache.
        Return status of the query request and its hash.
        """
        status = None
        error  = None
        reason = None
        for col in ['merge', 'cache']:
            self.rawcache.remove_expired(dasquery, col)
        if  dasquery and 'fields' in dasquery.mongo_query:
            fields = dasquery.mongo_query['fields']
            if  fields and isinstance(fields, list) and 'queries' in fields:
                return 'ok', error, reason
        record = self.rawcache.find(dasquery)
        error, reason = self.rawcache.is_error_in_records(dasquery)
        try:
            if  record and 'das' in record and 'status' in record['das']:
                status = record['das']['status']
                if  not error:
                    error = record['das'].get('error', error)
                if  not reason:
                    reason = record['das'].get('reason', reason)
                return status, error, reason
        except Exception as exc:
            print_exc(exc)
            status = error = reason = None
            self.rawcache.remove_from_cache(dasquery)
        return status, error, reason

    def worker(self, srv, dasquery):
        """Main worker function which calls data-srv call function"""
        self.logger.info('##### %s ######\n' % srv)
        das_timer(srv, self.verbose)
        getattr(getattr(self, srv), 'call')(dasquery)
        das_timer(srv, self.verbose)

    def insert_query_records(self, dasquery):
        """
        Insert DAS query records into DAS cache and return list of services
        which will answer this query
        """
        services = dasquery.services
        self.logger.info('Potential services = %s' % services)
        if  not services:
            msg  = 'No data-services for query %s' % dasquery
            msg += 'mongo_query: %s' % dasquery.mongo_query
            msg += 'params: %s' % dasquery.params()
            print dastimestamp('DAS WARNING '), msg

        # get list of URI which can answer this query
        ack_services = []
        for srv in services:
            gen = [t for t in getattr(getattr(self, srv), 'apimap')(dasquery)]
            for url, api, args, iformat, expire in gen:
                header = dasheader(srv, dasquery, expire, api, url, ctime=0)
                self.rawcache.insert_query_record(dasquery, header)
                if  srv not in ack_services:
                    ack_services.append(srv)
        if  not ack_services:
            ack_services = services
        if  dasquery.query.find('records ') != -1:
            srv_status = True # skip DAS queries w/ records request
        expire = 2*60 # 2 minutes, it should be overwriten by data-srv
        header = dasheader("das", dasquery, expire, api='das_core',
                services=dict(das=ack_services))
        header['lookup_keys'] = []
        self.rawcache.insert_query_record(dasquery, header)
        das_timer('das_record', self.verbose)
        return ack_services

    def call(self, query, add_to_analytics=True, **kwds):
        """
        Top level DAS api which execute a given query using underlying
        data-services. It follows the following steps:

            - parse input query
            - identify data-sercices based on selection keys
              and where clause conditions
            - construct DAS workflow and execute data-service 
              API calls. At this step individual 
              data-services store results into DAS cache.

        Return status 0/1 depending on success of the calls, can be
        used by workers on cache server.

        kwds is provided for compatibility with web layer, e.g. it
        may invoke this method with additional pid parameter.
        """
        def update_das_query(dasquery, status, reason=None):
            "Update DAS query record with given status and reason"
            self.rawcache.update_query_record(dasquery, status, reason=reason)
            self.rawcache.add_to_record(\
                    dasquery, {'das.timer': get_das_timer()}, system='das')
            # make sure that das record is updated, we use 7 iteration which
            # sum up into 1 minute to cover default syncdelay value of mongo
            # server (in a future it would be better to find programatically
            # this syncdelay value, but it seems pymongo driver does not
            # provide any API for it.
            for idx in xrange(1, 7):
                spec = {'qhash':dasquery.qhash, 'das.system':['das']}
                res = self.rawcache.col.find_one(spec)
                if  res:
                    dbstatus = res.get('das', {}).get('status', None)
                    if  dbstatus == status:
                        break
                    msg = 'qhash %s, das.status=%s, status=%s, wait for update' \
                            % (dasquery.qhash, dbstatus, status)
                    print dastimestamp('DAS WARNING'), msg
                time.sleep(idx*idx)
                self.rawcache.update_query_record(dasquery, status, reason=reason)

        self.logger.info('input query=%s' % query)
        das_timer('DASCore::call', self.verbose)
        if  isinstance(query, object) and hasattr(query, '__class__')\
            and query.__class__.__name__ == 'DASQuery':
            dasquery = query
        else:
            dasquery = DASQuery(query)
        for col in ['merge', 'cache']:
            self.rawcache.remove_expired(dasquery, col)
        if  add_to_analytics:
            dasquery.add_to_analytics()
        query  = dasquery.mongo_query
        spec   = query.get('spec')
        fields = query.get('fields')
        if  fields == ['records']:
            msg = 'look-up all records in cache'
            self.logger.info(msg)
            return 'in cache'
        if  spec == dict(records='*'):
            self.logger.info("look-up everything in cache")
            return 'in cache'
        for record in self.rawcache.find_specs(dasquery):
            status = record['das']['status']
            msg = 'found query %s in cache, status=%s\n' \
                        % (record['query'], status)
            self.logger.info(msg)
            print dastimestamp('DAS INFO'), msg
            return status

        self.logger.info(dasquery)
        das_timer('das_record', self.verbose)
        services = self.insert_query_records(dasquery)
        if  not services:
            msg = 'unable to locate data-services to fulfill this request'
            msg += ', will iterate over all registered services'
            print dastimestamp('DAS WARNING '), dasquery, msg
            services = dasquery.services if dasquery.services else self.systems
        try:
            if  self.multitask:
                jobs = []
                for srv in sorted(services):
                    jobs.append(self.taskmgr.spawn(self.worker, srv, dasquery))
                self.taskmgr.joinall(jobs)
            else:
                for srv in services:
                    self.worker(srv, dasquery)
        except Exception as exc:
            print_exc(exc)
            return 'fail'
        self.logger.info('\n##### merging ######\n')
        update_das_query(dasquery, 'merging')
        das_timer('merge', self.verbose)
        self.rawcache.merge_records(dasquery)
        das_timer('merge', self.verbose)
        # check if we have service records and properly setup status
        self.logger.info('\n##### check services ######\n')
        das_services = self.rawcache.check_services(dasquery)
        reason = ''
        status = 'ok'
        if  not das_services:
            if  'records' in dasquery.query:
                status = 'ok' # keep status ok for 'records' queries
            else:
                reason = 'no data records found in DAS cache'
                status = 'fail'
                print dastimestamp('DAS ERROR '), dasquery, reason
        update_das_query(dasquery, status, reason)
        das_timer('DASCore::call', self.verbose)
        return status

    def processing_time(self, dasquery):
        "Look-up and return DAS query processing time"
        query_record = self.rawcache.find(dasquery)
        if  query_record:
            das = query_record.get('das', None)
            if  isinstance(das, dict):
                ctime = das.get('ctime', [])
                if  ctime:
                    return ctime[-1]-ctime[0]
        return None

    def nresults(self, dasquery, coll='merge'):
        """
        Return total number of results (count) for provided query
        Code should match body of get_from_cache method.
        """
        fields = dasquery.mongo_query.get('fields', None)
        if  dasquery.mapreduce:
            result = self.rawcache.map_reduce(dasquery.mapreduce, dasquery)
            return len([1 for _ in result])
        elif dasquery.aggregators:
            return len(dasquery.aggregators)
        elif isinstance(fields, list) and 'queries' in fields:
            return len([1 for _ in self.get_queries(dasquery)])
        return self.rawcache.nresults(dasquery, coll)

    def apilist(self, dasquery):
        "Return list of APIs answer given das query"
        return self.rawcache.apilist(dasquery)

    def incache(self, dasquery, coll='merge'):
        """
        Answer the question if given query in DAS cache or not
        """
        return self.rawcache.incache(dasquery, collection=coll)

    def get_from_cache(self, dasquery, idx=0, limit=0, collection='merge'):
        """
        Look-up results from the merge cache and yield them for
        further processing.
        """
        das_timer('DASCore::get_from_cache', self.verbose)
        msg = 'col=%s, query=%s, idx=%s, limit=%s'\
                % (collection, dasquery, idx, limit)
        self.logger.info(msg)

        fields  = dasquery.mongo_query.get('fields', None)

        if  dasquery.mapreduce:
            res = self.rawcache.map_reduce(dasquery.mapreduce, dasquery)
        elif dasquery.aggregators:
            # extract das information from rawcache
            rows  = self.rawcache.get_from_cache(\
                    dasquery, collection=collection)
            first = rows.next()
            sinfo = das_sinfo(first)
            # to perform aggregation we need:
            # - loop over all aggregator functions
            # - loop over all data-services
            # - loop over all APIs within a data-services
            # the code below does that, it applies aggregator
            # to selected (based on key/srv/api) records
            res = []
            _id = 0
            time0  = time.time()
            expire = 300 # min expire
            for func, key in dasquery.aggregators:
                afunc = getattr(das_aggregator, 'das_%s' % func)
                found = False
                for srv, apis, in sinfo.items():
                    for api in apis:
                        rows  = self.rawcache.get_from_cache(\
                                dasquery, collection=collection)
                        gen   = api_rows(rows, api)
                        data  = afunc(key, gen)
                        ctime = time.time() - time0
                        das   = dasheader(srv, dasquery, expire, api=api,
                                ctime=ctime)
                        if  isinstance(data, dict) and data['value'] != 'N/A':
                            aggr = {'_id':_id, 'function': func,
                                    'key': key, 'result': data}
                            aggr.update(das)
                            res.append(aggr)
                            _id += 1
                            found = True
                if  not found: # when we got nothing add empty result record
                    empty = {'value':'N/A'}
                    ctime = time.time() - time0
                    das = dasheader('das', dasquery, expire, api='das_core',
                            ctime=ctime)
                    rec = {'_id':0, 'function':func, 'key':key, 'result':empty}
                    rec.update(das)
                    res.append(rec)
        elif isinstance(fields, list) and 'queries' in fields:
            res = itertools.islice(self.get_queries(dasquery), idx, idx+limit)
        else:
            res = self.rawcache.get_from_cache(dasquery, idx, limit, \
                    collection=collection)
        for row in res:
            fix_times(row)
            yield row
        das_timer('DASCore::get_from_cache', self.verbose)

    def get_queries(self, dasquery):
        """
        Look-up (popular) queries in DAS analytics/logging db
        """
        das_timer('DASCore::get_queries', self.verbose)
        fields = dasquery.mongo_query.get('fields')
        spec   = dasquery.mongo_query.get('spec')
        if  'popular' in fields:
            res = self.analytics.get_popular_queries(spec)
        else:
            datestamp = spec.get('date')
            if  isinstance(datestamp, dict):
                value = datestamp.get('$in')
                res = \
                self.analytics.list_queries(after=value[0], before=value[1])
            elif isinstance(datestamp, int):
                res = self.analytics.list_queries(after=datestamp)
            elif not datestamp:
                res = self.analytics.list_queries()
            else:
                msg = 'Unsupported date value: %s' % datestamp
                raise Exception(msg)
        for row in res:
            rid = row.pop('_id')
            yield dict(das_query=row, _id=rid)
        das_timer('DASCore::get_queries', self.verbose)
Beispiel #26
0
class DASMongocache(object):
    """
    DAS cache based MongoDB.
    """
    def __init__(self, config):
        self.config = config
        self.emptyset_expire = \
                expire_timestamp(config['das'].get('emptyset_expire', 5))
        self.dburi = config['mongodb']['dburi']
        self.cache_size = config['mongodb']['bulkupdate_size']
        self.dbname = config['dasdb']['dbname']
        self.verbose = config['verbose']
        self.logger = PrintManager('DASMongocache', self.verbose)
        self.mapping = config['dasmapping']
        self.logging = config['dasdb'].get('logging', False)
        self.rec_ttl = config['dasdb'].get('record_ttl', 24 * 60 * 60)
        self.del_ttl = config['dasdb'].get('delta_ttl', 60)
        self.cleanup_del_ttl = config['dasdb'].get('cleanup_delta_ttl', 3600)
        self.retry = config['dasdb'].get('retry', 3)
        self.das_son_manipulator = DAS_SONManipulator()

        # Initialize MongoDB connection
        self.col_ = self.config['dasdb']['cachecollection']
        self.mrcol_ = self.config['dasdb']['mrcollection']
        self.merge_ = self.config['dasdb']['mergecollection']
        self.gfs = db_gridfs(self.dburi)

        msg = "%s@%s" % (self.dburi, self.dbname)
        self.logger.info(msg)

        # ensure that we have the following indexes
        common_idx = [
            ('file.name', DESCENDING),
            ('dataset.name', DESCENDING),
            ('block.name', DESCENDING),
            ('run.run_number', DESCENDING),
        ]
        index_list = [('das.expire', ASCENDING), ('das_id', ASCENDING),
                      ('das.system', ASCENDING), ('qhash', DESCENDING),
                      ('das.record', ASCENDING)]
        create_indexes(self.col, index_list + common_idx)
        index_list = [('das.expire', ASCENDING), ('das_id', ASCENDING),
                      ('qhash', DESCENDING), ('das.record', ASCENDING),
                      ('das.ts', ASCENDING)]
        create_indexes(self.merge, index_list)
        # NOTE: I found that creating index in merge collection leads to
        # MongoDB error when records contains multiple arrays on indexed
        # keys. For example, when we query file,run,lumi both file and run
        # are arrays in MongoDB. In this case the final sort in MongoDB
        # bark with the following message:
        # cannot sort with keys that are parallel arrays
        # it looks like that there is no fix for that yet
        # see
        # http://stackoverflow.com/questions/6516725/how-do-i-index-two-arrays-in-mongodb
        # therefore I temporary disabled create_indexes call on merge
        # collection which was used to have index to ease final sort,
        # especially in a case when a lot of records correspond to inital
        # query, e.g. file records.
        # On another hand, the most common use case where sort fails is
        # getting file records, and I can add one compound key to ease sort
        # but I can't add another compound key on array field, e.g. run
        common_idx = [[('qhash', DESCENDING), ('file.name', DESCENDING)]]
        create_indexes(self.merge, index_list + common_idx)

        # thread which clean-up DAS collections
        thname = 'mongocache_cleanup'
        cols = [
            config['dasdb']['cachecollection'],
            config['dasdb']['mrcollection'], config['dasdb']['mergecollection']
        ]

    @property
    def col(self):
        "col property provides access to DAS cache collection"
        conn = db_connection(self.dburi)
        mdb = conn[self.dbname]
        colnames = mdb.collection_names()
        if not colnames or self.col_ not in colnames:
            try:
                mdb.create_collection(self.col_)
            except OperationFailure:
                pass
        mdb.add_son_manipulator(self.das_son_manipulator)
        return mdb[self.col_]

    @property
    def merge(self):
        "merge property provides access to DAS merge collection"
        conn = db_connection(self.dburi)
        mdb = conn[self.dbname]
        colnames = mdb.collection_names()
        if not colnames or self.merge_ not in colnames:
            try:
                mdb.create_collection(self.merge_)
            except OperationFailure:
                pass
        mdb.add_son_manipulator(self.das_son_manipulator)
        return mdb[self.merge_]

    @property
    def mrcol(self):
        "mrcol property provides access to DAS map-reduce collection"
        conn = db_connection(self.dburi)
        mdb = conn[self.dbname]
        mdb.add_son_manipulator(self.das_son_manipulator)
        return mdb[self.mrcol_]

    def get_dataset_hashes(self, dasquery):
        "Get dataset hashes from DBS database"
        spec = dasquery.mongo_query.get('spec', {})
        inst = dasquery.instance
        conn = db_connection(self.dburi)
        if spec and inst:
            dataset = spec.get('dataset.name', None)
            if dataset:
                if dataset.find('*') != -1:
                    cond = {'dataset': re.compile(dataset.replace('*', '.*'))}
                else:
                    cond = {'dataset': dataset}
                for row in conn['dbs'][inst].find(cond):
                    if 'qhash' in row:
                        yield row['qhash']

    def check_datasets(self, dasquery):
        "Check dataset presence in DAS cache for given das query"
        hashes = [r for r in self.get_dataset_hashes(dasquery)]
        if hashes:
            spec = {'qhash': {'$in': hashes}}
            if len(hashes) == self.merge.find(spec, **PYMONGO_OPTS).count():
                dasquery._hashes = hashes

    def get_superset_keys(self, key, value):
        """
        This is a special-case version of similar_keys,
        intended for analysers that want to quickly
        find possible superset queries of a simple
        query of the form key=value.
        """

        msg = "%s=%s" % (key, value)
        self.logger.debug(msg)
        cond = {'query.spec.key': key}
        for row in self.col.find(cond, **PYMONGO_OPTS):
            mongo_query = decode_mongo_query(row['query'])
            for thiskey, thisvalue in mongo_query.items():
                if thiskey == key:
                    if fnmatch.fnmatch(value, thisvalue):
                        yield thisvalue

    def get_fields(self, dasquery):
        "Prepare fields to extract from MongoDB"
        fields = dasquery.mongo_query.get('fields', [])
        if fields and 'records' in fields:
            fields = None  # look-up all records
        filters = dasquery.filters
        cond = {}
        if filters:
            new_fields = []
            for dasfilter in filters:
                if dasfilter == 'unique':
                    continue
                if  fields and dasfilter not in fields and \
                    dasfilter not in new_fields:
                    if  dasfilter.find('=') == -1 and dasfilter.find('<') == -1\
                    and dasfilter.find('>') == -1:
                        new_fields.append(dasfilter)
                    else:
                        cond = parse_filters(dasquery.mongo_query)
            if not new_fields and fields:
                new_fields = list(fields)
            return new_fields, cond
        return fields, cond

    def remove_expired(self, dasquery, collection):
        """
        Remove expired records from DAS cache. We need to perform this
        operation very carefullly since we don't use transation and on-going
        commits can invoke this method (see das_core.py).  Therefore we use
        MongoDB $or operator to wipe out queries which match DASQuery hash and
        already expired or queries which lived in cache more then rec_ttl
        config parameter. The later operation just prevent DAS cache from
        growing.
        """
        conn = db_connection(self.dburi)
        mdb = conn[self.dbname]
        mdb.add_son_manipulator(self.das_son_manipulator)
        col = mdb[collection]
        # use additional delta to check data record expiration
        # we add this delta to ensure that there is no records close to
        # current timestamp which may expire during request processing
        spec = {
            'qhash': dasquery.qhash,
            'das.expire': {
                '$lt': time.time() + self.del_ttl
            }
        }
        col.delete_many(spec)

    def check_services(self, dasquery):
        """
        Check if DAS cache contains DAS records with service response for
        given query.
        """
        das_rec = self.find(dasquery)
        if not das_rec:
            return False
        if 'das' not in das_rec:
            return False
        if 'services' not in das_rec['das']:
            return False
        spec = {
            'qhash': dasquery.qhash,
            'das.system': {
                '$ne': 'das'
            },
            'das.expire': {
                '$gt': time.time()
            }
        }
        nres = self.col.find(spec, **PYMONGO_OPTS).count()
        if nres:
            return True
        return False

    def find(self, dasquery):
        """
        Find provided query in DAS cache.
        """
        cond = {
            'qhash': dasquery.qhash,
            'das.system': 'das',
            'das.expire': {
                '$gt': time.time()
            }
        }
        return find_one(self.col, cond)

    def find_specs(self, dasquery, system='das'):
        """
        Check if cache has query whose specs are identical to provided query.
        Return all matches.
        """
        if dasquery.hashes:
            cond = {'qhash': {'$in': dasquery.hashes}}
        else:
            cond = {'qhash': dasquery.qhash}
        if system:
            cond.update({'das.system': system})
        cond.update({'das.expire': {'$gt': time.time()}})
        return self.col.find(cond, **PYMONGO_OPTS)

    def get_das_ids(self, dasquery):
        """
        Return list of DAS ids associated with given query
        """
        das_ids = []
        try:
            das_ids = \
                [r['_id'] for r in self.col.find_specs(dasquery, system='')]
        except:
            pass
        return das_ids

    def update_das_expire(self, dasquery, timestamp):
        "Update timestamp of all DAS data records for given query"
        nval = {'$set': {'das.expire': timestamp}}
        spec = {'qhash': dasquery.qhash}
        self.col.update_many(spec, nval)
        self.merge.update_many(spec, nval)

    def das_record(self, dasquery):
        "Retrieve DAS record for given query"
        cond = {'qhash': dasquery.qhash, 'das.expire': {'$gt': time.time()}}
        return find_one(self.col, cond)

    def find_records(self, das_id):
        " Return all the records matching a given das_id"
        return self.col.find({'das_id': das_id}, **PYMONGO_OPTS)

    def is_error_in_records(self, dasquery, collection='cache'):
        "Scan DAS cache for error records and return true or not"
        if collection == 'cache':
            results = self.col.find({'qhash': dasquery.qhash}, **PYMONGO_OPTS)
        else:
            results = self.merge.find({'qhash': dasquery.qhash},
                                      **PYMONGO_OPTS)
        error = None
        reason = None
        for row in results:
            if 'error' in row:
                error = row.get('error')
                reason = row.get('reason', '')
                break
        return error, reason

    def add_to_record(self, dasquery, info, system=None):
        "Add to existing DAS record provided info"
        if system:
            self.col.update_one(
                {
                    'query': dasquery.storage_query,
                    'das.system': system
                }, {'$set': info},
                upsert=True)
        else:
            self.col.update_one({'query': dasquery.storage_query},
                                {'$set': info},
                                upsert=True)

    def find_min_expire(self, dasquery):
        """Find minimal expire timestamp across all records for given DAS query"""
        spec = {'qhash': dasquery.qhash}
        min_expire = 2 * time.time()  # upper bound, will update
        for rec in self.col.find(spec, **PYMONGO_OPTS):
            if 'das' in rec and 'expire' in rec['das']:
                estamp = rec['das']['expire']
                if min_expire > estamp:
                    min_expire = estamp
        return long(min_expire)

    def find_query_record(self, dasquery):
        "Find DAS query records and return them to the caller"
        spec = {
            'qhash': dasquery.qhash,
            'das.record': record_codes('query_record')
        }
        return self.col.find(spec, **PYMONGO_OPTS)

    def update_query_record(self, dasquery, status, header=None, reason=None):
        "Update DAS record for provided query"
        ctime = time.time()
        das_spec = {'qhash': dasquery.qhash, 'das.system': 'das'}
        min_expire = self.find_min_expire(dasquery)
        if header:
            system = header['das']['system']
            sts = header['das']['status']
            expire = header['das']['expire']
            spec = {'qhash': dasquery.qhash, 'das.system': system}
            new_expire = None
            for rec in self.col.find(spec, **PYMONGO_OPTS):
                if 'das' in rec and 'expire' in rec['das']:
                    if rec['das']['expire'] > expire:
                        new_expire = expire
                        ndict = {'das.expire': expire, 'das.status': status}
                        cdict = {'das.ctime': ctime}
                        udict = {'$set': ndict, '$push': cdict}
                        oid = ObjectId(rec['_id'])
                        self.col.update_one({'_id': oid}, udict)
            if new_expire:
                udict = {
                    '$set': {
                        'das.expire': new_expire
                    },
                    '$push': {
                        'das.ctime': ctime
                    }
                }
                self.col.update_one(das_spec, udict)
        else:
            udict = {
                '$set': {
                    'das.status': status,
                    'das.expire': min_expire
                },
                '$push': {
                    'das.ctime': ctime
                }
            }
            self.col.update_one(das_spec, udict)
        if reason:
            udict = {'$set': {'das.reason': reason}}
            self.col.update_one(das_spec, udict)
        # align all expire timestamps when we recieve ok status
        if status == 'ok':
            udict = {'$set': {'das.expire': min_expire}}
            self.col.update_one(das_spec, udict)

    def apilist(self, dasquery):
        "Return list of apis for given dasquery"
        spec = {
            'qhash': dasquery.qhash,
            'das.record': record_codes('query_record')
        }
        apis = []
        for row in self.col.find(spec, ['das.api'], **PYMONGO_OPTS):
            try:
                apis += row['das']['api']
            except Exception as _err:
                pass
        return apis

    def incache(self,
                dasquery,
                collection='merge',
                system=None,
                api=None,
                query_record=False):
        """
        Check if we have query results in cache, otherwise return null.
        Please note, input parameter query means MongoDB query, please
        consult MongoDB API for more details,
        http://api.mongodb.org/python/
        """
        if query_record:
            record = record_codes('query_record')
        else:
            record = spec4data_records()
        spec = {
            'qhash': dasquery.qhash,
            'das.record': record,
            'das.expire': {
                '$gt': time.time()
            }
        }
        if system:
            spec.update({'das.system': system})
        if api:
            spec.update({'das.api': api})
        conn = db_connection(self.dburi)
        mdb = conn[self.dbname]
        mdb.add_son_manipulator(self.das_son_manipulator)
        col = mdb[collection]
        res = col.find(spec, **PYMONGO_OPTS).count()
        msg = "(%s, coll=%s) found %s results" % (dasquery, collection, res)
        self.logger.info(msg)
        if res:
            return True
        return False

    def nresults(self, dasquery, collection='merge'):
        """Return number of results for given query."""
        if dasquery.aggregators:
            return len(dasquery.aggregators)
        # Distinguish 2 use cases, unique filter and general query
        # in first one we should count only unique records, in later
        # we can rely on DB count() method. Pleas keep in mind that
        # usage of fields in find doesn't account for counting, since it
        # is a view over records found with spec, so we don't need to use it.
        fields, filter_cond = self.get_fields(dasquery)
        if not fields:
            spec = dasquery.mongo_query.get('spec', {})
        elif dasquery.hashes:
            spec = {
                'qhash': {
                    '$in': dasquery.hashes
                },
                'das.record': spec4data_records()
            }
        else:
            spec = {'qhash': dasquery.qhash, 'das.record': spec4data_records()}
        if filter_cond:
            spec.update(filter_cond)
        conn = db_connection(self.dburi)
        mdb = conn[self.dbname]
        mdb.add_son_manipulator(self.das_son_manipulator)
        col = mdb[collection]
        if dasquery.unique_filter:
            skeys = self.mongo_sort_keys(collection, dasquery)
            if skeys:
                gen = col.find(spec, **PYMONGO_OPTS).sort(skeys)
            else:
                gen = col.find(spec, **PYMONGO_OPTS)
            res = len([r for r in unique_filter(gen)])
        else:
            res = col.find(spec, **PYMONGO_OPTS).count()
            if not res:  # double check that this is really the case
                time.sleep(1)
                res = col.find(spec, **PYMONGO_OPTS).count()
        msg = "%s" % res
        self.logger.info(msg)
        return res

    def mongo_sort_keys(self, collection, dasquery):
        """
        Find list of sort keys for a given DAS query. Check existing
        indexes and either use fields or spec keys to find them out.
        Return list of mongo sort keys in a form of (key, order).
        """
        # try to get sort keys all the time to get ordered list of
        # docs which allow unique_filter to apply afterwards
        fields = dasquery.mongo_query.get('fields')
        spec = dasquery.mongo_query.get('spec')
        skeys = dasquery.sortkeys
        mongo_skeys = []
        if skeys:
            for key in skeys:
                if key.find('-') != -1:  # reverse order, e.g. desc
                    mongo_skeys.append((key.replace('-', ''), DESCENDING))
                else:
                    mongo_skeys.append((key, ASCENDING))
        else:
            existing_idx = [i for i in self.existing_indexes(collection)]
            if fields:
                lkeys = []
                for key in fields:
                    for mkey in self.mapping.mapkeys(key):
                        if mkey not in lkeys:
                            lkeys.append(mkey)
            else:
                lkeys = list(spec.keys())
            keys = [k for k in lkeys \
                if k.find('das') == -1 and k.find('_id') == -1 and \
                        k in existing_idx]
            mongo_skeys = [(k, ASCENDING) for k in keys]
        return mongo_skeys

    def existing_indexes(self, collection='merge'):
        """
        Get list of existing indexes in DB. They are returned by
        index_information API in the following for:

        .. doctest::

            {u'_id_': {u'key': [(u'_id', 1)], u'v': 0},
             u'das.expire_1': {u'key': [(u'das.expire', 1)], u'v': 0},
             ...
             u'tier.name_-1': {u'key': [(u'tier.name', -1)], u'v': 0}}
        """
        conn = db_connection(self.dburi)
        mdb = conn[self.dbname]
        mdb.add_son_manipulator(self.das_son_manipulator)
        col = mdb[collection]
        for val in col.index_information().values():
            for idx in val['key']:
                yield idx[0]  # index name

    def get_records(self, coll, spec, fields, skeys, idx, limit, unique=False):
        "Generator to get records from MongoDB."
        try:
            conn = db_connection(self.dburi)
            mdb = conn[self.dbname]
            mdb.add_son_manipulator(self.das_son_manipulator)
            col = mdb[coll]
            nres = col.find(spec, **PYMONGO_OPTS).count()
            if nres == 1 or nres <= limit:
                limit = 0
            if limit:
                res = col.find(spec, fields, sort=skeys, skip=idx, limit=limit)
            else:
                res = col.find(spec, fields, sort=skeys, **PYMONGO_OPTS)
            if unique:
                res = unique_filter(res)
            for row in res:
                yield row
        except Exception as exp:
            print_exc(exp)
            row = {'exception': str(exp)}
            res = []
            yield row

    def get_from_cache(self, dasquery, idx=0, limit=0, collection='merge'):
        "Generator which retrieves results from the cache"
        if dasquery.service_apis_map():  # valid DAS query
            result = self.get_das_records(dasquery, idx, limit, collection)
            for row in result:
                yield row
        else:  # pure MongoDB query
            fields = dasquery.mongo_query.get('fields', [])
            if fields == None:
                fields = []
            spec = dasquery.mongo_query.get('spec', {})
            if dasquery.filters:
                if not fields:
                    fields = []
                fields += dasquery.filters
                pkeys = [k.split('.')[0] for k in fields]
            fields += das_record_keys()
            if 'records' in dasquery.query:
                fields = None  # special case for DAS 'records' keyword
            skeys = self.mongo_sort_keys(collection, dasquery)
            result  = self.get_records(collection, spec, fields, skeys, \
                            idx, limit, dasquery.unique_filter)
            for row in result:
                if dasquery.filters:
                    if pkeys and set(pkeys) & set(row.keys()):
                        yield row
                else:
                    yield row

    def get_das_records(self, dasquery, idx=0, limit=0, collection='merge'):
        "Generator which retrieves DAS records from the cache"
        msg = "(%s, %s, %s, coll=%s)" % (dasquery, idx, limit, collection)
        self.logger.info(msg)

        idx = int(idx)
        fields, filter_cond = self.get_fields(dasquery)
        if fields == None:
            fields = []
        if not fields:
            spec = dasquery.mongo_query.get('spec', {})
        elif dasquery.hashes:
            spec = {
                'qhash': {
                    '$in': dasquery.hashes
                },
                'das.record': spec4data_records()
            }
        else:
            spec = {'qhash': dasquery.qhash, 'das.record': spec4data_records()}
        if filter_cond:
            spec.update(filter_cond)
        if 'records' in dasquery.query:
            fields = None  # retrieve all fields for records DAS query
        else:
            # be sure to extract das internal keys
            fields += das_record_keys()
        # try to get sort keys all the time to get ordered list of
        # docs which allow unique_filter to apply afterwards
        skeys = self.mongo_sort_keys(collection, dasquery)
        res     = self.get_records(collection, spec, fields, skeys, \
                        idx, limit, dasquery.unique_filter)
        counter = 0
        for row in res:
            counter += 1
            yield row
        msg = 'qhash %s, found %s record(s) in %s collection' \
                % (dasquery.qhash, counter, collection)
        print(dastimestamp('DAS INFO '), msg)

        if counter:
            msg = "yield %s record(s)" % counter
            self.logger.info(msg)

        # if no raw records were yield we look-up possible error records
        # and reset timestamp for record with system:['das']
        if not counter:
            spec = {'qhash': dasquery.qhash}
            nrec = self.col.find(spec, **PYMONGO_OPTS).count()
            if nrec:
                msg = "for query %s, found %s non-result record(s)" \
                        % (dasquery, nrec)
                print(dastimestamp('DAS WARNING'), msg)
                for rec in self.col.find(spec, **PYMONGO_OPTS):
                    if 'query' in rec:
                        print(dastimestamp('DAS das record'), rec)
            self.update_das_expire(dasquery, etstamp())

    def map_reduce(self, mr_input, dasquery, collection='merge'):
        """
        Wrapper around _map_reduce to allow sequential map/reduce
        operations, e.g. map/reduce out of map/reduce.

        mr_input is either alias name or list of alias names for
        map/reduce functions.

        Input dasquery which is applied to first
        iteration of map/reduce functions.
        """
        # NOTE: I need to revisit mapreduce.
        spec = dasquery.mongo_query['spec']
        if not isinstance(mr_input, list):
            mrlist = [mr_input]
        else:
            mrlist = mr_input
        conn = db_connection(self.dburi)
        mdb = conn[self.dbname]
        mdb.add_son_manipulator(self.das_son_manipulator)
        coll = mdb[collection]
        for mapreduce in mrlist:
            if mapreduce == mrlist[0]:
                cond = spec
            else:
                cond = None
            coll = self._map_reduce(coll, mapreduce, cond)
        for row in coll.find():
            yield row

    def _map_reduce(self, coll, mapreduce, spec=None):
        """
        Perform map/reduce operation over DAS cache using provided
        collection, mapreduce name and optional conditions.
        """
        self.logger.debug("(%s, %s)" % (mapreduce, spec))
        record = find_one(self.mrcol, {'name': mapreduce})
        if not record:
            raise Exception("Map/reduce function '%s' not found" % mapreduce)
        fmap = record['map']
        freduce = record['reduce']
        if spec:
            result = coll.map_reduce(Code(fmap), Code(freduce), query=spec)
        else:
            result = coll.map_reduce(Code(fmap), Code(freduce))
        msg = "found %s records in %s" % (result.count(), result.name)
        self.logger.info(msg)
        self.logger.debug(fmap)
        self.logger.debug(freduce)
        return result

    def get_map_reduce(self, name=None):
        """
        Return definition of map/reduce functions for provided name
        or gives full list.
        """
        spec = {}
        if name:
            spec = {'name': name}
        result = self.mrcol.find(spec, **PYMONGO_OPTS)
        for row in result:
            yield row

    def merge_records(self, dasquery, attempt=0):
        """
        Merge DAS records for provided query. We perform the following
        steps:
        1. get all queries from das.cache by ordering them by primary key
        2. run aggregtor function to merge neighbors
        3. insert records into das.merge
        """
        ### TMP for asyncio
        #         time.sleep(attempt+3) # pymongo 3.2 don't yet flush in time

        # remove any entries in merge collection for this query
        self.merge.delete_many({'qhash': dasquery.qhash})
        # proceed
        self.logger.debug(dasquery)
        id_list = []
        expire = 9999999999  # future
        # get all API records for given DAS query
        spec = {
            'qhash': dasquery.qhash,
            'das.expire': {
                '$gt': time.time()
            },
            'das.record': record_codes('query_record')
        }
        records = self.col.find(spec, **PYMONGO_OPTS)
        for row in records:
            # find smallest expire timestamp to be used by aggregator
            rexpire = row.get('das', {}).get('expire', expire)
            if rexpire < expire:
                expire = rexpire
            if row['_id'] not in id_list:
                id_list.append(row['_id'])
        inserted = 0
        lookup_keys = set()
        fields = dasquery.mongo_query.get('fields')
        if not fields:  # Mongo
            fields = []
        for key in fields:
            for pkey in self.mapping.mapkeys(key):
                lookup_keys.add(pkey)
        for pkey in lookup_keys:
            skey = [(pkey, DESCENDING)]
            # lookup all service records
            spec = {'das_id': {'$in': id_list}, 'das.primary_key': pkey}
            if self.verbose:
                nrec = self.col.find(spec, **PYMONGO_OPTS).sort(skey).count()
                msg = "merging %s records, for %s key" % (nrec, pkey)
            else:
                msg = "merging records, for %s key" % pkey
            self.logger.debug(msg)
            # use exhaust=False since we process all records in aggregator
            # and it can be delay in processing
            records = self.col.find(spec, **PYMONGO_NOEXHAUST).sort(skey)
            # aggregate all records
            agen = aggregator(dasquery, records, expire)
            # diff aggregated records
            gen = das_diff(agen, self.mapping.diff_keys(pkey.split('.')[0]))
            # insert all records into das.merge using bulk insert
            size = self.cache_size
            try:
                res = self.merge.insert_many(gen)
                inserted += len(res.inserted_ids)
            except InvalidDocument as exp:
                print(dastimestamp('DAS WARNING'),
                      'InvalidDocument during merge', str(exp))
                msg = "Caught bson error: " + str(exp)
                self.logger.info(msg)
                records = self.col.find(spec, **PYMONGO_OPTS).sort(skey)
                gen = aggregator(dasquery, records, expire)
                genrows = parse2gridfs(self.gfs, pkey, gen, self.logger)
                das_dict = {
                    'das': {
                        'expire': expire,
                        'das.record': record_codes('gridfs_record'),
                        'primary_key': [k for k in lookup_keys],
                        'system': ['gridfs']
                    },
                    'qhash': dasquery.qhash,
                    'cache_id': [],
                    'das_id': id_list
                }
                for row in genrows:
                    row.update(das_dict)
                    self.merge.insert(row)
            except InvalidOperation as exp:
                pass
            except DuplicateKeyError as err:
                print(dastimestamp('DAS WARNING'),
                      'DuplicateKeyError during merge')
                if not isinstance(gen, list):
                    raise err
        status = 'fail'
        if inserted:
            status = 'ok'
        elif not lookup_keys:  # we get query w/o fields
            msg = 'qhash %s, no lookup_keys' % dasquery.qhash
            print(dastimestamp('DAS WARNING'), msg)
            status = 'ok'
        else:  # we didn't merge anything, it is DB look-up failure
            msg  = 'qhash %s, did not insert into das.merge, attempt %s' \
                    % (dasquery.qhash, attempt)
            print(dastimestamp('DAS WARNING'), msg)
            empty_expire = etstamp()
            lkeys = list(lookup_keys)
            das = dict(expire=empty_expire,
                       primary_key=lkeys[0],
                       condition_keys=lkeys,
                       instance=dasquery.instance,
                       system=['das'],
                       services=dasquery.services,
                       record=record_codes('empty_record'),
                       ts=time.time(),
                       api=[])
            empty_record = {
                'das': das,
                'qhash': dasquery.qhash,
                'cache_id': [],
                'das_id': id_list
            }
            for key in lkeys:
                empty_record.update({key.split('.')[0]: []})
            for key, val in dasquery.mongo_query['spec'].items():
                if key.find('.') == -1:
                    empty_record[key] = []
                else:  # it is compound key, e.g. site.name
                    newkey, newval = convert_dot_notation(key, val)
                    empty_record[newkey] = adjust_mongo_keyvalue(newval)
            self.merge.insert(empty_record)
            # update DAS records (both meta and data ones, by using qhash)
            nval = {'$set': {'das.expire': empty_expire}}
            spec = {'qhash': dasquery.qhash}
            self.col.update_many(spec, nval)
        return status

    def update_cache(self, dasquery, results, header, system, api):
        """
        Insert results into cache. Use bulk insert controller by
        self.cache_size. Upon completion ensure indexies.
        """
        # update results records in DAS cache
        gen = self.generate_records(dasquery, results, header)
        inserted = 0
        # bulk insert
        try:
            res = self.col.insert_many(gen,
                                       ordered=False,
                                       bypass_document_validation=True)
            inserted += len(res.inserted_ids)
        except InvalidOperation:
            pass

        # update query record for this sub-system
        self.update_query_record_system(dasquery, system, api, 'ok')

        if dasquery.qcache:  # custom DASQuery cache
            self.update_das_expire(dasquery, expire_timestamp(dasquery.qcache))

    def update_query_record_system(self, dasquery, system, api, status):
        "Update system status of dasquery in das.cache collection"
        spec = {
            'qhash': dasquery.qhash,
            'das.system': system,
            'das.api': api,
            'das.record': record_codes('query_record')
        }
        udict = {'$set': {'das.status': status}}
        #         print("### update_query_record", spec)
        doc = self.col.find_one_and_update(
            spec, udict, return_document=ReturnDocument.AFTER)
#         print(doc)

    def insert_query_record(self, dasquery, header):
        """
        Insert query record into DAS cache.
        """
        # check presence of API record in a cache
        dasheader = header['das']
        system = dasheader['system']
        api = dasheader['api']
        collection = 'cache'
        check_query = True
        expire = dasheader.get('expire', None)
        if expire:
            dasheader['expire'] = adjust_expire(expire)
        if not self.incache(dasquery, collection, system, api, check_query):
            msg = "query=%s, header=%s" % (dasquery, header)
            self.logger.debug(msg)
            q_record = dict(das=dasheader, query=dasquery.storage_query)
            q_record['das']['record'] = record_codes('query_record')
            q_record['das']['status'] = "requested"
            q_record['qhash'] = dasquery.qhash
            q_record['das']['ctime'] = [time.time()]
            res = self.col.insert_one(q_record)
            if not res:
                msg = 'unable to insert query record'
                print(dastimestamp('DAS ERROR '), dasquery, msg,
                      ', will retry')
                time.sleep(1)
                res = self.col.insert(q_record)
                if not res:
                    print(dastimestamp('DAS ERROR '), dasquery, msg)

    def generate_records(self, dasquery, results, header):
        """
        Iterate over provided results, update records and yield them
        to next level (update_cache)
        """
        self.logger.debug("(%s) store to cache" % dasquery)
        if not results:
            return

        dasheader = header['das']
        expire = adjust_expire(dasheader['expire'])
        system = dasheader['system']  # DAS service names, e.g. combined
        services = dasheader['services']  # CMS services used to get data
        api = dasheader['api']
        prim_key = header.get('prim_key', None)
        if not prim_key:
            # get primary key from a list of lookup keys which has the
            # following structure [{'api':[keys]}, {...}]
            lup_keys = header['lookup_keys']
            lkeys = [l for i in lup_keys for k in i.values() for l in k]
            prim_key = lkeys[0] if 'summary' not in lkeys else 'summary'
        cond_keys = list(dasquery.mongo_query['spec'].keys())
        # get API record id
        spec = {
            'qhash': dasquery.qhash,
            'das.system': system,
            'das.expire': {
                '$gt': time.time()
            },
            'das.record': record_codes('query_record')
        }
        counter = 0
        rids = [str(r['_id']) for r in \
                self.col.find(spec, ['_id'], **PYMONGO_OPTS)]
        if rids:
            if isinstance(results, list) or isinstance(results, GeneratorType):
                for item in results:
                    counter += 1
                    if 'das' in item:
                        expire = item.get('das').get('expire', expire)
                        dasheader['expire'] = expire
                    item['das'] = dict(expire=expire,
                                       primary_key=prim_key,
                                       condition_keys=cond_keys,
                                       instance=dasquery.instance,
                                       system=system,
                                       services=services,
                                       record=record_codes('data_record'),
                                       ts=time.time(),
                                       api=api)
                    item['das_id'] = rids
                    item['qhash'] = dasquery.qhash
                    yield item
            else:
                print("\n\n ### results = ", str(results))
                raise Exception(
                    'Provided results is not a list/generator type')
        if expire != dasheader['expire']:  # update DAS records
            header['das']['expire'] = expire
        # update das record with new status
        status = 'Update DAS cache, %s API' % header['das']['api'][0]
        self.update_query_record(dasquery, status, header)

        msg = "\n%s yield %s rows" % (dasheader['system'], counter)
        self.logger.info(msg)

    def remove_from_cache(self, dasquery):
        """
        Remove query from DAS cache. To do so, we retrieve API record
        and remove all data records from das.cache and das.merge
        """
        records = self.col.find({'qhash': dasquery.qhash}, **PYMONGO_OPTS)
        id_list = []
        for row in records:
            if row['_id'] not in id_list:
                id_list.append(row['_id'])
        spec = {'das_id': {'$in': id_list}}
        self.merge.remove(spec)
        self.merge.remove({'qhash': dasquery.qhash})
        self.col.remove(spec)
        self.col.remove({'qhash': dasquery.qhash})

    def clean_cache(self, collection=None):
        """
        Clean expired docs in das.cache and das.merge.
        """
        current_time = time.time()
        query = {'das.expire': {'$lt': current_time}}
        if not collection or collection == 'merge':
            self.merge.remove(query)
        if not collection or collection == 'cache':
            self.col.remove(query)

    def delete_cache(self):
        """
        Delete all results in DAS cache/merge collection, including
        internal indexes.
        """
        self.col.remove({})
        try:
            self.col.drop_indexes()
        except:
            pass
        self.merge.remove({})
        try:
            self.merge.drop_indexes()
        except:
            pass