Ejemplo n.º 1
0
Archivo: logger_t.py Proyecto: ktf/DAS
 def test_debug(self):
     "Test logger debug method"
     old_stdout = sys.stdout
     logger = PrintManager(self.name, verbose=2)
     sys.stdout = StringIO()
     logger.debug('test')
     result = sys.stdout.getvalue()
     expect = 'DEBUG %s:%s test\n' % (self.name, funcname())
     self.assertEqual(expect, result)
     sys.stdout = old_stdout
Ejemplo n.º 2
0
 def test_debug(self):
     "Test logger debug method"
     old_stdout = sys.stdout
     logger = PrintManager(self.name, verbose=2)
     sys.stdout = StringIO.StringIO()
     logger.debug('test')
     result = sys.stdout.getvalue()
     expect = 'DEBUG %s:%s test\n' % (self.name, funcname())
     self.assertEqual(expect, result)
     sys.stdout = old_stdout
Ejemplo n.º 3
0
class DASAbstractService(object):
    """
    Abstract class describing DAS service. It initialized with a name which
    is used to identify service parameters from DAS configuration file.
    Those parameters are keys, verbosity level, URL of the data-service.
    """
    def __init__(self, name, config):
        self.name = name
        try:
            self.verbose      = config['verbose']
            title             = 'DASAbstactService_%s' % self.name
            self.logger       = PrintManager(title, self.verbose)
            self.dasmapping   = config['dasmapping']
            self.write2cache  = config.get('write_cache', True)
            self.multitask    = config['das'].get('multitask', True)
            self.error_expire = config['das'].get('error_expire', 300) 
            self.dbs_global   = None # to be configured at run time
            self.dburi        = config['mongodb']['dburi']
            engine            = config.get('engine', None)
            self.gfs          = db_gridfs(self.dburi)
        except Exception as exc:
            print_exc(exc)
            raise Exception('fail to parse DAS config')

        # read key/cert info
        try:
            self.ckey, self.cert = get_key_cert()
        except Exception as exc:
            print_exc(exc)
            self.ckey = None
            self.cert = None

        if  self.multitask:
            nworkers = config['das'].get('api_workers', 3)
            thr_weights = config['das'].get('thread_weights', [])
            for system_weight in thr_weights:
                system, weight = system_weight.split(':')
                if  system == self.name:
                    nworkers *= int(weight)
            if  engine:
                thr_name = 'DASAbstractService:%s:PluginTaskManager' % self.name
                self.taskmgr = PluginTaskManager(\
                        engine, nworkers=nworkers, name=thr_name)
                self.taskmgr.subscribe()
            else:
                thr_name = 'DASAbstractService:%s:TaskManager' % self.name
                self.taskmgr = TaskManager(nworkers=nworkers, name=thr_name)
        else:
            self.taskmgr = None

        self.map        = {}   # to be defined by data-service implementation
        self._keys      = None # to be defined at run-time in self.keys
        self._params    = None # to be defined at run-time in self.parameters
        self._notations = {}   # to be defined at run-time in self.notations

        self.logger.info('initialized')
        # define internal cache manager to put 'raw' results into cache
        if  'rawcache' in config and config['rawcache']:
            self.localcache   = config['rawcache']
        else:
            msg = 'Undefined rawcache, please check your configuration'
            raise Exception(msg)

    def services(self):
        """
        Return sub-subsystems used to retrieve data records. It is used
        in dasheader call to setup das.services field. This method can be
        overwritten in sub-classes, otherwise returns dict of service name
        and CMS systems used to retrieve data records.
        """
        return {self.name:[self.name]}

    def version(self):
        """Return data-services version, should be implemented in sub-classes"""
        return ''

    def keys(self):
        """
        Return service keys
        """
        if  self._keys:
            return self._keys
        srv_keys = []
        for _api, params in self.map.items():
            for key in params['keys']:
                if  not key in srv_keys:
                    srv_keys.append(key)
        self._keys = srv_keys
        return srv_keys

    def parameters(self):
        """
        Return mapped service parameters
        """
        if  self._params:
            return self._params
        srv_params = []
        for _api, params in self.map.items():
            for key in params['params']:
                param_list = self.dasmapping.api2das(self.name, key)
                for par in param_list:
                    if  not par in srv_params:
                        srv_params.append(par)
        self._params = srv_params
        return srv_params

    def notations(self):
        """
        Return a map of system notations.
        """
        if  self._notations:
            return self._notations
        for _, rows in self.dasmapping.notations(self.name).items():
            for row in rows:
                api  = row['api']
                nmap = row['rec_key']
                notation = row['api_output']
                if  api in self._notations:
                    self._notations[api].update({notation:nmap})
                else:
                    self._notations[api] = {notation:nmap}
        return self._notations

    def getdata(self, url, params, expire, headers=None, post=None):
        """URL call wrapper"""
        if  url.find('https:') != -1:
            return getdata(url, params, headers, expire, post,
                self.error_expire, self.verbose, self.ckey, self.cert,
                system=self.name)
        else:
            return getdata(url, params, headers, expire, post,
                self.error_expire, self.verbose, system=self.name)

    def call(self, dasquery):
        """
        Invoke service API to execute given query.
        Return results as a collect list set.
        """
        self.logger.info(dasquery)
        # check the cache for records with given query/system
        res = self.localcache.incache(dasquery,
                                      collection='cache',
                                      system=self.name)
        if  res:
            msg  = "found records in local cache"
            self.logger.info(msg)
            return
        # ask data-service api to get results, they'll be store them in
        # cache, so return at the end what we have in cache.
        self.api(dasquery)

    def write_to_cache(self, dasquery, expire, url, api, args, gen, ctime):
        """
        Write provided result set into DAS cache.
        """
        if  not self.write2cache:
            return

        # before going to cache we should check/set possible misses, e.g.
        # primary key when error is thrown
        result = self.set_misses(dasquery, api, gen)

        # update the cache
        header = dasheader(self.name, dasquery, expire, api, url,
                services=self.services())
        header['lookup_keys'] = self.lookup_keys(api)
        header['prim_key'] = self.dasmapping.primary_mapkey(self.name, api)
        header['ctime'] = ctime
        self.localcache.update_cache(dasquery, result, header)

        msg  = 'cache has been updated,\n'
        self.logger.debug(msg)

    def adjust_params(self, api, kwds, instance=None):
        """
        Data-service specific parser to adjust parameters according to
        its specifications. For example, DQ service accepts a string
        of parameters, rather parameter set, while DBS2 can reuse
        some parameters for different API, e.g. I can use dataset path
        to pass to listPrimaryDatasets as primary_dataset pattern.
        """
        pass

    def lookup_keys(self, api):
        """
        Return look-up keys of data output for given data-service API.
        """
        lkeys = self.dasmapping.lookup_keys(self.name, api)
        return [{api:lkeys}]

    def inspect_params(self, api, args):
        """
        Perform API parameter inspection. Check if API accept a range
        of parameters, etc.
        """
        for key, value in args.items():
            if  isinstance(value, dict):
                minval = None
                maxval = None
                for oper, val in value.items():
                    if  oper == '$in':
                        minval = int(val[0])
                        maxval = int(val[-1])
                        args[key] = range(minval, maxval)
                    elif oper == '$lt':
                        maxval = int(val)
                        args[key] = maxval
                    elif oper == '$lte':
                        maxval = int(val)
                        args[key] = maxval
                    elif oper == '$gt':
                        minval = int(val)
                        args[key] = minval
                    elif oper == '$gte':
                        minval = int(val)
                        args[key] = minval
                    else:
                        msg = '%s does not support operator %s' % (api, oper)
                        raise Exception(msg)
        return args

    def get_notations(self, api):
        """Return notations used for given API"""
        notationmap = self.notations()
        if  not notationmap:
            return {}
        notations = {}
        if  '' in notationmap:
            notations = dict(notationmap['']) # notations applied to all APIs
            if  api in notationmap: # overwrite the one for provided API
                notations.update(notationmap[api])
        return notations

    def parser(self, dasquery, dformat, data, api):
        """
        DAS data parser. Input parameters:

        - *query* input DAS query
        - *dformat* is a data format, e.g. XML, JSON
        - *data* is a data source, either file-like object or
          actual data
        - *api* is API name
        """
        prim_key  = self.dasmapping.primary_key(self.name, api)
        counter   = 0
        if  dformat.lower() == 'xml':
            tags = self.dasmapping.api2daskey(self.name, api)
            gen  = xml_parser(data, prim_key, tags)
            for row in gen:
                counter += 1
                yield row
        elif dformat.lower() == 'json' or dformat.lower() == 'dasjson':
            gen  = json_parser(data, self.logger)
            das_dict = {}
            for row in gen:
                if  dformat.lower() == 'dasjson':
                    for key, val in row.items():
                        if  key != 'results':
                            das_dict[key] = val
                    row = row['results']
                if  isinstance(row, list):
                    for item in row:
                        if  item:
                            if  prim_key in item:
                                counter += 1
                                yield item
                            else:
                                counter += 1
                                yield {prim_key:item}
                else:
                    if  prim_key in row:
                        counter += 1
                        yield row
                    else:
                        counter += 1
                        yield {prim_key:row}
        else:
            msg = 'Unsupported data format="%s", API="%s"' % (dformat, api)
            raise Exception(msg)
        msg  = "api=%s, format=%s " % (api, dformat)
        msg += "prim_key=%s yield %s rows" % (prim_key, counter)
        self.logger.info(msg)

    def translator(self, api, genrows):
        """
        Convert raw results into DAS records. 
        """
        prim_key  = self.dasmapping.primary_key(self.name, api)
        count = 0
        for row in genrows:
            row2das(self.dasmapping.notation2das, self.name, api, row)
            count += 1
            # check for primary key existance, since it can be overriden
            # by row2das. For example DBS3 uses flat namespace, so we
            # override dataset=>name, while dataset still is a primary key
            if  isinstance(row, list):
                yield {prim_key:row}
            elif  prim_key in row:
                if  prim_key in row[prim_key]:
                    yield row[prim_key] # remapping may create nested dict
                else:
                    yield row
            else:
                yield {prim_key:row}
        msg = "yield %s rows" % count
        self.logger.debug(msg)

    def set_misses(self, dasquery, api, genrows):
        """
        Check and adjust DAS records wrt input query. If some of the DAS
        keys are missing, add it with its value to the DAS record.
        """
        # look-up primary key
        prim_key  = self.dasmapping.primary_key(self.name, api)

        # Scan all docs and store those whose size above MongoDB limit into
        # GridFS
        map_key = self.dasmapping.primary_mapkey(self.name, api)
        genrows = parse2gridfs(self.gfs, map_key, genrows, self.logger)

        spec  = dasquery.mongo_query['spec']
        row   = next(genrows)
        ddict = DotDict(row)
        keys2adjust = []
        for key in spec.keys():
            val = ddict.get(key)
            if  spec[key] != val and key not in keys2adjust:
                keys2adjust.append(key)
        msg   = "adjust keys %s" % keys2adjust
        self.logger.debug(msg)
        count = 0
        if  keys2adjust:
            # adjust of the rows
            for row in yield_rows(row, genrows):
                ddict = DotDict(row)
                pval  = ddict.get(map_key)
                if  isinstance(pval, dict) and 'error' in pval:
                    ddict[map_key] = ''
                    ddict.update({prim_key: pval})
                for key in keys2adjust:
                    value = spec[key]
                    existing_value = ddict.get(key)
                    # the way to deal with proximity/patern/condition results
                    if  (isinstance(value, str) or isinstance(value, unicode))\
                        and value.find('*') != -1: # we got pattern
                        if  existing_value:
                            value = existing_value
                    elif isinstance(value, dict) or \
                        isinstance(value, list): # we got condition
                        if  existing_value:
                            value = existing_value
                        elif isinstance(value, dict) and \
                        '$in' in value: # we got a range {'$in': []}
                            value = value['$in']
                        elif isinstance(value, dict) and \
                        '$lte' in value and '$gte' in value:
                            # we got a between range
                            value = [value['$gte'], value['$lte']]
                        else: 
                            value = json.dumps(value) 
                    elif existing_value and value != existing_value:
                        # we got proximity results
                        if  'proximity' in ddict:
                            proximity = DotDict({key:existing_value})
                            ddict['proximity'].update(proximity)
                        else:
                            proximity = DotDict({})
                            proximity[key] = existing_value
                            ddict['proximity'] = proximity
                    else:
                        if  existing_value:
                            value = existing_value
                    ddict[key] = value
                yield ddict
                count += 1
        else:
            yield row
            for row in genrows:
                yield row
                count += 1
        msg   = "yield %s rows" % count
        self.logger.debug(msg)
            
    def api(self, dasquery):
        """
        Data service api method, can be defined by data-service class.
        It parse input query and invoke appropriate data-service API
        call. All results are stored into the DAS cache along with
        api call inserted into Analytics DB.
        """
        self.logger.info(dasquery)
        genrows = self.apimap(dasquery)
        if  not genrows:
            return
        jobs = []
        for url, api, args, dformat, expire in genrows:
            # insert DAS query record for given API
            header = dasheader(self.name, dasquery, expire, api, url)
            self.localcache.insert_query_record(dasquery, header)
            # fetch DAS data records
            if  self.multitask:
                jobs.append(self.taskmgr.spawn(self.apicall, \
                            dasquery, url, api, args, dformat, expire))
            else:
                self.apicall(dasquery, url, api, args, dformat, expire)
        if  self.multitask:
            self.taskmgr.joinall(jobs)

    def apicall(self, dasquery, url, api, args, dformat, expire):
        """
        Data service api method, can be defined by data-service class.
        It parse input query and invoke appropriate data-service API
        call. All results are stored into the DAS cache along with
        api call inserted into Analytics DB.

        We invoke explicitly close call for our datastream instead
        of using context manager since this method as well as
        getdata/parser can be overwritten by child classes.
        """
        datastream  = None
        try:
            args    = self.inspect_params(api, args)
            time0   = time.time()
            headers = make_headers(dformat)
            datastream, expire = self.getdata(url, args, expire, headers)
            self.logger.info("%s expire %s" % (api, expire))
            rawrows = self.parser(dasquery, dformat, datastream, api)
            dasrows = self.translator(api, rawrows)
            ctime   = time.time() - time0
            self.write_to_cache(dasquery, expire, url, api, args,
                    dasrows, ctime)
        except Exception as exc:
            msg  = 'Fail to process: url=%s, api=%s, args=%s' \
                    % (url, api, args)
            print(msg)
            print_exc(exc)
        close(datastream)

    def url_instance(self, url, _instance):
        """
        Virtual method to adjust URL for a given instance,
        must be implemented in service classes
        """
        return url

    def adjust_url(self, url, instance):
        """
        Adjust data-service URL wrt provided instance, e.g.
        DBS carry several instances
        """
        if  instance:
            url = self.url_instance(url, instance)
        return url

    def apimap(self, dasquery):
        """
        Analyze input query and yield url, api, args, format, expire
        for further processing.
        """
        srv   = self.name # get local copy to avoid threading issues
        cond  = getarg(dasquery.mongo_query, 'spec', {})
        instance = dasquery.mongo_query.get('instance', self.dbs_global)
        skeys = getarg(dasquery.mongo_query, 'fields', [])
        if  not skeys:
            skeys = []
        self.logger.info("\n")
        for api, value in self.map.items():
            expire = value['expire']
            iformat = value['format']
            url    = self.adjust_url(value['url'], instance)
            if  not url:
                msg = '--- rejects API %s, no URL' % api
                self.logger.info(msg)
                continue
            args   = dict(value['params']) # make new copy, since we'll adjust
            wild   = value.get('wild_card', '*')
            found  = 0
            # check if input parameters are covered by API
            if  not self.dasmapping.check_api_match(srv, api, cond):
                msg = '--- rejects API %s, does not cover input condition keys' \
                        % api
                self.logger.info(msg)
                continue
            # once we now that API covers input set of parameters we check
            # every input parameter for pattern matching
            for key, val in cond.items():
                # check if keys from conditions are accepted by API
                # need to convert key (which is daskeys.map) into
                # input api parameter
                for apiparam in self.dasmapping.das2api(srv, api, key, val):
                    if  apiparam in args:
                        args[apiparam] = val
                        found += 1
            # VK 20160708, wrong statement, it caused to pass
            # datasets API for query dataset in [path1, path2]
            # I'll leave block here until I test and verify that
            # commented out block will not cause other issues
            #
            # check the case when we only have single condition key
            # and it is the key we look-up
#             if  not found and skeys == [k.split('.')[0] for k in cond.keys()]:
#                 found = 1
            # check if number of keys on cond and args are the same
            if  len(cond.keys()) != found:
                msg = "--- reject API %s, not all condition keys are covered" \
                        % api
                self.logger.info(msg)
                msg = 'args=%s' % args
                self.logger.debug(msg)
                continue
            if  not found:
                msg = "--- rejects API %s, parameters don't match" % api
                self.logger.info(msg)
                msg = 'args=%s' % args
                self.logger.debug(msg)
                continue
            self.adjust_params(api, args, instance)
            # delete args keys whose value is optional
            delete_keys(args, 'optional')
            # check that there is no "required" parameter left in args,
            # since such api will not work
            if 'required' in args.values():
                msg = '--- rejects API %s, parameter is required' % api
                self.logger.info(msg)
                msg = 'args=%s' % args
                self.logger.debug(msg)
                continue
            # adjust pattern symbols in arguments
            if  wild != '*':
                for key, val in args.items():
                    if  isinstance(val, str) or isinstance(val, unicode):
                        val   = val.replace('*', wild)
                    args[key] = val

            # compare query selection keys with API look-up keys
            api_lkeys = self.dasmapping.api_lkeys(srv, api)
            if  set(api_lkeys) != set(skeys):
                msg = "--- rejects API %s, api_lkeys(%s)!=skeys(%s)"\
                        % (api, api_lkeys, skeys)
                self.logger.info(msg)
                continue

            msg = '+++ %s passes API %s' % (srv, api)
            self.logger.info(msg)
            msg = 'args=%s' % args
            self.logger.debug(msg)

            msg  = "yield "
            msg += "system ***%s***, url=%s, api=%s, args=%s, format=%s, " \
                % (srv, url, api, args, iformat)
            msg += "expire=%s, wild_card=%s" \
                % (expire, wild)
            self.logger.debug(msg)

            yield url, api, args, iformat, expire
Ejemplo n.º 4
0
class DASAnalytics(object):
    """
    DAS analytics DB manager.
    """
    def __init__(self, config):
        self.verbose = config['verbose']
        self.logger  = PrintManager('DASAnalytics', self.verbose)
        self.dburi   = config['mongodb']['dburi']
        self.dbname  = config['analyticsdb']['dbname']        
        self.colname = config['analyticsdb']['collname']
        self.history = config['analyticsdb']['history']
        msg = "%s@%s" % (self.dburi, self.dbname)
        self.logger.info(msg)
        self.create_db()

    def create_db(self):
        """
        Create analytics DB in MongoDB back-end.
        """
        self.conn = db_connection(self.dburi)
        database  = self.conn[self.dbname]
        das_son_manipulator = DAS_SONManipulator()
        database.add_son_manipulator(das_son_manipulator)
        self.col  = database[self.colname]
#        if  self.dbname not in self.conn.database_names():
#            capped_size = 104857600
#            options   = {'capped':True, 'size': capped_size}
#            database  = self.conn[self.dbname]
#            database.create_collection('self.colname', **options)
#            print "####CREATE CAPPED ANALYTICS"
#        self.col  = self.conn[self.dbname][self.colname] 

    def delete_db(self):
        """
        Delete analytics DB in MongoDB back-end.
        """
        self.conn.drop_database(self.dbname)

    def delete_db_collection(self):
        """
        Delete analytics DB collection in MongoDB.
        """
        self.conn.drop_collection(self.colname)

    def add_query(self, query, mongoquery):
        """
        Add DAS-QL/MongoDB-QL queries into analytics.
        
        A unique record is contained for each (qhash, dhash) pair.
        For each an array of call-times is contained.
        """
        if  isinstance(mongoquery, dict):
            mongoquery = encode_mongo_query(mongoquery)
        msg = 'query=%s, mongoquery=%s' % (query, mongoquery)
        self.logger.debug(msg)
        dhash = genkey(query)
        qhash = genkey(mongoquery)

        now = time.time()

        existing = self.col.find_one({'qhash': qhash, 'dhash': dhash})
        if existing:
            # check if times contains very old timestamps
            rec = self.col.find({'_id': ObjectId(existing['_id']), 
                                 'times':{'$lt' : now - self.history}})
            if  rec:
                self.col.update({'_id': ObjectId(existing['_id'])},
                    {'$pull': {'times': {'$lt' : now - self.history}}})
            # update times array with new timestamp
            self.col.update({'_id': ObjectId(existing['_id'])},
                            {'$push': {'times': now}})
        else:
            record = dict(query=query, mongoquery=mongoquery,
                        qhash=qhash, dhash=dhash, times=[now])
            self.col.insert(record)

        index = [('qhash', DESCENDING),
                 ('dhash', DESCENDING)]
        create_indexes(self.col, index)
        
    def clean_queries(self):
        """
        Standalone method to clean up expired call-times from query records,
        since otherwise only the active record is cleaned.
        
        This is too expensive to do with every operation, and mongodb
        does not allow multiple modifications to a single field in a single
        update operation (ie, we can't do $push and $pull in one update),
        so it should probably be done asynchronously at fixed intervals.
        """
        
        self.logger.debug('')
        
        now = time.time()
        
        #clean out the times array
        self.col.update({'times': {'$exists': True}},
                        {'$pull': {'times': {'$lt': now - self.history}}})
        #now delete any with no times
        self.col.remove({'times': {'$size': 0}})
        #and should maybe delete anything with the same qhash here?

    def remove_expired(self):
        "Moved from AbstractService -  remove old apicall records"
        spec = {'apicall.expire':{'$lt' : int(time.time())}}
        self.col.remove(spec)

    def add_summary(self, identifier, start, finish, **payload):
        """
        Add an analyzer summary, with given analyzer identifier,
        start and finish times and payload.
        
        It is intended that a summary document is deposited on
        each run of an analyzer (if desirable) and is thereafter
        immutable.
        """
        msg = '(%s, %s->%s, %s)' % (identifier, start, finish, payload)
        self.logger.debug(msg)
        
        # clean-up analyzer records whose start timestamp is too old
        spec = {'start':{'$lt':time.time()-self.history},
                'analyzer': {'$exists': True}}
        self.col.remove(spec)

        # insert new analyzer record
        record = {'analyzer':identifier,
                  'start': start,
                  'finish': finish}
        payload.update(record) #ensure key fields are set correctly
        self.col.insert(payload)
        # ensure summary items are indexed for quick extract
        create_indexes(self.col, [('analyzer', DESCENDING), ('start', ASCENDING)])

    def get_summary(self, identifier, after=None, before=None, **query):
        """
        Retrieve a summary document for a given analyzer-identifier,
        optionally specifying a time range.
        """
        cond = {'analyzer': identifier}
        if after:
            cond['start'] = {'$gte': after}
        if before:
            cond['finish'] = {'$lte': before}
        if query:
            cond.update(query)
        return list(self.col.find(cond))

    def add_api(self, system, query, api, args):
        """
        Add API info to analytics DB. 
        Here args is a dict of API parameters.
        """
        orig_query = query
        if  isinstance(query, dict):
            query = encode_mongo_query(query)
        msg = '(%s, %s, %s, %s)' % (system, query, api, args)
        self.logger.debug(msg)
        # find query record
        qhash = genkey(query)
        record = self.col.find_one({'qhash':qhash}, fields=['dasquery'])
        if  not record:
            self.add_query("", orig_query)
        # find api record
        record = self.col.find_one({'qhash':qhash, 'system':system,
                        'api.name':api, 'api.params':args}) 
        apidict = dict(name=api, params=args)
        if  record:
            self.col.update({'_id':record['_id']}, {'$inc':{'counter':1}})
        else:
            record = dict(system=system, api=apidict, qhash=qhash, counter=1)
            self.col.insert(record)
        index = [('system', DESCENDING), ('dasquery', DESCENDING),
                 ('api.name', DESCENDING), ('qhash', DESCENDING) ]
        create_indexes(self.col, index)
        
    def insert_apicall(self, system, query, url, api, api_params, expire):
        """
        Remove obsolete apicall records and
        insert into Analytics DB provided information about API call.
        Moved from AbstractService.
        
        Updated so that we do not have multiple records when performing
        forced updates (ie, the old record is not yet expired) - now
        look for an existing record with the same parameters (I'm hoping
        the fact that some of the variables are indexed will make this
        fast even though not all are), and if it exists just update
        the expiry. Otherwise insert a new record.
        """
        msg = 'query=%s, url=%s,' % (query, url)
        msg += 'api=%s, args=%s, expire=%s' % (api, api_params, expire)
        self.logger.debug(msg)
        expire = expire_timestamp(expire)
        query = encode_mongo_query(query)
        qhash = genkey(query)
        self.remove_expired()
        existing = self.col.find_one({'apicall.system':     system,
                                      'apicall.url':        url,
                                      'apicall.api':        api,
                                      'apicall.api_params': api_params,
                                      'apicall.qhash':      qhash})
        if existing:
            self.logger.debug("updating")
            self.col.update({'_id': existing['_id']},
                            {'$set':{'apicall.expire': expire}})
        else:
            self.col.insert({'apicall':{'api_params':   api_params,
                                        'url':          url,
                                        'api':          api,
                                        'system':       system,
                                        'expire':       expire,
                                        'qhash':        qhash}})
        index_list = [('apicall.url', DESCENDING),
                      ('apicall.api', DESCENDING),
                      ('qhash', DESCENDING)]
        create_indexes(self.col, index_list)
        
    def update_apicall(self, query, das_dict):
        """
        Update apicall record with provided DAS dict.
        Moved from AbstractService
        """
        msg = 'DBSAnalytics::update_apicall, query=%s, das_dict=%s'\
                % (query, das_dict)
        self.logger.debug(msg)
        spec = {'apicall.qhash':genkey(encode_mongo_query(query))} 
        record = self.col.find_one(spec)
        self.col.update({'_id':ObjectId(record['_id'])},
            {'$set':{'dasapi':das_dict,
                     'apicall.expire':das_dict['response_expires']}})

    def update(self, system, query):
        """
        Update records for given system/query.
        """
        if  isinstance(query, dict):
            query = encode_mongo_query(query)
        msg = 'system=%s, query=%s' % (system, query)
        self.logger.debug(msg)
        qhash = genkey(query)
        if  system:
            cond = {'qhash':qhash, 'system':system}
        else:
            cond = {'qhash':qhash}
        self.col.update(cond, {'$inc' : {'counter':1}}, multi=True)

    def list_systems(self):
        """
        List all DAS systems.
        """
        cond = { 'system' : { '$ne' : None } }
        gen = (row['system'] for row in self.col.find(cond, ['system']))
        return gen2list(gen)

    def list_queries(self, qhash=None, dhash=None, query_regex=None,
                     key=None, after=None, before=None):
        """
        List inserted queries based on many criteria.
        """
        cond = {'mongoquery': {'$exists': True}}
        if qhash:
            cond['qhash'] = qhash
        if dhash:
            cond['dhash'] = dhash
        if query_regex:
            cond['dasquery'] = {'$regex':query_regex}
        if key:
            cond['mongoquery.spec.key'] = key
        # in this case we need a specific element to be within the range,
        # so we need to use elemMatch
        if before and after:
            cond['times'] = {'$gt': after, '$lt': before}
        # in these cases we only need to match any element
        elif after:
            cond['times'] = {'$gt': after}
        elif before:
            cond['times'] = {'$lt': before}
        
        return self.col.find(cond)
            
    def get_popular_queries(self, spec):
        """
        Get popular queries based on provided spec, which can be
        in a form of time stamp range, etc.
        """
        cond = {'counter':{'$exists':True}}
        for row in self.col.find(fields=['qhash'], spec=cond).\
                sort('counter', DESCENDING):
            spec = {'qhash': row['qhash'], 'counter':{'$exists': False}}
            for res in self.col.find(spec=spec):
                yield res

    def list_apis(self, system=None):
        """
        List all APIs.
        """
        cond = { 'api.name' : { '$ne' : None } }
        if  system:
            cond['system'] = system
        gen = (row['api']['name'] for row in \
                self.col.find(cond, ['api.name']))
        return gen2list(gen)
    
    def list_apicalls(self, qhash=None, api=None, url=None):
        "Replace ad-hoc calls in AbstractService"
        cond = {}
        if qhash:
            cond['apicall.qhash'] = qhash
        if api:
            cond['apicall.api'] = api
        if url:
            cond['apicall.url'] = url
        
        return list(self.col.find(cond))

    def api_params(self, api):
        """
        Retrieve API parameters from analytics DB
        """
        cond = {'api.name':api}
        gen = (row['api']['params'] for row in \
                self.col.find(cond, ['api.params']))
        return gen2list(gen)

    def api_counter(self, api, args=None):
        """
        Retrieve API counter from analytics DB. User must supply
        API name and optional dict of parameters.
        """
        cond = {'api.name': api}
        if  args:
            for key, val in args.iteritems():
                cond[key] = val
        return self.col.find_one(cond, ['counter'])['counter']
Ejemplo n.º 5
0
class DASMapping(object):
    """
    This class manages DAS mapping DB.
    """

    def __init__(self, config):
        self.verbose = config["verbose"]
        self.logger = PrintManager("DASMapping", self.verbose)
        self.services = config["services"]
        self.dburi = config["mongodb"]["dburi"]
        self.dbname = config["mappingdb"]["dbname"]
        self.colname = config["mappingdb"]["collname"]

        msg = "%s@%s" % (self.dburi, self.dbname)
        self.logger.info(msg)

        self.create_db()

        self.keymap = {}  # to be filled at run time
        self.presentationcache = {}  # to be filled at run time
        self.reverse_presentation = {}  # to be filled at run time
        self.notationcache = {}  # to be filled at run time
        self.diffkeycache = {}  # to be filled at run time
        self.apicache = {}  # to be filled at run time
        self.apiinfocache = {}  # to be filled at run time
        self.init_notationcache()
        self.init_presentationcache()

    # ===============
    # Management APIs
    # ===============
    def init_notationcache(self):
        """
        Initialize notation cache by reading notations.
        """
        for system, notations in self.notations().iteritems():
            for row in notations:
                key = system, row["notation"]
                if self.notationcache.has_key(key):
                    self.notationcache[key] += [(row["api"], row["map"])]
                else:
                    self.notationcache[key] = [(row["api"], row["map"])]

    def init_presentationcache(self):
        """
        Initialize presentation cache by reading presentation map.
        """
        query = {"presentation": {"$ne": None}}
        data = self.col.find_one(query)
        if data:
            self.presentationcache = data["presentation"]
            for daskey, uilist in self.presentationcache.iteritems():
                for row in uilist:
                    link = None
                    if row.has_key("link"):
                        link = row["link"]
                    if row.has_key("diff"):
                        self.diffkeycache[daskey] = row["diff"]
                    tdict = {daskey: {"mapkey": row["das"], "link": link}}
                    if self.reverse_presentation.has_key(row["ui"]):
                        self.reverse_presentation[row["ui"]].update(tdict)
                    else:
                        self.reverse_presentation[row["ui"]] = {daskey: {"mapkey": row["das"], "link": link}}

    def create_db(self):
        """
        Establish connection to MongoDB back-end and create DB.
        """
        self.conn = db_connection(self.dburi)
        self.db = self.conn[self.dbname]
        self.col = self.db[self.colname]

    def delete_db(self):
        """
        Delete mapping DB in MongoDB back-end.
        """
        self.conn.drop_database(self.dbname)

    def delete_db_collection(self):
        """
        Delete mapping DB collection in MongoDB.
        """
        self.db.drop_collection(self.colname)

    def check_maps(self):
        """
        Check if there are records in Mapping DB
        """
        return self.col.count()

    def remove(self, spec):
        """
        Remove record in DAS Mapping DB for provided Mongo spec.
        """
        self.col.remove(spec)

    def add(self, record):
        """
        Add new record into mapping DB. Example of URI record

        .. doctest::

            {
             system:dbs, 
             urn : listBlocks, 
             url : "http://a.b.com/api"
             params : [
                 {"apiversion":1_2_2, test:"*"}
             ]
             daskeys: [
                 {"key" : "block", "map":"block.name", "pattern":""}
             ]
             das2api: [
                 {"das_key":"site", "api_param":"se", 
                       "pattern":"re.compile('^T[0-3]_')"}
             ]
            }

        Example of notation record:

        .. doctest::

             notations: [
                 {"notation" : "storage_element_name", "map":"site", "api": ""},
             ]
        """
        msg = "record=%s" % record
        self.logger.debug(msg)
        self.col.insert(record)
        index = None
        if record.has_key("urn"):
            index = [("system", DESCENDING), ("daskeys", DESCENDING), ("urn", DESCENDING)]
        elif record.has_key("notations"):
            index = [("system", DESCENDING), ("notations.api_param", DESCENDING)]
        elif record.has_key("presentation"):
            index = []
        else:
            msg = "Invalid record %s" % record
            raise Exception(msg)
        if index:
            create_indexes(self.col, index)

    # ==================
    # Informational APIs
    # ==================
    def list_systems(self):
        """
        List all DAS systems.
        """
        cond = {"system": {"$ne": None}}
        gen = (row["system"] for row in self.col.find(cond, ["system"]))
        return list(set(gen2list(gen)) & set(self.services))

    def list_apis(self, system=None):
        """
        List all APIs.
        """
        if self.apicache and self.apicache.has_key(system):
            return self.apicache[system]
        cond = {"urn": {"$ne": None}}
        if system:
            cond["system"] = system
        gen = (row["urn"] for row in self.col.find(cond, ["urn"]))
        self.apicache[system] = gen2list(gen)
        return self.apicache[system]

    def api_info(self, api_name):
        """
        Return full API info record.
        """
        return self.apiinfocache.get(api_name, self.col.find_one({"urn": api_name}))

    def relational_keys(self, system1, system2):
        """
        Return a list of relational keys between provided systems
        """
        for system, keys in self.daskeys().iteritems():
            if system == system1:
                keys1 = keys
            if system == system2:
                keys2 = keys
        return list(set(keys1) & set(keys2))

    def daskeys(self, das_system=None):
        """
        Return a dict with all known DAS keys.
        """
        cond = {"system": {"$ne": None}}
        if das_system:
            cond = {"system": das_system}
        gen = (row["system"] for row in self.col.find(cond, ["system"]))
        kdict = {}
        for system in gen:
            query = {"system": system, "urn": {"$ne": None}}
            keys = []
            for row in self.col.find(query):
                for entry in row["daskeys"]:
                    if entry["key"] not in keys:
                        keys.append(entry["key"])
            kdict[system] = keys
        return kdict

    # ============
    # Look-up APIs
    # ============
    def primary_key(self, das_system, urn):
        """
        Return DAS primary key for provided system and urn
        """
        cond = {"system": das_system, "urn": urn}
        daskeys = self.col.find(cond, ["daskeys.key"])
        for row in daskeys:
            if row and row.has_key("daskeys"):
                for dkey in row["daskeys"]:
                    if dkey.has_key("key"):
                        return dkey["key"]

    def primary_mapkey(self, das_system, urn):
        """
        Return DAS primary map key for provided system and urn
        """
        cond = {"system": das_system, "urn": urn}
        mapkeys = self.col.find(cond, ["daskeys.map"])
        for row in mapkeys:
            if row and row.has_key("daskeys"):
                for mkey in row["daskeys"]:
                    if mkey.has_key("map"):
                        return mkey["map"]

    def find_daskey(self, das_system, map_key, value=None):
        """
        Find das key for given system and map key.
        """
        msg = "system=%s\n" % das_system
        cond = {"system": das_system, "daskeys.map": map_key}
        daskeys = []
        for row in self.col.find(cond, ["daskeys"]):
            if row and row.has_key("daskeys"):
                for dkey in row["daskeys"]:
                    if dkey.has_key("key"):
                        if value:
                            pval = dkey.get("pattern", "")
                            if pval:
                                pat = re.compile(pval)
                                if pat.match(str(value)):
                                    daskeys.append(dkey["key"])
                                else:
                                    msg += "-- reject key=%s, val=%s, pat=%s\n" % (map_key, value, pval)
                                    self.logger.debug(msg)
                            else:
                                daskeys.append(dkey["key"])
                        else:
                            daskeys.append(dkey["key"])
        return daskeys

    def find_mapkey(self, das_system, das_key, value=None):
        """
        Find map key for given system and das key.
        """
        msg = "system=%s\n" % das_system
        cond = {"system": das_system, "daskeys.key": das_key}
        for row in self.col.find(cond, ["daskeys", "urn"]):
            if row and row.has_key("daskeys"):
                for key in row["daskeys"]:
                    if key.has_key("map") and key["key"] == das_key:
                        if value:
                            pval = key.get("pattern", "")
                            pat = re.compile(pval)
                            if pat.match(str(value)):
                                return key["map"]
                            else:
                                msg += "-- reject key=%s, val=%s, pat=%s\n" % (das_key, value, key["pattern"])
                                self.logger.debug(msg)
                                continue
                        else:
                            return key["map"]

    def mapkeys(self, daskey):
        """
        Find primary key for a given daskey
        """
        if self.keymap.has_key(daskey):
            return self.keymap[daskey]
        spec = {"daskeys.key": daskey}
        mapkeys = []
        for row in self.col.find(spec, ["daskeys"]):
            for kmap in row["daskeys"]:
                if kmap["key"] == daskey and kmap["map"] not in mapkeys:
                    mapkeys.append(kmap["map"])
        self.keymap[daskey] = mapkeys
        return self.keymap[daskey]

    def find_apis(self, das_system, map_key):
        """
        Find list of apis which correspond to provided
        system and das map key.
        """
        cond = {"system": das_system, "daskeys.map": map_key}
        apilist = []
        for row in self.col.find(cond, ["urn"]):
            if row.has_key("urn") and row["urn"] not in apilist:
                apilist.append(row["urn"])
        return apilist

    def check_dasmap(self, system, urn, das_map, value=None):
        """
        Check if provided system/urn/das_map is a valid combination
        in mapping db. If value for das_map key is provided we verify
        it against pattern in DB.
        """
        if not value:
            cond = {"system": system, "daskeys.map": das_map, "urn": urn}
            return self.col.find(cond).count()
        cond = {"system": system, "daskeys.map": das_map, "urn": urn}
        for row in self.col.find(cond, ["daskeys.pattern"]):
            for item in row["daskeys"]:
                pat = re.compile(item["pattern"])
                if pat.match(str(value)):
                    return True
        return False

    def find_system(self, key):
        """
        Return system name for provided DAS key.
        """
        cond = {"daskeys.key": key}
        gen = (row["system"] for row in self.col.find(cond, ["system"]))
        systems = []
        for system in gen:
            if system not in systems:
                systems.append(system)
        systems.sort()
        return systems

    def lookup_keys(self, system, daskey, api=None, value=None):
        """
        Returns lookup keys for given system and provided
        selection DAS key, e.g. block => block.name
        """
        query = {"system": system, "daskeys.key": daskey}
        if api:
            query["urn"] = api
        lookupkeys = []
        for row in self.col.find(query):
            for kdict in row["daskeys"]:
                if kdict["key"] == daskey:
                    lkey = kdict["map"]
                else:
                    continue
                if value and kdict["pattern"]:
                    pat = re.compile(kdict["pattern"])
                    if pat.match(str(value)):
                        if lkey not in lookupkeys:
                            lookupkeys.append(lkey)
                else:
                    if lkey not in lookupkeys:
                        lookupkeys.append(lkey)
        if not lookupkeys:
            msg = "Unable to find look-up key for "
            msg += "system=%s, daskey=%s, api=%s, value=%s" % (system, daskey, api, value)
            raise Exception(msg)
        return lookupkeys

    def api2das(self, system, api_input_name):
        """
        Translates data-service API input parameter into DAS QL key,
        e.g. run_number => run.
        """
        query = {"system": system, "das2api.api_param": api_input_name}
        names = []
        for adas in self.col.find(query, ["das2api"]):
            for row in adas["das2api"]:
                try:
                    aparam = row["api_param"]
                    daskey = row["das_key"]
                    if aparam == api_input_name and daskey not in names:
                        names.append(daskey)
                except Exception, err:
                    print "ERROR: look-up api_param/das_key in", row
                    raise err
        return names
Ejemplo n.º 6
0
class DASMongocache(object):
    """
    DAS cache based MongoDB. 
    """
    def __init__(self, config):
        self.emptyset_expire = expire_timestamp(\
            config['das'].get('emptyset_expire', 5))
        self.dburi   = config['mongodb']['dburi']
        self.cache_size = config['mongodb']['bulkupdate_size']
        self.dbname  = config['dasdb']['dbname']
        self.verbose = config['verbose']
        self.logger  = PrintManager('DASMongocache', self.verbose)
        self.mapping = config['dasmapping']

        self.conn    = db_connection(self.dburi)
        self.mdb     = self.conn[self.dbname]
        self.col     = self.mdb[config['dasdb']['cachecollection']]
        self.mrcol   = self.mdb[config['dasdb']['mrcollection']]
        self.merge   = self.mdb[config['dasdb']['mergecollection']]
        self.gfs     = db_gridfs(self.dburi)

        self.logdb   = DASLogdb(config)

        self.das_internal_keys = ['das_id', 'das', 'cache_id', 'qhash']

        msg = "%s@%s" % (self.dburi, self.dbname)
        self.logger.info(msg)

        self.add_manipulator()

        # ensure that we have the following indexes
        index_list = [('das.expire', ASCENDING), ('das_id', ASCENDING),
                      ('das.system', ASCENDING),
                      ('qhash', DESCENDING),
                      ('das.empty_record', ASCENDING)]
        create_indexes(self.col, index_list)
        index_list = [('das.expire', ASCENDING), ('das_id', ASCENDING),
                      ('qhash', DESCENDING),
                      ('das.empty_record', ASCENDING), ('das.ts', ASCENDING)]
        create_indexes(self.merge, index_list)
        
    def add_manipulator(self):
        """
        Add DAS-specific MongoDB SON manipulator to perform
        conversion of inserted data into DAS cache.
        """
        das_son_manipulator = DAS_SONManipulator()
        self.mdb.add_son_manipulator(das_son_manipulator)
        msg = "DAS_SONManipulator %s" \
        % das_son_manipulator
        self.logger.debug(msg)

    def similar_queries(self, dasquery):
        """
        Check if we have query results in cache whose conditions are
        superset of provided query. The method only works for single
        key whose value is substring of value in input query.
        For example, if cache contains records about T1 sites, 
        then input query T1_CH_CERN is subset of results stored in cache.
        """
        spec = dasquery.mongo_query.get('spec', {})
        cond = {'query.spec.key': {'$in' : spec.keys()}, 'qhash':dasquery.qhash}
        for row in self.col.find(cond):
            found_query = DASQuery(row['query'])
            if  dasquery.qhash == found_query.qhash:
                msg = "%s similar to %s" % (dasquery, found_query)
                self.logger.info(msg)
                return found_query
        return False
    
    def get_superset_keys(self, key, value):
        """
        This is a special-case version of similar_keys,
        intended for analysers that want to quickly
        find possible superset queries of a simple
        query of the form key=value.
        """
        
        msg = "%s=%s" % (key, value)
        self.logger.debug(msg)
        cond = {'query.spec.key': key}
        for row in self.col.find(cond):
            mongo_query = decode_mongo_query(row['query'])
            for thiskey, thisvalue in mongo_query.iteritems():
                if thiskey == key:
                    if fnmatch.fnmatch(value, thisvalue):
                        yield thisvalue

    def get_fields(self, dasquery):
        "Prepare fields to extract from MongoDB"
        fields     = dasquery.mongo_query.get('fields', None)
        if  fields == ['records']:
            fields = None # look-up all records
        filters    = dasquery.filters
        cond       = {}
        if  filters:
            new_fields = []
            for dasfilter in filters:
                if  dasfilter == 'unique':
                    continue
                if  dasfilter not in fields and \
                    dasfilter not in new_fields:
                    if  dasfilter.find('=') == -1 and dasfilter.find('<') == -1\
                    and dasfilter.find('>') == -1:
                        new_fields.append(dasfilter)
                    else:
                        cond = parse_filters(dasquery.mongo_query)
            if  not new_fields and fields:
                new_fields = list(fields)
            return new_fields, cond
        return fields, cond

    def remove_expired(self, collection):
        """
        Remove expired records from DAS cache.
        """
        timestamp = int(time.time())
        col  = self.mdb[collection]
        spec = {'das.expire' : {'$lt' : timestamp}}
        if  self.verbose:
            nrec = col.find(spec).count()
            msg  = "will remove %s records" % nrec
            msg += ", localtime=%s" % timestamp
            self.logger.debug(msg)
        self.logdb.insert(collection, {'delete': self.col.find(spec).count()})
        col.remove(spec)

    def find(self, dasquery):
        """
        Find provided query in DAS cache.
        """
        cond = {'qhash': dasquery.qhash, 'das.system':'das'}
        return self.col.find_one(cond)

    def find_specs(self, dasquery, system='das'):
        """
        Check if cache has query whose specs are identical to provided query.
        Return all matches.
        """
        cond = {'qhash': dasquery.qhash}
        if  system:
            cond.update({'das.system': system})
        return self.col.find(cond)

    def get_das_ids(self, dasquery):
        """
        Return list of DAS ids associated with given query
        """
        das_ids = []
        try:
            das_ids = \
                [r['_id'] for r in self.col.find_specs(dasquery, system='')]
        except:
            pass
        return das_ids

    def update_das_expire(self, dasquery, timestamp):
        "Update timestamp of all DAS data records for given query"
        nval = {'$set': {'das.expire':timestamp}}
        spec = {'qhash' : dasquery.qhash}
        self.col.update(spec, nval, multi=True, safe=True)
        self.merge.update(spec, nval, multi=True, safe=True)

    def das_record(self, dasquery):
        "Retrieve DAS record for given query"
        return self.col.find_one({'qhash': dasquery.qhash})
    
    def find_records(self, das_id):
        " Return all the records matching a given das_id"
        return self.col.find({'das_id': das_id})

    def add_to_record(self, dasquery, info, system=None):
        "Add to existing DAS record provided info"
        if  system:
            self.col.update({'query': dasquery.storage_query,
                             'das.system':system},
                            {'$set': info}, upsert=True, safe=True)
        else:
            self.col.update({'query': dasquery.storage_query},
                            {'$set': info}, upsert=True, safe=True)

    def update_query_record(self, dasquery, status, header=None):
        "Update DAS record for provided query"
        if  header:
            system = header['das']['system']
            spec1  = {'qhash': dasquery.qhash, 'das.system': 'das'}
            dasrecord = self.col.find_one(spec1)
            spec2  = {'qhash': dasquery.qhash, 'das.system': system}
            sysrecord = self.col.find_one(spec2)
            hexpire = header['das']['expire']
            dexpire = hexpire
            if  dasrecord and dasrecord.has_key('das'):
                dexpire = dasrecord['das'].get('expire', None)
            if  dexpire and hexpire > dexpire:
                expire = dexpire
            else:
                expire = hexpire
            if  sysrecord:
                api  = header['das']['api']
                url  = header['das']['url']
                sapi = sysrecord['das'].get('api', [])
                surl = sysrecord['das'].get('url', [])
                if  set(api) & set(sapi) == set(api) and \
                    set(url) & set(surl) == set(url):
                    self.col.update({'_id':ObjectId(sysrecord['_id'])},
                        {'$set': {'das.expire':expire, 'das.status':status}},
                        safe=True)
                else:
                    self.col.update({'_id':ObjectId(sysrecord['_id'])},
                        {'$pushAll':{'das.api':header['das']['api'],
                                     'das.urn':header['das']['api'],
                                     'das.url':header['das']['url'],
                                     'das.ctime':header['das']['ctime'],
                                    },
                         '$set': {'das.expire':expire, 'das.status':status}},
                        safe=True)
            if  dasrecord:
                self.col.update({'_id':ObjectId(dasrecord['_id'])},
                     {'$set': {'das.expire':expire}}, safe=True)
        else:
            self.col.update({'qhash': dasquery.qhash,
                             'das.system':'das'},
                            {'$set': {'das.status': status}}, safe=True)

    def incache(self, dasquery, collection='merge', system=None):
        """
        Check if we have query results in cache, otherwise return null.
        Please note, input parameter query means MongoDB query, please
        consult MongoDB API for more details,
        http://api.mongodb.org/python/
        """
        self.remove_expired(collection)
        col  = self.mdb[collection]
        spec = {'qhash':dasquery.qhash}
        if  system:
            spec.update({'das.system': system})
        res  = col.find(spec=spec).count()
        msg  = "(%s, coll=%s) found %s results" % (dasquery, collection, res)
        self.logger.info(msg)
        if  res:
            return True
        return False

    def nresults(self, dasquery, collection='merge'):
        """Return number of results for given query."""
        if  dasquery.aggregators:
            return len(dasquery.aggregators)
        # Distinguish 2 use cases, unique filter and general query
        # in first one we should count only unique records, in later
        # we can rely on DB count() method. Pleas keep in mind that
        # usage of fields in find doesn't account for counting, since it
        # is a view over records found with spec, so we don't need to use it.
        col  = self.mdb[collection]
        fields, filter_cond = self.get_fields(dasquery)
        if  not fields:
            spec = dasquery.mongo_query.get('spec', {})
        else:
            spec = {'qhash':dasquery.qhash, 'das.empty_record':0}
        if  filter_cond:
            spec.update(filter_cond)
        if  dasquery.unique_filter:
            skeys = self.mongo_sort_keys(collection, dasquery)
            if  skeys:
                gen = col.find(spec=spec).sort(skeys)
            else:
                gen = col.find(spec=spec)
            res = len([r for r in unique_filter(gen)])
        else:
            res = col.find(spec=spec).count()
        msg = "%s" % res
        self.logger.info(msg)
        return res

    def mongo_sort_keys(self, collection, dasquery):
        """
        Find list of sort keys for a given DAS query. Check existing
        indexes and either use fields or spec keys to find them out.
        Return list of mongo sort keys in a form of (key, order).
        """
        # try to get sort keys all the time to get ordered list of
        # docs which allow unique_filter to apply afterwards
        fields = dasquery.mongo_query.get('fields')
        spec   = dasquery.mongo_query.get('spec')
        skeys  = dasquery.sortkeys
        mongo_skeys = []
        if  skeys:
            for key in skeys:
                if  key.find('-') != -1: # reverse order, e.g. desc
                    mongo_skeys.append((key.replace('-', ''), DESCENDING))
                else:
                    mongo_skeys.append((key, ASCENDING))
        else:
            existing_idx = [i for i in self.existing_indexes(collection)]
            if  fields:
                lkeys = []
                for key in fields:
                    for mkey in self.mapping.mapkeys(key):
                        if  mkey not in lkeys:
                            lkeys.append(mkey)
            else:
                lkeys = spec.keys()
            keys = [k for k in lkeys \
                if k.find('das') == -1 and k.find('_id') == -1 and \
                        k in existing_idx]
            mongo_skeys = [(k, ASCENDING) for k in keys]
        return mongo_skeys

    def existing_indexes(self, collection='merge'):
        """
        Get list of existing indexes in DB. They are returned by index_information
        API in the following for:

        .. doctest::

            {u'_id_': {u'key': [(u'_id', 1)], u'v': 0},
             u'das.expire_1': {u'key': [(u'das.expire', 1)], u'v': 0},
             ...
             u'tier.name_-1': {u'key': [(u'tier.name', -1)], u'v': 0}}
        """
        col = self.mdb[collection]
        for val in col.index_information().values():
            for idx in val['key']:
                yield idx[0] # index name

    def get_records(self, col, spec, fields, skeys, idx, limit, unique=False):
        "Generator to get records from MongoDB. It correctly applies"
        if  fields:
            for key in fields: # ensure that fields keys will be presented
                if  key not in self.das_internal_keys and \
                    not spec.has_key(key):
                    spec.update({key: {'$exists':True}})
        try:
            res = col.find(spec=spec, fields=fields)
            if  skeys:
                res = res.sort(skeys)
            if  not unique:
                if  idx:
                    res = res.skip(idx)
                if  limit:
                    res = res.limit(limit)
        except Exception as exp:
            print_exc(exp)
            row = {'exception': str(exp)}
            res = []
            yield row
        if  unique:
            if  limit:
                gen = itertools.islice(unique_filter(res), idx, idx+limit)
            else:
                gen = unique_filter(res)
            for row in gen:
                yield row
        else:
            for row in res:
                yield row

    def get_from_cache(self, dasquery, idx=0, limit=0, collection='merge'):
        "Generator which retrieves results from the cache"
        if  dasquery.service_apis_map(): # valid DAS query
            result = self.get_das_records(dasquery, idx, limit, collection)
        else: # pure MongoDB query
            coll    = self.mdb[collection]
            fields  = dasquery.mongo_query.get('fields', None)
            spec    = dasquery.mongo_query.get('spec', {})
            if  dasquery.filters:
                if  fields == None:
                    fields = dasquery.filters
                else:
                    fields += dasquery.filters
            skeys   = self.mongo_sort_keys(collection, dasquery)
            result  = self.get_records(coll, spec, fields, skeys, \
                            idx, limit, dasquery.unique_filter)
        for row in result:
            yield row

    def get_das_records(self, dasquery, idx=0, limit=0, collection='merge'):
        "Generator which retrieves DAS records from the cache"
        col = self.mdb[collection]
        msg = "(%s, %s, %s, coll=%s)" % (dasquery, idx, limit, collection)
        self.logger.info(msg)

        idx = int(idx)
        fields, filter_cond = self.get_fields(dasquery)
        if  not fields:
            spec = dasquery.mongo_query.get('spec', {})
        else:
            spec = {'qhash':dasquery.qhash, 'das.empty_record':0}
        if  filter_cond:
            spec.update(filter_cond)
        if  fields: # be sure to extract das internal keys
            fields += self.das_internal_keys
        # try to get sort keys all the time to get ordered list of
        # docs which allow unique_filter to apply afterwards
        skeys   = self.mongo_sort_keys(collection, dasquery)
        res     = self.get_records(col, spec, fields, skeys, \
                        idx, limit, dasquery.unique_filter)
        counter = 0
        for row in res:
            counter += 1
            yield row

        if  counter:
            msg = "yield %s record(s)" % counter
            self.logger.info(msg)

        # if no raw records were yield we look-up possible error records
        if  not counter:
            nrec = self.col.find({'qhash':dasquery.qhash}).count()
            if  nrec:
                msg = "for query %s, found %s non-result record(s)" \
                        % (dasquery, nrec)
                prf = 'DAS WARNING, monogocache:get_from_cache '
                print dastimestamp(prf), msg

    def map_reduce(self, mr_input, dasquery, collection='merge'):
        """
        Wrapper around _map_reduce to allow sequential map/reduce
        operations, e.g. map/reduce out of map/reduce. 

        mr_input is either alias name or list of alias names for
        map/reduce functions.

        Input dasquery which is applied to first
        iteration of map/reduce functions.
        """
        # NOTE: I need to revisit mapreduce.
        spec = dasquery.mongo_query['spec']
        if  not isinstance(mr_input, list):
            mrlist = [mr_input]
        else:
            mrlist = mr_input
        coll = self.mdb[collection]
        for mapreduce in mrlist:
            if  mapreduce == mrlist[0]:
                cond = spec
            else:
                cond = None
            coll = self._map_reduce(coll, mapreduce, cond)
        for row in coll.find():
            yield row

    def _map_reduce(self, coll, mapreduce, spec=None):
        """
        Perform map/reduce operation over DAS cache using provided
        collection, mapreduce name and optional conditions.
        """
        self.logger.debug("(%s, %s)" % (mapreduce, spec))
        record = self.mrcol.find_one({'name':mapreduce})
        if  not record:
            raise Exception("Map/reduce function '%s' not found" % mapreduce)
        fmap = record['map']
        freduce = record['reduce']
        if  spec:
            result = coll.map_reduce(Code(fmap), Code(freduce), query=spec)
        else:
            result = coll.map_reduce(Code(fmap), Code(freduce))
        msg = "found %s records in %s" % (result.count(), result.name)
        self.logger.info(msg)
        self.logger.debug(fmap)
        self.logger.debug(freduce)
        return result

    def get_map_reduce(self, name=None):
        """
        Return definition of map/reduce functions for provided name
        or gives full list.
        """
        spec = {}
        if  name:
            spec = {'name':name}
        result = self.mrcol.find(spec)
        for row in result:
            yield row

    def merge_records(self, dasquery):
        """
        Merge DAS records for provided query. We perform the following
        steps:
        1. get all queries from das.cache by ordering them by primary key
        2. run aggregtor function to merge neighbors
        3. insert records into das.merge
        """
        self.logger.debug(dasquery)
        id_list = []
        expire  = 9999999999 # future
        # get all API records for given DAS query
        spec    = {'qhash':dasquery.qhash, 'query':{'$exists':True}}
        records = self.col.find(spec)
        for row in records:
            # find smallest expire timestamp to be used by aggregator
            if  row['das']['expire'] < expire:
                expire = row['das']['expire']
            if  row['_id'] not in id_list:
                id_list.append(row['_id'])
        inserted = 0
        lookup_keys = set()
        fields = dasquery.mongo_query.get('fields')
        if  not fields: # Mongo
            fields = []
        for key in fields:
            for pkey in self.mapping.mapkeys(key):
                lookup_keys.add(pkey)
        for pkey in lookup_keys:
            skey = [(pkey, DESCENDING)]
            # lookup all service records
            spec = {'das_id': {'$in': id_list}, 'das.primary_key': pkey}
            if  self.verbose:
                nrec = self.col.find(spec).sort(skey).count()
                msg  = "merging %s records, for %s key" % (nrec, pkey) 
            else:
                msg  = "merging records, for %s key" % pkey
            self.logger.debug(msg)
            records = self.col.find(spec).sort(skey)
            # aggregate all records
            agen = aggregator(dasquery, records, expire)
            # diff aggregated records
            gen  = das_diff(agen, self.mapping.diff_keys(pkey.split('.')[0]))
            # insert all records into das.merge using bulk insert
            size = self.cache_size
            try:
                while True:
                    nres = self.merge.insert(\
                        itertools.islice(gen, size), safe=True)
                    if  nres and isinstance(nres, list):
                        inserted += len(nres)
                    else:
                        break
            except InvalidDocument as exp:
                msg = "Caught bson error: " + str(exp)
                self.logger.info(msg)
                records = self.col.find(spec).sort(skey)
                gen = aggregator(dasquery, records, expire)
                genrows = parse2gridfs(self.gfs, pkey, gen, self.logger)
                das_dict = {'das':{'expire':expire, 'empty_record': 0,
                        'primary_key':[k for k in lookup_keys],
                        'system': ['gridfs']}, 'qhash':dasquery.qhash,
                        'cache_id':[], 'das_id': id_list}
                for row in genrows:
                    row.update(das_dict)
                    self.merge.insert(row, safe=True)
            except InvalidOperation:
                pass
        if  inserted:
            self.logdb.insert('merge', {'insert': inserted})
        elif  not lookup_keys: # we get query w/o fields
            pass
        else: # we didn't merge anything, it is DB look-up failure
            empty_expire = time.time() + 20 # secs, short enough to expire
            empty_record = {'das':{'expire':empty_expire,
                                   'primary_key':list(lookup_keys),
                                   'empty_record': 1},
                            'cache_id':[], 'das_id': id_list}
            for key, val in dasquery.mongo_query['spec'].iteritems():
                if  key.find('.') == -1:
                    empty_record[key] = []
                else: # it is compound key, e.g. site.name
                    newkey, newval = convert_dot_notation(key, val)
                    empty_record[newkey] = adjust_mongo_keyvalue(newval)
            self.merge.insert(empty_record, safe=True)
            # update DAS records (both meta and data ones, by using qhash)
            nval = {'$set': {'das.expire':empty_expire}}
            spec = {'qhash':dasquery.qhash}
            self.col.update(spec, nval, multi=True, safe=True)

    def update_cache(self, dasquery, results, header):
        """
        Insert results into cache. Use bulk insert controller by
        self.cache_size. Upon completion ensure indexies.
        """
        # insert/check query record in DAS cache
        self.insert_query_record(dasquery, header)

        # update results records in DAS cache
        gen  = self.generate_records(dasquery, results, header)
        inserted = 0
        # bulk insert
        try:
            while True:
                nres = self.col.insert(\
                        itertools.islice(gen, self.cache_size), safe=True)
                if  nres and isinstance(nres, list):
                    inserted += len(nres)
                else:
                    break
        except InvalidOperation:
            pass
        if  inserted:
            self.logdb.insert('cache', {'insert': inserted})

    def insert_query_record(self, dasquery, header):
        """
        Insert query record into DAS cache.
        """
        dasheader  = header['das']
        # check presence of API record in a cache
        system     = dasheader['system']
        if  not self.incache(dasquery, collection='cache', system=system):
            msg = "query=%s, header=%s" % (dasquery, header)
            self.logger.debug(msg)
            q_record = dict(das=dasheader, query=dasquery.storage_query)
            q_record['das']['empty_record'] = 0
            q_record['das']['status'] = "requested"
            q_record['qhash'] = dasquery.qhash
            self.col.insert(q_record, safe=True)

    def generate_records(self, dasquery, results, header):
        """
        Iterate over provided results, update records and yield them
        to next level (update_cache)
        """
        self.logger.debug("(%s) store to cache" % dasquery)
        if  not results:
            return
        # update das record with new status
        status = 'Update DAS cache, %s API' % header['das']['api'][0]
        self.update_query_record(dasquery, status, header)

        dasheader  = header['das']
        expire     = dasheader['expire']
        system     = dasheader['system']
        rec        = [k for i in header['lookup_keys'] for k in i.values()]
        cond_keys  = dasquery.mongo_query['spec'].keys()
        # get API record id
        spec       = {'qhash':dasquery.qhash, 'das.system':system}
        record     = self.col.find_one(spec, fields=['_id'])
        counter    = 0
        prim_key   = rec[0][0]#use rec instead of lkeys[0] which re-order items
        if  record:
            objid  = record['_id']
            if  isinstance(results, list) or isinstance(results, GeneratorType):
                for item in results:
                    counter += 1
                    item['das'] = dict(expire=expire, primary_key=prim_key,
                                       condition_keys=cond_keys,
                                       instance=dasquery.instance,
                                       system=system, empty_record=0)
                    item['das_id'] = str(objid)
                    item['qhash'] = dasquery.qhash
                    yield item
            else:
                print "\n\n ### results = ", str(results)
                raise Exception('Provided results is not a list/generator type')
        self.logger.info("\n")
        msg = "%s yield %s rows" % (dasheader['system'], counter)
        self.logger.info(msg)

    def remove_from_cache(self, dasquery):
        """
        Remove query from DAS cache. To do so, we retrieve API record
        and remove all data records from das.cache and das.merge
        """
        records = self.col.find({'qhash':dasquery.qhash})
        id_list = []
        for row in records:
            if  row['_id'] not in id_list:
                id_list.append(row['_id'])
        spec = {'das_id':{'$in':id_list}}
        self.logdb.insert('merge', {'delete': self.col.find(spec).count()})
        self.merge.remove(spec)
        self.logdb.insert('cache', {'delete': self.col.find(spec).count()})
        self.col.remove(spec)
        self.col.remove({'qhash':dasquery.qhash})

    def clean_cache(self):
        """
        Clean expired docs in das.cache and das.merge. 
        """
        current_time = time.time()
        query = {'das.expire': { '$lt':current_time} }
        self.logdb.insert('merge', {'delete': self.merge.find(query).count()})
        self.merge.remove(query)
        self.logdb.insert('cache', {'delete': self.col.find(query).count()})
        self.col.remove(query)

    def delete_cache(self):
        """
        Delete all results in DAS cache/merge collection, including
        internal indexes.
        """
        self.logdb.insert('cache', {'delete': self.col.count()})
        self.col.remove({})
        try: 
            self.col.drop_indexes()
        except:
            pass
        self.logdb.insert('merge', {'delete': self.merge.count()})
        self.merge.remove({})
        try:
            self.merge.drop_indexes()
        except:
            pass
Ejemplo n.º 7
0
class DASParserDB(object):
    """
    Caching layer for the PLY parser.
    """
    def __init__(self, config):
        self.verbose  = config['verbose']
        self.logger   = PrintManager('DASParserDB', self.verbose)
        self.dburi    = config['mongodb']['dburi']
        self.dbname   = config['parserdb']['dbname']
        self.sizecap  = config['parserdb'].get('sizecap', 5*1024*1024)
        self.colname  = config['parserdb']['collname']
        
        msg = "DASParserCache::__init__ %s@%s" % (self.dburi, self.dbname)
        self.logger.info(msg)
        
        self.col = None
        self.create_db()

    def create_db(self):
        """
        Create db collection
        """
        conn = db_connection(self.dburi)
        dbn  = conn[self.dbname]
        if  self.colname not in dbn.collection_names():
            dbn.create_collection(self.colname, capped=True, size=self.sizecap)
        self.col = dbn[self.colname]

    def lookup_query(self, rawtext):
        """
        Check the parser cache for a given rawtext query.
        Search is done with the hash of this string.
        
        Returns a tuple (status, value) for the cases
        (PARSERCACHE_VALID, mongo_query) - valid query found
        (PARSERCACHE_INVALID, error) - error message for invalid query
        (PARSERCACHE_NOTFOUND, None) - not in the cache
        """
        result = self.col.find_one({'hash':genkey(rawtext)},
                        fields=['query', 'error'])

        if result and result['query']:
            if self.verbose:
                self.logger.debug("DASParserCache: found valid %s->%s" %\
                                  (rawtext, result['query']))
            
            query = decode_mongo_query(result['query'])
            return (PARSERCACHE_VALID, query)
        elif result and result['error']:
            if self.verbose:
                self.logger.debug("DASParserCache: found invalid %s->%s" %\
                                  (rawtext, result['error']))
            return (PARSERCACHE_INVALID, result['error'])
        else:
            if self.verbose:
                self.logger.debug("DASParserCache: not found %s" %\
                                  (rawtext))
            return (PARSERCACHE_NOTFOUND, None)

    def insert_valid_query(self, rawtext, query):
        "Insert a query that was successfully transformed"	
        self._insert_query(rawtext, query, None)

    def insert_invalid_query(self, rawtext, error):
        "Insert the error message for an invalid query"
        self._insert_query(rawtext, None, error)

    def _insert_query(self, rawtext, query, error):
        """Internal method to insert a query"""
        if  self.verbose:
            self.logger.debug("DASParserCache: insert %s->%s/%s" %\
	                          (rawtext, query, error))
        # since MongoDB does not support insertion of $ sign in queries
        # we need to encode inserted query
        if  query:
            encquery = encode_mongo_query(query)
        else:
            encquery = ""
        self.col.insert({'raw':rawtext, 'hash':genkey(rawtext),
                         'query':encquery, 'error':str(error)})
Ejemplo n.º 8
0
class DASMongocache(object):
    """
    DAS cache based MongoDB.
    """
    def __init__(self, config):
        self.config  = config
        self.emptyset_expire = \
                expire_timestamp(config['das'].get('emptyset_expire', 5))
        self.dburi   = config['mongodb']['dburi']
        self.cache_size = config['mongodb']['bulkupdate_size']
        self.dbname  = config['dasdb']['dbname']
        self.verbose = config['verbose']
        self.logger  = PrintManager('DASMongocache', self.verbose)
        self.mapping = config['dasmapping']
        self.logging = config['dasdb'].get('logging', False)
        self.rec_ttl = config['dasdb'].get('record_ttl', 24*60*60)
        self.del_ttl = config['dasdb'].get('delta_ttl', 60)
        self.cleanup_del_ttl = config['dasdb'].get('cleanup_delta_ttl', 3600)
        self.retry   = config['dasdb'].get('retry', 3)
        self.das_son_manipulator = DAS_SONManipulator()

        # Initialize MongoDB connection
        self.col_    = self.config['dasdb']['cachecollection']
        self.mrcol_  = self.config['dasdb']['mrcollection']
        self.merge_  = self.config['dasdb']['mergecollection']
        self.gfs     = db_gridfs(self.dburi)

        msg = "%s@%s" % (self.dburi, self.dbname)
        self.logger.info(msg)

        # ensure that we have the following indexes
        common_idx = [
                      ('file.name', DESCENDING),
                      ('dataset.name', DESCENDING),
                      ('block.name', DESCENDING),
                      ('run.run_number', DESCENDING),
                      ]
        index_list = [('das.expire', ASCENDING), ('das_id', ASCENDING),
                      ('das.system', ASCENDING),
                      ('qhash', DESCENDING),
                      ('das.record', ASCENDING)]
        create_indexes(self.col, index_list + common_idx)
        index_list = [('das.expire', ASCENDING), ('das_id', ASCENDING),
                      ('qhash', DESCENDING),
                      ('das.record', ASCENDING),
                      ('das.ts', ASCENDING)]
        create_indexes(self.merge, index_list)
        # NOTE: I found that creating index in merge collection leads to
        # MongoDB error when records contains multiple arrays on indexed
        # keys. For example, when we query file,run,lumi both file and run
        # are arrays in MongoDB. In this case the final sort in MongoDB
        # bark with the following message:
        # cannot sort with keys that are parallel arrays
        # it looks like that there is no fix for that yet
        # see
        # http://stackoverflow.com/questions/6516725/how-do-i-index-two-arrays-in-mongodb
        # therefore I temporary disabled create_indexes call on merge
        # collection which was used to have index to ease final sort,
        # especially in a case when a lot of records correspond to inital
        # query, e.g. file records.
        # On another hand, the most common use case where sort fails is
        # getting file records, and I can add one compound key to ease sort
        # but I can't add another compound key on array field, e.g. run
        common_idx = [[('qhash', DESCENDING), ('file.name', DESCENDING)]]
        create_indexes(self.merge, index_list + common_idx)

        # thread which clean-up DAS collections
        thname = 'mongocache_cleanup'
        cols   = [config['dasdb']['cachecollection'],
                  config['dasdb']['mrcollection'],
                  config['dasdb']['mergecollection']]

    @property
    def col(self):
        "col property provides access to DAS cache collection"
        conn = db_connection(self.dburi)
        mdb  = conn[self.dbname]
        colnames = mdb.collection_names()
        if  not colnames or self.col_ not in colnames:
            try:
                mdb.create_collection(self.col_)
            except OperationFailure:
                pass
        mdb.add_son_manipulator(self.das_son_manipulator)
        return mdb[self.col_]

    @property
    def merge(self):
        "merge property provides access to DAS merge collection"
        conn = db_connection(self.dburi)
        mdb  = conn[self.dbname]
        colnames = mdb.collection_names()
        if  not colnames or self.merge_ not in colnames:
            try:
                mdb.create_collection(self.merge_)
            except OperationFailure:
                pass
        mdb.add_son_manipulator(self.das_son_manipulator)
        return mdb[self.merge_]

    @property
    def mrcol(self):
        "mrcol property provides access to DAS map-reduce collection"
        conn = db_connection(self.dburi)
        mdb  = conn[self.dbname]
        mdb.add_son_manipulator(self.das_son_manipulator)
        return mdb[self.mrcol_]

    def get_dataset_hashes(self, dasquery):
        "Get dataset hashes from DBS database"
        spec = dasquery.mongo_query.get('spec', {})
        inst = dasquery.instance
        conn = db_connection(self.dburi)
        if  spec and inst:
            dataset = spec.get('dataset.name', None)
            if  dataset:
                if  dataset.find('*') != -1:
                    cond = {'dataset':re.compile(dataset.replace('*', '.*'))}
                else:
                    cond = {'dataset': dataset}
                for row in conn['dbs'][inst].find(cond):
                    if  'qhash' in row:
                        yield row['qhash']

    def check_datasets(self, dasquery):
        "Check dataset presence in DAS cache for given das query"
        hashes = [r for r in self.get_dataset_hashes(dasquery)]
        if  hashes:
            spec = {'qhash': {'$in': hashes}}
            if  len(hashes) == self.merge.find(spec, **PYMONGO_OPTS).count():
                dasquery._hashes = hashes

    def get_superset_keys(self, key, value):
        """
        This is a special-case version of similar_keys,
        intended for analysers that want to quickly
        find possible superset queries of a simple
        query of the form key=value.
        """

        msg = "%s=%s" % (key, value)
        self.logger.debug(msg)
        cond = {'query.spec.key': key}
        for row in self.col.find(cond, **PYMONGO_OPTS):
            mongo_query = decode_mongo_query(row['query'])
            for thiskey, thisvalue in mongo_query.items():
                if thiskey == key:
                    if fnmatch.fnmatch(value, thisvalue):
                        yield thisvalue

    def get_fields(self, dasquery):
        "Prepare fields to extract from MongoDB"
        fields     = dasquery.mongo_query.get('fields', [])
        if  fields and 'records' in fields:
            fields = None # look-up all records
        filters    = dasquery.filters
        cond       = {}
        if  filters:
            new_fields = []
            for dasfilter in filters:
                if  dasfilter == 'unique':
                    continue
                if  fields and dasfilter not in fields and \
                    dasfilter not in new_fields:
                    if  dasfilter.find('=') == -1 and dasfilter.find('<') == -1\
                    and dasfilter.find('>') == -1:
                        new_fields.append(dasfilter)
                    else:
                        cond = parse_filters(dasquery.mongo_query)
            if  not new_fields and fields:
                new_fields = list(fields)
            return new_fields, cond
        return fields, cond

    def remove_expired(self, dasquery, collection):
        """
        Remove expired records from DAS cache. We need to perform this
        operation very carefullly since we don't use transation and on-going
        commits can invoke this method (see das_core.py).  Therefore we use
        MongoDB $or operator to wipe out queries which match DASQuery hash and
        already expired or queries which lived in cache more then rec_ttl
        config parameter. The later operation just prevent DAS cache from
        growing.
        """
        conn   = db_connection(self.dburi)
        mdb    = conn[self.dbname]
        mdb.add_son_manipulator(self.das_son_manipulator)
        col    = mdb[collection]
        # use additional delta to check data record expiration
        # we add this delta to ensure that there is no records close to
        # current timestamp which may expire during request processing
        spec = {'qhash':dasquery.qhash,
                'das.expire':{'$lt':time.time()+self.del_ttl}}
        col.delete_many(spec)

    def check_services(self, dasquery):
        """
        Check if DAS cache contains DAS records with service response for
        given query.
        """
        das_rec  = self.find(dasquery)
        if  not das_rec:
            return False
        if  'das' not in das_rec:
            return False
        if  'services' not in das_rec['das']:
            return False
        spec = {'qhash':dasquery.qhash, 'das.system':{'$ne':'das'},
                'das.expire':{'$gt':time.time()}}
        nres = self.col.find(spec, **PYMONGO_OPTS).count()
        if  nres:
            return True
        return False

    def find(self, dasquery):
        """
        Find provided query in DAS cache.
        """
        cond = {'qhash': dasquery.qhash, 'das.system':'das',
                'das.expire': {'$gt':time.time()}}
        return find_one(self.col, cond)

    def find_specs(self, dasquery, system='das'):
        """
        Check if cache has query whose specs are identical to provided query.
        Return all matches.
        """
        if dasquery.hashes:
            cond = {'qhash':{'$in':dasquery.hashes}}
        else:
            cond = {'qhash': dasquery.qhash}
        if  system:
            cond.update({'das.system': system})
        cond.update({'das.expire':{'$gt':time.time()}})
        return self.col.find(cond, **PYMONGO_OPTS)

    def get_das_ids(self, dasquery):
        """
        Return list of DAS ids associated with given query
        """
        das_ids = []
        try:
            das_ids = \
                [r['_id'] for r in self.col.find_specs(dasquery, system='')]
        except:
            pass
        return das_ids

    def update_das_expire(self, dasquery, timestamp):
        "Update timestamp of all DAS data records for given query"
        nval = {'$set': {'das.expire':timestamp}}
        spec = {'qhash' : dasquery.qhash}
        self.col.update_many(spec, nval)
        self.merge.update_many(spec, nval)

    def das_record(self, dasquery):
        "Retrieve DAS record for given query"
        cond = {'qhash': dasquery.qhash, 'das.expire':{'$gt':time.time()}}
        return find_one(self.col, cond)

    def find_records(self, das_id):
        " Return all the records matching a given das_id"
        return self.col.find({'das_id': das_id}, **PYMONGO_OPTS)

    def is_error_in_records(self, dasquery, collection='cache'):
        "Scan DAS cache for error records and return true or not"
        if  collection == 'cache':
            results = self.col.find({'qhash':dasquery.qhash}, **PYMONGO_OPTS)
        else:
            results = self.merge.find({'qhash':dasquery.qhash}, **PYMONGO_OPTS)
        error  = None
        reason = None
        for row in results:
            if 'error' in row:
                error  = row.get('error')
                reason = row.get('reason', '')
                break
        return error, reason

    def add_to_record(self, dasquery, info, system=None):
        "Add to existing DAS record provided info"
        if  system:
            self.col.update_one({'query': dasquery.storage_query,
                             'das.system':system},
                            {'$set': info}, upsert=True)
        else:
            self.col.update_one({'query': dasquery.storage_query},
                            {'$set': info}, upsert=True)

    def find_min_expire(self, dasquery):
        """Find minimal expire timestamp across all records for given DAS query"""
        spec   = {'qhash': dasquery.qhash}
        min_expire = 2*time.time() # upper bound, will update
        for rec in self.col.find(spec, **PYMONGO_OPTS):
            if  'das' in rec and 'expire' in rec['das']:
                estamp = rec['das']['expire']
                if  min_expire > estamp:
                    min_expire = estamp
        return long(min_expire)

    def find_query_record(self, dasquery):
        "Find DAS query records and return them to the caller"
        spec = {'qhash':dasquery.qhash,
                'das.record':record_codes('query_record')}
        return self.col.find(spec, **PYMONGO_OPTS)

    def update_query_record(self, dasquery, status, header=None, reason=None):
        "Update DAS record for provided query"
        ctime = time.time()
        das_spec = {'qhash': dasquery.qhash, 'das.system':'das'}
        min_expire = self.find_min_expire(dasquery)
        if  header:
            system = header['das']['system']
            sts    = header['das']['status']
            expire = header['das']['expire']
            spec   = {'qhash': dasquery.qhash, 'das.system': system}
            new_expire = None
            for rec in self.col.find(spec, **PYMONGO_OPTS):
                if  'das' in rec and 'expire' in rec['das']:
                    if  rec['das']['expire'] > expire:
                        new_expire = expire
                        ndict = {'das.expire':expire, 'das.status':status}
                        cdict = {'das.ctime':ctime}
                        udict = {'$set':ndict, '$push':cdict}
                        oid   = ObjectId(rec['_id'])
                        self.col.update_one({'_id':oid}, udict)
            if  new_expire:
                udict = {'$set': {'das.expire': new_expire},
                         '$push': {'das.ctime':ctime}}
                self.col.update_one(das_spec, udict)
        else:
            udict = {'$set': {'das.status':status, 'das.expire': min_expire},
                     '$push': {'das.ctime':ctime}}
            self.col.update_one(das_spec, udict)
        if  reason:
            udict = {'$set': {'das.reason':reason}}
            self.col.update_one(das_spec, udict)
        # align all expire timestamps when we recieve ok status
        if  status == 'ok':
            udict = {'$set': {'das.expire': min_expire}}
            self.col.update_one(das_spec, udict)

    def apilist(self, dasquery):
        "Return list of apis for given dasquery"
        spec = {'qhash':dasquery.qhash,
                'das.record':record_codes('query_record')}
        apis = []
        for row in self.col.find(spec, ['das.api'], **PYMONGO_OPTS):
            try:
                apis += row['das']['api']
            except Exception as _err:
                pass
        return apis

    def incache(self, dasquery, collection='merge', system=None, api=None,
            query_record=False):
        """
        Check if we have query results in cache, otherwise return null.
        Please note, input parameter query means MongoDB query, please
        consult MongoDB API for more details,
        http://api.mongodb.org/python/
        """
        if  query_record:
            record = record_codes('query_record')
        else:
            record = spec4data_records()
        spec = {'qhash':dasquery.qhash, 'das.record':record,
                'das.expire':{'$gt':time.time()}}
        if  system:
            spec.update({'das.system': system})
        if  api:
            spec.update({'das.api': api})
        conn = db_connection(self.dburi)
        mdb  = conn[self.dbname]
        mdb.add_son_manipulator(self.das_son_manipulator)
        col  = mdb[collection]
        res  = col.find(spec, **PYMONGO_OPTS).count()
        msg  = "(%s, coll=%s) found %s results" % (dasquery, collection, res)
        self.logger.info(msg)
        if  res:
            return True
        return False

    def nresults(self, dasquery, collection='merge'):
        """Return number of results for given query."""
        if  dasquery.aggregators:
            return len(dasquery.aggregators)
        # Distinguish 2 use cases, unique filter and general query
        # in first one we should count only unique records, in later
        # we can rely on DB count() method. Pleas keep in mind that
        # usage of fields in find doesn't account for counting, since it
        # is a view over records found with spec, so we don't need to use it.
        fields, filter_cond = self.get_fields(dasquery)
        if  not fields:
            spec = dasquery.mongo_query.get('spec', {})
        elif dasquery.hashes:
            spec = {'qhash':{'$in':dasquery.hashes},
                    'das.record': spec4data_records()}
        else:
            spec = {'qhash':dasquery.qhash,
                    'das.record': spec4data_records()}
        if  filter_cond:
            spec.update(filter_cond)
        conn = db_connection(self.dburi)
        mdb  = conn[self.dbname]
        mdb.add_son_manipulator(self.das_son_manipulator)
        col  = mdb[collection]
        if  dasquery.unique_filter:
            skeys = self.mongo_sort_keys(collection, dasquery)
            if  skeys:
                gen = col.find(spec, **PYMONGO_OPTS).sort(skeys)
            else:
                gen = col.find(spec, **PYMONGO_OPTS)
            res = len([r for r in unique_filter(gen)])
        else:
            res = col.find(spec, **PYMONGO_OPTS).count()
            if  not res: # double check that this is really the case
                time.sleep(1)
                res = col.find(spec, **PYMONGO_OPTS).count()
        msg = "%s" % res
        self.logger.info(msg)
        return res

    def mongo_sort_keys(self, collection, dasquery):
        """
        Find list of sort keys for a given DAS query. Check existing
        indexes and either use fields or spec keys to find them out.
        Return list of mongo sort keys in a form of (key, order).
        """
        # try to get sort keys all the time to get ordered list of
        # docs which allow unique_filter to apply afterwards
        fields = dasquery.mongo_query.get('fields')
        spec   = dasquery.mongo_query.get('spec')
        skeys  = dasquery.sortkeys
        mongo_skeys = []
        if  skeys:
            for key in skeys:
                if  key.find('-') != -1: # reverse order, e.g. desc
                    mongo_skeys.append((key.replace('-', ''), DESCENDING))
                else:
                    mongo_skeys.append((key, ASCENDING))
        else:
            existing_idx = [i for i in self.existing_indexes(collection)]
            if  fields:
                lkeys = []
                for key in fields:
                    for mkey in self.mapping.mapkeys(key):
                        if  mkey not in lkeys:
                            lkeys.append(mkey)
            else:
                lkeys = list(spec.keys())
            keys = [k for k in lkeys \
                if k.find('das') == -1 and k.find('_id') == -1 and \
                        k in existing_idx]
            mongo_skeys = [(k, ASCENDING) for k in keys]
        return mongo_skeys

    def existing_indexes(self, collection='merge'):
        """
        Get list of existing indexes in DB. They are returned by
        index_information API in the following for:

        .. doctest::

            {u'_id_': {u'key': [(u'_id', 1)], u'v': 0},
             u'das.expire_1': {u'key': [(u'das.expire', 1)], u'v': 0},
             ...
             u'tier.name_-1': {u'key': [(u'tier.name', -1)], u'v': 0}}
        """
        conn = db_connection(self.dburi)
        mdb  = conn[self.dbname]
        mdb.add_son_manipulator(self.das_son_manipulator)
        col = mdb[collection]
        for val in col.index_information().values():
            for idx in val['key']:
                yield idx[0] # index name

    def get_records(self, coll, spec, fields, skeys, idx, limit, unique=False):
        "Generator to get records from MongoDB."
        try:
            conn = db_connection(self.dburi)
            mdb  = conn[self.dbname]
            mdb.add_son_manipulator(self.das_son_manipulator)
            col = mdb[coll]
            nres = col.find(spec, **PYMONGO_OPTS).count()
            if  nres == 1 or nres <= limit:
                limit = 0
            if  limit:
                res = col.find(spec, fields, sort=skeys, skip=idx, limit=limit)
            else:
                res = col.find(spec, fields, sort=skeys, **PYMONGO_OPTS)
            if  unique:
                res = unique_filter(res)
            for row in res:
                yield row
        except Exception as exp:
            print_exc(exp)
            row = {'exception': str(exp)}
            res = []
            yield row

    def get_from_cache(self, dasquery, idx=0, limit=0, collection='merge'):
        "Generator which retrieves results from the cache"
        if  dasquery.service_apis_map(): # valid DAS query
            result = self.get_das_records(dasquery, idx, limit, collection)
            for row in result:
                yield row
        else: # pure MongoDB query
            fields  = dasquery.mongo_query.get('fields', [])
            if  fields == None:
                fields = []
            spec    = dasquery.mongo_query.get('spec', {})
            if  dasquery.filters:
                if  not fields:
                    fields = []
                fields += dasquery.filters
                pkeys   = [k.split('.')[0] for k in fields]
            fields += das_record_keys()
            if  'records' in dasquery.query:
                fields = None # special case for DAS 'records' keyword
            skeys   = self.mongo_sort_keys(collection, dasquery)
            result  = self.get_records(collection, spec, fields, skeys, \
                            idx, limit, dasquery.unique_filter)
            for row in result:
                if  dasquery.filters:
                    if  pkeys and set(pkeys) & set(row.keys()):
                        yield row
                else:
                    yield row

    def get_das_records(self, dasquery, idx=0, limit=0, collection='merge'):
        "Generator which retrieves DAS records from the cache"
        msg = "(%s, %s, %s, coll=%s)" % (dasquery, idx, limit, collection)
        self.logger.info(msg)

        idx = int(idx)
        fields, filter_cond = self.get_fields(dasquery)
        if  fields == None:
            fields = []
        if  not fields:
            spec = dasquery.mongo_query.get('spec', {})
        elif dasquery.hashes:
            spec = {'qhash':{'$in':dasquery.hashes},
                    'das.record': spec4data_records()}
        else:
            spec = {'qhash':dasquery.qhash,
                    'das.record': spec4data_records()}
        if  filter_cond:
            spec.update(filter_cond)
        if  'records' in dasquery.query:
            fields  = None # retrieve all fields for records DAS query
        else:
            # be sure to extract das internal keys
            fields += das_record_keys()
        # try to get sort keys all the time to get ordered list of
        # docs which allow unique_filter to apply afterwards
        skeys   = self.mongo_sort_keys(collection, dasquery)
        res     = self.get_records(collection, spec, fields, skeys, \
                        idx, limit, dasquery.unique_filter)
        counter = 0
        for row in res:
            counter += 1
            yield row
        msg = 'qhash %s, found %s record(s) in %s collection' \
                % (dasquery.qhash, counter, collection)
        print(dastimestamp('DAS INFO '), msg)

        if  counter:
            msg = "yield %s record(s)" % counter
            self.logger.info(msg)

        # if no raw records were yield we look-up possible error records
        # and reset timestamp for record with system:['das']
        if  not counter:
            spec = {'qhash':dasquery.qhash}
            nrec = self.col.find(spec, **PYMONGO_OPTS).count()
            if  nrec:
                msg = "for query %s, found %s non-result record(s)" \
                        % (dasquery, nrec)
                print(dastimestamp('DAS WARNING'), msg)
                for rec in self.col.find(spec, **PYMONGO_OPTS):
                    if  'query' in rec:
                        print(dastimestamp('DAS das record'), rec)
            self.update_das_expire(dasquery, etstamp())

    def map_reduce(self, mr_input, dasquery, collection='merge'):
        """
        Wrapper around _map_reduce to allow sequential map/reduce
        operations, e.g. map/reduce out of map/reduce.

        mr_input is either alias name or list of alias names for
        map/reduce functions.

        Input dasquery which is applied to first
        iteration of map/reduce functions.
        """
        # NOTE: I need to revisit mapreduce.
        spec = dasquery.mongo_query['spec']
        if  not isinstance(mr_input, list):
            mrlist = [mr_input]
        else:
            mrlist = mr_input
        conn = db_connection(self.dburi)
        mdb  = conn[self.dbname]
        mdb.add_son_manipulator(self.das_son_manipulator)
        coll = mdb[collection]
        for mapreduce in mrlist:
            if  mapreduce == mrlist[0]:
                cond = spec
            else:
                cond = None
            coll = self._map_reduce(coll, mapreduce, cond)
        for row in coll.find():
            yield row

    def _map_reduce(self, coll, mapreduce, spec=None):
        """
        Perform map/reduce operation over DAS cache using provided
        collection, mapreduce name and optional conditions.
        """
        self.logger.debug("(%s, %s)" % (mapreduce, spec))
        record = find_one(self.mrcol, {'name':mapreduce})
        if  not record:
            raise Exception("Map/reduce function '%s' not found" % mapreduce)
        fmap = record['map']
        freduce = record['reduce']
        if  spec:
            result = coll.map_reduce(Code(fmap), Code(freduce), query=spec)
        else:
            result = coll.map_reduce(Code(fmap), Code(freduce))
        msg = "found %s records in %s" % (result.count(), result.name)
        self.logger.info(msg)
        self.logger.debug(fmap)
        self.logger.debug(freduce)
        return result

    def get_map_reduce(self, name=None):
        """
        Return definition of map/reduce functions for provided name
        or gives full list.
        """
        spec = {}
        if  name:
            spec = {'name':name}
        result = self.mrcol.find(spec, **PYMONGO_OPTS)
        for row in result:
            yield row

    def merge_records(self, dasquery, attempt=0):
        """
        Merge DAS records for provided query. We perform the following
        steps:
        1. get all queries from das.cache by ordering them by primary key
        2. run aggregtor function to merge neighbors
        3. insert records into das.merge
        """
        ### TMP for asyncio
#         time.sleep(attempt+3) # pymongo 3.2 don't yet flush in time

        # remove any entries in merge collection for this query
        self.merge.delete_many({'qhash':dasquery.qhash})
        # proceed
        self.logger.debug(dasquery)
        id_list = []
        expire  = 9999999999 # future
        # get all API records for given DAS query
        spec    = {'qhash':dasquery.qhash,
                   'das.expire':{'$gt':time.time()},
                   'das.record':record_codes('query_record')}
        records = self.col.find(spec, **PYMONGO_OPTS)
        for row in records:
            # find smallest expire timestamp to be used by aggregator
            rexpire = row.get('das', {}).get('expire', expire)
            if  rexpire < expire:
                expire = rexpire
            if  row['_id'] not in id_list:
                id_list.append(row['_id'])
        inserted = 0
        lookup_keys = set()
        fields = dasquery.mongo_query.get('fields')
        if  not fields: # Mongo
            fields = []
        for key in fields:
            for pkey in self.mapping.mapkeys(key):
                lookup_keys.add(pkey)
        for pkey in lookup_keys:
            skey = [(pkey, DESCENDING)]
            # lookup all service records
            spec = {'das_id': {'$in': id_list}, 'das.primary_key': pkey}
            if  self.verbose:
                nrec = self.col.find(spec, **PYMONGO_OPTS).sort(skey).count()
                msg  = "merging %s records, for %s key" % (nrec, pkey)
            else:
                msg  = "merging records, for %s key" % pkey
            self.logger.debug(msg)
            # use exhaust=False since we process all records in aggregator
            # and it can be delay in processing
            records = self.col.find(spec, **PYMONGO_NOEXHAUST).sort(skey)
            # aggregate all records
            agen = aggregator(dasquery, records, expire)
            # diff aggregated records
            gen = das_diff(agen, self.mapping.diff_keys(pkey.split('.')[0]))
            # insert all records into das.merge using bulk insert
            size = self.cache_size
            try:
                res = self.merge.insert_many(gen)
                inserted += len(res.inserted_ids)
            except InvalidDocument as exp:
                print(dastimestamp('DAS WARNING'), 'InvalidDocument during merge', str(exp))
                msg = "Caught bson error: " + str(exp)
                self.logger.info(msg)
                records = self.col.find(spec, **PYMONGO_OPTS).sort(skey)
                gen = aggregator(dasquery, records, expire)
                genrows = parse2gridfs(self.gfs, pkey, gen, self.logger)
                das_dict = {'das':{'expire':expire,
                        'das.record': record_codes('gridfs_record'),
                        'primary_key':[k for k in lookup_keys],
                        'system': ['gridfs']}, 'qhash':dasquery.qhash,
                        'cache_id':[], 'das_id': id_list}
                for row in genrows:
                    row.update(das_dict)
                    self.merge.insert(row)
            except InvalidOperation as exp:
                pass
            except DuplicateKeyError as err:
                print(dastimestamp('DAS WARNING'), 'DuplicateKeyError during merge')
                if  not isinstance(gen, list):
                    raise err
        status = 'fail'
        if  inserted:
            status = 'ok'
        elif  not lookup_keys: # we get query w/o fields
            msg = 'qhash %s, no lookup_keys' % dasquery.qhash
            print(dastimestamp('DAS WARNING'), msg)
            status = 'ok'
        else: # we didn't merge anything, it is DB look-up failure
            msg  = 'qhash %s, did not insert into das.merge, attempt %s' \
                    % (dasquery.qhash, attempt)
            print(dastimestamp('DAS WARNING'), msg)
            empty_expire = etstamp()
            lkeys = list(lookup_keys)
            das = dict(expire=empty_expire, primary_key=lkeys[0],
                       condition_keys=lkeys,
                       instance=dasquery.instance,
                       system=['das'], services=dasquery.services,
                       record=record_codes('empty_record'),
                       ts=time.time(), api=[])
            empty_record = {'das':das, 'qhash': dasquery.qhash,
                            'cache_id':[], 'das_id': id_list}
            for key in lkeys:
                empty_record.update({key.split('.')[0]:[]})
            for key, val in dasquery.mongo_query['spec'].items():
                if  key.find('.') == -1:
                    empty_record[key] = []
                else: # it is compound key, e.g. site.name
                    newkey, newval = convert_dot_notation(key, val)
                    empty_record[newkey] = adjust_mongo_keyvalue(newval)
            self.merge.insert(empty_record)
            # update DAS records (both meta and data ones, by using qhash)
            nval = {'$set': {'das.expire':empty_expire}}
            spec = {'qhash':dasquery.qhash}
            self.col.update_many(spec, nval)
        return status

    def update_cache(self, dasquery, results, header, system, api):
        """
        Insert results into cache. Use bulk insert controller by
        self.cache_size. Upon completion ensure indexies.
        """
        # update results records in DAS cache
        gen  = self.generate_records(dasquery, results, header)
        inserted = 0
        # bulk insert
        try:
            res = self.col.insert_many(gen, ordered=False, bypass_document_validation=True)
            inserted += len(res.inserted_ids)
        except InvalidOperation:
            pass

        # update query record for this sub-system
        self.update_query_record_system(dasquery, system, api, 'ok')

        if  dasquery.qcache: # custom DASQuery cache
            self.update_das_expire(dasquery, expire_timestamp(dasquery.qcache))

    def update_query_record_system(self, dasquery, system, api, status):
        "Update system status of dasquery in das.cache collection"
        spec = {'qhash': dasquery.qhash, 'das.system': system, 'das.api': api,
                'das.record':record_codes('query_record')}
        udict = {'$set': {'das.status':status}}
#         print("### update_query_record", spec)
        doc=self.col.find_one_and_update(spec, udict, return_document=ReturnDocument.AFTER)
#         print(doc)

    def insert_query_record(self, dasquery, header):
        """
        Insert query record into DAS cache.
        """
        # check presence of API record in a cache
        dasheader   = header['das']
        system      = dasheader['system']
        api         = dasheader['api']
        collection  = 'cache'
        check_query = True
        expire = dasheader.get('expire', None)
        if  expire:
            dasheader['expire'] = adjust_expire(expire)
        if  not self.incache(dasquery, collection, system, api, check_query):
            msg = "query=%s, header=%s" % (dasquery, header)
            self.logger.debug(msg)
            q_record = dict(das=dasheader, query=dasquery.storage_query)
            q_record['das']['record'] = record_codes('query_record')
            q_record['das']['status'] = "requested"
            q_record['qhash'] = dasquery.qhash
            q_record['das']['ctime'] = [time.time()]
            res = self.col.insert_one(q_record)
            if  not res:
                msg = 'unable to insert query record'
                print(dastimestamp('DAS ERROR '), dasquery, msg, ', will retry')
                time.sleep(1)
                res = self.col.insert(q_record)
                if  not res:
                    print(dastimestamp('DAS ERROR '), dasquery, msg)

    def generate_records(self, dasquery, results, header):
        """
        Iterate over provided results, update records and yield them
        to next level (update_cache)
        """
        self.logger.debug("(%s) store to cache" % dasquery)
        if  not results:
            return

        dasheader  = header['das']
        expire     = adjust_expire(dasheader['expire'])
        system     = dasheader['system'] # DAS service names, e.g. combined
        services   = dasheader['services'] # CMS services used to get data
        api        = dasheader['api']
        prim_key   = header.get('prim_key', None)
        if  not prim_key:
            # get primary key from a list of lookup keys which has the
            # following structure [{'api':[keys]}, {...}]
            lup_keys = header['lookup_keys']
            lkeys    = [l for i in lup_keys for k in i.values() for l in k]
            prim_key = lkeys[0] if 'summary' not in lkeys else 'summary'
        cond_keys  = list(dasquery.mongo_query['spec'].keys())
        # get API record id
        spec       = {'qhash':dasquery.qhash, 'das.system':system,
                      'das.expire': {'$gt':time.time()},
                      'das.record': record_codes('query_record')}
        counter    = 0
        rids = [str(r['_id']) for r in \
                self.col.find(spec, ['_id'], **PYMONGO_OPTS)]
        if  rids:
            if  isinstance(results, list) or isinstance(results, GeneratorType):
                for item in results:
                    counter += 1
                    if  'das' in item:
                        expire = item.get('das').get('expire', expire)
                        dasheader['expire'] = expire
                    item['das'] = dict(expire=expire, primary_key=prim_key,
                                       condition_keys=cond_keys,
                                       instance=dasquery.instance,
                                       system=system, services=services,
                                       record=record_codes('data_record'),
                                       ts=time.time(), api=api)
                    item['das_id'] = rids
                    item['qhash'] = dasquery.qhash
                    yield item
            else:
                print("\n\n ### results = ", str(results))
                raise Exception('Provided results is not a list/generator type')
        if  expire != dasheader['expire']: # update DAS records
            header['das']['expire'] = expire
        # update das record with new status
        status = 'Update DAS cache, %s API' % header['das']['api'][0]
        self.update_query_record(dasquery, status, header)

        msg = "\n%s yield %s rows" % (dasheader['system'], counter)
        self.logger.info(msg)

    def remove_from_cache(self, dasquery):
        """
        Remove query from DAS cache. To do so, we retrieve API record
        and remove all data records from das.cache and das.merge
        """
        records = self.col.find({'qhash':dasquery.qhash}, **PYMONGO_OPTS)
        id_list = []
        for row in records:
            if  row['_id'] not in id_list:
                id_list.append(row['_id'])
        spec = {'das_id':{'$in':id_list}}
        self.merge.remove(spec)
        self.merge.remove({'qhash':dasquery.qhash})
        self.col.remove(spec)
        self.col.remove({'qhash':dasquery.qhash})

    def clean_cache(self, collection=None):
        """
        Clean expired docs in das.cache and das.merge.
        """
        current_time = time.time()
        query = {'das.expire': { '$lt':current_time} }
        if  not collection or collection == 'merge':
            self.merge.remove(query)
        if  not collection or collection == 'cache':
            self.col.remove(query)

    def delete_cache(self):
        """
        Delete all results in DAS cache/merge collection, including
        internal indexes.
        """
        self.col.remove({})
        try:
            self.col.drop_indexes()
        except:
            pass
        self.merge.remove({})
        try:
            self.merge.drop_indexes()
        except:
            pass
Ejemplo n.º 9
0
class DASParserDB(object):
    """
    Caching layer for the PLY parser.
    """
    def __init__(self, config):
        self.verbose = config['verbose']
        self.logger = PrintManager('DASParserDB', self.verbose)
        self.dburi = config['mongodb']['dburi']
        self.dbname = config['parserdb']['dbname']
        self.sizecap = config['parserdb'].get('sizecap', 5 * 1024 * 1024)
        self.colname = config['parserdb']['collname']
        msg = "DASParserCache::__init__ %s@%s" % (self.dburi, self.dbname)
        self.logger.info(msg)
        self.create_db()

    def create_db(self):
        """
        Create db collection
        """
        conn = db_connection(self.dburi)
        dbn = conn[self.dbname]
        if self.colname not in dbn.collection_names():
            dbn.create_collection(self.colname, capped=True, size=self.sizecap)
        col = dbn[self.colname]
        index_list = [('qhash', DESCENDING)]
        create_indexes(col, index_list)

    @property
    def col(self):
        "Collection object to MongoDB"
        conn = db_connection(self.dburi)
        dbn = conn[self.dbname]
        col = dbn[self.colname]
        return col

    def lookup_query(self, rawtext):
        """
        Check the parser cache for a given rawtext query.
        Search is done with the qhash of this string.
        Returns a tuple (status, value) for the cases
        (PARSERCACHE_VALID, mongo_query) - valid query found
        (PARSERCACHE_INVALID, error) - error message for invalid query
        (PARSERCACHE_NOTFOUND, None) - not in the cache
        """
        result = find_one(self.col, {'qhash':genkey(rawtext)}, \
                        fields=['query', 'error'])

        if result and result['query']:
            if self.verbose:
                self.logger.debug("DASParserCache: found valid %s->%s" %\
                                  (rawtext, result['query']))
            query = decode_mongo_query(result['query'])
            return (PARSERCACHE_VALID, query)
        elif result and result['error']:
            if self.verbose:
                self.logger.debug("DASParserCache: found invalid %s->%s" %\
                                  (rawtext, result['error']))
            return (PARSERCACHE_INVALID, result['error'])
        else:
            if self.verbose:
                self.logger.debug("DASParserCache: not found %s" %\
                                  (rawtext))
            return (PARSERCACHE_NOTFOUND, None)

    def insert_valid_query(self, rawtext, query):
        "Insert a query that was successfully transformed"
        self._insert_query(rawtext, query, None)

    def insert_invalid_query(self, rawtext, error):
        "Insert the error message for an invalid query"
        self._insert_query(rawtext, None, error)

    def _insert_query(self, rawtext, query, error):
        """Internal method to insert a query"""
        if self.verbose:
            self.logger.debug("DASParserCache: insert %s->%s/%s" %\
                           (rawtext, query, error))
        # since MongoDB does not support insertion of $ sign in queries
        # we need to encode inserted query
        if query:
            encquery = encode_mongo_query(query)
        else:
            encquery = ""
        self.col.insert({
            'raw': rawtext,
            'qhash': genkey(rawtext),
            'query': encquery,
            'error': str(error)
        })
Ejemplo n.º 10
0
class QLManager(object):
    """
    DAS QL manager.
    """
    def __init__(self, config=None):
        if  not config:
            config = das_readconfig()
        if  not config.has_key('dasmapping'):
            config['dasmapping'] = DASMapping(config)
        if  not config.has_key('dasanalytics'):
            config['dasanalytics'] = DASAnalytics(config)
        if  not config['dasmapping'].check_maps():
            msg = "No DAS maps found in MappingDB"
            raise Exception(msg)
        self.map         = config['dasmapping']
        self.analytics   = config['dasanalytics']
        self.dasservices = config['services']
        self.daskeysmap  = self.map.daskeys()
        self.operators   = list(das_operators())
        self.daskeys     = list(das_special_keys())
        self.verbose     = config['verbose']
        self.logger      = PrintManager('QLManger', self.verbose)
        for val in self.daskeysmap.values():
            for item in val:
                self.daskeys.append(item)
        parserdir   = config['das']['parserdir']
        self.dasply = DASPLY(parserdir, self.daskeys, self.dasservices, 
                verbose=self.verbose)

        self.enabledb = config['parserdb']['enable']
        if  self.enabledb:
            self.parserdb = DASParserDB(config)

    def parse(self, query):
        """
        Parse input query and return query in MongoDB form.
        Optionally parsed query can be written into analytics DB.
        """
        mongo_query = self.mongo_query(query)
        self.convert2skeys(mongo_query)
        return mongo_query

    def add_to_analytics(self, query, mongo_query):
        "Add DAS query to analytics DB"
        self.analytics.add_query(query, mongo_query)

    def mongo_query(self, query):
        """
        Return mongo query for provided input query
        """
        # NOTE: somehow I need to keep build call just before using
        # PLY parser, otherwise it fails to parse.
        self.dasply.build()
        if  self.verbose:
            msg = "input query='%s'" % query
            self.logger.debug(msg)
            self.dasply.test_lexer(query)
        if  self.enabledb:
            status, value = self.parserdb.lookup_query(query)
            if status == PARSERCACHE_VALID and \
                len(last_key_pattern.findall(query)) == 0:
                mongo_query = value
            elif status == PARSERCACHE_INVALID:
                raise Exception(value)
            else:
                try:
                    ply_query = self.dasply.parser.parse(query)
                    mongo_query = ply2mongo(ply_query)
                    self.parserdb.insert_valid_query(query, mongo_query)
                except Exception as exp:
                    self.parserdb.insert_invalid_query(query, exp)
                    print "Input query=%s" % query
                    raise exp
        else:
            try:
                ply_query   = self.dasply.parser.parse(query)
                mongo_query = ply2mongo(ply_query)
            except Exception as exc:
                msg = "Fail to convert input query='%s' into MongoDB format" \
                    % query
                print_exc(msg, print_traceback=False)
                raise exc
        if  set(mongo_query.keys()) & set(['fields', 'spec']) != \
                set(['fields', 'spec']):
            raise Exception('Invalid MongoDB query %s' % mongo_query)
        if  not mongo_query['fields'] and len(mongo_query['spec'].keys()) > 1:
            raise Exception(ambiguous_msg(query, mongo_query['spec'].keys()))
        for key, val in mongo_query['spec'].iteritems():
            if  isinstance(val, list):
                raise Exception(ambiguos_val_msg(query, key, val))
        return mongo_query

    def convert2skeys(self, mongo_query):
        """
        Convert DAS input keys into DAS selection keys.
        """
        if  not mongo_query['spec']:
            for key in mongo_query['fields']:
                for system in self.map.list_systems():
                    mapkey = self.map.find_mapkey(system, key)
                    if  mapkey:
                        mongo_query['spec'][mapkey] = '*'
            return
        spec = mongo_query['spec']
        to_replace = []
        for key, val in spec.iteritems():
            for system in self.map.list_systems():
                mapkey = self.map.find_mapkey(system, key, val)
                if  mapkey and mapkey != key and \
                    mongo_query['spec'].has_key(key):
                    to_replace.append((key, mapkey))
                    continue
        for key, mapkey in to_replace:
            if  mongo_query['spec'].has_key(key):
                mongo_query['spec'][mapkey] = mongo_query['spec'][key]
                del mongo_query['spec'][key]
        
    def services(self, query):
        """Find out DAS services to use for provided query"""
        skeys, cond = decompose(query)
        if  not skeys:
            skeys = []
        if  isinstance(skeys, str):
            skeys = [skeys]
        slist = []
        # look-up services from Mapping DB
        for key in skeys + [i for i in cond.keys()]:
            for service, keys in self.daskeysmap.iteritems():
                if  service not in self.dasservices:
                    continue
                value = cond.get(key, None)
                daskeys = self.map.find_daskey(service, key, value)
                if  set(keys) & set(daskeys) and service not in slist:
                    slist.append(service)
        # look-up special key condition
        requested_system = query.get('system', None)
        if  requested_system:
            if  isinstance(requested_system, str):
                requested_system = [requested_system]
            return list( set(slist) & set(requested_system) )
        return slist

    def service_apis_map(self, query):
        """
        Find out which APIs correspond to provided query.
        Return a map of found services and their apis.
        """
        skeys, cond = decompose(query)
        if  not skeys:
            skeys = []
        if  isinstance(skeys, str):
            skeys = [skeys]
        adict = {}
        mapkeys = [key for key in cond.keys() if key not in das_special_keys()]
        services = self.services(query)
        for srv in services:
            alist = self.map.list_apis(srv)
            for api in alist:
                daskeys = self.map.api_info(api)['daskeys']
                maps = [r['map'] for r in daskeys]
                if  set(mapkeys) & set(maps) == set(mapkeys): 
                    if  adict.has_key(srv):
                        new_list = adict[srv] + [api]
                        adict[srv] = list( set(new_list) )
                    else:
                        adict[srv] = [api]
        return adict

    def params(self, query):
        """
        Return dictionary of parameters to be used in DAS Core:
        selection keys, conditions and services.
        """
        skeys, cond = decompose(query)
        services = []
        for srv in self.services(query):
            if  srv not in services:
                services.append(srv)
        return dict(selkeys=skeys, conditions=cond, services=services)
Ejemplo n.º 11
0
class DASMapping(object):
    """
    This class manages DAS mapping DB.
    """

    def __init__(self, config):
        self.verbose = config["verbose"]
        self.logger = PrintManager("DASMapping", self.verbose)
        self.services = config["services"]
        self.dburi = config["mongodb"]["dburi"]
        self.dbname = config["mappingdb"]["dbname"]
        self.colname = config["mappingdb"]["collname"]
        self.map_test = config.get("map_test", True)
        self.main_dbs = config["das"].get("main_dbs", "dbs")
        self.dbsinsts = config["das"].get("dbs_instances", [])

        msg = "%s@%s" % (self.dburi, self.dbname)
        self.logger.info(msg)

        self.init()
        self.on_reload = Event()

        # Monitoring thread which performs auto-reconnection to MongoDB
        thname = "mappingdb_monitor"
        sleep = 5
        reload_time = config["mappingdb"].get("reload_time", 86400)
        reload_time_bad_maps = config["mappingdb"].get("reload_time_bad_maps", 120)
        start_new_thread(
            thname,
            db_monitor,
            (self.dburi, self.init, sleep, self.load_maps, reload_time, self.check_maps, reload_time_bad_maps),
        )

        self.daskeyscache = {}  # to be filled at run time
        self.systems = []  # to be filled at run time
        self.dasmapscache = {}  # to be filled at run time
        self.keymap = {}  # to be filled at run time
        self.presentationcache = {}  # to be filled at run time
        self.reverse_presentation = {}  # to be filled at run time
        self.notationcache = {}  # to be filled at run time
        self.diffkeycache = {}  # to be filled at run time
        self.apicache = {}  # to be filled at run time
        self.dbs_global_url = None  # to be determined at run time
        self.dbs_inst_names = None  # to be determined at run time
        self.load_maps(notify=False)

    @property
    def col(self):
        "Return MongoDB collection object"
        conn = db_connection(self.dburi)
        dbc = conn[self.dbname]
        col = dbc[self.colname]
        return col

    # ===============
    # Management APIs
    # ===============
    def load_maps(self, notify=True):
        "Helper function to reload DAS maps"
        self.init_dasmapscache()
        self.init_notationcache()
        self.init_presentationcache()
        self.systems = None  # re-initialize DAS system list
        self.list_systems()
        self.dbs_global_url = None  # re-initialize DAS dbs global url
        self.dbs_url()
        self.dbs_inst_names = None  # re-initialize DAS dbs instances
        self.dbs_instances()

        if notify:
            self.on_reload()

    def init_dasmapscache(self, records=[]):
        "Read DAS maps and initialize DAS API maps"
        if not records:
            spec = {"type": "service"}
            records = self.col.find(spec, exhaust=True)
        for row in records:
            if "urn" in row:
                api = row["urn"]
                srv = row["system"]
                for dmap in row["das_map"]:
                    for key, val in dmap.iteritems():
                        if key == "pattern":
                            pat = re.compile(val)
                            dmap[key] = pat
                key = (row["system"], row["urn"])
                self.dasmapscache[key] = row

    def init_notationcache(self):
        """
        Initialize notation cache by reading notations.
        """
        for system, notations in self.notations().iteritems():
            for row in notations:
                key = system, row["api_output"]
                if key in self.notationcache:
                    self.notationcache[key] += [(row["api"], row["rec_key"])]
                else:
                    self.notationcache[key] = [(row["api"], row["rec_key"])]

    def init_presentationcache(self):
        """
        Initialize presentation cache by reading presentation map.
        """
        spec = {"type": "presentation"}
        data = find_one(self.col, spec)
        if data:
            self.presentationcache = data["presentation"]
            for daskey, uilist in self.presentationcache.iteritems():
                for row in uilist:
                    link = None
                    if "link" in row:
                        link = row["link"]
                    if "diff" in row:
                        self.diffkeycache[daskey] = row["diff"]
                    tdict = {daskey: {"mapkey": row["das"], "link": link}}
                    if row["ui"] in self.reverse_presentation:
                        self.reverse_presentation[row["ui"]].update(tdict)
                    else:
                        self.reverse_presentation[row["ui"]] = {daskey: {"mapkey": row["das"], "link": link}}

    def das_presentation_map(self):
        "Read DAS presentation map"
        spec = {"type": "presentation"}
        data = find_one(self.col, spec)
        if data:
            for daskey, uilist in data.get("presentation", {}).iteritems():
                for row in uilist:
                    if "link" in row:
                        yield row

    def init(self):
        """
        Establish connection to MongoDB back-end and create DB.
        """
        col = None
        try:
            conn = db_connection(self.dburi)
            if conn:
                dbc = conn[self.dbname]
                col = dbc[self.colname]
        #            print "### DASMapping:init started successfully"
        except ConnectionFailure as _err:
            tstamp = dastimestamp("")
            thread = threading.current_thread()
            print "### MongoDB connection failure thread=%s, id=%s, time=%s" % (thread.name, thread.ident, tstamp)
        except Exception as exc:
            print_exc(exc)
        if col:
            index = [
                ("type", DESCENDING),
                ("system", DESCENDING),
                ("urn", DESCENDING),
                ("das_map.das_key", DESCENDING),
                ("das_map.rec_key", DESCENDING),
                ("das_map.api_arg", DESCENDING),
            ]
            create_indexes(col, index)

    def delete_db(self):
        """
        Delete mapping DB in MongoDB back-end.
        """
        conn = db_connection(self.dburi)
        if conn:
            conn.drop_database(self.dbname)

    def delete_db_collection(self):
        """
        Delete mapping DB collection in MongoDB.
        """
        conn = db_connection(self.dburi)
        if conn:
            dbc = conn[self.dbname]
            dbc.drop_collection(self.colname)

    def check_maps(self):
        """
        Check Mapping DB and return true/false based on its content
        """
        if not self.map_test:
            return True  # do not test DAS maps, useful for unit tests
        udict = defaultdict(int)
        ndict = defaultdict(int)
        pdict = defaultdict(int)
        adict = {}
        maps_hash = False
        for row in self.col.find(exhaust=True):
            check_map_record(row)
            if "urn" in row:
                udict[row["system"]] += 1
            elif "notations" in row:
                ndict[row["system"]] += 1
            elif "presentation" in row:
                pdict["presentation"] += 1
            elif "arecord" in row:
                arec = row["arecord"]
                system = arec["system"]
                rec = {arec["type"]: arec["count"]}
                if system in adict:
                    adict[system].update(rec)
                else:
                    adict[system] = rec
            elif "verification_token" in row:
                maps_hash = row["verification_token"]

        # retrieve uri/notation/presentation maps
        ulist = []
        nlist = []
        for system in adict.keys():
            if "uri" in adict[system]:
                ulist.append(adict[system]["uri"] == udict[system])
                nlist.append(adict[system]["notations"] == ndict[system])
        status_umap = sum(ulist) == len(ulist)
        status_nmap = sum(nlist) == len(nlist)
        status_pmap = adict.get("presentation", {}).get("presentation", 0) == 1
        # verify completeness of maps
        calc_token = verification_token(self.col.find(exhaust=True))
        status_complete = maps_hash and maps_hash == calc_token
        if self.verbose:
            print "### DAS map status, umap=%s, nmap=%s, pmap=%s, complete=%s" % (
                status_umap,
                status_nmap,
                status_pmap,
                status_complete,
            )
        if not status_complete:
            print "### DAS map hash do not match, got=%s calculated=%s" % (maps_hash, calc_token)
        # multiply statuses as a result of this map check
        return status_umap * status_nmap * status_pmap * status_complete

    def remove(self, spec):
        """
        Remove record in DAS Mapping DB for provided Mongo spec.
        """
        self.col.remove(spec)

    def add(self, record):
        """
        Add new record into mapping DB. Example of URI record

        .. doctest::

            {
             system:dbs, 
             urn : listBlocks, 
             url : "http://a.b.com/api"
             params : [{"apiversion":1_2_2, se:"*"}]
             lookup : block
             das_map: [
                 {"das_key":"block", "rec_key":"block.name"},
                 {"das_key":"site", "rec_key":"site.name", "api_arg":"se", "pattern":"^T[0-3]_},
             ]
            }

        Example of notation record:

        .. doctest::

             notations: [
                 {"api_output" : "storage_element_name", "rec_key":"site", "api": ""},
             ]
        """
        msg = "record=%s" % record
        self.logger.debug(msg)
        self.col.insert(record)
        self.init_dasmapscache([record])

    # ==================
    # Informational APIs
    # ==================
    def dbs_global_instance(self, system=None):
        "Retrive from mapping DB DBS url and extract DBS instance"
        if not system:
            system = self.main_dbs
        url = self.dbs_url(system)
        return get_dbs_instance(url)

    def dbs_url(self, system=None):
        "Retrive from mapping DB DBS url"
        if not system:
            system = self.main_dbs
        systems = self.list_systems()
        dbses = set(["dbs", "dbs3"])
        if dbses & set(systems) != dbses:
            # use caching only when we operate with single DBS
            if self.dbs_global_url:
                return self.dbs_global_url
        url = None
        for srv in systems:
            if srv == system:
                apis = self.list_apis(srv)
                url = self.api_info(srv, apis[0])["url"]
                url = parse_dbs_url(srv, url)
                self.dbs_global_url = url
                return url
        return url

    def dbs_instances(self, system=None):
        "Retrive from mapping DB DBS instances"
        # use dbs istances from the config
        if self.dbsinsts and not system:
            return self.dbsinsts
        # default dbs
        if not system:
            system = self.main_dbs
        systems = self.list_systems()
        dbses = set(["dbs", "dbs3"])
        if dbses & set(systems) != dbses:
            # use caching only when we operate with single DBS
            if self.dbs_inst_names:
                return self.dbs_inst_names
        insts = []
        for srv in systems:
            if srv == system:
                apis = self.list_apis(srv)
                insts = self.api_info(srv, apis[0])["instances"]
                self.dbs_inst_names = insts
                return insts
        return insts

    def list_systems(self):
        """
        List all DAS systems.
        """
        if not self.systems:
            spec = {"type": "service", "system": {"$ne": None}}
            gen = (row["system"] for row in self.col.find(spec, ["system"], exhaust=True))
            self.systems = list(set(gen2list(gen)) & set(self.services))
        return self.systems

    def list_apis(self, system=None):
        """
        List all APIs.
        """
        if self.apicache and system in self.apicache:
            return self.apicache[system]
        spec = {"type": "service", "urn": {"$ne": None}}
        if system:
            spec["system"] = system
        gen = (row["urn"] for row in self.col.find(spec, ["urn"], exhaust=True))
        self.apicache[system] = gen2list(gen)
        return self.apicache[system]

    def api_info(self, srv, api_name):
        """
        Return full API info record.
        """
        return self.dasmapscache[(srv, api_name)]

    def relational_keys(self, system1, system2):
        """
        Return a list of relational keys between provided systems
        """
        for system, keys in self.daskeys().iteritems():
            if system == system1:
                keys1 = keys
            if system == system2:
                keys2 = keys
        return list(set(keys1) & set(keys2))

    def daskeys(self, das_system=None):
        """
        Return a dict with all known DAS keys.
        """
        if das_system in self.daskeyscache:
            return self.daskeyscache[das_system]

        spec = {"type": "service", "system": {"$ne": None}}
        if das_system:
            spec = {"system": das_system}
        gen = (row["system"] for row in self.col.find(spec, ["system"], exhaust=True))
        gen = [r for r in gen]
        kdict = {}
        for system in gen:
            spec = {"system": system, "urn": {"$ne": None}}
            keys = []
            for row in self.col.find(spec, exhaust=True):
                for entry in row["das_map"]:
                    if entry["das_key"] not in keys:
                        keys.append(entry["das_key"])
            kdict[system] = keys
        # cache it
        self.daskeyscache[das_system] = kdict
        return kdict

    # ============
    # Look-up APIs
    # ============
    def api_lkeys(self, das_system, api):
        """
        Return DAS lookup keys for given das system and api
        """
        entry = self.dasmapscache[(das_system, api)]
        skeys = entry["lookup"].split(",")
        return skeys

    def primary_key(self, das_system, urn):
        """
        Return DAS primary key for provided system and urn. The DAS primary key
        is a first entry in *lookup* attribute of DAS API record.
        """
        spec = {"system": das_system, "urn": urn}
        record = find_one(self.col, spec)
        if not record:
            return None
        pkey = record["lookup"]
        if pkey.find(",") != -1:
            pkey = pkey.split(",")[0]
        return pkey

    def primary_mapkey(self, das_system, urn):
        """
        Return DAS primary map key for provided system and urn. For example,
        the file DAS key is mapped to file.name, so this API will return
        file.name
        """
        spec = {"system": das_system, "urn": urn}
        record = find_one(self.col, spec)
        mapkey = []
        for row in record["das_map"]:
            lkey = record["lookup"]
            if lkey.find(",") != -1:
                lkey = lkey.split(",")[0]
            if row["das_key"] == lkey:
                return row["rec_key"]
        return mapkey

    def find_daskey(self, das_system, map_key, value=None):
        """
        Find das key for given system and map key.
        """
        msg = "system=%s\n" % das_system
        daskeys = []
        for key, record in self.dasmapscache.iteritems():
            srv, _urn = key
            if das_system != srv:
                continue
            for row in record["das_map"]:
                das_key = row["das_key"]
                rec_key = row["rec_key"]
                if rec_key != map_key:
                    continue
                pat = row.get("pattern", None)
                if value:
                    if pat:
                        if pat.match(str(value)):
                            daskeys.append(das_key)
                        else:
                            msg += "-- reject key=%s, val=%s, pat=%s\n" % (map_key, value, pat.pattern)
                            self.logger.debug(msg)
                    else:
                        daskeys.append(das_key)
                else:
                    daskeys.append(das_key)
        return daskeys

    def find_mapkey(self, das_system, das_key, value=None):
        """
        Find map key for given system and das key.
        """
        msg = "system=%s\n" % das_system
        for key, record in self.dasmapscache.iteritems():
            srv, _urn = key
            if das_system != srv:
                continue
            for row in record["das_map"]:
                if row["das_key"] != das_key:
                    continue
                rec_key = row["rec_key"]
                pat = row.get("pattern", None)
                if value:
                    if pat:
                        if pat.match(str(value)):
                            return rec_key
                        else:
                            msg += "-- reject key=%s, val=%s, pat=%s\n" % (das_key, value, pat.pattern)
                            self.logger.debug(msg)
                            continue
                    else:
                        return rec_key
                else:
                    return rec_key

    def mapkeys(self, daskey):
        """
        Find all lookup keys (primary keys) for a given daskey
        """
        if daskey in self.keymap:
            return self.keymap[daskey]
        spec = {"das_map.das_key": daskey}
        mapkeys = []
        for row in self.col.find(spec, ["das_map"], exhaust=True):
            for kmap in row["das_map"]:
                if kmap["das_key"] == daskey and kmap["rec_key"] not in mapkeys:
                    mapkeys.append(kmap["rec_key"])
        self.keymap[daskey] = mapkeys
        return self.keymap[daskey]

    def find_apis(self, das_system, map_key):
        """
        Find list of apis which correspond to provided
        system and das map key.
        """
        spec = {"system": das_system, "das_map.rec_key": map_key}
        apilist = []
        for row in self.col.find(spec, ["urn"], exhaust=True):
            if "urn" in row and row["urn"] not in apilist:
                apilist.append(row["urn"])
        return apilist

    def find_system(self, key):
        """
        Return system name for provided DAS key.
        """
        spec = {"das_map.das_key": key}
        gen = (row["system"] for row in self.col.find(spec, ["system"], exhaust=True))
        systems = []
        for system in gen:
            if system not in systems:
                systems.append(system)
        systems.sort()
        return systems

    def lookup_keys(self, system, api, daskey=None, value=None):
        """
        Returns lookup keys for given system and provided
        selection DAS key, e.g. block => block.name
        """
        entry = self.dasmapscache.get((system, api), None)
        if not entry:
            return []
        lkeys = entry.get("lookup", []).split(",")
        rkeys = []
        if daskey in lkeys:
            for dmap in entry["das_map"]:
                rec_key = dmap["rec_key"]
                if daskey:
                    if dmap["das_key"] == daskey:
                        pat = dmap.get("pattern", None)
                        if value:
                            if pat.match(str(value)):
                                rkeys.append(rec_key)
                        else:
                            if rec_key not in rkeys:
                                rkeys.append(rec_key)
                else:
                    rkeys.append(rec_key)
        return rkeys

    def api2das(self, system, api_input_name):
        """
        Translates data-service API input parameter into DAS QL key,
        e.g. run_number => run.
        """
        query = {"system": system, "das_map.api_arg": api_input_name}
        names = []
        for adas in self.col.find(query, ["das_map"], exhaust=True):
            for row in adas["das_map"]:
                try:
                    if "api_arg" in row:
                        aparam = row["api_arg"]
                        daskey = row["das_key"]
                        if aparam == api_input_name and daskey not in names:
                            names.append(daskey)
                except Exception, err:
                    print "ERROR: look-up api_param/das_key in", row
                    raise err
        return names
Ejemplo n.º 12
0
class DASAbstractService(object):
    """
    Abstract class describing DAS service. It initialized with a name which
    is used to identify service parameters from DAS configuration file.
    Those parameters are keys, verbosity level, URL of the data-service.
    """
    def __init__(self, name, config):
        self.name = name
        try:
            self.verbose = config['verbose']
            title = 'DASAbstactService_%s' % self.name
            self.logger = PrintManager(title, self.verbose)
            self.dasmapping = config['dasmapping']
            self.write2cache = config.get('write_cache', True)
            self.multitask = config['das'].get('multitask', True)
            self.error_expire = config['das'].get('error_expire', 300)
            self.dbs_global = None  # to be configured at run time
            self.dburi = config['mongodb']['dburi']
            engine = config.get('engine', None)
            self.gfs = db_gridfs(self.dburi)
        except Exception as exc:
            print_exc(exc)
            raise Exception('fail to parse DAS config')

        # read key/cert info
        try:
            self.ckey, self.cert = get_key_cert()
        except Exception as exc:
            print_exc(exc)
            self.ckey = None
            self.cert = None

        if self.multitask:
            nworkers = config['das'].get('api_workers', 3)
            thr_weights = config['das'].get('thread_weights', [])
            for system_weight in thr_weights:
                system, weight = system_weight.split(':')
                if system == self.name:
                    nworkers *= int(weight)
#             if  engine:
#                 thr_name = 'DASAbstractService:%s:PluginTaskManager' % self.name
#                 self.taskmgr = PluginTaskManager(\
#                         engine, nworkers=nworkers, name=thr_name)
#                 self.taskmgr.subscribe()
#             else:
#                 thr_name = 'DASAbstractService:%s:TaskManager' % self.name
#                 self.taskmgr = TaskManager(nworkers=nworkers, name=thr_name)
            thr_name = 'DASAbstractService:%s:TaskManager' % self.name
            self.taskmgr = TaskManager(nworkers=nworkers, name=thr_name)
        else:
            self.taskmgr = None

        self.map = {}  # to be defined by data-service implementation
        self._keys = None  # to be defined at run-time in self.keys
        self._params = None  # to be defined at run-time in self.parameters
        self._notations = {}  # to be defined at run-time in self.notations

        self.logger.info('initialized')
        # define internal cache manager to put 'raw' results into cache
        if 'rawcache' in config and config['rawcache']:
            self.localcache = config['rawcache']
        else:
            msg = 'Undefined rawcache, please check your configuration'
            raise Exception(msg)

    def status(self):
        "Return status of the service"
        return self.taskmgr.status()

    def services(self):
        """
        Return sub-subsystems used to retrieve data records. It is used
        in dasheader call to setup das.services field. This method can be
        overwritten in sub-classes, otherwise returns dict of service name
        and CMS systems used to retrieve data records.
        """
        return {self.name: [self.name]}

    def version(self):
        """Return data-services version, should be implemented in sub-classes"""
        return ''

    def keys(self):
        """
        Return service keys
        """
        if self._keys:
            return self._keys
        srv_keys = []
        for _api, params in self.map.items():
            for key in params['keys']:
                if not key in srv_keys:
                    srv_keys.append(key)
        self._keys = srv_keys
        return srv_keys

    def parameters(self):
        """
        Return mapped service parameters
        """
        if self._params:
            return self._params
        srv_params = []
        for _api, params in self.map.items():
            for key in params['params']:
                param_list = self.dasmapping.api2das(self.name, key)
                for par in param_list:
                    if not par in srv_params:
                        srv_params.append(par)
        self._params = srv_params
        return srv_params

    def notations(self):
        """
        Return a map of system notations.
        """
        if self._notations:
            return self._notations
        for _, rows in self.dasmapping.notations(self.name).items():
            for row in rows:
                api = row['api']
                nmap = row['rec_key']
                notation = row['api_output']
                if api in self._notations:
                    self._notations[api].update({notation: nmap})
                else:
                    self._notations[api] = {notation: nmap}
        return self._notations

    def getdata(self, url, params, expire, headers=None, post=None):
        """URL call wrapper"""
        if url.find('https:') != -1:
            return getdata(url,
                           params,
                           headers,
                           expire,
                           post,
                           self.error_expire,
                           self.verbose,
                           self.ckey,
                           self.cert,
                           system=self.name)
        else:
            return getdata(url,
                           params,
                           headers,
                           expire,
                           post,
                           self.error_expire,
                           self.verbose,
                           system=self.name)

    def call(self, dasquery):
        """
        Invoke service API to execute given query.
        Return results as a collect list set.
        """
        self.logger.info(dasquery)
        # check the cache for records with given query/system
        res = self.localcache.incache(dasquery,
                                      collection='cache',
                                      system=self.name)
        if res:
            msg = "found records in local cache"
            self.logger.info(msg)
            return
        # ask data-service api to get results, they'll be store them in
        # cache, so return at the end what we have in cache.
        self.api(dasquery)

    def write_to_cache(self, dasquery, expire, url, api, args, gen, ctime):
        """
        Write provided result set into DAS cache.
        """
        if not self.write2cache:
            return

        # before going to cache we should check/set possible misses, e.g.
        # primary key when error is thrown
        result = self.set_misses(dasquery, api, gen)

        # update the cache
        header = dasheader(self.name,
                           dasquery,
                           expire,
                           api,
                           url,
                           services=self.services())
        header['lookup_keys'] = self.lookup_keys(api)
        header['prim_key'] = self.dasmapping.primary_mapkey(self.name, api)
        header['ctime'] = ctime
        system = self.name
        self.localcache.update_cache(dasquery, result, header, system, api)

        msg = 'cache has been updated,\n'
        self.logger.debug(msg)

    def adjust_params(self, api, kwds, instance=None):
        """
        Data-service specific parser to adjust parameters according to
        its specifications. For example, DQ service accepts a string
        of parameters, rather parameter set, while DBS2 can reuse
        some parameters for different API, e.g. I can use dataset path
        to pass to listPrimaryDatasets as primary_dataset pattern.
        """
        pass

    def lookup_keys(self, api):
        """
        Return look-up keys of data output for given data-service API.
        """
        lkeys = self.dasmapping.lookup_keys(self.name, api)
        return [{api: lkeys}]

    def inspect_params(self, api, args):
        """
        Perform API parameter inspection. Check if API accept a range
        of parameters, etc.
        """
        for key, value in args.items():
            if isinstance(value, dict):
                minval = None
                maxval = None
                for oper, val in value.items():
                    if oper == '$in':
                        minval = int(val[0])
                        maxval = int(val[-1])
                        args[key] = range(minval, maxval)
                    elif oper == '$lt':
                        maxval = int(val)
                        args[key] = maxval
                    elif oper == '$lte':
                        maxval = int(val)
                        args[key] = maxval
                    elif oper == '$gt':
                        minval = int(val)
                        args[key] = minval
                    elif oper == '$gte':
                        minval = int(val)
                        args[key] = minval
                    else:
                        msg = '%s does not support operator %s' % (api, oper)
                        raise Exception(msg)
        return args

    def get_notations(self, api):
        """Return notations used for given API"""
        notationmap = self.notations()
        if not notationmap:
            return {}
        notations = {}
        if '' in notationmap:
            notations = dict(notationmap[''])  # notations applied to all APIs
            if api in notationmap:  # overwrite the one for provided API
                notations.update(notationmap[api])
        return notations

    def parser(self, dasquery, dformat, data, api):
        """
        DAS data parser. Input parameters:

        - *query* input DAS query
        - *dformat* is a data format, e.g. XML, JSON
        - *data* is a data source, either file-like object or
          actual data
        - *api* is API name
        """
        prim_key = self.dasmapping.primary_key(self.name, api)
        counter = 0
        if dformat.lower() == 'xml':
            tags = self.dasmapping.api2daskey(self.name, api)
            gen = xml_parser(data, prim_key, tags)
            for row in gen:
                counter += 1
                yield row
        elif dformat.lower() == 'json' or dformat.lower() == 'dasjson':
            gen = json_parser(data, self.logger)
            das_dict = {}
            for row in gen:
                if dformat.lower() == 'dasjson':
                    for key, val in row.items():
                        if key != 'results':
                            das_dict[key] = val
                    row = row['results']
                if isinstance(row, list):
                    for item in row:
                        if item:
                            if prim_key in item:
                                counter += 1
                                yield item
                            else:
                                counter += 1
                                yield {prim_key: item}
                else:
                    if prim_key in row:
                        counter += 1
                        yield row
                    else:
                        counter += 1
                        yield {prim_key: row}
        else:
            msg = 'Unsupported data format="%s", API="%s"' % (dformat, api)
            raise Exception(msg)
        msg = "api=%s, format=%s " % (api, dformat)
        msg += "prim_key=%s yield %s rows" % (prim_key, counter)
        self.logger.info(msg)

    def translator(self, api, genrows):
        """
        Convert raw results into DAS records. 
        """
        prim_key = self.dasmapping.primary_key(self.name, api)
        count = 0
        for row in genrows:
            row2das(self.dasmapping.notation2das, self.name, api, row)
            count += 1
            # check for primary key existance, since it can be overriden
            # by row2das. For example DBS3 uses flat namespace, so we
            # override dataset=>name, while dataset still is a primary key
            if isinstance(row, list):
                yield {prim_key: row}
            elif prim_key in row:
                if prim_key in row[prim_key]:
                    yield row[prim_key]  # remapping may create nested dict
                else:
                    yield row
            else:
                yield {prim_key: row}
        msg = "yield %s rows" % count
        self.logger.debug(msg)

    def set_misses(self, dasquery, api, genrows):
        """
        Check and adjust DAS records wrt input query. If some of the DAS
        keys are missing, add it with its value to the DAS record.
        """
        # look-up primary key
        prim_key = self.dasmapping.primary_key(self.name, api)

        # Scan all docs and store those whose size above MongoDB limit into
        # GridFS
        map_key = self.dasmapping.primary_mapkey(self.name, api)
        genrows = parse2gridfs(self.gfs, map_key, genrows, self.logger)

        spec = dasquery.mongo_query['spec']
        row = next(genrows)
        ddict = DotDict(row)
        keys2adjust = []
        for key in spec.keys():
            val = ddict.get(key)
            if spec[key] != val and key not in keys2adjust:
                keys2adjust.append(key)
        msg = "adjust keys %s" % keys2adjust
        self.logger.debug(msg)
        count = 0
        if keys2adjust:
            # adjust of the rows
            for row in yield_rows(row, genrows):
                ddict = DotDict(row)
                pval = ddict.get(map_key)
                if isinstance(pval, dict) and 'error' in pval:
                    ddict[map_key] = ''
                    ddict.update({prim_key: pval})
                for key in keys2adjust:
                    value = spec[key]
                    existing_value = ddict.get(key)
                    # the way to deal with proximity/patern/condition results
                    if  (isinstance(value, str) or isinstance(value, unicode))\
                        and value.find('*') != -1: # we got pattern
                        if existing_value:
                            value = existing_value
                    elif isinstance(value, dict) or \
                        isinstance(value, list): # we got condition
                        if existing_value:
                            value = existing_value
                        elif isinstance(value, dict) and \
                        '$in' in value: # we got a range {'$in': []}
                            value = value['$in']
                        elif isinstance(value, dict) and \
                        '$lte' in value and '$gte' in value:
                            # we got a between range
                            value = [value['$gte'], value['$lte']]
                        else:
                            value = json.dumps(value)
                    elif existing_value and value != existing_value:
                        # we got proximity results
                        if 'proximity' in ddict:
                            proximity = DotDict({key: existing_value})
                            ddict['proximity'].update(proximity)
                        else:
                            proximity = DotDict({})
                            proximity[key] = existing_value
                            ddict['proximity'] = proximity
                    else:
                        if existing_value:
                            value = existing_value
                    ddict[key] = value
                yield ddict
                count += 1
        else:
            yield row
            for row in genrows:
                yield row
                count += 1
        msg = "yield %s rows" % count
        self.logger.debug(msg)

    def api(self, dasquery):
        """
        Data service api method, can be defined by data-service class.
        It parse input query and invoke appropriate data-service API
        call. All results are stored into the DAS cache along with
        api call inserted into Analytics DB.
        """
        self.logger.info(dasquery)
        genrows = self.apimap(dasquery)
        if not genrows:
            return
        jobs = []
        for url, api, args, dformat, expire in genrows:
            # insert DAS query record for given API
            header = dasheader(self.name, dasquery, expire, api, url)
            self.localcache.insert_query_record(dasquery, header)
            # fetch DAS data records
            if self.multitask:
                jobs.append(self.taskmgr.spawn(self.apicall, \
                            dasquery, url, api, args, dformat, expire))
            else:
                self.apicall(dasquery, url, api, args, dformat, expire)
        if self.multitask:
            self.taskmgr.joinall(jobs)

    def apicall(self, dasquery, url, api, args, dformat, expire):
        """
        Data service api method, can be defined by data-service class.
        It parse input query and invoke appropriate data-service API
        call. All results are stored into the DAS cache along with
        api call inserted into Analytics DB.

        We invoke explicitly close call for our datastream instead
        of using context manager since this method as well as
        getdata/parser can be overwritten by child classes.
        """
        datastream = None
        try:
            args = self.inspect_params(api, args)
            time0 = time.time()
            headers = make_headers(dformat)
            datastream, expire = self.getdata(url, args, expire, headers)
            self.logger.info("%s expire %s" % (api, expire))
            rawrows = self.parser(dasquery, dformat, datastream, api)
            dasrows = self.translator(api, rawrows)
            ctime = time.time() - time0
            self.write_to_cache(dasquery, expire, url, api, args, dasrows,
                                ctime)
        except Exception as exc:
            msg  = 'Fail to process: url=%s, api=%s, args=%s' \
                    % (url, api, args)
            print(msg)
            print_exc(exc)
        close(datastream)

    def url_instance(self, url, _instance):
        """
        Virtual method to adjust URL for a given instance,
        must be implemented in service classes
        """
        return url

    def adjust_url(self, url, instance):
        """
        Adjust data-service URL wrt provided instance, e.g.
        DBS carry several instances
        """
        if instance:
            url = self.url_instance(url, instance)
        return url

    def apimap(self, dasquery):
        """
        Analyze input query and yield url, api, args, format, expire
        for further processing.
        """
        srv = self.name  # get local copy to avoid threading issues
        cond = getarg(dasquery.mongo_query, 'spec', {})
        instance = dasquery.mongo_query.get('instance', self.dbs_global)
        skeys = getarg(dasquery.mongo_query, 'fields', [])
        if not skeys:
            skeys = []
        self.logger.info("\n")
        for api, value in self.map.items():
            expire = value['expire']
            iformat = value['format']
            url = self.adjust_url(value['url'], instance)
            if not url:
                msg = '--- rejects API %s, no URL' % api
                self.logger.info(msg)
                continue
            args = dict(value['params'])  # make new copy, since we'll adjust
            wild = value.get('wild_card', '*')
            found = 0
            # check if input parameters are covered by API
            if not self.dasmapping.check_api_match(srv, api, cond):
                msg = '--- rejects API %s, does not cover input condition keys' \
                        % api
                self.logger.info(msg)
                continue
            # once we now that API covers input set of parameters we check
            # every input parameter for pattern matching
            for key, val in cond.items():
                # check if keys from conditions are accepted by API
                # need to convert key (which is daskeys.map) into
                # input api parameter
                for apiparam in self.dasmapping.das2api(srv, api, key, val):
                    if apiparam in args:
                        args[apiparam] = val
                        found += 1
            # VK 20160708, wrong statement, it caused to pass
            # datasets API for query dataset in [path1, path2]
            # I'll leave block here until I test and verify that
            # commented out block will not cause other issues
            #
            # check the case when we only have single condition key
            # and it is the key we look-up
#             if  not found and skeys == [k.split('.')[0] for k in cond.keys()]:
#                 found = 1
# check if number of keys on cond and args are the same
            if len(cond.keys()) != found:
                msg = "--- reject API %s, not all condition keys are covered" \
                        % api
                self.logger.info(msg)
                msg = 'args=%s' % args
                self.logger.debug(msg)
                continue
            if not found:
                msg = "--- rejects API %s, parameters don't match" % api
                self.logger.info(msg)
                msg = 'args=%s' % args
                self.logger.debug(msg)
                continue
            self.adjust_params(api, args, instance)
            # delete args keys whose value is optional
            delete_keys(args, 'optional')
            # check that there is no "required" parameter left in args,
            # since such api will not work
            if 'required' in args.values():
                msg = '--- rejects API %s, parameter is required' % api
                self.logger.info(msg)
                msg = 'args=%s' % args
                self.logger.debug(msg)
                continue
            # adjust pattern symbols in arguments
            if wild != '*':
                for key, val in args.items():
                    if isinstance(val, str) or isinstance(val, unicode):
                        val = val.replace('*', wild)
                    args[key] = val

            # compare query selection keys with API look-up keys
            api_lkeys = self.dasmapping.api_lkeys(srv, api)
            if set(api_lkeys) != set(skeys):
                msg = "--- rejects API %s, api_lkeys(%s)!=skeys(%s)"\
                        % (api, api_lkeys, skeys)
                self.logger.info(msg)
                continue

            msg = '+++ %s passes API %s' % (srv, api)
            self.logger.info(msg)
            msg = 'args=%s' % args
            self.logger.debug(msg)

            msg = "yield "
            msg += "system ***%s***, url=%s, api=%s, args=%s, format=%s, " \
                % (srv, url, api, args, iformat)
            msg += "expire=%s, wild_card=%s" \
                % (expire, wild)
            self.logger.debug(msg)

            yield url, api, args, iformat, expire
Ejemplo n.º 13
0
class DASMapping(object):
    """
    This class manages DAS mapping DB.
    """
    __cached_inst = None
    __cached_params = None

    def __new__(cls, config):
        """
        creates a new instance of the class and cache it or return an existing
         instance if one exists (only when the params match).

        only the last instance is cached, but this simplifies the implementation
        as the param 'config' might be a complex unhashable object.
        """
        # check if we can reuse an existing instance
        if cls.__cached_inst and cls.__cached_params == config:
            if  config['verbose']:
                print("DASMapping::__new__: returning a cached instance")
            return cls.__cached_inst

        # otherwise create and initialize a new instance
        if  config['verbose']:
            print("DASMapping::__new__: creating a new instance")
        self = object.__new__(cls)

        self.verbose  = config['verbose']
        self.logger   = PrintManager('DASMapping', self.verbose)
        self.services = config['services']
        self.dburi    = config['mongodb']['dburi']
        self.dbname   = config['mappingdb']['dbname']
        self.colname  = config['mappingdb']['collname']
        self.map_test = config.get('map_test', True)
        self.main_dbs = config['das'].get('main_dbs', 'dbs3')
        self.dbsinsts = config['das'].get('dbs_instances', [])

        msg = "%s@%s" % (self.dburi, self.dbname)
        self.logger.info(msg)

        self.das_son_manipulator = DAS_SONManipulator()
        index = [('type', DESCENDING),\
                 ('system', DESCENDING),\
                 ('urn', DESCENDING),\
                 ('das_map.das_key', DESCENDING),\
                 ('das_map.rec_key', DESCENDING),\
                 ('das_map.api_arg', DESCENDING),\
                 ]
        create_indexes(self.col, index)

        self.daskeyscache = {}         # to be filled at run time
        self.systems = []              # to be filled at run time
        self.dasmapscache = {}         # to be filled at run time
        self.keymap = {}               # to be filled at run time
        self.presentationcache = {}    # to be filled at run time
        self.reverse_presentation = {} # to be filled at run time
        self.notationcache = {}        # to be filled at run time
        self.diffkeycache = {}         # to be filled at run time
        self.apicache = {}             # to be filled at run time
        self.dbs_global_url = None     # to be determined at run time
        self.dbs_inst_names = None     # to be determined at run time
        self.load_maps()

        # cache the instance and return it
        DASMapping.__cached_inst = self
        DASMapping.__cached_params = config
        return self

    @property
    def col(self):
        "col property provides access to DAS mapping collection"
        conn = db_connection(self.dburi)
        mdb  = conn[self.dbname]
        colnames = mdb.collection_names()
        if  not colnames or self.colname not in colnames:
            try:
                mdb.create_collection(self.colname)
            except OperationFailure:
                pass
        mdb.add_son_manipulator(self.das_son_manipulator)
        return mdb[self.colname]

    # ===============
    # Management APIs
    # ===============
    def load_maps(self):
        "Helper function to reload DAS maps"
        self.init_dasmapscache()
        self.init_notationcache()
        self.init_presentationcache()
        self.systems = None        # re-initialize DAS system list
        self.list_systems()
        self.dbs_global_url = None # re-initialize DAS dbs global url
        self.dbs_url()
        self.dbs_inst_names = None # re-initialize DAS dbs instances
        self.dbs_instances()

    def init_dasmapscache(self, records=None):
        "Read DAS maps and initialize DAS API maps"
        if  not records:
            spec = {'type':'service'}
            records = self.col.find(spec, **PYMONGO_OPTS)
        for row in records:
            if  'urn' in row:
                for dmap in row['das_map']:
                    for key, val in dmap.items():
                        if  key == 'pattern':
                            pat = re.compile(val)
                            dmap[key] = pat
                key = (row['system'], row['urn'])
                self.dasmapscache[key] = row

    def init_notationcache(self):
        """
        Initialize notation cache by reading notations.
        """
        for system, notations in self.notations().items():
            for row in notations:
                key = system, row['api_output']
                if  key in self.notationcache:
                    self.notationcache[key] += [ (row['api'], row['rec_key']) ]
                else:
                    self.notationcache[key] = [ (row['api'], row['rec_key']) ]

    def init_presentationcache(self):
        """
        Initialize presentation cache by reading presentation map.
        """
        spec  = {'type':'presentation'}
        data  = find_one(self.col, spec)
        if  data:
            self.presentationcache = data['presentation']
            for daskey, uilist in self.presentationcache.items():
                for row in uilist:
                    link = None
                    if  'link' in row:
                        link = row['link']
                    if  'diff' in row:
                        self.diffkeycache[daskey] = row['diff']
                    tdict = {daskey : {'mapkey': row['das'], 'link': link}}
                    if  row['ui'] in self.reverse_presentation:
                        self.reverse_presentation[row['ui']].update(tdict)
                    else:
                        self.reverse_presentation[row['ui']] = \
                                {daskey : {'mapkey': row['das'], 'link': link}}

    def das_presentation_map(self):
        "Read DAS presentation map"
        spec  = {'type':'presentation'}
        data  = find_one(self.col, spec)
        if  data:
            for _, uilist in data.get('presentation', {}).items():
                for row in uilist:
                    if  'link' in row:
                        yield row

    def delete_db(self):
        """
        Delete mapping DB in MongoDB back-end.
        """
        conn = db_connection(self.dburi)
        if  conn:
            conn.drop_database(self.dbname)

    def delete_db_collection(self):
        """
        Delete mapping DB collection in MongoDB.
        """
        conn = db_connection(self.dburi)
        if  conn:
            dbc  = conn[self.dbname]
            dbc.drop_collection(self.colname)

    def check_maps(self):
        """
        Check Mapping DB and return true/false based on its content
        """
        if  not self.map_test:
            return True # do not test DAS maps, useful for unit tests
        udict = defaultdict(int)
        ndict = defaultdict(int)
        pdict = defaultdict(int)
        adict = {}
        maps_hash = False
        for row in self.col.find(**PYMONGO_OPTS):
            check_map_record(row)
            if  'urn' in row:
                udict[row['system']] += 1
            elif 'notations' in row:
                ndict[row['system']] += 1
            elif 'presentation' in row:
                pdict['presentation'] += 1
            elif 'arecord' in row:
                arec = row['arecord']
                system = arec['system']
                rec = {arec['type']:arec['count']}
                if  system in adict:
                    adict[system].update(rec)
                else:
                    adict[system] = rec
            elif 'verification_token' in row:
                maps_hash = row['verification_token']

        # retrieve uri/notation/presentation maps
        ulist = []
        nlist = []
        for system in adict.keys():
            if  'uri' in adict[system]:
                ulist.append(adict[system]['uri'] == udict[system])
                nlist.append(adict[system]['notations'] == ndict[system])
        status_umap = sum(ulist) == len(ulist)
        status_nmap = sum(nlist) == len(nlist)
        status_pmap = adict.get('presentation', {}).get('presentation', 0) == 1
        # verify completeness of maps
        calc_token = verification_token(self.col.find(**PYMONGO_OPTS))
        status_complete = maps_hash and maps_hash == calc_token
        if  self.verbose:
            print("### DAS map status, umap=%s, nmap=%s, pmap=%s, complete=%s" \
                    % (status_umap, status_nmap, status_pmap, status_complete))
        if not status_complete:
            print("### DAS map hash do not match, got=%s calculated=%s" \
                    % (maps_hash, calc_token))
        # multiply statuses as a result of this map check
        return status_umap*status_nmap*status_pmap*status_complete

    def remove(self, spec):
        """
        Remove record in DAS Mapping DB for provided Mongo spec.
        """
        self.col.remove(spec)

    def add(self, record):
        """
        Add new record into mapping DB. Example of URI record

        .. doctest::

            {
             system:dbs,
             urn : listBlocks,
             url : "http://a.b.com/api"
             params : [{"apiversion":1_2_2, se:"*"}]
             lookup : block
             das_map: [
                 {"das_key":"block", "rec_key":"block.name"},
                 {"das_key":"site", "rec_key":"site.name", "api_arg":"se", "pattern":"^T[0-3]_},
             ]
            }

        Example of notation record:

        .. doctest::

             notations: [
                 {"api_output" : "storage_element_name", "rec_key":"site", "api": ""},
             ]
        """
        msg = 'record=%s' % record
        self.logger.debug(msg)
        self.col.insert(record)
        self.init_dasmapscache([record])

    # ==================
    # Informational APIs
    # ==================
    def dbs_global_instance(self, system=None):
        "Retrive from mapping DB DBS url and extract DBS instance"
        if  not system:
            system = self.main_dbs
        url = self.dbs_url(system)
        return get_dbs_instance(url)

    def dbs_url(self, system=None):
        "Retrive from mapping DB DBS url"
        if  not system:
            system = self.main_dbs
        systems = self.list_systems()
        dbses   = set(['dbs3'])
        if  dbses & set(systems) != dbses:
            # use caching only when we operate with single DBS
            if  self.dbs_global_url:
                return self.dbs_global_url
        url = None
        for srv in systems:
            if  srv == system:
                apis = self.list_apis(srv)
                url  = self.api_info(srv, apis[0])['url']
                url  = parse_dbs_url(srv, url)
                self.dbs_global_url = url
                return url
        return url

    def dbs_instances(self, system=None):
        "Retrive from mapping DB DBS instances"
        # use dbs istances from the config
        if  self.dbsinsts and not system:
            return self.dbsinsts
        # default dbs
        if  not system:
            system = self.main_dbs
        systems = self.list_systems()
        dbses   = set(['dbs3'])
        if  dbses & set(systems) != dbses:
            # use caching only when we operate with single DBS
            if  self.dbs_inst_names:
                return self.dbs_inst_names
        insts = []
        dbs_global_inst = self.dbs_global_instance(system)
        if  system == 'dbs3' and dbs_global_inst:
            dbs_namespace = dbs_global_inst.split('/')[0]
        else:
            dbs_namespace = None
        for srv in systems:
            if  srv == system:
                apis  = self.list_apis(srv)
                insts = self.api_info(srv, apis[0])['instances']
                if  dbs_namespace:
                    insts = [d for d in insts if d.startswith(dbs_namespace)]
                self.dbs_inst_names = insts
                return insts
        return insts

    def list_systems(self):
        """
        List all DAS systems.
        """
        if  not self.systems:
            spec = { 'type': 'service', 'system' : { '$ne' : None } }
            gen  = (row['system'] \
                    for row in self.col.find(spec, ['system'], **PYMONGO_OPTS))
            self.systems = list( set(gen2list(gen)) & set(self.services) )
        return self.systems

    def list_apis(self, system=None):
        """
        List all APIs.
        """
        if  self.apicache and system in self.apicache:
            return self.apicache[system]
        spec = { 'type': 'service', 'urn' : { '$ne' : None } }
        if  system:
            spec['system'] = system
        gen  = (row['urn'] \
                for row in self.col.find(spec, ['urn'], **PYMONGO_OPTS))
        self.apicache[system] = gen2list(gen)
        return self.apicache[system]

    def api_info(self, srv, api_name):
        """
        Return full API info record.
        """
        return self.dasmapscache[(srv, api_name)]

    def relational_keys(self, system1, system2):
        """
        Return a list of relational keys between provided systems
        """
        for system, keys in self.daskeys().items():
            if  system == system1:
                keys1 = keys
            if  system == system2:
                keys2 = keys
        return list( set(keys1) & set(keys2) )

    def daskeys(self, das_system=None):
        """
        Return a dict with all known DAS keys.
        """
        if  das_system in self.daskeyscache:
            return self.daskeyscache[das_system]

        spec  = { 'type': 'service', 'system' : { '$ne' : None } }
        if  das_system:
            spec  = { 'system' : das_system }
        gen   = (row['system'] \
                for row in self.col.find(spec, ['system'], **PYMONGO_OPTS))
        gen   = [r for r in gen]
        kdict = {}
        for system in gen:
            spec = {'system':system, 'urn':{'$ne':None}}
            keys = []
            for row in self.col.find(spec, **PYMONGO_OPTS):
                for entry in row['das_map']:
                    if  entry['das_key'] not in keys:
                        keys.append(entry['das_key'])
            kdict[system] = keys
        # cache it
        self.daskeyscache[das_system] = kdict
        return kdict

    # ============
    # Look-up APIs
    # ============
    def api_lkeys(self, das_system, api):
        """
        Return DAS lookup keys for given das system and api
        """
        entry = self.dasmapscache[(das_system, api)]
        skeys = entry['lookup'].split(',')
        return skeys

    def primary_key(self, das_system, urn):
        """
        Return DAS primary key for provided system and urn. The DAS primary key
        is a first entry in *lookup* attribute of DAS API record.
        """
        spec = {'system':das_system, 'urn':urn}
        record = find_one(self.col, spec)
        if  not record:
            return None
        pkey = record['lookup']
        if  pkey.find(',') != -1:
            pkey = pkey.split(',')[0]
        return pkey

    def primary_mapkey(self, das_system, urn):
        """
        Return DAS primary map key for provided system and urn. For example,
        the file DAS key is mapped to file.name, so this API will return
        file.name
        """
        spec = {'system':das_system, 'urn':urn}
        record = find_one(self.col, spec)
        mapkey = []
        for row in record['das_map']:
            lkey = record['lookup']
            if  lkey.find(',') != -1:
                lkey = lkey.split(',')[0]
            if  row['das_key'] == lkey:
                return row['rec_key']
        return mapkey

    def find_daskey(self, das_system, map_key, value=None):
        """
        Find das key for given system and map key.
        """
        msg   = 'system=%s\n' % das_system
        daskeys = []
        for key, record in self.dasmapscache.items():
            srv, _ = key
            if  das_system != srv:
                continue
            for row in record['das_map']:
                das_key = row['das_key']
                rec_key = row['rec_key']
                if  rec_key != map_key:
                    continue
                pat = row.get('pattern', None)
                if  value:
                    if  pat:
                        if  pat.match(str(value)):
                            daskeys.append(das_key)
                        else:
                            msg += '-- reject key=%s, val=%s, pat=%s\n'\
                                    % (map_key, value, pat.pattern)
                            self.logger.debug(msg)
                    else:
                        daskeys.append(das_key)
                else:
                    daskeys.append(das_key)
        return daskeys

    def find_mapkey(self, das_system, das_key, value=None):
        """
        Find map key for given system and das key.
        """
        msg   = 'system=%s\n' % das_system
        for key, record in self.dasmapscache.items():
            srv, _ = key
            if  das_system != srv:
                continue
            for row in record['das_map']:
                if  row['das_key'] != das_key:
                    continue
                rec_key = row['rec_key']
                pat = row.get('pattern', None)
                if  value:
                    if  pat:
                        if  pat.match(str(value)):
                            return rec_key
                        else:
                            msg += '-- reject key=%s, val=%s, pat=%s\n'\
                                    % (das_key, value, pat.pattern)
                            self.logger.debug(msg)
                            continue
                    else:
                        return rec_key
                else:
                    return rec_key

    def mapkeys(self, daskey):
        """
        Find all lookup keys (primary keys) for a given daskey
        """
        if  daskey in self.keymap:
            return self.keymap[daskey]
        spec = {'das_map.das_key' : daskey}
        mapkeys = []
        for row in self.col.find(spec, ['das_map'], **PYMONGO_OPTS):
            for kmap in row['das_map']:
                if  kmap['das_key'] == daskey and \
                    kmap['rec_key'] not in mapkeys:
                    mapkeys.append(kmap['rec_key'])
        self.keymap[daskey] = mapkeys
        return self.keymap[daskey]

    def find_apis(self, das_system, map_key):
        """
        Find list of apis which correspond to provided
        system and das map key.
        """
        spec  = { 'system' : das_system, 'das_map.rec_key': map_key }
        apilist = []
        for row in self.col.find(spec, ['urn'], **PYMONGO_OPTS):
            if  'urn' in row and row['urn'] not in apilist:
                apilist.append(row['urn'])
        return apilist

    def find_system(self, key):
        """
        Return system name for provided DAS key.
        """
        spec = { 'das_map.das_key' : key }
        gen  = (row['system'] \
                for row in self.col.find(spec, ['system'], **PYMONGO_OPTS))
        systems = []
        for system in gen:
            if  system not in systems:
                systems.append(system)
        systems.sort()
        return systems

    def lookup_keys(self, system, api, daskey=None, value=None):
        """
        Returns lookup keys for given system and provided
        selection DAS key, e.g. block => block.name
        """
        entry = self.dasmapscache.get((system, api), None)
        if  not entry:
            return []
        lkeys = entry.get('lookup', []).split(',')
        rkeys = []
        if  daskey in lkeys:
            for dmap in entry['das_map']:
                rec_key = dmap['rec_key']
                if  daskey:
                    if  dmap['das_key'] == daskey:
                        pat = dmap.get('pattern', None)
                        if  value:
                            if  pat.match(str(value)):
                                rkeys.append(rec_key)
                        else:
                            if  rec_key not in rkeys:
                                rkeys.append(rec_key)
                else:
                    rkeys.append(rec_key)
        return rkeys

    def api2das(self, system, api_input_name):
        """
        Translates data-service API input parameter into DAS QL key,
        e.g. run_number => run.
        """
        query = {'system':system, 'das_map.api_arg' : api_input_name}
        names = []
        for adas in self.col.find(query, ['das_map'], **PYMONGO_OPTS):
            for row in adas['das_map']:
                try:
                    if  'api_arg' in row:
                        aparam = row['api_arg']
                        daskey = row['das_key']
                        if  aparam == api_input_name and daskey not in names:
                            names.append(daskey)
                except Exception as err:
                    print("ERROR: look-up api_param/das_key in", row)
                    raise err
        return names

    def check_api_match(self, system, api, icond):
        "Check if given API covers condition parameters"
        entry = self.dasmapscache.get((system, api), None)
        if  not entry:
            return False
        ikeys = [k.split('.')[0] for k in icond.keys()]
        dkeys = []
        for row in entry.get('das_map', []):
            if  'api_arg' in row:
                das_key = row['das_key']
                dkeys.append(das_key)
            else:
                dkeys.append(row['das_key'])
        if  set(ikeys) & set(dkeys) == set(ikeys):
            return True
        return False

    def das2api(self, system, api, rec_key, value=None):
        """
        Translates DAS record key into data-service API input parameter,
        e.g. run.number => run_number
        """
        entry = self.dasmapscache.get((system, api), None)
        names = []
        if  not entry:
            return [rec_key]
        for row in entry.get('das_map', []):
            if  'api_arg' in row:
                api_param = row['api_arg']
                pat = row.get('pattern', None)
                if  row['rec_key'] != rec_key:
                    continue
                if  value and pat:
                    if  isinstance(value, dict):
                        if pat.match(json.dumps(value.values()[0])):
                            if  api_param not in names:
                                names.append(api_param)
                    if  pat.match(str(value)):
                        if  api_param not in names:
                            names.append(api_param)
                else:
                    if  api_param not in names:
                        names.append(api_param)
            else:
                names.append(row['rec_key'])
        return names

    def notations(self, system=None):
        """
        Return DAS notation map.
        """
        notationmap = {}
        spec = {'type':'notation'}
        if  system:
            spec['system'] = system
        for item in self.col.find(spec, **PYMONGO_OPTS):
            notationmap[item['system']] = item['notations']
        return notationmap

    def notation2das(self, system, api_param, api=""):
        """
        Translates data-service API parameter name into DAS name, e.g.
        run_number=run. In case when api_param is not presented in DB
        just return it back.
        """
        if  not self.notationcache:
            self.init_notationcache()
        name = api_param
        if  (system, api_param) in self.notationcache:
            for item in self.notationcache[(system, api_param)]:
                _api, das_name = item
                if  _api:
                    if  _api == api:
                        name = das_name
                        break
                else: # valid for all API names
                    name = das_name
        return name

    def api2daskey(self, system, api):
        """
        Returns list of DAS keys which cover provided data-service API
        """
        spec = {'system':system, 'urn':api}
        keys = []
        for row in self.col.find(spec, **PYMONGO_OPTS):
            for entry in row['das_map']:
                keys.append(entry['das_key'])
        return keys

    def servicemap(self, system):
        """
        Constructs data-service map, e.g.

        .. doctest::

            {api: {keys:[list of DAS keys], params: args,
             url:url, format:ext, expire:exp} }
        """
        spec = {'system':system, 'urn':{'$ne':None}}
        smap = {}
        for row in self.col.find(spec, **PYMONGO_OPTS):
            url  = row['url']
            exp  = row['expire']
            ext  = row['format']
            api  = row['urn']
            lookup = row['lookup']
            wild = row.get('wild_card', '*')
            ckey = row.get('ckey')
            cert = row.get('cert')
            services = row.get('services', '')
            keys = []
            for entry in row['das_map']:
                keys.append(entry['das_key'])
            params = dict(row['params'])
            smap[api] = dict(keys=keys, params=params, url=url, expire=exp,\
                            format=ext, wild_card=wild, ckey=ckey, cert=cert,\
                            services=services, lookup=lookup)
        return smap

    def presentation(self, daskey):
        """
        Return web UI presentation keys for provided DAS keyword.
        For example once asked for block we present block.name, block.size, etc.
        """
        if  daskey in self.presentationcache:
            return self.presentationcache[daskey]
        return [daskey]

    def daskey_from_presentation(self, uikey):
        """
        Return triplet (DAS key, DAS access key, link)
        associated with provided UI key.
        """
        if  uikey in self.reverse_presentation:
            return self.reverse_presentation[uikey]

    def diff_keys(self, daskey):
        """
        Return diff keys for provided DAS key.
        """
        if  daskey in self.diffkeycache:
            return self.diffkeycache[daskey]
        return []

    def inputvalues_uris(self):
        """
        Return the info on how to fetch the list of allowed input values for
        certain commonly used input fields (from enabled DAS systems only)
        """
        uris = []
        for row in self.col.find({'type': 'input_values'}, **PYMONGO_OPTS):
            # check that system is active
            if row['system'] not in self.services:
                continue
            uris.extend(row['input_values'])
        return uris
Ejemplo n.º 14
0
class DASMongocache(object):
    """
    DAS cache based MongoDB.
    """
    def __init__(self, config):
        self.config = config
        self.emptyset_expire = \
                expire_timestamp(config['das'].get('emptyset_expire', 5))
        self.dburi = config['mongodb']['dburi']
        self.cache_size = config['mongodb']['bulkupdate_size']
        self.dbname = config['dasdb']['dbname']
        self.verbose = config['verbose']
        self.logger = PrintManager('DASMongocache', self.verbose)
        self.mapping = config['dasmapping']
        self.logging = config['dasdb'].get('logging', False)
        self.rec_ttl = config['dasdb'].get('record_ttl', 24 * 60 * 60)
        self.del_ttl = config['dasdb'].get('delta_ttl', 60)
        self.cleanup_del_ttl = config['dasdb'].get('cleanup_delta_ttl', 3600)
        self.retry = config['dasdb'].get('retry', 3)
        self.das_son_manipulator = DAS_SONManipulator()

        # Initialize MongoDB connection
        self.col_ = self.config['dasdb']['cachecollection']
        self.mrcol_ = self.config['dasdb']['mrcollection']
        self.merge_ = self.config['dasdb']['mergecollection']
        self.gfs = db_gridfs(self.dburi)

        msg = "%s@%s" % (self.dburi, self.dbname)
        self.logger.info(msg)

        # ensure that we have the following indexes
        common_idx = [
            ('file.name', DESCENDING),
            ('dataset.name', DESCENDING),
            ('block.name', DESCENDING),
            ('run.run_number', DESCENDING),
        ]
        index_list = [('das.expire', ASCENDING), ('das_id', ASCENDING),
                      ('das.system', ASCENDING), ('qhash', DESCENDING),
                      ('das.record', ASCENDING)]
        create_indexes(self.col, index_list + common_idx)
        index_list = [('das.expire', ASCENDING), ('das_id', ASCENDING),
                      ('qhash', DESCENDING), ('das.record', ASCENDING),
                      ('das.ts', ASCENDING)]
        create_indexes(self.merge, index_list)
        # NOTE: I found that creating index in merge collection leads to
        # MongoDB error when records contains multiple arrays on indexed
        # keys. For example, when we query file,run,lumi both file and run
        # are arrays in MongoDB. In this case the final sort in MongoDB
        # bark with the following message:
        # cannot sort with keys that are parallel arrays
        # it looks like that there is no fix for that yet
        # see
        # http://stackoverflow.com/questions/6516725/how-do-i-index-two-arrays-in-mongodb
        # therefore I temporary disabled create_indexes call on merge
        # collection which was used to have index to ease final sort,
        # especially in a case when a lot of records correspond to inital
        # query, e.g. file records.
        # On another hand, the most common use case where sort fails is
        # getting file records, and I can add one compound key to ease sort
        # but I can't add another compound key on array field, e.g. run
        common_idx = [[('qhash', DESCENDING), ('file.name', DESCENDING)]]
        create_indexes(self.merge, index_list + common_idx)

        # thread which clean-up DAS collections
        thname = 'mongocache_cleanup'
        cols = [
            config['dasdb']['cachecollection'],
            config['dasdb']['mrcollection'], config['dasdb']['mergecollection']
        ]

    @property
    def col(self):
        "col property provides access to DAS cache collection"
        conn = db_connection(self.dburi)
        mdb = conn[self.dbname]
        colnames = mdb.collection_names()
        if not colnames or self.col_ not in colnames:
            try:
                mdb.create_collection(self.col_)
            except OperationFailure:
                pass
        mdb.add_son_manipulator(self.das_son_manipulator)
        return mdb[self.col_]

    @property
    def merge(self):
        "merge property provides access to DAS merge collection"
        conn = db_connection(self.dburi)
        mdb = conn[self.dbname]
        colnames = mdb.collection_names()
        if not colnames or self.merge_ not in colnames:
            try:
                mdb.create_collection(self.merge_)
            except OperationFailure:
                pass
        mdb.add_son_manipulator(self.das_son_manipulator)
        return mdb[self.merge_]

    @property
    def mrcol(self):
        "mrcol property provides access to DAS map-reduce collection"
        conn = db_connection(self.dburi)
        mdb = conn[self.dbname]
        mdb.add_son_manipulator(self.das_son_manipulator)
        return mdb[self.mrcol_]

    def get_dataset_hashes(self, dasquery):
        "Get dataset hashes from DBS database"
        spec = dasquery.mongo_query.get('spec', {})
        inst = dasquery.instance
        conn = db_connection(self.dburi)
        if spec and inst:
            dataset = spec.get('dataset.name', None)
            if dataset:
                if dataset.find('*') != -1:
                    cond = {'dataset': re.compile(dataset.replace('*', '.*'))}
                else:
                    cond = {'dataset': dataset}
                for row in conn['dbs'][inst].find(cond):
                    if 'qhash' in row:
                        yield row['qhash']

    def check_datasets(self, dasquery):
        "Check dataset presence in DAS cache for given das query"
        hashes = [r for r in self.get_dataset_hashes(dasquery)]
        if hashes:
            spec = {'qhash': {'$in': hashes}}
            if len(hashes) == self.merge.find(spec, **PYMONGO_OPTS).count():
                dasquery._hashes = hashes

    def get_superset_keys(self, key, value):
        """
        This is a special-case version of similar_keys,
        intended for analysers that want to quickly
        find possible superset queries of a simple
        query of the form key=value.
        """

        msg = "%s=%s" % (key, value)
        self.logger.debug(msg)
        cond = {'query.spec.key': key}
        for row in self.col.find(cond, **PYMONGO_OPTS):
            mongo_query = decode_mongo_query(row['query'])
            for thiskey, thisvalue in mongo_query.items():
                if thiskey == key:
                    if fnmatch.fnmatch(value, thisvalue):
                        yield thisvalue

    def get_fields(self, dasquery):
        "Prepare fields to extract from MongoDB"
        fields = dasquery.mongo_query.get('fields', [])
        if fields and 'records' in fields:
            fields = None  # look-up all records
        filters = dasquery.filters
        cond = {}
        if filters:
            new_fields = []
            for dasfilter in filters:
                if dasfilter == 'unique':
                    continue
                if  fields and dasfilter not in fields and \
                    dasfilter not in new_fields:
                    if  dasfilter.find('=') == -1 and dasfilter.find('<') == -1\
                    and dasfilter.find('>') == -1:
                        new_fields.append(dasfilter)
                    else:
                        cond = parse_filters(dasquery.mongo_query)
            if not new_fields and fields:
                new_fields = list(fields)
            return new_fields, cond
        return fields, cond

    def remove_expired(self, dasquery, collection):
        """
        Remove expired records from DAS cache. We need to perform this
        operation very carefullly since we don't use transation and on-going
        commits can invoke this method (see das_core.py).  Therefore we use
        MongoDB $or operator to wipe out queries which match DASQuery hash and
        already expired or queries which lived in cache more then rec_ttl
        config parameter. The later operation just prevent DAS cache from
        growing.
        """
        conn = db_connection(self.dburi)
        mdb = conn[self.dbname]
        mdb.add_son_manipulator(self.das_son_manipulator)
        col = mdb[collection]
        # use additional delta to check data record expiration
        # we add this delta to ensure that there is no records close to
        # current timestamp which may expire during request processing
        spec = {
            'qhash': dasquery.qhash,
            'das.expire': {
                '$lt': time.time() + self.del_ttl
            }
        }
        col.delete_many(spec)

    def check_services(self, dasquery):
        """
        Check if DAS cache contains DAS records with service response for
        given query.
        """
        das_rec = self.find(dasquery)
        if not das_rec:
            return False
        if 'das' not in das_rec:
            return False
        if 'services' not in das_rec['das']:
            return False
        spec = {
            'qhash': dasquery.qhash,
            'das.system': {
                '$ne': 'das'
            },
            'das.expire': {
                '$gt': time.time()
            }
        }
        nres = self.col.find(spec, **PYMONGO_OPTS).count()
        if nres:
            return True
        return False

    def find(self, dasquery):
        """
        Find provided query in DAS cache.
        """
        cond = {
            'qhash': dasquery.qhash,
            'das.system': 'das',
            'das.expire': {
                '$gt': time.time()
            }
        }
        return find_one(self.col, cond)

    def find_specs(self, dasquery, system='das'):
        """
        Check if cache has query whose specs are identical to provided query.
        Return all matches.
        """
        if dasquery.hashes:
            cond = {'qhash': {'$in': dasquery.hashes}}
        else:
            cond = {'qhash': dasquery.qhash}
        if system:
            cond.update({'das.system': system})
        cond.update({'das.expire': {'$gt': time.time()}})
        return self.col.find(cond, **PYMONGO_OPTS)

    def get_das_ids(self, dasquery):
        """
        Return list of DAS ids associated with given query
        """
        das_ids = []
        try:
            das_ids = \
                [r['_id'] for r in self.col.find_specs(dasquery, system='')]
        except:
            pass
        return das_ids

    def update_das_expire(self, dasquery, timestamp):
        "Update timestamp of all DAS data records for given query"
        nval = {'$set': {'das.expire': timestamp}}
        spec = {'qhash': dasquery.qhash}
        self.col.update_many(spec, nval)
        self.merge.update_many(spec, nval)

    def das_record(self, dasquery):
        "Retrieve DAS record for given query"
        cond = {'qhash': dasquery.qhash, 'das.expire': {'$gt': time.time()}}
        return find_one(self.col, cond)

    def find_records(self, das_id):
        " Return all the records matching a given das_id"
        return self.col.find({'das_id': das_id}, **PYMONGO_OPTS)

    def is_error_in_records(self, dasquery, collection='cache'):
        "Scan DAS cache for error records and return true or not"
        if collection == 'cache':
            results = self.col.find({'qhash': dasquery.qhash}, **PYMONGO_OPTS)
        else:
            results = self.merge.find({'qhash': dasquery.qhash},
                                      **PYMONGO_OPTS)
        error = None
        reason = None
        for row in results:
            if 'error' in row:
                error = row.get('error')
                reason = row.get('reason', '')
                break
        return error, reason

    def add_to_record(self, dasquery, info, system=None):
        "Add to existing DAS record provided info"
        if system:
            self.col.update_one(
                {
                    'query': dasquery.storage_query,
                    'das.system': system
                }, {'$set': info},
                upsert=True)
        else:
            self.col.update_one({'query': dasquery.storage_query},
                                {'$set': info},
                                upsert=True)

    def find_min_expire(self, dasquery):
        """Find minimal expire timestamp across all records for given DAS query"""
        spec = {'qhash': dasquery.qhash}
        min_expire = 2 * time.time()  # upper bound, will update
        for rec in self.col.find(spec, **PYMONGO_OPTS):
            if 'das' in rec and 'expire' in rec['das']:
                estamp = rec['das']['expire']
                if min_expire > estamp:
                    min_expire = estamp
        return long(min_expire)

    def find_query_record(self, dasquery):
        "Find DAS query records and return them to the caller"
        spec = {
            'qhash': dasquery.qhash,
            'das.record': record_codes('query_record')
        }
        return self.col.find(spec, **PYMONGO_OPTS)

    def update_query_record(self, dasquery, status, header=None, reason=None):
        "Update DAS record for provided query"
        ctime = time.time()
        das_spec = {'qhash': dasquery.qhash, 'das.system': 'das'}
        min_expire = self.find_min_expire(dasquery)
        if header:
            system = header['das']['system']
            sts = header['das']['status']
            expire = header['das']['expire']
            spec = {'qhash': dasquery.qhash, 'das.system': system}
            new_expire = None
            for rec in self.col.find(spec, **PYMONGO_OPTS):
                if 'das' in rec and 'expire' in rec['das']:
                    if rec['das']['expire'] > expire:
                        new_expire = expire
                        ndict = {'das.expire': expire, 'das.status': status}
                        cdict = {'das.ctime': ctime}
                        udict = {'$set': ndict, '$push': cdict}
                        oid = ObjectId(rec['_id'])
                        self.col.update_one({'_id': oid}, udict)
            if new_expire:
                udict = {
                    '$set': {
                        'das.expire': new_expire
                    },
                    '$push': {
                        'das.ctime': ctime
                    }
                }
                self.col.update_one(das_spec, udict)
        else:
            udict = {
                '$set': {
                    'das.status': status,
                    'das.expire': min_expire
                },
                '$push': {
                    'das.ctime': ctime
                }
            }
            self.col.update_one(das_spec, udict)
        if reason:
            udict = {'$set': {'das.reason': reason}}
            self.col.update_one(das_spec, udict)
        # align all expire timestamps when we recieve ok status
        if status == 'ok':
            udict = {'$set': {'das.expire': min_expire}}
            self.col.update_one(das_spec, udict)

    def apilist(self, dasquery):
        "Return list of apis for given dasquery"
        spec = {
            'qhash': dasquery.qhash,
            'das.record': record_codes('query_record')
        }
        apis = []
        for row in self.col.find(spec, ['das.api'], **PYMONGO_OPTS):
            try:
                apis += row['das']['api']
            except Exception as _err:
                pass
        return apis

    def incache(self,
                dasquery,
                collection='merge',
                system=None,
                api=None,
                query_record=False):
        """
        Check if we have query results in cache, otherwise return null.
        Please note, input parameter query means MongoDB query, please
        consult MongoDB API for more details,
        http://api.mongodb.org/python/
        """
        if query_record:
            record = record_codes('query_record')
        else:
            record = spec4data_records()
        spec = {
            'qhash': dasquery.qhash,
            'das.record': record,
            'das.expire': {
                '$gt': time.time()
            }
        }
        if system:
            spec.update({'das.system': system})
        if api:
            spec.update({'das.api': api})
        conn = db_connection(self.dburi)
        mdb = conn[self.dbname]
        mdb.add_son_manipulator(self.das_son_manipulator)
        col = mdb[collection]
        res = col.find(spec, **PYMONGO_OPTS).count()
        msg = "(%s, coll=%s) found %s results" % (dasquery, collection, res)
        self.logger.info(msg)
        if res:
            return True
        return False

    def nresults(self, dasquery, collection='merge'):
        """Return number of results for given query."""
        if dasquery.aggregators:
            return len(dasquery.aggregators)
        # Distinguish 2 use cases, unique filter and general query
        # in first one we should count only unique records, in later
        # we can rely on DB count() method. Pleas keep in mind that
        # usage of fields in find doesn't account for counting, since it
        # is a view over records found with spec, so we don't need to use it.
        fields, filter_cond = self.get_fields(dasquery)
        if not fields:
            spec = dasquery.mongo_query.get('spec', {})
        elif dasquery.hashes:
            spec = {
                'qhash': {
                    '$in': dasquery.hashes
                },
                'das.record': spec4data_records()
            }
        else:
            spec = {'qhash': dasquery.qhash, 'das.record': spec4data_records()}
        if filter_cond:
            spec.update(filter_cond)
        conn = db_connection(self.dburi)
        mdb = conn[self.dbname]
        mdb.add_son_manipulator(self.das_son_manipulator)
        col = mdb[collection]
        if dasquery.unique_filter:
            skeys = self.mongo_sort_keys(collection, dasquery)
            if skeys:
                gen = col.find(spec, **PYMONGO_OPTS).sort(skeys)
            else:
                gen = col.find(spec, **PYMONGO_OPTS)
            res = len([r for r in unique_filter(gen)])
        else:
            res = col.find(spec, **PYMONGO_OPTS).count()
            if not res:  # double check that this is really the case
                time.sleep(1)
                res = col.find(spec, **PYMONGO_OPTS).count()
        msg = "%s" % res
        self.logger.info(msg)
        return res

    def mongo_sort_keys(self, collection, dasquery):
        """
        Find list of sort keys for a given DAS query. Check existing
        indexes and either use fields or spec keys to find them out.
        Return list of mongo sort keys in a form of (key, order).
        """
        # try to get sort keys all the time to get ordered list of
        # docs which allow unique_filter to apply afterwards
        fields = dasquery.mongo_query.get('fields')
        spec = dasquery.mongo_query.get('spec')
        skeys = dasquery.sortkeys
        mongo_skeys = []
        if skeys:
            for key in skeys:
                if key.find('-') != -1:  # reverse order, e.g. desc
                    mongo_skeys.append((key.replace('-', ''), DESCENDING))
                else:
                    mongo_skeys.append((key, ASCENDING))
        else:
            existing_idx = [i for i in self.existing_indexes(collection)]
            if fields:
                lkeys = []
                for key in fields:
                    for mkey in self.mapping.mapkeys(key):
                        if mkey not in lkeys:
                            lkeys.append(mkey)
            else:
                lkeys = list(spec.keys())
            keys = [k for k in lkeys \
                if k.find('das') == -1 and k.find('_id') == -1 and \
                        k in existing_idx]
            mongo_skeys = [(k, ASCENDING) for k in keys]
        return mongo_skeys

    def existing_indexes(self, collection='merge'):
        """
        Get list of existing indexes in DB. They are returned by
        index_information API in the following for:

        .. doctest::

            {u'_id_': {u'key': [(u'_id', 1)], u'v': 0},
             u'das.expire_1': {u'key': [(u'das.expire', 1)], u'v': 0},
             ...
             u'tier.name_-1': {u'key': [(u'tier.name', -1)], u'v': 0}}
        """
        conn = db_connection(self.dburi)
        mdb = conn[self.dbname]
        mdb.add_son_manipulator(self.das_son_manipulator)
        col = mdb[collection]
        for val in col.index_information().values():
            for idx in val['key']:
                yield idx[0]  # index name

    def get_records(self, coll, spec, fields, skeys, idx, limit, unique=False):
        "Generator to get records from MongoDB."
        try:
            conn = db_connection(self.dburi)
            mdb = conn[self.dbname]
            mdb.add_son_manipulator(self.das_son_manipulator)
            col = mdb[coll]
            nres = col.find(spec, **PYMONGO_OPTS).count()
            if nres == 1 or nres <= limit:
                limit = 0
            if limit:
                res = col.find(spec, fields, sort=skeys, skip=idx, limit=limit)
            else:
                res = col.find(spec, fields, sort=skeys, **PYMONGO_OPTS)
            if unique:
                res = unique_filter(res)
            for row in res:
                yield row
        except Exception as exp:
            print_exc(exp)
            row = {'exception': str(exp)}
            res = []
            yield row

    def get_from_cache(self, dasquery, idx=0, limit=0, collection='merge'):
        "Generator which retrieves results from the cache"
        if dasquery.service_apis_map():  # valid DAS query
            result = self.get_das_records(dasquery, idx, limit, collection)
            for row in result:
                yield row
        else:  # pure MongoDB query
            fields = dasquery.mongo_query.get('fields', [])
            if fields == None:
                fields = []
            spec = dasquery.mongo_query.get('spec', {})
            if dasquery.filters:
                if not fields:
                    fields = []
                fields += dasquery.filters
                pkeys = [k.split('.')[0] for k in fields]
            fields += das_record_keys()
            if 'records' in dasquery.query:
                fields = None  # special case for DAS 'records' keyword
            skeys = self.mongo_sort_keys(collection, dasquery)
            result  = self.get_records(collection, spec, fields, skeys, \
                            idx, limit, dasquery.unique_filter)
            for row in result:
                if dasquery.filters:
                    if pkeys and set(pkeys) & set(row.keys()):
                        yield row
                else:
                    yield row

    def get_das_records(self, dasquery, idx=0, limit=0, collection='merge'):
        "Generator which retrieves DAS records from the cache"
        msg = "(%s, %s, %s, coll=%s)" % (dasquery, idx, limit, collection)
        self.logger.info(msg)

        idx = int(idx)
        fields, filter_cond = self.get_fields(dasquery)
        if fields == None:
            fields = []
        if not fields:
            spec = dasquery.mongo_query.get('spec', {})
        elif dasquery.hashes:
            spec = {
                'qhash': {
                    '$in': dasquery.hashes
                },
                'das.record': spec4data_records()
            }
        else:
            spec = {'qhash': dasquery.qhash, 'das.record': spec4data_records()}
        if filter_cond:
            spec.update(filter_cond)
        if 'records' in dasquery.query:
            fields = None  # retrieve all fields for records DAS query
        else:
            # be sure to extract das internal keys
            fields += das_record_keys()
        # try to get sort keys all the time to get ordered list of
        # docs which allow unique_filter to apply afterwards
        skeys = self.mongo_sort_keys(collection, dasquery)
        res     = self.get_records(collection, spec, fields, skeys, \
                        idx, limit, dasquery.unique_filter)
        counter = 0
        for row in res:
            counter += 1
            yield row
        msg = 'qhash %s, found %s record(s) in %s collection' \
                % (dasquery.qhash, counter, collection)
        print(dastimestamp('DAS INFO '), msg)

        if counter:
            msg = "yield %s record(s)" % counter
            self.logger.info(msg)

        # if no raw records were yield we look-up possible error records
        # and reset timestamp for record with system:['das']
        if not counter:
            spec = {'qhash': dasquery.qhash}
            nrec = self.col.find(spec, **PYMONGO_OPTS).count()
            if nrec:
                msg = "for query %s, found %s non-result record(s)" \
                        % (dasquery, nrec)
                print(dastimestamp('DAS WARNING'), msg)
                for rec in self.col.find(spec, **PYMONGO_OPTS):
                    if 'query' in rec:
                        print(dastimestamp('DAS das record'), rec)
            self.update_das_expire(dasquery, etstamp())

    def map_reduce(self, mr_input, dasquery, collection='merge'):
        """
        Wrapper around _map_reduce to allow sequential map/reduce
        operations, e.g. map/reduce out of map/reduce.

        mr_input is either alias name or list of alias names for
        map/reduce functions.

        Input dasquery which is applied to first
        iteration of map/reduce functions.
        """
        # NOTE: I need to revisit mapreduce.
        spec = dasquery.mongo_query['spec']
        if not isinstance(mr_input, list):
            mrlist = [mr_input]
        else:
            mrlist = mr_input
        conn = db_connection(self.dburi)
        mdb = conn[self.dbname]
        mdb.add_son_manipulator(self.das_son_manipulator)
        coll = mdb[collection]
        for mapreduce in mrlist:
            if mapreduce == mrlist[0]:
                cond = spec
            else:
                cond = None
            coll = self._map_reduce(coll, mapreduce, cond)
        for row in coll.find():
            yield row

    def _map_reduce(self, coll, mapreduce, spec=None):
        """
        Perform map/reduce operation over DAS cache using provided
        collection, mapreduce name and optional conditions.
        """
        self.logger.debug("(%s, %s)" % (mapreduce, spec))
        record = find_one(self.mrcol, {'name': mapreduce})
        if not record:
            raise Exception("Map/reduce function '%s' not found" % mapreduce)
        fmap = record['map']
        freduce = record['reduce']
        if spec:
            result = coll.map_reduce(Code(fmap), Code(freduce), query=spec)
        else:
            result = coll.map_reduce(Code(fmap), Code(freduce))
        msg = "found %s records in %s" % (result.count(), result.name)
        self.logger.info(msg)
        self.logger.debug(fmap)
        self.logger.debug(freduce)
        return result

    def get_map_reduce(self, name=None):
        """
        Return definition of map/reduce functions for provided name
        or gives full list.
        """
        spec = {}
        if name:
            spec = {'name': name}
        result = self.mrcol.find(spec, **PYMONGO_OPTS)
        for row in result:
            yield row

    def merge_records(self, dasquery, attempt=0):
        """
        Merge DAS records for provided query. We perform the following
        steps:
        1. get all queries from das.cache by ordering them by primary key
        2. run aggregtor function to merge neighbors
        3. insert records into das.merge
        """
        ### TMP for asyncio
        #         time.sleep(attempt+3) # pymongo 3.2 don't yet flush in time

        # remove any entries in merge collection for this query
        self.merge.delete_many({'qhash': dasquery.qhash})
        # proceed
        self.logger.debug(dasquery)
        id_list = []
        expire = 9999999999  # future
        # get all API records for given DAS query
        spec = {
            'qhash': dasquery.qhash,
            'das.expire': {
                '$gt': time.time()
            },
            'das.record': record_codes('query_record')
        }
        records = self.col.find(spec, **PYMONGO_OPTS)
        for row in records:
            # find smallest expire timestamp to be used by aggregator
            rexpire = row.get('das', {}).get('expire', expire)
            if rexpire < expire:
                expire = rexpire
            if row['_id'] not in id_list:
                id_list.append(row['_id'])
        inserted = 0
        lookup_keys = set()
        fields = dasquery.mongo_query.get('fields')
        if not fields:  # Mongo
            fields = []
        for key in fields:
            for pkey in self.mapping.mapkeys(key):
                lookup_keys.add(pkey)
        for pkey in lookup_keys:
            skey = [(pkey, DESCENDING)]
            # lookup all service records
            spec = {'das_id': {'$in': id_list}, 'das.primary_key': pkey}
            if self.verbose:
                nrec = self.col.find(spec, **PYMONGO_OPTS).sort(skey).count()
                msg = "merging %s records, for %s key" % (nrec, pkey)
            else:
                msg = "merging records, for %s key" % pkey
            self.logger.debug(msg)
            # use exhaust=False since we process all records in aggregator
            # and it can be delay in processing
            records = self.col.find(spec, **PYMONGO_NOEXHAUST).sort(skey)
            # aggregate all records
            agen = aggregator(dasquery, records, expire)
            # diff aggregated records
            gen = das_diff(agen, self.mapping.diff_keys(pkey.split('.')[0]))
            # insert all records into das.merge using bulk insert
            size = self.cache_size
            try:
                res = self.merge.insert_many(gen)
                inserted += len(res.inserted_ids)
            except InvalidDocument as exp:
                print(dastimestamp('DAS WARNING'),
                      'InvalidDocument during merge', str(exp))
                msg = "Caught bson error: " + str(exp)
                self.logger.info(msg)
                records = self.col.find(spec, **PYMONGO_OPTS).sort(skey)
                gen = aggregator(dasquery, records, expire)
                genrows = parse2gridfs(self.gfs, pkey, gen, self.logger)
                das_dict = {
                    'das': {
                        'expire': expire,
                        'das.record': record_codes('gridfs_record'),
                        'primary_key': [k for k in lookup_keys],
                        'system': ['gridfs']
                    },
                    'qhash': dasquery.qhash,
                    'cache_id': [],
                    'das_id': id_list
                }
                for row in genrows:
                    row.update(das_dict)
                    self.merge.insert(row)
            except InvalidOperation as exp:
                pass
            except DuplicateKeyError as err:
                print(dastimestamp('DAS WARNING'),
                      'DuplicateKeyError during merge')
                if not isinstance(gen, list):
                    raise err
        status = 'fail'
        if inserted:
            status = 'ok'
        elif not lookup_keys:  # we get query w/o fields
            msg = 'qhash %s, no lookup_keys' % dasquery.qhash
            print(dastimestamp('DAS WARNING'), msg)
            status = 'ok'
        else:  # we didn't merge anything, it is DB look-up failure
            msg  = 'qhash %s, did not insert into das.merge, attempt %s' \
                    % (dasquery.qhash, attempt)
            print(dastimestamp('DAS WARNING'), msg)
            empty_expire = etstamp()
            lkeys = list(lookup_keys)
            das = dict(expire=empty_expire,
                       primary_key=lkeys[0],
                       condition_keys=lkeys,
                       instance=dasquery.instance,
                       system=['das'],
                       services=dasquery.services,
                       record=record_codes('empty_record'),
                       ts=time.time(),
                       api=[])
            empty_record = {
                'das': das,
                'qhash': dasquery.qhash,
                'cache_id': [],
                'das_id': id_list
            }
            for key in lkeys:
                empty_record.update({key.split('.')[0]: []})
            for key, val in dasquery.mongo_query['spec'].items():
                if key.find('.') == -1:
                    empty_record[key] = []
                else:  # it is compound key, e.g. site.name
                    newkey, newval = convert_dot_notation(key, val)
                    empty_record[newkey] = adjust_mongo_keyvalue(newval)
            self.merge.insert(empty_record)
            # update DAS records (both meta and data ones, by using qhash)
            nval = {'$set': {'das.expire': empty_expire}}
            spec = {'qhash': dasquery.qhash}
            self.col.update_many(spec, nval)
        return status

    def update_cache(self, dasquery, results, header, system, api):
        """
        Insert results into cache. Use bulk insert controller by
        self.cache_size. Upon completion ensure indexies.
        """
        # update results records in DAS cache
        gen = self.generate_records(dasquery, results, header)
        inserted = 0
        # bulk insert
        try:
            res = self.col.insert_many(gen,
                                       ordered=False,
                                       bypass_document_validation=True)
            inserted += len(res.inserted_ids)
        except InvalidOperation:
            pass

        # update query record for this sub-system
        self.update_query_record_system(dasquery, system, api, 'ok')

        if dasquery.qcache:  # custom DASQuery cache
            self.update_das_expire(dasquery, expire_timestamp(dasquery.qcache))

    def update_query_record_system(self, dasquery, system, api, status):
        "Update system status of dasquery in das.cache collection"
        spec = {
            'qhash': dasquery.qhash,
            'das.system': system,
            'das.api': api,
            'das.record': record_codes('query_record')
        }
        udict = {'$set': {'das.status': status}}
        #         print("### update_query_record", spec)
        doc = self.col.find_one_and_update(
            spec, udict, return_document=ReturnDocument.AFTER)
#         print(doc)

    def insert_query_record(self, dasquery, header):
        """
        Insert query record into DAS cache.
        """
        # check presence of API record in a cache
        dasheader = header['das']
        system = dasheader['system']
        api = dasheader['api']
        collection = 'cache'
        check_query = True
        expire = dasheader.get('expire', None)
        if expire:
            dasheader['expire'] = adjust_expire(expire)
        if not self.incache(dasquery, collection, system, api, check_query):
            msg = "query=%s, header=%s" % (dasquery, header)
            self.logger.debug(msg)
            q_record = dict(das=dasheader, query=dasquery.storage_query)
            q_record['das']['record'] = record_codes('query_record')
            q_record['das']['status'] = "requested"
            q_record['qhash'] = dasquery.qhash
            q_record['das']['ctime'] = [time.time()]
            res = self.col.insert_one(q_record)
            if not res:
                msg = 'unable to insert query record'
                print(dastimestamp('DAS ERROR '), dasquery, msg,
                      ', will retry')
                time.sleep(1)
                res = self.col.insert(q_record)
                if not res:
                    print(dastimestamp('DAS ERROR '), dasquery, msg)

    def generate_records(self, dasquery, results, header):
        """
        Iterate over provided results, update records and yield them
        to next level (update_cache)
        """
        self.logger.debug("(%s) store to cache" % dasquery)
        if not results:
            return

        dasheader = header['das']
        expire = adjust_expire(dasheader['expire'])
        system = dasheader['system']  # DAS service names, e.g. combined
        services = dasheader['services']  # CMS services used to get data
        api = dasheader['api']
        prim_key = header.get('prim_key', None)
        if not prim_key:
            # get primary key from a list of lookup keys which has the
            # following structure [{'api':[keys]}, {...}]
            lup_keys = header['lookup_keys']
            lkeys = [l for i in lup_keys for k in i.values() for l in k]
            prim_key = lkeys[0] if 'summary' not in lkeys else 'summary'
        cond_keys = list(dasquery.mongo_query['spec'].keys())
        # get API record id
        spec = {
            'qhash': dasquery.qhash,
            'das.system': system,
            'das.expire': {
                '$gt': time.time()
            },
            'das.record': record_codes('query_record')
        }
        counter = 0
        rids = [str(r['_id']) for r in \
                self.col.find(spec, ['_id'], **PYMONGO_OPTS)]
        if rids:
            if isinstance(results, list) or isinstance(results, GeneratorType):
                for item in results:
                    counter += 1
                    if 'das' in item:
                        expire = item.get('das').get('expire', expire)
                        dasheader['expire'] = expire
                    item['das'] = dict(expire=expire,
                                       primary_key=prim_key,
                                       condition_keys=cond_keys,
                                       instance=dasquery.instance,
                                       system=system,
                                       services=services,
                                       record=record_codes('data_record'),
                                       ts=time.time(),
                                       api=api)
                    item['das_id'] = rids
                    item['qhash'] = dasquery.qhash
                    yield item
            else:
                print("\n\n ### results = ", str(results))
                raise Exception(
                    'Provided results is not a list/generator type')
        if expire != dasheader['expire']:  # update DAS records
            header['das']['expire'] = expire
        # update das record with new status
        status = 'Update DAS cache, %s API' % header['das']['api'][0]
        self.update_query_record(dasquery, status, header)

        msg = "\n%s yield %s rows" % (dasheader['system'], counter)
        self.logger.info(msg)

    def remove_from_cache(self, dasquery):
        """
        Remove query from DAS cache. To do so, we retrieve API record
        and remove all data records from das.cache and das.merge
        """
        records = self.col.find({'qhash': dasquery.qhash}, **PYMONGO_OPTS)
        id_list = []
        for row in records:
            if row['_id'] not in id_list:
                id_list.append(row['_id'])
        spec = {'das_id': {'$in': id_list}}
        self.merge.remove(spec)
        self.merge.remove({'qhash': dasquery.qhash})
        self.col.remove(spec)
        self.col.remove({'qhash': dasquery.qhash})

    def clean_cache(self, collection=None):
        """
        Clean expired docs in das.cache and das.merge.
        """
        current_time = time.time()
        query = {'das.expire': {'$lt': current_time}}
        if not collection or collection == 'merge':
            self.merge.remove(query)
        if not collection or collection == 'cache':
            self.col.remove(query)

    def delete_cache(self):
        """
        Delete all results in DAS cache/merge collection, including
        internal indexes.
        """
        self.col.remove({})
        try:
            self.col.drop_indexes()
        except:
            pass
        self.merge.remove({})
        try:
            self.merge.drop_indexes()
        except:
            pass