def test_debug(self): "Test logger debug method" old_stdout = sys.stdout logger = PrintManager(self.name, verbose=2) sys.stdout = StringIO() logger.debug('test') result = sys.stdout.getvalue() expect = 'DEBUG %s:%s test\n' % (self.name, funcname()) self.assertEqual(expect, result) sys.stdout = old_stdout
def test_debug(self): "Test logger debug method" old_stdout = sys.stdout logger = PrintManager(self.name, verbose=2) sys.stdout = StringIO.StringIO() logger.debug('test') result = sys.stdout.getvalue() expect = 'DEBUG %s:%s test\n' % (self.name, funcname()) self.assertEqual(expect, result) sys.stdout = old_stdout
class DASAbstractService(object): """ Abstract class describing DAS service. It initialized with a name which is used to identify service parameters from DAS configuration file. Those parameters are keys, verbosity level, URL of the data-service. """ def __init__(self, name, config): self.name = name try: self.verbose = config['verbose'] title = 'DASAbstactService_%s' % self.name self.logger = PrintManager(title, self.verbose) self.dasmapping = config['dasmapping'] self.write2cache = config.get('write_cache', True) self.multitask = config['das'].get('multitask', True) self.error_expire = config['das'].get('error_expire', 300) self.dbs_global = None # to be configured at run time self.dburi = config['mongodb']['dburi'] engine = config.get('engine', None) self.gfs = db_gridfs(self.dburi) except Exception as exc: print_exc(exc) raise Exception('fail to parse DAS config') # read key/cert info try: self.ckey, self.cert = get_key_cert() except Exception as exc: print_exc(exc) self.ckey = None self.cert = None if self.multitask: nworkers = config['das'].get('api_workers', 3) thr_weights = config['das'].get('thread_weights', []) for system_weight in thr_weights: system, weight = system_weight.split(':') if system == self.name: nworkers *= int(weight) if engine: thr_name = 'DASAbstractService:%s:PluginTaskManager' % self.name self.taskmgr = PluginTaskManager(\ engine, nworkers=nworkers, name=thr_name) self.taskmgr.subscribe() else: thr_name = 'DASAbstractService:%s:TaskManager' % self.name self.taskmgr = TaskManager(nworkers=nworkers, name=thr_name) else: self.taskmgr = None self.map = {} # to be defined by data-service implementation self._keys = None # to be defined at run-time in self.keys self._params = None # to be defined at run-time in self.parameters self._notations = {} # to be defined at run-time in self.notations self.logger.info('initialized') # define internal cache manager to put 'raw' results into cache if 'rawcache' in config and config['rawcache']: self.localcache = config['rawcache'] else: msg = 'Undefined rawcache, please check your configuration' raise Exception(msg) def services(self): """ Return sub-subsystems used to retrieve data records. It is used in dasheader call to setup das.services field. This method can be overwritten in sub-classes, otherwise returns dict of service name and CMS systems used to retrieve data records. """ return {self.name:[self.name]} def version(self): """Return data-services version, should be implemented in sub-classes""" return '' def keys(self): """ Return service keys """ if self._keys: return self._keys srv_keys = [] for _api, params in self.map.items(): for key in params['keys']: if not key in srv_keys: srv_keys.append(key) self._keys = srv_keys return srv_keys def parameters(self): """ Return mapped service parameters """ if self._params: return self._params srv_params = [] for _api, params in self.map.items(): for key in params['params']: param_list = self.dasmapping.api2das(self.name, key) for par in param_list: if not par in srv_params: srv_params.append(par) self._params = srv_params return srv_params def notations(self): """ Return a map of system notations. """ if self._notations: return self._notations for _, rows in self.dasmapping.notations(self.name).items(): for row in rows: api = row['api'] nmap = row['rec_key'] notation = row['api_output'] if api in self._notations: self._notations[api].update({notation:nmap}) else: self._notations[api] = {notation:nmap} return self._notations def getdata(self, url, params, expire, headers=None, post=None): """URL call wrapper""" if url.find('https:') != -1: return getdata(url, params, headers, expire, post, self.error_expire, self.verbose, self.ckey, self.cert, system=self.name) else: return getdata(url, params, headers, expire, post, self.error_expire, self.verbose, system=self.name) def call(self, dasquery): """ Invoke service API to execute given query. Return results as a collect list set. """ self.logger.info(dasquery) # check the cache for records with given query/system res = self.localcache.incache(dasquery, collection='cache', system=self.name) if res: msg = "found records in local cache" self.logger.info(msg) return # ask data-service api to get results, they'll be store them in # cache, so return at the end what we have in cache. self.api(dasquery) def write_to_cache(self, dasquery, expire, url, api, args, gen, ctime): """ Write provided result set into DAS cache. """ if not self.write2cache: return # before going to cache we should check/set possible misses, e.g. # primary key when error is thrown result = self.set_misses(dasquery, api, gen) # update the cache header = dasheader(self.name, dasquery, expire, api, url, services=self.services()) header['lookup_keys'] = self.lookup_keys(api) header['prim_key'] = self.dasmapping.primary_mapkey(self.name, api) header['ctime'] = ctime self.localcache.update_cache(dasquery, result, header) msg = 'cache has been updated,\n' self.logger.debug(msg) def adjust_params(self, api, kwds, instance=None): """ Data-service specific parser to adjust parameters according to its specifications. For example, DQ service accepts a string of parameters, rather parameter set, while DBS2 can reuse some parameters for different API, e.g. I can use dataset path to pass to listPrimaryDatasets as primary_dataset pattern. """ pass def lookup_keys(self, api): """ Return look-up keys of data output for given data-service API. """ lkeys = self.dasmapping.lookup_keys(self.name, api) return [{api:lkeys}] def inspect_params(self, api, args): """ Perform API parameter inspection. Check if API accept a range of parameters, etc. """ for key, value in args.items(): if isinstance(value, dict): minval = None maxval = None for oper, val in value.items(): if oper == '$in': minval = int(val[0]) maxval = int(val[-1]) args[key] = range(minval, maxval) elif oper == '$lt': maxval = int(val) args[key] = maxval elif oper == '$lte': maxval = int(val) args[key] = maxval elif oper == '$gt': minval = int(val) args[key] = minval elif oper == '$gte': minval = int(val) args[key] = minval else: msg = '%s does not support operator %s' % (api, oper) raise Exception(msg) return args def get_notations(self, api): """Return notations used for given API""" notationmap = self.notations() if not notationmap: return {} notations = {} if '' in notationmap: notations = dict(notationmap['']) # notations applied to all APIs if api in notationmap: # overwrite the one for provided API notations.update(notationmap[api]) return notations def parser(self, dasquery, dformat, data, api): """ DAS data parser. Input parameters: - *query* input DAS query - *dformat* is a data format, e.g. XML, JSON - *data* is a data source, either file-like object or actual data - *api* is API name """ prim_key = self.dasmapping.primary_key(self.name, api) counter = 0 if dformat.lower() == 'xml': tags = self.dasmapping.api2daskey(self.name, api) gen = xml_parser(data, prim_key, tags) for row in gen: counter += 1 yield row elif dformat.lower() == 'json' or dformat.lower() == 'dasjson': gen = json_parser(data, self.logger) das_dict = {} for row in gen: if dformat.lower() == 'dasjson': for key, val in row.items(): if key != 'results': das_dict[key] = val row = row['results'] if isinstance(row, list): for item in row: if item: if prim_key in item: counter += 1 yield item else: counter += 1 yield {prim_key:item} else: if prim_key in row: counter += 1 yield row else: counter += 1 yield {prim_key:row} else: msg = 'Unsupported data format="%s", API="%s"' % (dformat, api) raise Exception(msg) msg = "api=%s, format=%s " % (api, dformat) msg += "prim_key=%s yield %s rows" % (prim_key, counter) self.logger.info(msg) def translator(self, api, genrows): """ Convert raw results into DAS records. """ prim_key = self.dasmapping.primary_key(self.name, api) count = 0 for row in genrows: row2das(self.dasmapping.notation2das, self.name, api, row) count += 1 # check for primary key existance, since it can be overriden # by row2das. For example DBS3 uses flat namespace, so we # override dataset=>name, while dataset still is a primary key if isinstance(row, list): yield {prim_key:row} elif prim_key in row: if prim_key in row[prim_key]: yield row[prim_key] # remapping may create nested dict else: yield row else: yield {prim_key:row} msg = "yield %s rows" % count self.logger.debug(msg) def set_misses(self, dasquery, api, genrows): """ Check and adjust DAS records wrt input query. If some of the DAS keys are missing, add it with its value to the DAS record. """ # look-up primary key prim_key = self.dasmapping.primary_key(self.name, api) # Scan all docs and store those whose size above MongoDB limit into # GridFS map_key = self.dasmapping.primary_mapkey(self.name, api) genrows = parse2gridfs(self.gfs, map_key, genrows, self.logger) spec = dasquery.mongo_query['spec'] row = next(genrows) ddict = DotDict(row) keys2adjust = [] for key in spec.keys(): val = ddict.get(key) if spec[key] != val and key not in keys2adjust: keys2adjust.append(key) msg = "adjust keys %s" % keys2adjust self.logger.debug(msg) count = 0 if keys2adjust: # adjust of the rows for row in yield_rows(row, genrows): ddict = DotDict(row) pval = ddict.get(map_key) if isinstance(pval, dict) and 'error' in pval: ddict[map_key] = '' ddict.update({prim_key: pval}) for key in keys2adjust: value = spec[key] existing_value = ddict.get(key) # the way to deal with proximity/patern/condition results if (isinstance(value, str) or isinstance(value, unicode))\ and value.find('*') != -1: # we got pattern if existing_value: value = existing_value elif isinstance(value, dict) or \ isinstance(value, list): # we got condition if existing_value: value = existing_value elif isinstance(value, dict) and \ '$in' in value: # we got a range {'$in': []} value = value['$in'] elif isinstance(value, dict) and \ '$lte' in value and '$gte' in value: # we got a between range value = [value['$gte'], value['$lte']] else: value = json.dumps(value) elif existing_value and value != existing_value: # we got proximity results if 'proximity' in ddict: proximity = DotDict({key:existing_value}) ddict['proximity'].update(proximity) else: proximity = DotDict({}) proximity[key] = existing_value ddict['proximity'] = proximity else: if existing_value: value = existing_value ddict[key] = value yield ddict count += 1 else: yield row for row in genrows: yield row count += 1 msg = "yield %s rows" % count self.logger.debug(msg) def api(self, dasquery): """ Data service api method, can be defined by data-service class. It parse input query and invoke appropriate data-service API call. All results are stored into the DAS cache along with api call inserted into Analytics DB. """ self.logger.info(dasquery) genrows = self.apimap(dasquery) if not genrows: return jobs = [] for url, api, args, dformat, expire in genrows: # insert DAS query record for given API header = dasheader(self.name, dasquery, expire, api, url) self.localcache.insert_query_record(dasquery, header) # fetch DAS data records if self.multitask: jobs.append(self.taskmgr.spawn(self.apicall, \ dasquery, url, api, args, dformat, expire)) else: self.apicall(dasquery, url, api, args, dformat, expire) if self.multitask: self.taskmgr.joinall(jobs) def apicall(self, dasquery, url, api, args, dformat, expire): """ Data service api method, can be defined by data-service class. It parse input query and invoke appropriate data-service API call. All results are stored into the DAS cache along with api call inserted into Analytics DB. We invoke explicitly close call for our datastream instead of using context manager since this method as well as getdata/parser can be overwritten by child classes. """ datastream = None try: args = self.inspect_params(api, args) time0 = time.time() headers = make_headers(dformat) datastream, expire = self.getdata(url, args, expire, headers) self.logger.info("%s expire %s" % (api, expire)) rawrows = self.parser(dasquery, dformat, datastream, api) dasrows = self.translator(api, rawrows) ctime = time.time() - time0 self.write_to_cache(dasquery, expire, url, api, args, dasrows, ctime) except Exception as exc: msg = 'Fail to process: url=%s, api=%s, args=%s' \ % (url, api, args) print(msg) print_exc(exc) close(datastream) def url_instance(self, url, _instance): """ Virtual method to adjust URL for a given instance, must be implemented in service classes """ return url def adjust_url(self, url, instance): """ Adjust data-service URL wrt provided instance, e.g. DBS carry several instances """ if instance: url = self.url_instance(url, instance) return url def apimap(self, dasquery): """ Analyze input query and yield url, api, args, format, expire for further processing. """ srv = self.name # get local copy to avoid threading issues cond = getarg(dasquery.mongo_query, 'spec', {}) instance = dasquery.mongo_query.get('instance', self.dbs_global) skeys = getarg(dasquery.mongo_query, 'fields', []) if not skeys: skeys = [] self.logger.info("\n") for api, value in self.map.items(): expire = value['expire'] iformat = value['format'] url = self.adjust_url(value['url'], instance) if not url: msg = '--- rejects API %s, no URL' % api self.logger.info(msg) continue args = dict(value['params']) # make new copy, since we'll adjust wild = value.get('wild_card', '*') found = 0 # check if input parameters are covered by API if not self.dasmapping.check_api_match(srv, api, cond): msg = '--- rejects API %s, does not cover input condition keys' \ % api self.logger.info(msg) continue # once we now that API covers input set of parameters we check # every input parameter for pattern matching for key, val in cond.items(): # check if keys from conditions are accepted by API # need to convert key (which is daskeys.map) into # input api parameter for apiparam in self.dasmapping.das2api(srv, api, key, val): if apiparam in args: args[apiparam] = val found += 1 # VK 20160708, wrong statement, it caused to pass # datasets API for query dataset in [path1, path2] # I'll leave block here until I test and verify that # commented out block will not cause other issues # # check the case when we only have single condition key # and it is the key we look-up # if not found and skeys == [k.split('.')[0] for k in cond.keys()]: # found = 1 # check if number of keys on cond and args are the same if len(cond.keys()) != found: msg = "--- reject API %s, not all condition keys are covered" \ % api self.logger.info(msg) msg = 'args=%s' % args self.logger.debug(msg) continue if not found: msg = "--- rejects API %s, parameters don't match" % api self.logger.info(msg) msg = 'args=%s' % args self.logger.debug(msg) continue self.adjust_params(api, args, instance) # delete args keys whose value is optional delete_keys(args, 'optional') # check that there is no "required" parameter left in args, # since such api will not work if 'required' in args.values(): msg = '--- rejects API %s, parameter is required' % api self.logger.info(msg) msg = 'args=%s' % args self.logger.debug(msg) continue # adjust pattern symbols in arguments if wild != '*': for key, val in args.items(): if isinstance(val, str) or isinstance(val, unicode): val = val.replace('*', wild) args[key] = val # compare query selection keys with API look-up keys api_lkeys = self.dasmapping.api_lkeys(srv, api) if set(api_lkeys) != set(skeys): msg = "--- rejects API %s, api_lkeys(%s)!=skeys(%s)"\ % (api, api_lkeys, skeys) self.logger.info(msg) continue msg = '+++ %s passes API %s' % (srv, api) self.logger.info(msg) msg = 'args=%s' % args self.logger.debug(msg) msg = "yield " msg += "system ***%s***, url=%s, api=%s, args=%s, format=%s, " \ % (srv, url, api, args, iformat) msg += "expire=%s, wild_card=%s" \ % (expire, wild) self.logger.debug(msg) yield url, api, args, iformat, expire
class DASAnalytics(object): """ DAS analytics DB manager. """ def __init__(self, config): self.verbose = config['verbose'] self.logger = PrintManager('DASAnalytics', self.verbose) self.dburi = config['mongodb']['dburi'] self.dbname = config['analyticsdb']['dbname'] self.colname = config['analyticsdb']['collname'] self.history = config['analyticsdb']['history'] msg = "%s@%s" % (self.dburi, self.dbname) self.logger.info(msg) self.create_db() def create_db(self): """ Create analytics DB in MongoDB back-end. """ self.conn = db_connection(self.dburi) database = self.conn[self.dbname] das_son_manipulator = DAS_SONManipulator() database.add_son_manipulator(das_son_manipulator) self.col = database[self.colname] # if self.dbname not in self.conn.database_names(): # capped_size = 104857600 # options = {'capped':True, 'size': capped_size} # database = self.conn[self.dbname] # database.create_collection('self.colname', **options) # print "####CREATE CAPPED ANALYTICS" # self.col = self.conn[self.dbname][self.colname] def delete_db(self): """ Delete analytics DB in MongoDB back-end. """ self.conn.drop_database(self.dbname) def delete_db_collection(self): """ Delete analytics DB collection in MongoDB. """ self.conn.drop_collection(self.colname) def add_query(self, query, mongoquery): """ Add DAS-QL/MongoDB-QL queries into analytics. A unique record is contained for each (qhash, dhash) pair. For each an array of call-times is contained. """ if isinstance(mongoquery, dict): mongoquery = encode_mongo_query(mongoquery) msg = 'query=%s, mongoquery=%s' % (query, mongoquery) self.logger.debug(msg) dhash = genkey(query) qhash = genkey(mongoquery) now = time.time() existing = self.col.find_one({'qhash': qhash, 'dhash': dhash}) if existing: # check if times contains very old timestamps rec = self.col.find({'_id': ObjectId(existing['_id']), 'times':{'$lt' : now - self.history}}) if rec: self.col.update({'_id': ObjectId(existing['_id'])}, {'$pull': {'times': {'$lt' : now - self.history}}}) # update times array with new timestamp self.col.update({'_id': ObjectId(existing['_id'])}, {'$push': {'times': now}}) else: record = dict(query=query, mongoquery=mongoquery, qhash=qhash, dhash=dhash, times=[now]) self.col.insert(record) index = [('qhash', DESCENDING), ('dhash', DESCENDING)] create_indexes(self.col, index) def clean_queries(self): """ Standalone method to clean up expired call-times from query records, since otherwise only the active record is cleaned. This is too expensive to do with every operation, and mongodb does not allow multiple modifications to a single field in a single update operation (ie, we can't do $push and $pull in one update), so it should probably be done asynchronously at fixed intervals. """ self.logger.debug('') now = time.time() #clean out the times array self.col.update({'times': {'$exists': True}}, {'$pull': {'times': {'$lt': now - self.history}}}) #now delete any with no times self.col.remove({'times': {'$size': 0}}) #and should maybe delete anything with the same qhash here? def remove_expired(self): "Moved from AbstractService - remove old apicall records" spec = {'apicall.expire':{'$lt' : int(time.time())}} self.col.remove(spec) def add_summary(self, identifier, start, finish, **payload): """ Add an analyzer summary, with given analyzer identifier, start and finish times and payload. It is intended that a summary document is deposited on each run of an analyzer (if desirable) and is thereafter immutable. """ msg = '(%s, %s->%s, %s)' % (identifier, start, finish, payload) self.logger.debug(msg) # clean-up analyzer records whose start timestamp is too old spec = {'start':{'$lt':time.time()-self.history}, 'analyzer': {'$exists': True}} self.col.remove(spec) # insert new analyzer record record = {'analyzer':identifier, 'start': start, 'finish': finish} payload.update(record) #ensure key fields are set correctly self.col.insert(payload) # ensure summary items are indexed for quick extract create_indexes(self.col, [('analyzer', DESCENDING), ('start', ASCENDING)]) def get_summary(self, identifier, after=None, before=None, **query): """ Retrieve a summary document for a given analyzer-identifier, optionally specifying a time range. """ cond = {'analyzer': identifier} if after: cond['start'] = {'$gte': after} if before: cond['finish'] = {'$lte': before} if query: cond.update(query) return list(self.col.find(cond)) def add_api(self, system, query, api, args): """ Add API info to analytics DB. Here args is a dict of API parameters. """ orig_query = query if isinstance(query, dict): query = encode_mongo_query(query) msg = '(%s, %s, %s, %s)' % (system, query, api, args) self.logger.debug(msg) # find query record qhash = genkey(query) record = self.col.find_one({'qhash':qhash}, fields=['dasquery']) if not record: self.add_query("", orig_query) # find api record record = self.col.find_one({'qhash':qhash, 'system':system, 'api.name':api, 'api.params':args}) apidict = dict(name=api, params=args) if record: self.col.update({'_id':record['_id']}, {'$inc':{'counter':1}}) else: record = dict(system=system, api=apidict, qhash=qhash, counter=1) self.col.insert(record) index = [('system', DESCENDING), ('dasquery', DESCENDING), ('api.name', DESCENDING), ('qhash', DESCENDING) ] create_indexes(self.col, index) def insert_apicall(self, system, query, url, api, api_params, expire): """ Remove obsolete apicall records and insert into Analytics DB provided information about API call. Moved from AbstractService. Updated so that we do not have multiple records when performing forced updates (ie, the old record is not yet expired) - now look for an existing record with the same parameters (I'm hoping the fact that some of the variables are indexed will make this fast even though not all are), and if it exists just update the expiry. Otherwise insert a new record. """ msg = 'query=%s, url=%s,' % (query, url) msg += 'api=%s, args=%s, expire=%s' % (api, api_params, expire) self.logger.debug(msg) expire = expire_timestamp(expire) query = encode_mongo_query(query) qhash = genkey(query) self.remove_expired() existing = self.col.find_one({'apicall.system': system, 'apicall.url': url, 'apicall.api': api, 'apicall.api_params': api_params, 'apicall.qhash': qhash}) if existing: self.logger.debug("updating") self.col.update({'_id': existing['_id']}, {'$set':{'apicall.expire': expire}}) else: self.col.insert({'apicall':{'api_params': api_params, 'url': url, 'api': api, 'system': system, 'expire': expire, 'qhash': qhash}}) index_list = [('apicall.url', DESCENDING), ('apicall.api', DESCENDING), ('qhash', DESCENDING)] create_indexes(self.col, index_list) def update_apicall(self, query, das_dict): """ Update apicall record with provided DAS dict. Moved from AbstractService """ msg = 'DBSAnalytics::update_apicall, query=%s, das_dict=%s'\ % (query, das_dict) self.logger.debug(msg) spec = {'apicall.qhash':genkey(encode_mongo_query(query))} record = self.col.find_one(spec) self.col.update({'_id':ObjectId(record['_id'])}, {'$set':{'dasapi':das_dict, 'apicall.expire':das_dict['response_expires']}}) def update(self, system, query): """ Update records for given system/query. """ if isinstance(query, dict): query = encode_mongo_query(query) msg = 'system=%s, query=%s' % (system, query) self.logger.debug(msg) qhash = genkey(query) if system: cond = {'qhash':qhash, 'system':system} else: cond = {'qhash':qhash} self.col.update(cond, {'$inc' : {'counter':1}}, multi=True) def list_systems(self): """ List all DAS systems. """ cond = { 'system' : { '$ne' : None } } gen = (row['system'] for row in self.col.find(cond, ['system'])) return gen2list(gen) def list_queries(self, qhash=None, dhash=None, query_regex=None, key=None, after=None, before=None): """ List inserted queries based on many criteria. """ cond = {'mongoquery': {'$exists': True}} if qhash: cond['qhash'] = qhash if dhash: cond['dhash'] = dhash if query_regex: cond['dasquery'] = {'$regex':query_regex} if key: cond['mongoquery.spec.key'] = key # in this case we need a specific element to be within the range, # so we need to use elemMatch if before and after: cond['times'] = {'$gt': after, '$lt': before} # in these cases we only need to match any element elif after: cond['times'] = {'$gt': after} elif before: cond['times'] = {'$lt': before} return self.col.find(cond) def get_popular_queries(self, spec): """ Get popular queries based on provided spec, which can be in a form of time stamp range, etc. """ cond = {'counter':{'$exists':True}} for row in self.col.find(fields=['qhash'], spec=cond).\ sort('counter', DESCENDING): spec = {'qhash': row['qhash'], 'counter':{'$exists': False}} for res in self.col.find(spec=spec): yield res def list_apis(self, system=None): """ List all APIs. """ cond = { 'api.name' : { '$ne' : None } } if system: cond['system'] = system gen = (row['api']['name'] for row in \ self.col.find(cond, ['api.name'])) return gen2list(gen) def list_apicalls(self, qhash=None, api=None, url=None): "Replace ad-hoc calls in AbstractService" cond = {} if qhash: cond['apicall.qhash'] = qhash if api: cond['apicall.api'] = api if url: cond['apicall.url'] = url return list(self.col.find(cond)) def api_params(self, api): """ Retrieve API parameters from analytics DB """ cond = {'api.name':api} gen = (row['api']['params'] for row in \ self.col.find(cond, ['api.params'])) return gen2list(gen) def api_counter(self, api, args=None): """ Retrieve API counter from analytics DB. User must supply API name and optional dict of parameters. """ cond = {'api.name': api} if args: for key, val in args.iteritems(): cond[key] = val return self.col.find_one(cond, ['counter'])['counter']
class DASMapping(object): """ This class manages DAS mapping DB. """ def __init__(self, config): self.verbose = config["verbose"] self.logger = PrintManager("DASMapping", self.verbose) self.services = config["services"] self.dburi = config["mongodb"]["dburi"] self.dbname = config["mappingdb"]["dbname"] self.colname = config["mappingdb"]["collname"] msg = "%s@%s" % (self.dburi, self.dbname) self.logger.info(msg) self.create_db() self.keymap = {} # to be filled at run time self.presentationcache = {} # to be filled at run time self.reverse_presentation = {} # to be filled at run time self.notationcache = {} # to be filled at run time self.diffkeycache = {} # to be filled at run time self.apicache = {} # to be filled at run time self.apiinfocache = {} # to be filled at run time self.init_notationcache() self.init_presentationcache() # =============== # Management APIs # =============== def init_notationcache(self): """ Initialize notation cache by reading notations. """ for system, notations in self.notations().iteritems(): for row in notations: key = system, row["notation"] if self.notationcache.has_key(key): self.notationcache[key] += [(row["api"], row["map"])] else: self.notationcache[key] = [(row["api"], row["map"])] def init_presentationcache(self): """ Initialize presentation cache by reading presentation map. """ query = {"presentation": {"$ne": None}} data = self.col.find_one(query) if data: self.presentationcache = data["presentation"] for daskey, uilist in self.presentationcache.iteritems(): for row in uilist: link = None if row.has_key("link"): link = row["link"] if row.has_key("diff"): self.diffkeycache[daskey] = row["diff"] tdict = {daskey: {"mapkey": row["das"], "link": link}} if self.reverse_presentation.has_key(row["ui"]): self.reverse_presentation[row["ui"]].update(tdict) else: self.reverse_presentation[row["ui"]] = {daskey: {"mapkey": row["das"], "link": link}} def create_db(self): """ Establish connection to MongoDB back-end and create DB. """ self.conn = db_connection(self.dburi) self.db = self.conn[self.dbname] self.col = self.db[self.colname] def delete_db(self): """ Delete mapping DB in MongoDB back-end. """ self.conn.drop_database(self.dbname) def delete_db_collection(self): """ Delete mapping DB collection in MongoDB. """ self.db.drop_collection(self.colname) def check_maps(self): """ Check if there are records in Mapping DB """ return self.col.count() def remove(self, spec): """ Remove record in DAS Mapping DB for provided Mongo spec. """ self.col.remove(spec) def add(self, record): """ Add new record into mapping DB. Example of URI record .. doctest:: { system:dbs, urn : listBlocks, url : "http://a.b.com/api" params : [ {"apiversion":1_2_2, test:"*"} ] daskeys: [ {"key" : "block", "map":"block.name", "pattern":""} ] das2api: [ {"das_key":"site", "api_param":"se", "pattern":"re.compile('^T[0-3]_')"} ] } Example of notation record: .. doctest:: notations: [ {"notation" : "storage_element_name", "map":"site", "api": ""}, ] """ msg = "record=%s" % record self.logger.debug(msg) self.col.insert(record) index = None if record.has_key("urn"): index = [("system", DESCENDING), ("daskeys", DESCENDING), ("urn", DESCENDING)] elif record.has_key("notations"): index = [("system", DESCENDING), ("notations.api_param", DESCENDING)] elif record.has_key("presentation"): index = [] else: msg = "Invalid record %s" % record raise Exception(msg) if index: create_indexes(self.col, index) # ================== # Informational APIs # ================== def list_systems(self): """ List all DAS systems. """ cond = {"system": {"$ne": None}} gen = (row["system"] for row in self.col.find(cond, ["system"])) return list(set(gen2list(gen)) & set(self.services)) def list_apis(self, system=None): """ List all APIs. """ if self.apicache and self.apicache.has_key(system): return self.apicache[system] cond = {"urn": {"$ne": None}} if system: cond["system"] = system gen = (row["urn"] for row in self.col.find(cond, ["urn"])) self.apicache[system] = gen2list(gen) return self.apicache[system] def api_info(self, api_name): """ Return full API info record. """ return self.apiinfocache.get(api_name, self.col.find_one({"urn": api_name})) def relational_keys(self, system1, system2): """ Return a list of relational keys between provided systems """ for system, keys in self.daskeys().iteritems(): if system == system1: keys1 = keys if system == system2: keys2 = keys return list(set(keys1) & set(keys2)) def daskeys(self, das_system=None): """ Return a dict with all known DAS keys. """ cond = {"system": {"$ne": None}} if das_system: cond = {"system": das_system} gen = (row["system"] for row in self.col.find(cond, ["system"])) kdict = {} for system in gen: query = {"system": system, "urn": {"$ne": None}} keys = [] for row in self.col.find(query): for entry in row["daskeys"]: if entry["key"] not in keys: keys.append(entry["key"]) kdict[system] = keys return kdict # ============ # Look-up APIs # ============ def primary_key(self, das_system, urn): """ Return DAS primary key for provided system and urn """ cond = {"system": das_system, "urn": urn} daskeys = self.col.find(cond, ["daskeys.key"]) for row in daskeys: if row and row.has_key("daskeys"): for dkey in row["daskeys"]: if dkey.has_key("key"): return dkey["key"] def primary_mapkey(self, das_system, urn): """ Return DAS primary map key for provided system and urn """ cond = {"system": das_system, "urn": urn} mapkeys = self.col.find(cond, ["daskeys.map"]) for row in mapkeys: if row and row.has_key("daskeys"): for mkey in row["daskeys"]: if mkey.has_key("map"): return mkey["map"] def find_daskey(self, das_system, map_key, value=None): """ Find das key for given system and map key. """ msg = "system=%s\n" % das_system cond = {"system": das_system, "daskeys.map": map_key} daskeys = [] for row in self.col.find(cond, ["daskeys"]): if row and row.has_key("daskeys"): for dkey in row["daskeys"]: if dkey.has_key("key"): if value: pval = dkey.get("pattern", "") if pval: pat = re.compile(pval) if pat.match(str(value)): daskeys.append(dkey["key"]) else: msg += "-- reject key=%s, val=%s, pat=%s\n" % (map_key, value, pval) self.logger.debug(msg) else: daskeys.append(dkey["key"]) else: daskeys.append(dkey["key"]) return daskeys def find_mapkey(self, das_system, das_key, value=None): """ Find map key for given system and das key. """ msg = "system=%s\n" % das_system cond = {"system": das_system, "daskeys.key": das_key} for row in self.col.find(cond, ["daskeys", "urn"]): if row and row.has_key("daskeys"): for key in row["daskeys"]: if key.has_key("map") and key["key"] == das_key: if value: pval = key.get("pattern", "") pat = re.compile(pval) if pat.match(str(value)): return key["map"] else: msg += "-- reject key=%s, val=%s, pat=%s\n" % (das_key, value, key["pattern"]) self.logger.debug(msg) continue else: return key["map"] def mapkeys(self, daskey): """ Find primary key for a given daskey """ if self.keymap.has_key(daskey): return self.keymap[daskey] spec = {"daskeys.key": daskey} mapkeys = [] for row in self.col.find(spec, ["daskeys"]): for kmap in row["daskeys"]: if kmap["key"] == daskey and kmap["map"] not in mapkeys: mapkeys.append(kmap["map"]) self.keymap[daskey] = mapkeys return self.keymap[daskey] def find_apis(self, das_system, map_key): """ Find list of apis which correspond to provided system and das map key. """ cond = {"system": das_system, "daskeys.map": map_key} apilist = [] for row in self.col.find(cond, ["urn"]): if row.has_key("urn") and row["urn"] not in apilist: apilist.append(row["urn"]) return apilist def check_dasmap(self, system, urn, das_map, value=None): """ Check if provided system/urn/das_map is a valid combination in mapping db. If value for das_map key is provided we verify it against pattern in DB. """ if not value: cond = {"system": system, "daskeys.map": das_map, "urn": urn} return self.col.find(cond).count() cond = {"system": system, "daskeys.map": das_map, "urn": urn} for row in self.col.find(cond, ["daskeys.pattern"]): for item in row["daskeys"]: pat = re.compile(item["pattern"]) if pat.match(str(value)): return True return False def find_system(self, key): """ Return system name for provided DAS key. """ cond = {"daskeys.key": key} gen = (row["system"] for row in self.col.find(cond, ["system"])) systems = [] for system in gen: if system not in systems: systems.append(system) systems.sort() return systems def lookup_keys(self, system, daskey, api=None, value=None): """ Returns lookup keys for given system and provided selection DAS key, e.g. block => block.name """ query = {"system": system, "daskeys.key": daskey} if api: query["urn"] = api lookupkeys = [] for row in self.col.find(query): for kdict in row["daskeys"]: if kdict["key"] == daskey: lkey = kdict["map"] else: continue if value and kdict["pattern"]: pat = re.compile(kdict["pattern"]) if pat.match(str(value)): if lkey not in lookupkeys: lookupkeys.append(lkey) else: if lkey not in lookupkeys: lookupkeys.append(lkey) if not lookupkeys: msg = "Unable to find look-up key for " msg += "system=%s, daskey=%s, api=%s, value=%s" % (system, daskey, api, value) raise Exception(msg) return lookupkeys def api2das(self, system, api_input_name): """ Translates data-service API input parameter into DAS QL key, e.g. run_number => run. """ query = {"system": system, "das2api.api_param": api_input_name} names = [] for adas in self.col.find(query, ["das2api"]): for row in adas["das2api"]: try: aparam = row["api_param"] daskey = row["das_key"] if aparam == api_input_name and daskey not in names: names.append(daskey) except Exception, err: print "ERROR: look-up api_param/das_key in", row raise err return names
class DASMongocache(object): """ DAS cache based MongoDB. """ def __init__(self, config): self.emptyset_expire = expire_timestamp(\ config['das'].get('emptyset_expire', 5)) self.dburi = config['mongodb']['dburi'] self.cache_size = config['mongodb']['bulkupdate_size'] self.dbname = config['dasdb']['dbname'] self.verbose = config['verbose'] self.logger = PrintManager('DASMongocache', self.verbose) self.mapping = config['dasmapping'] self.conn = db_connection(self.dburi) self.mdb = self.conn[self.dbname] self.col = self.mdb[config['dasdb']['cachecollection']] self.mrcol = self.mdb[config['dasdb']['mrcollection']] self.merge = self.mdb[config['dasdb']['mergecollection']] self.gfs = db_gridfs(self.dburi) self.logdb = DASLogdb(config) self.das_internal_keys = ['das_id', 'das', 'cache_id', 'qhash'] msg = "%s@%s" % (self.dburi, self.dbname) self.logger.info(msg) self.add_manipulator() # ensure that we have the following indexes index_list = [('das.expire', ASCENDING), ('das_id', ASCENDING), ('das.system', ASCENDING), ('qhash', DESCENDING), ('das.empty_record', ASCENDING)] create_indexes(self.col, index_list) index_list = [('das.expire', ASCENDING), ('das_id', ASCENDING), ('qhash', DESCENDING), ('das.empty_record', ASCENDING), ('das.ts', ASCENDING)] create_indexes(self.merge, index_list) def add_manipulator(self): """ Add DAS-specific MongoDB SON manipulator to perform conversion of inserted data into DAS cache. """ das_son_manipulator = DAS_SONManipulator() self.mdb.add_son_manipulator(das_son_manipulator) msg = "DAS_SONManipulator %s" \ % das_son_manipulator self.logger.debug(msg) def similar_queries(self, dasquery): """ Check if we have query results in cache whose conditions are superset of provided query. The method only works for single key whose value is substring of value in input query. For example, if cache contains records about T1 sites, then input query T1_CH_CERN is subset of results stored in cache. """ spec = dasquery.mongo_query.get('spec', {}) cond = {'query.spec.key': {'$in' : spec.keys()}, 'qhash':dasquery.qhash} for row in self.col.find(cond): found_query = DASQuery(row['query']) if dasquery.qhash == found_query.qhash: msg = "%s similar to %s" % (dasquery, found_query) self.logger.info(msg) return found_query return False def get_superset_keys(self, key, value): """ This is a special-case version of similar_keys, intended for analysers that want to quickly find possible superset queries of a simple query of the form key=value. """ msg = "%s=%s" % (key, value) self.logger.debug(msg) cond = {'query.spec.key': key} for row in self.col.find(cond): mongo_query = decode_mongo_query(row['query']) for thiskey, thisvalue in mongo_query.iteritems(): if thiskey == key: if fnmatch.fnmatch(value, thisvalue): yield thisvalue def get_fields(self, dasquery): "Prepare fields to extract from MongoDB" fields = dasquery.mongo_query.get('fields', None) if fields == ['records']: fields = None # look-up all records filters = dasquery.filters cond = {} if filters: new_fields = [] for dasfilter in filters: if dasfilter == 'unique': continue if dasfilter not in fields and \ dasfilter not in new_fields: if dasfilter.find('=') == -1 and dasfilter.find('<') == -1\ and dasfilter.find('>') == -1: new_fields.append(dasfilter) else: cond = parse_filters(dasquery.mongo_query) if not new_fields and fields: new_fields = list(fields) return new_fields, cond return fields, cond def remove_expired(self, collection): """ Remove expired records from DAS cache. """ timestamp = int(time.time()) col = self.mdb[collection] spec = {'das.expire' : {'$lt' : timestamp}} if self.verbose: nrec = col.find(spec).count() msg = "will remove %s records" % nrec msg += ", localtime=%s" % timestamp self.logger.debug(msg) self.logdb.insert(collection, {'delete': self.col.find(spec).count()}) col.remove(spec) def find(self, dasquery): """ Find provided query in DAS cache. """ cond = {'qhash': dasquery.qhash, 'das.system':'das'} return self.col.find_one(cond) def find_specs(self, dasquery, system='das'): """ Check if cache has query whose specs are identical to provided query. Return all matches. """ cond = {'qhash': dasquery.qhash} if system: cond.update({'das.system': system}) return self.col.find(cond) def get_das_ids(self, dasquery): """ Return list of DAS ids associated with given query """ das_ids = [] try: das_ids = \ [r['_id'] for r in self.col.find_specs(dasquery, system='')] except: pass return das_ids def update_das_expire(self, dasquery, timestamp): "Update timestamp of all DAS data records for given query" nval = {'$set': {'das.expire':timestamp}} spec = {'qhash' : dasquery.qhash} self.col.update(spec, nval, multi=True, safe=True) self.merge.update(spec, nval, multi=True, safe=True) def das_record(self, dasquery): "Retrieve DAS record for given query" return self.col.find_one({'qhash': dasquery.qhash}) def find_records(self, das_id): " Return all the records matching a given das_id" return self.col.find({'das_id': das_id}) def add_to_record(self, dasquery, info, system=None): "Add to existing DAS record provided info" if system: self.col.update({'query': dasquery.storage_query, 'das.system':system}, {'$set': info}, upsert=True, safe=True) else: self.col.update({'query': dasquery.storage_query}, {'$set': info}, upsert=True, safe=True) def update_query_record(self, dasquery, status, header=None): "Update DAS record for provided query" if header: system = header['das']['system'] spec1 = {'qhash': dasquery.qhash, 'das.system': 'das'} dasrecord = self.col.find_one(spec1) spec2 = {'qhash': dasquery.qhash, 'das.system': system} sysrecord = self.col.find_one(spec2) hexpire = header['das']['expire'] dexpire = hexpire if dasrecord and dasrecord.has_key('das'): dexpire = dasrecord['das'].get('expire', None) if dexpire and hexpire > dexpire: expire = dexpire else: expire = hexpire if sysrecord: api = header['das']['api'] url = header['das']['url'] sapi = sysrecord['das'].get('api', []) surl = sysrecord['das'].get('url', []) if set(api) & set(sapi) == set(api) and \ set(url) & set(surl) == set(url): self.col.update({'_id':ObjectId(sysrecord['_id'])}, {'$set': {'das.expire':expire, 'das.status':status}}, safe=True) else: self.col.update({'_id':ObjectId(sysrecord['_id'])}, {'$pushAll':{'das.api':header['das']['api'], 'das.urn':header['das']['api'], 'das.url':header['das']['url'], 'das.ctime':header['das']['ctime'], }, '$set': {'das.expire':expire, 'das.status':status}}, safe=True) if dasrecord: self.col.update({'_id':ObjectId(dasrecord['_id'])}, {'$set': {'das.expire':expire}}, safe=True) else: self.col.update({'qhash': dasquery.qhash, 'das.system':'das'}, {'$set': {'das.status': status}}, safe=True) def incache(self, dasquery, collection='merge', system=None): """ Check if we have query results in cache, otherwise return null. Please note, input parameter query means MongoDB query, please consult MongoDB API for more details, http://api.mongodb.org/python/ """ self.remove_expired(collection) col = self.mdb[collection] spec = {'qhash':dasquery.qhash} if system: spec.update({'das.system': system}) res = col.find(spec=spec).count() msg = "(%s, coll=%s) found %s results" % (dasquery, collection, res) self.logger.info(msg) if res: return True return False def nresults(self, dasquery, collection='merge'): """Return number of results for given query.""" if dasquery.aggregators: return len(dasquery.aggregators) # Distinguish 2 use cases, unique filter and general query # in first one we should count only unique records, in later # we can rely on DB count() method. Pleas keep in mind that # usage of fields in find doesn't account for counting, since it # is a view over records found with spec, so we don't need to use it. col = self.mdb[collection] fields, filter_cond = self.get_fields(dasquery) if not fields: spec = dasquery.mongo_query.get('spec', {}) else: spec = {'qhash':dasquery.qhash, 'das.empty_record':0} if filter_cond: spec.update(filter_cond) if dasquery.unique_filter: skeys = self.mongo_sort_keys(collection, dasquery) if skeys: gen = col.find(spec=spec).sort(skeys) else: gen = col.find(spec=spec) res = len([r for r in unique_filter(gen)]) else: res = col.find(spec=spec).count() msg = "%s" % res self.logger.info(msg) return res def mongo_sort_keys(self, collection, dasquery): """ Find list of sort keys for a given DAS query. Check existing indexes and either use fields or spec keys to find them out. Return list of mongo sort keys in a form of (key, order). """ # try to get sort keys all the time to get ordered list of # docs which allow unique_filter to apply afterwards fields = dasquery.mongo_query.get('fields') spec = dasquery.mongo_query.get('spec') skeys = dasquery.sortkeys mongo_skeys = [] if skeys: for key in skeys: if key.find('-') != -1: # reverse order, e.g. desc mongo_skeys.append((key.replace('-', ''), DESCENDING)) else: mongo_skeys.append((key, ASCENDING)) else: existing_idx = [i for i in self.existing_indexes(collection)] if fields: lkeys = [] for key in fields: for mkey in self.mapping.mapkeys(key): if mkey not in lkeys: lkeys.append(mkey) else: lkeys = spec.keys() keys = [k for k in lkeys \ if k.find('das') == -1 and k.find('_id') == -1 and \ k in existing_idx] mongo_skeys = [(k, ASCENDING) for k in keys] return mongo_skeys def existing_indexes(self, collection='merge'): """ Get list of existing indexes in DB. They are returned by index_information API in the following for: .. doctest:: {u'_id_': {u'key': [(u'_id', 1)], u'v': 0}, u'das.expire_1': {u'key': [(u'das.expire', 1)], u'v': 0}, ... u'tier.name_-1': {u'key': [(u'tier.name', -1)], u'v': 0}} """ col = self.mdb[collection] for val in col.index_information().values(): for idx in val['key']: yield idx[0] # index name def get_records(self, col, spec, fields, skeys, idx, limit, unique=False): "Generator to get records from MongoDB. It correctly applies" if fields: for key in fields: # ensure that fields keys will be presented if key not in self.das_internal_keys and \ not spec.has_key(key): spec.update({key: {'$exists':True}}) try: res = col.find(spec=spec, fields=fields) if skeys: res = res.sort(skeys) if not unique: if idx: res = res.skip(idx) if limit: res = res.limit(limit) except Exception as exp: print_exc(exp) row = {'exception': str(exp)} res = [] yield row if unique: if limit: gen = itertools.islice(unique_filter(res), idx, idx+limit) else: gen = unique_filter(res) for row in gen: yield row else: for row in res: yield row def get_from_cache(self, dasquery, idx=0, limit=0, collection='merge'): "Generator which retrieves results from the cache" if dasquery.service_apis_map(): # valid DAS query result = self.get_das_records(dasquery, idx, limit, collection) else: # pure MongoDB query coll = self.mdb[collection] fields = dasquery.mongo_query.get('fields', None) spec = dasquery.mongo_query.get('spec', {}) if dasquery.filters: if fields == None: fields = dasquery.filters else: fields += dasquery.filters skeys = self.mongo_sort_keys(collection, dasquery) result = self.get_records(coll, spec, fields, skeys, \ idx, limit, dasquery.unique_filter) for row in result: yield row def get_das_records(self, dasquery, idx=0, limit=0, collection='merge'): "Generator which retrieves DAS records from the cache" col = self.mdb[collection] msg = "(%s, %s, %s, coll=%s)" % (dasquery, idx, limit, collection) self.logger.info(msg) idx = int(idx) fields, filter_cond = self.get_fields(dasquery) if not fields: spec = dasquery.mongo_query.get('spec', {}) else: spec = {'qhash':dasquery.qhash, 'das.empty_record':0} if filter_cond: spec.update(filter_cond) if fields: # be sure to extract das internal keys fields += self.das_internal_keys # try to get sort keys all the time to get ordered list of # docs which allow unique_filter to apply afterwards skeys = self.mongo_sort_keys(collection, dasquery) res = self.get_records(col, spec, fields, skeys, \ idx, limit, dasquery.unique_filter) counter = 0 for row in res: counter += 1 yield row if counter: msg = "yield %s record(s)" % counter self.logger.info(msg) # if no raw records were yield we look-up possible error records if not counter: nrec = self.col.find({'qhash':dasquery.qhash}).count() if nrec: msg = "for query %s, found %s non-result record(s)" \ % (dasquery, nrec) prf = 'DAS WARNING, monogocache:get_from_cache ' print dastimestamp(prf), msg def map_reduce(self, mr_input, dasquery, collection='merge'): """ Wrapper around _map_reduce to allow sequential map/reduce operations, e.g. map/reduce out of map/reduce. mr_input is either alias name or list of alias names for map/reduce functions. Input dasquery which is applied to first iteration of map/reduce functions. """ # NOTE: I need to revisit mapreduce. spec = dasquery.mongo_query['spec'] if not isinstance(mr_input, list): mrlist = [mr_input] else: mrlist = mr_input coll = self.mdb[collection] for mapreduce in mrlist: if mapreduce == mrlist[0]: cond = spec else: cond = None coll = self._map_reduce(coll, mapreduce, cond) for row in coll.find(): yield row def _map_reduce(self, coll, mapreduce, spec=None): """ Perform map/reduce operation over DAS cache using provided collection, mapreduce name and optional conditions. """ self.logger.debug("(%s, %s)" % (mapreduce, spec)) record = self.mrcol.find_one({'name':mapreduce}) if not record: raise Exception("Map/reduce function '%s' not found" % mapreduce) fmap = record['map'] freduce = record['reduce'] if spec: result = coll.map_reduce(Code(fmap), Code(freduce), query=spec) else: result = coll.map_reduce(Code(fmap), Code(freduce)) msg = "found %s records in %s" % (result.count(), result.name) self.logger.info(msg) self.logger.debug(fmap) self.logger.debug(freduce) return result def get_map_reduce(self, name=None): """ Return definition of map/reduce functions for provided name or gives full list. """ spec = {} if name: spec = {'name':name} result = self.mrcol.find(spec) for row in result: yield row def merge_records(self, dasquery): """ Merge DAS records for provided query. We perform the following steps: 1. get all queries from das.cache by ordering them by primary key 2. run aggregtor function to merge neighbors 3. insert records into das.merge """ self.logger.debug(dasquery) id_list = [] expire = 9999999999 # future # get all API records for given DAS query spec = {'qhash':dasquery.qhash, 'query':{'$exists':True}} records = self.col.find(spec) for row in records: # find smallest expire timestamp to be used by aggregator if row['das']['expire'] < expire: expire = row['das']['expire'] if row['_id'] not in id_list: id_list.append(row['_id']) inserted = 0 lookup_keys = set() fields = dasquery.mongo_query.get('fields') if not fields: # Mongo fields = [] for key in fields: for pkey in self.mapping.mapkeys(key): lookup_keys.add(pkey) for pkey in lookup_keys: skey = [(pkey, DESCENDING)] # lookup all service records spec = {'das_id': {'$in': id_list}, 'das.primary_key': pkey} if self.verbose: nrec = self.col.find(spec).sort(skey).count() msg = "merging %s records, for %s key" % (nrec, pkey) else: msg = "merging records, for %s key" % pkey self.logger.debug(msg) records = self.col.find(spec).sort(skey) # aggregate all records agen = aggregator(dasquery, records, expire) # diff aggregated records gen = das_diff(agen, self.mapping.diff_keys(pkey.split('.')[0])) # insert all records into das.merge using bulk insert size = self.cache_size try: while True: nres = self.merge.insert(\ itertools.islice(gen, size), safe=True) if nres and isinstance(nres, list): inserted += len(nres) else: break except InvalidDocument as exp: msg = "Caught bson error: " + str(exp) self.logger.info(msg) records = self.col.find(spec).sort(skey) gen = aggregator(dasquery, records, expire) genrows = parse2gridfs(self.gfs, pkey, gen, self.logger) das_dict = {'das':{'expire':expire, 'empty_record': 0, 'primary_key':[k for k in lookup_keys], 'system': ['gridfs']}, 'qhash':dasquery.qhash, 'cache_id':[], 'das_id': id_list} for row in genrows: row.update(das_dict) self.merge.insert(row, safe=True) except InvalidOperation: pass if inserted: self.logdb.insert('merge', {'insert': inserted}) elif not lookup_keys: # we get query w/o fields pass else: # we didn't merge anything, it is DB look-up failure empty_expire = time.time() + 20 # secs, short enough to expire empty_record = {'das':{'expire':empty_expire, 'primary_key':list(lookup_keys), 'empty_record': 1}, 'cache_id':[], 'das_id': id_list} for key, val in dasquery.mongo_query['spec'].iteritems(): if key.find('.') == -1: empty_record[key] = [] else: # it is compound key, e.g. site.name newkey, newval = convert_dot_notation(key, val) empty_record[newkey] = adjust_mongo_keyvalue(newval) self.merge.insert(empty_record, safe=True) # update DAS records (both meta and data ones, by using qhash) nval = {'$set': {'das.expire':empty_expire}} spec = {'qhash':dasquery.qhash} self.col.update(spec, nval, multi=True, safe=True) def update_cache(self, dasquery, results, header): """ Insert results into cache. Use bulk insert controller by self.cache_size. Upon completion ensure indexies. """ # insert/check query record in DAS cache self.insert_query_record(dasquery, header) # update results records in DAS cache gen = self.generate_records(dasquery, results, header) inserted = 0 # bulk insert try: while True: nres = self.col.insert(\ itertools.islice(gen, self.cache_size), safe=True) if nres and isinstance(nres, list): inserted += len(nres) else: break except InvalidOperation: pass if inserted: self.logdb.insert('cache', {'insert': inserted}) def insert_query_record(self, dasquery, header): """ Insert query record into DAS cache. """ dasheader = header['das'] # check presence of API record in a cache system = dasheader['system'] if not self.incache(dasquery, collection='cache', system=system): msg = "query=%s, header=%s" % (dasquery, header) self.logger.debug(msg) q_record = dict(das=dasheader, query=dasquery.storage_query) q_record['das']['empty_record'] = 0 q_record['das']['status'] = "requested" q_record['qhash'] = dasquery.qhash self.col.insert(q_record, safe=True) def generate_records(self, dasquery, results, header): """ Iterate over provided results, update records and yield them to next level (update_cache) """ self.logger.debug("(%s) store to cache" % dasquery) if not results: return # update das record with new status status = 'Update DAS cache, %s API' % header['das']['api'][0] self.update_query_record(dasquery, status, header) dasheader = header['das'] expire = dasheader['expire'] system = dasheader['system'] rec = [k for i in header['lookup_keys'] for k in i.values()] cond_keys = dasquery.mongo_query['spec'].keys() # get API record id spec = {'qhash':dasquery.qhash, 'das.system':system} record = self.col.find_one(spec, fields=['_id']) counter = 0 prim_key = rec[0][0]#use rec instead of lkeys[0] which re-order items if record: objid = record['_id'] if isinstance(results, list) or isinstance(results, GeneratorType): for item in results: counter += 1 item['das'] = dict(expire=expire, primary_key=prim_key, condition_keys=cond_keys, instance=dasquery.instance, system=system, empty_record=0) item['das_id'] = str(objid) item['qhash'] = dasquery.qhash yield item else: print "\n\n ### results = ", str(results) raise Exception('Provided results is not a list/generator type') self.logger.info("\n") msg = "%s yield %s rows" % (dasheader['system'], counter) self.logger.info(msg) def remove_from_cache(self, dasquery): """ Remove query from DAS cache. To do so, we retrieve API record and remove all data records from das.cache and das.merge """ records = self.col.find({'qhash':dasquery.qhash}) id_list = [] for row in records: if row['_id'] not in id_list: id_list.append(row['_id']) spec = {'das_id':{'$in':id_list}} self.logdb.insert('merge', {'delete': self.col.find(spec).count()}) self.merge.remove(spec) self.logdb.insert('cache', {'delete': self.col.find(spec).count()}) self.col.remove(spec) self.col.remove({'qhash':dasquery.qhash}) def clean_cache(self): """ Clean expired docs in das.cache and das.merge. """ current_time = time.time() query = {'das.expire': { '$lt':current_time} } self.logdb.insert('merge', {'delete': self.merge.find(query).count()}) self.merge.remove(query) self.logdb.insert('cache', {'delete': self.col.find(query).count()}) self.col.remove(query) def delete_cache(self): """ Delete all results in DAS cache/merge collection, including internal indexes. """ self.logdb.insert('cache', {'delete': self.col.count()}) self.col.remove({}) try: self.col.drop_indexes() except: pass self.logdb.insert('merge', {'delete': self.merge.count()}) self.merge.remove({}) try: self.merge.drop_indexes() except: pass
class DASParserDB(object): """ Caching layer for the PLY parser. """ def __init__(self, config): self.verbose = config['verbose'] self.logger = PrintManager('DASParserDB', self.verbose) self.dburi = config['mongodb']['dburi'] self.dbname = config['parserdb']['dbname'] self.sizecap = config['parserdb'].get('sizecap', 5*1024*1024) self.colname = config['parserdb']['collname'] msg = "DASParserCache::__init__ %s@%s" % (self.dburi, self.dbname) self.logger.info(msg) self.col = None self.create_db() def create_db(self): """ Create db collection """ conn = db_connection(self.dburi) dbn = conn[self.dbname] if self.colname not in dbn.collection_names(): dbn.create_collection(self.colname, capped=True, size=self.sizecap) self.col = dbn[self.colname] def lookup_query(self, rawtext): """ Check the parser cache for a given rawtext query. Search is done with the hash of this string. Returns a tuple (status, value) for the cases (PARSERCACHE_VALID, mongo_query) - valid query found (PARSERCACHE_INVALID, error) - error message for invalid query (PARSERCACHE_NOTFOUND, None) - not in the cache """ result = self.col.find_one({'hash':genkey(rawtext)}, fields=['query', 'error']) if result and result['query']: if self.verbose: self.logger.debug("DASParserCache: found valid %s->%s" %\ (rawtext, result['query'])) query = decode_mongo_query(result['query']) return (PARSERCACHE_VALID, query) elif result and result['error']: if self.verbose: self.logger.debug("DASParserCache: found invalid %s->%s" %\ (rawtext, result['error'])) return (PARSERCACHE_INVALID, result['error']) else: if self.verbose: self.logger.debug("DASParserCache: not found %s" %\ (rawtext)) return (PARSERCACHE_NOTFOUND, None) def insert_valid_query(self, rawtext, query): "Insert a query that was successfully transformed" self._insert_query(rawtext, query, None) def insert_invalid_query(self, rawtext, error): "Insert the error message for an invalid query" self._insert_query(rawtext, None, error) def _insert_query(self, rawtext, query, error): """Internal method to insert a query""" if self.verbose: self.logger.debug("DASParserCache: insert %s->%s/%s" %\ (rawtext, query, error)) # since MongoDB does not support insertion of $ sign in queries # we need to encode inserted query if query: encquery = encode_mongo_query(query) else: encquery = "" self.col.insert({'raw':rawtext, 'hash':genkey(rawtext), 'query':encquery, 'error':str(error)})
class DASMongocache(object): """ DAS cache based MongoDB. """ def __init__(self, config): self.config = config self.emptyset_expire = \ expire_timestamp(config['das'].get('emptyset_expire', 5)) self.dburi = config['mongodb']['dburi'] self.cache_size = config['mongodb']['bulkupdate_size'] self.dbname = config['dasdb']['dbname'] self.verbose = config['verbose'] self.logger = PrintManager('DASMongocache', self.verbose) self.mapping = config['dasmapping'] self.logging = config['dasdb'].get('logging', False) self.rec_ttl = config['dasdb'].get('record_ttl', 24*60*60) self.del_ttl = config['dasdb'].get('delta_ttl', 60) self.cleanup_del_ttl = config['dasdb'].get('cleanup_delta_ttl', 3600) self.retry = config['dasdb'].get('retry', 3) self.das_son_manipulator = DAS_SONManipulator() # Initialize MongoDB connection self.col_ = self.config['dasdb']['cachecollection'] self.mrcol_ = self.config['dasdb']['mrcollection'] self.merge_ = self.config['dasdb']['mergecollection'] self.gfs = db_gridfs(self.dburi) msg = "%s@%s" % (self.dburi, self.dbname) self.logger.info(msg) # ensure that we have the following indexes common_idx = [ ('file.name', DESCENDING), ('dataset.name', DESCENDING), ('block.name', DESCENDING), ('run.run_number', DESCENDING), ] index_list = [('das.expire', ASCENDING), ('das_id', ASCENDING), ('das.system', ASCENDING), ('qhash', DESCENDING), ('das.record', ASCENDING)] create_indexes(self.col, index_list + common_idx) index_list = [('das.expire', ASCENDING), ('das_id', ASCENDING), ('qhash', DESCENDING), ('das.record', ASCENDING), ('das.ts', ASCENDING)] create_indexes(self.merge, index_list) # NOTE: I found that creating index in merge collection leads to # MongoDB error when records contains multiple arrays on indexed # keys. For example, when we query file,run,lumi both file and run # are arrays in MongoDB. In this case the final sort in MongoDB # bark with the following message: # cannot sort with keys that are parallel arrays # it looks like that there is no fix for that yet # see # http://stackoverflow.com/questions/6516725/how-do-i-index-two-arrays-in-mongodb # therefore I temporary disabled create_indexes call on merge # collection which was used to have index to ease final sort, # especially in a case when a lot of records correspond to inital # query, e.g. file records. # On another hand, the most common use case where sort fails is # getting file records, and I can add one compound key to ease sort # but I can't add another compound key on array field, e.g. run common_idx = [[('qhash', DESCENDING), ('file.name', DESCENDING)]] create_indexes(self.merge, index_list + common_idx) # thread which clean-up DAS collections thname = 'mongocache_cleanup' cols = [config['dasdb']['cachecollection'], config['dasdb']['mrcollection'], config['dasdb']['mergecollection']] @property def col(self): "col property provides access to DAS cache collection" conn = db_connection(self.dburi) mdb = conn[self.dbname] colnames = mdb.collection_names() if not colnames or self.col_ not in colnames: try: mdb.create_collection(self.col_) except OperationFailure: pass mdb.add_son_manipulator(self.das_son_manipulator) return mdb[self.col_] @property def merge(self): "merge property provides access to DAS merge collection" conn = db_connection(self.dburi) mdb = conn[self.dbname] colnames = mdb.collection_names() if not colnames or self.merge_ not in colnames: try: mdb.create_collection(self.merge_) except OperationFailure: pass mdb.add_son_manipulator(self.das_son_manipulator) return mdb[self.merge_] @property def mrcol(self): "mrcol property provides access to DAS map-reduce collection" conn = db_connection(self.dburi) mdb = conn[self.dbname] mdb.add_son_manipulator(self.das_son_manipulator) return mdb[self.mrcol_] def get_dataset_hashes(self, dasquery): "Get dataset hashes from DBS database" spec = dasquery.mongo_query.get('spec', {}) inst = dasquery.instance conn = db_connection(self.dburi) if spec and inst: dataset = spec.get('dataset.name', None) if dataset: if dataset.find('*') != -1: cond = {'dataset':re.compile(dataset.replace('*', '.*'))} else: cond = {'dataset': dataset} for row in conn['dbs'][inst].find(cond): if 'qhash' in row: yield row['qhash'] def check_datasets(self, dasquery): "Check dataset presence in DAS cache for given das query" hashes = [r for r in self.get_dataset_hashes(dasquery)] if hashes: spec = {'qhash': {'$in': hashes}} if len(hashes) == self.merge.find(spec, **PYMONGO_OPTS).count(): dasquery._hashes = hashes def get_superset_keys(self, key, value): """ This is a special-case version of similar_keys, intended for analysers that want to quickly find possible superset queries of a simple query of the form key=value. """ msg = "%s=%s" % (key, value) self.logger.debug(msg) cond = {'query.spec.key': key} for row in self.col.find(cond, **PYMONGO_OPTS): mongo_query = decode_mongo_query(row['query']) for thiskey, thisvalue in mongo_query.items(): if thiskey == key: if fnmatch.fnmatch(value, thisvalue): yield thisvalue def get_fields(self, dasquery): "Prepare fields to extract from MongoDB" fields = dasquery.mongo_query.get('fields', []) if fields and 'records' in fields: fields = None # look-up all records filters = dasquery.filters cond = {} if filters: new_fields = [] for dasfilter in filters: if dasfilter == 'unique': continue if fields and dasfilter not in fields and \ dasfilter not in new_fields: if dasfilter.find('=') == -1 and dasfilter.find('<') == -1\ and dasfilter.find('>') == -1: new_fields.append(dasfilter) else: cond = parse_filters(dasquery.mongo_query) if not new_fields and fields: new_fields = list(fields) return new_fields, cond return fields, cond def remove_expired(self, dasquery, collection): """ Remove expired records from DAS cache. We need to perform this operation very carefullly since we don't use transation and on-going commits can invoke this method (see das_core.py). Therefore we use MongoDB $or operator to wipe out queries which match DASQuery hash and already expired or queries which lived in cache more then rec_ttl config parameter. The later operation just prevent DAS cache from growing. """ conn = db_connection(self.dburi) mdb = conn[self.dbname] mdb.add_son_manipulator(self.das_son_manipulator) col = mdb[collection] # use additional delta to check data record expiration # we add this delta to ensure that there is no records close to # current timestamp which may expire during request processing spec = {'qhash':dasquery.qhash, 'das.expire':{'$lt':time.time()+self.del_ttl}} col.delete_many(spec) def check_services(self, dasquery): """ Check if DAS cache contains DAS records with service response for given query. """ das_rec = self.find(dasquery) if not das_rec: return False if 'das' not in das_rec: return False if 'services' not in das_rec['das']: return False spec = {'qhash':dasquery.qhash, 'das.system':{'$ne':'das'}, 'das.expire':{'$gt':time.time()}} nres = self.col.find(spec, **PYMONGO_OPTS).count() if nres: return True return False def find(self, dasquery): """ Find provided query in DAS cache. """ cond = {'qhash': dasquery.qhash, 'das.system':'das', 'das.expire': {'$gt':time.time()}} return find_one(self.col, cond) def find_specs(self, dasquery, system='das'): """ Check if cache has query whose specs are identical to provided query. Return all matches. """ if dasquery.hashes: cond = {'qhash':{'$in':dasquery.hashes}} else: cond = {'qhash': dasquery.qhash} if system: cond.update({'das.system': system}) cond.update({'das.expire':{'$gt':time.time()}}) return self.col.find(cond, **PYMONGO_OPTS) def get_das_ids(self, dasquery): """ Return list of DAS ids associated with given query """ das_ids = [] try: das_ids = \ [r['_id'] for r in self.col.find_specs(dasquery, system='')] except: pass return das_ids def update_das_expire(self, dasquery, timestamp): "Update timestamp of all DAS data records for given query" nval = {'$set': {'das.expire':timestamp}} spec = {'qhash' : dasquery.qhash} self.col.update_many(spec, nval) self.merge.update_many(spec, nval) def das_record(self, dasquery): "Retrieve DAS record for given query" cond = {'qhash': dasquery.qhash, 'das.expire':{'$gt':time.time()}} return find_one(self.col, cond) def find_records(self, das_id): " Return all the records matching a given das_id" return self.col.find({'das_id': das_id}, **PYMONGO_OPTS) def is_error_in_records(self, dasquery, collection='cache'): "Scan DAS cache for error records and return true or not" if collection == 'cache': results = self.col.find({'qhash':dasquery.qhash}, **PYMONGO_OPTS) else: results = self.merge.find({'qhash':dasquery.qhash}, **PYMONGO_OPTS) error = None reason = None for row in results: if 'error' in row: error = row.get('error') reason = row.get('reason', '') break return error, reason def add_to_record(self, dasquery, info, system=None): "Add to existing DAS record provided info" if system: self.col.update_one({'query': dasquery.storage_query, 'das.system':system}, {'$set': info}, upsert=True) else: self.col.update_one({'query': dasquery.storage_query}, {'$set': info}, upsert=True) def find_min_expire(self, dasquery): """Find minimal expire timestamp across all records for given DAS query""" spec = {'qhash': dasquery.qhash} min_expire = 2*time.time() # upper bound, will update for rec in self.col.find(spec, **PYMONGO_OPTS): if 'das' in rec and 'expire' in rec['das']: estamp = rec['das']['expire'] if min_expire > estamp: min_expire = estamp return long(min_expire) def find_query_record(self, dasquery): "Find DAS query records and return them to the caller" spec = {'qhash':dasquery.qhash, 'das.record':record_codes('query_record')} return self.col.find(spec, **PYMONGO_OPTS) def update_query_record(self, dasquery, status, header=None, reason=None): "Update DAS record for provided query" ctime = time.time() das_spec = {'qhash': dasquery.qhash, 'das.system':'das'} min_expire = self.find_min_expire(dasquery) if header: system = header['das']['system'] sts = header['das']['status'] expire = header['das']['expire'] spec = {'qhash': dasquery.qhash, 'das.system': system} new_expire = None for rec in self.col.find(spec, **PYMONGO_OPTS): if 'das' in rec and 'expire' in rec['das']: if rec['das']['expire'] > expire: new_expire = expire ndict = {'das.expire':expire, 'das.status':status} cdict = {'das.ctime':ctime} udict = {'$set':ndict, '$push':cdict} oid = ObjectId(rec['_id']) self.col.update_one({'_id':oid}, udict) if new_expire: udict = {'$set': {'das.expire': new_expire}, '$push': {'das.ctime':ctime}} self.col.update_one(das_spec, udict) else: udict = {'$set': {'das.status':status, 'das.expire': min_expire}, '$push': {'das.ctime':ctime}} self.col.update_one(das_spec, udict) if reason: udict = {'$set': {'das.reason':reason}} self.col.update_one(das_spec, udict) # align all expire timestamps when we recieve ok status if status == 'ok': udict = {'$set': {'das.expire': min_expire}} self.col.update_one(das_spec, udict) def apilist(self, dasquery): "Return list of apis for given dasquery" spec = {'qhash':dasquery.qhash, 'das.record':record_codes('query_record')} apis = [] for row in self.col.find(spec, ['das.api'], **PYMONGO_OPTS): try: apis += row['das']['api'] except Exception as _err: pass return apis def incache(self, dasquery, collection='merge', system=None, api=None, query_record=False): """ Check if we have query results in cache, otherwise return null. Please note, input parameter query means MongoDB query, please consult MongoDB API for more details, http://api.mongodb.org/python/ """ if query_record: record = record_codes('query_record') else: record = spec4data_records() spec = {'qhash':dasquery.qhash, 'das.record':record, 'das.expire':{'$gt':time.time()}} if system: spec.update({'das.system': system}) if api: spec.update({'das.api': api}) conn = db_connection(self.dburi) mdb = conn[self.dbname] mdb.add_son_manipulator(self.das_son_manipulator) col = mdb[collection] res = col.find(spec, **PYMONGO_OPTS).count() msg = "(%s, coll=%s) found %s results" % (dasquery, collection, res) self.logger.info(msg) if res: return True return False def nresults(self, dasquery, collection='merge'): """Return number of results for given query.""" if dasquery.aggregators: return len(dasquery.aggregators) # Distinguish 2 use cases, unique filter and general query # in first one we should count only unique records, in later # we can rely on DB count() method. Pleas keep in mind that # usage of fields in find doesn't account for counting, since it # is a view over records found with spec, so we don't need to use it. fields, filter_cond = self.get_fields(dasquery) if not fields: spec = dasquery.mongo_query.get('spec', {}) elif dasquery.hashes: spec = {'qhash':{'$in':dasquery.hashes}, 'das.record': spec4data_records()} else: spec = {'qhash':dasquery.qhash, 'das.record': spec4data_records()} if filter_cond: spec.update(filter_cond) conn = db_connection(self.dburi) mdb = conn[self.dbname] mdb.add_son_manipulator(self.das_son_manipulator) col = mdb[collection] if dasquery.unique_filter: skeys = self.mongo_sort_keys(collection, dasquery) if skeys: gen = col.find(spec, **PYMONGO_OPTS).sort(skeys) else: gen = col.find(spec, **PYMONGO_OPTS) res = len([r for r in unique_filter(gen)]) else: res = col.find(spec, **PYMONGO_OPTS).count() if not res: # double check that this is really the case time.sleep(1) res = col.find(spec, **PYMONGO_OPTS).count() msg = "%s" % res self.logger.info(msg) return res def mongo_sort_keys(self, collection, dasquery): """ Find list of sort keys for a given DAS query. Check existing indexes and either use fields or spec keys to find them out. Return list of mongo sort keys in a form of (key, order). """ # try to get sort keys all the time to get ordered list of # docs which allow unique_filter to apply afterwards fields = dasquery.mongo_query.get('fields') spec = dasquery.mongo_query.get('spec') skeys = dasquery.sortkeys mongo_skeys = [] if skeys: for key in skeys: if key.find('-') != -1: # reverse order, e.g. desc mongo_skeys.append((key.replace('-', ''), DESCENDING)) else: mongo_skeys.append((key, ASCENDING)) else: existing_idx = [i for i in self.existing_indexes(collection)] if fields: lkeys = [] for key in fields: for mkey in self.mapping.mapkeys(key): if mkey not in lkeys: lkeys.append(mkey) else: lkeys = list(spec.keys()) keys = [k for k in lkeys \ if k.find('das') == -1 and k.find('_id') == -1 and \ k in existing_idx] mongo_skeys = [(k, ASCENDING) for k in keys] return mongo_skeys def existing_indexes(self, collection='merge'): """ Get list of existing indexes in DB. They are returned by index_information API in the following for: .. doctest:: {u'_id_': {u'key': [(u'_id', 1)], u'v': 0}, u'das.expire_1': {u'key': [(u'das.expire', 1)], u'v': 0}, ... u'tier.name_-1': {u'key': [(u'tier.name', -1)], u'v': 0}} """ conn = db_connection(self.dburi) mdb = conn[self.dbname] mdb.add_son_manipulator(self.das_son_manipulator) col = mdb[collection] for val in col.index_information().values(): for idx in val['key']: yield idx[0] # index name def get_records(self, coll, spec, fields, skeys, idx, limit, unique=False): "Generator to get records from MongoDB." try: conn = db_connection(self.dburi) mdb = conn[self.dbname] mdb.add_son_manipulator(self.das_son_manipulator) col = mdb[coll] nres = col.find(spec, **PYMONGO_OPTS).count() if nres == 1 or nres <= limit: limit = 0 if limit: res = col.find(spec, fields, sort=skeys, skip=idx, limit=limit) else: res = col.find(spec, fields, sort=skeys, **PYMONGO_OPTS) if unique: res = unique_filter(res) for row in res: yield row except Exception as exp: print_exc(exp) row = {'exception': str(exp)} res = [] yield row def get_from_cache(self, dasquery, idx=0, limit=0, collection='merge'): "Generator which retrieves results from the cache" if dasquery.service_apis_map(): # valid DAS query result = self.get_das_records(dasquery, idx, limit, collection) for row in result: yield row else: # pure MongoDB query fields = dasquery.mongo_query.get('fields', []) if fields == None: fields = [] spec = dasquery.mongo_query.get('spec', {}) if dasquery.filters: if not fields: fields = [] fields += dasquery.filters pkeys = [k.split('.')[0] for k in fields] fields += das_record_keys() if 'records' in dasquery.query: fields = None # special case for DAS 'records' keyword skeys = self.mongo_sort_keys(collection, dasquery) result = self.get_records(collection, spec, fields, skeys, \ idx, limit, dasquery.unique_filter) for row in result: if dasquery.filters: if pkeys and set(pkeys) & set(row.keys()): yield row else: yield row def get_das_records(self, dasquery, idx=0, limit=0, collection='merge'): "Generator which retrieves DAS records from the cache" msg = "(%s, %s, %s, coll=%s)" % (dasquery, idx, limit, collection) self.logger.info(msg) idx = int(idx) fields, filter_cond = self.get_fields(dasquery) if fields == None: fields = [] if not fields: spec = dasquery.mongo_query.get('spec', {}) elif dasquery.hashes: spec = {'qhash':{'$in':dasquery.hashes}, 'das.record': spec4data_records()} else: spec = {'qhash':dasquery.qhash, 'das.record': spec4data_records()} if filter_cond: spec.update(filter_cond) if 'records' in dasquery.query: fields = None # retrieve all fields for records DAS query else: # be sure to extract das internal keys fields += das_record_keys() # try to get sort keys all the time to get ordered list of # docs which allow unique_filter to apply afterwards skeys = self.mongo_sort_keys(collection, dasquery) res = self.get_records(collection, spec, fields, skeys, \ idx, limit, dasquery.unique_filter) counter = 0 for row in res: counter += 1 yield row msg = 'qhash %s, found %s record(s) in %s collection' \ % (dasquery.qhash, counter, collection) print(dastimestamp('DAS INFO '), msg) if counter: msg = "yield %s record(s)" % counter self.logger.info(msg) # if no raw records were yield we look-up possible error records # and reset timestamp for record with system:['das'] if not counter: spec = {'qhash':dasquery.qhash} nrec = self.col.find(spec, **PYMONGO_OPTS).count() if nrec: msg = "for query %s, found %s non-result record(s)" \ % (dasquery, nrec) print(dastimestamp('DAS WARNING'), msg) for rec in self.col.find(spec, **PYMONGO_OPTS): if 'query' in rec: print(dastimestamp('DAS das record'), rec) self.update_das_expire(dasquery, etstamp()) def map_reduce(self, mr_input, dasquery, collection='merge'): """ Wrapper around _map_reduce to allow sequential map/reduce operations, e.g. map/reduce out of map/reduce. mr_input is either alias name or list of alias names for map/reduce functions. Input dasquery which is applied to first iteration of map/reduce functions. """ # NOTE: I need to revisit mapreduce. spec = dasquery.mongo_query['spec'] if not isinstance(mr_input, list): mrlist = [mr_input] else: mrlist = mr_input conn = db_connection(self.dburi) mdb = conn[self.dbname] mdb.add_son_manipulator(self.das_son_manipulator) coll = mdb[collection] for mapreduce in mrlist: if mapreduce == mrlist[0]: cond = spec else: cond = None coll = self._map_reduce(coll, mapreduce, cond) for row in coll.find(): yield row def _map_reduce(self, coll, mapreduce, spec=None): """ Perform map/reduce operation over DAS cache using provided collection, mapreduce name and optional conditions. """ self.logger.debug("(%s, %s)" % (mapreduce, spec)) record = find_one(self.mrcol, {'name':mapreduce}) if not record: raise Exception("Map/reduce function '%s' not found" % mapreduce) fmap = record['map'] freduce = record['reduce'] if spec: result = coll.map_reduce(Code(fmap), Code(freduce), query=spec) else: result = coll.map_reduce(Code(fmap), Code(freduce)) msg = "found %s records in %s" % (result.count(), result.name) self.logger.info(msg) self.logger.debug(fmap) self.logger.debug(freduce) return result def get_map_reduce(self, name=None): """ Return definition of map/reduce functions for provided name or gives full list. """ spec = {} if name: spec = {'name':name} result = self.mrcol.find(spec, **PYMONGO_OPTS) for row in result: yield row def merge_records(self, dasquery, attempt=0): """ Merge DAS records for provided query. We perform the following steps: 1. get all queries from das.cache by ordering them by primary key 2. run aggregtor function to merge neighbors 3. insert records into das.merge """ ### TMP for asyncio # time.sleep(attempt+3) # pymongo 3.2 don't yet flush in time # remove any entries in merge collection for this query self.merge.delete_many({'qhash':dasquery.qhash}) # proceed self.logger.debug(dasquery) id_list = [] expire = 9999999999 # future # get all API records for given DAS query spec = {'qhash':dasquery.qhash, 'das.expire':{'$gt':time.time()}, 'das.record':record_codes('query_record')} records = self.col.find(spec, **PYMONGO_OPTS) for row in records: # find smallest expire timestamp to be used by aggregator rexpire = row.get('das', {}).get('expire', expire) if rexpire < expire: expire = rexpire if row['_id'] not in id_list: id_list.append(row['_id']) inserted = 0 lookup_keys = set() fields = dasquery.mongo_query.get('fields') if not fields: # Mongo fields = [] for key in fields: for pkey in self.mapping.mapkeys(key): lookup_keys.add(pkey) for pkey in lookup_keys: skey = [(pkey, DESCENDING)] # lookup all service records spec = {'das_id': {'$in': id_list}, 'das.primary_key': pkey} if self.verbose: nrec = self.col.find(spec, **PYMONGO_OPTS).sort(skey).count() msg = "merging %s records, for %s key" % (nrec, pkey) else: msg = "merging records, for %s key" % pkey self.logger.debug(msg) # use exhaust=False since we process all records in aggregator # and it can be delay in processing records = self.col.find(spec, **PYMONGO_NOEXHAUST).sort(skey) # aggregate all records agen = aggregator(dasquery, records, expire) # diff aggregated records gen = das_diff(agen, self.mapping.diff_keys(pkey.split('.')[0])) # insert all records into das.merge using bulk insert size = self.cache_size try: res = self.merge.insert_many(gen) inserted += len(res.inserted_ids) except InvalidDocument as exp: print(dastimestamp('DAS WARNING'), 'InvalidDocument during merge', str(exp)) msg = "Caught bson error: " + str(exp) self.logger.info(msg) records = self.col.find(spec, **PYMONGO_OPTS).sort(skey) gen = aggregator(dasquery, records, expire) genrows = parse2gridfs(self.gfs, pkey, gen, self.logger) das_dict = {'das':{'expire':expire, 'das.record': record_codes('gridfs_record'), 'primary_key':[k for k in lookup_keys], 'system': ['gridfs']}, 'qhash':dasquery.qhash, 'cache_id':[], 'das_id': id_list} for row in genrows: row.update(das_dict) self.merge.insert(row) except InvalidOperation as exp: pass except DuplicateKeyError as err: print(dastimestamp('DAS WARNING'), 'DuplicateKeyError during merge') if not isinstance(gen, list): raise err status = 'fail' if inserted: status = 'ok' elif not lookup_keys: # we get query w/o fields msg = 'qhash %s, no lookup_keys' % dasquery.qhash print(dastimestamp('DAS WARNING'), msg) status = 'ok' else: # we didn't merge anything, it is DB look-up failure msg = 'qhash %s, did not insert into das.merge, attempt %s' \ % (dasquery.qhash, attempt) print(dastimestamp('DAS WARNING'), msg) empty_expire = etstamp() lkeys = list(lookup_keys) das = dict(expire=empty_expire, primary_key=lkeys[0], condition_keys=lkeys, instance=dasquery.instance, system=['das'], services=dasquery.services, record=record_codes('empty_record'), ts=time.time(), api=[]) empty_record = {'das':das, 'qhash': dasquery.qhash, 'cache_id':[], 'das_id': id_list} for key in lkeys: empty_record.update({key.split('.')[0]:[]}) for key, val in dasquery.mongo_query['spec'].items(): if key.find('.') == -1: empty_record[key] = [] else: # it is compound key, e.g. site.name newkey, newval = convert_dot_notation(key, val) empty_record[newkey] = adjust_mongo_keyvalue(newval) self.merge.insert(empty_record) # update DAS records (both meta and data ones, by using qhash) nval = {'$set': {'das.expire':empty_expire}} spec = {'qhash':dasquery.qhash} self.col.update_many(spec, nval) return status def update_cache(self, dasquery, results, header, system, api): """ Insert results into cache. Use bulk insert controller by self.cache_size. Upon completion ensure indexies. """ # update results records in DAS cache gen = self.generate_records(dasquery, results, header) inserted = 0 # bulk insert try: res = self.col.insert_many(gen, ordered=False, bypass_document_validation=True) inserted += len(res.inserted_ids) except InvalidOperation: pass # update query record for this sub-system self.update_query_record_system(dasquery, system, api, 'ok') if dasquery.qcache: # custom DASQuery cache self.update_das_expire(dasquery, expire_timestamp(dasquery.qcache)) def update_query_record_system(self, dasquery, system, api, status): "Update system status of dasquery in das.cache collection" spec = {'qhash': dasquery.qhash, 'das.system': system, 'das.api': api, 'das.record':record_codes('query_record')} udict = {'$set': {'das.status':status}} # print("### update_query_record", spec) doc=self.col.find_one_and_update(spec, udict, return_document=ReturnDocument.AFTER) # print(doc) def insert_query_record(self, dasquery, header): """ Insert query record into DAS cache. """ # check presence of API record in a cache dasheader = header['das'] system = dasheader['system'] api = dasheader['api'] collection = 'cache' check_query = True expire = dasheader.get('expire', None) if expire: dasheader['expire'] = adjust_expire(expire) if not self.incache(dasquery, collection, system, api, check_query): msg = "query=%s, header=%s" % (dasquery, header) self.logger.debug(msg) q_record = dict(das=dasheader, query=dasquery.storage_query) q_record['das']['record'] = record_codes('query_record') q_record['das']['status'] = "requested" q_record['qhash'] = dasquery.qhash q_record['das']['ctime'] = [time.time()] res = self.col.insert_one(q_record) if not res: msg = 'unable to insert query record' print(dastimestamp('DAS ERROR '), dasquery, msg, ', will retry') time.sleep(1) res = self.col.insert(q_record) if not res: print(dastimestamp('DAS ERROR '), dasquery, msg) def generate_records(self, dasquery, results, header): """ Iterate over provided results, update records and yield them to next level (update_cache) """ self.logger.debug("(%s) store to cache" % dasquery) if not results: return dasheader = header['das'] expire = adjust_expire(dasheader['expire']) system = dasheader['system'] # DAS service names, e.g. combined services = dasheader['services'] # CMS services used to get data api = dasheader['api'] prim_key = header.get('prim_key', None) if not prim_key: # get primary key from a list of lookup keys which has the # following structure [{'api':[keys]}, {...}] lup_keys = header['lookup_keys'] lkeys = [l for i in lup_keys for k in i.values() for l in k] prim_key = lkeys[0] if 'summary' not in lkeys else 'summary' cond_keys = list(dasquery.mongo_query['spec'].keys()) # get API record id spec = {'qhash':dasquery.qhash, 'das.system':system, 'das.expire': {'$gt':time.time()}, 'das.record': record_codes('query_record')} counter = 0 rids = [str(r['_id']) for r in \ self.col.find(spec, ['_id'], **PYMONGO_OPTS)] if rids: if isinstance(results, list) or isinstance(results, GeneratorType): for item in results: counter += 1 if 'das' in item: expire = item.get('das').get('expire', expire) dasheader['expire'] = expire item['das'] = dict(expire=expire, primary_key=prim_key, condition_keys=cond_keys, instance=dasquery.instance, system=system, services=services, record=record_codes('data_record'), ts=time.time(), api=api) item['das_id'] = rids item['qhash'] = dasquery.qhash yield item else: print("\n\n ### results = ", str(results)) raise Exception('Provided results is not a list/generator type') if expire != dasheader['expire']: # update DAS records header['das']['expire'] = expire # update das record with new status status = 'Update DAS cache, %s API' % header['das']['api'][0] self.update_query_record(dasquery, status, header) msg = "\n%s yield %s rows" % (dasheader['system'], counter) self.logger.info(msg) def remove_from_cache(self, dasquery): """ Remove query from DAS cache. To do so, we retrieve API record and remove all data records from das.cache and das.merge """ records = self.col.find({'qhash':dasquery.qhash}, **PYMONGO_OPTS) id_list = [] for row in records: if row['_id'] not in id_list: id_list.append(row['_id']) spec = {'das_id':{'$in':id_list}} self.merge.remove(spec) self.merge.remove({'qhash':dasquery.qhash}) self.col.remove(spec) self.col.remove({'qhash':dasquery.qhash}) def clean_cache(self, collection=None): """ Clean expired docs in das.cache and das.merge. """ current_time = time.time() query = {'das.expire': { '$lt':current_time} } if not collection or collection == 'merge': self.merge.remove(query) if not collection or collection == 'cache': self.col.remove(query) def delete_cache(self): """ Delete all results in DAS cache/merge collection, including internal indexes. """ self.col.remove({}) try: self.col.drop_indexes() except: pass self.merge.remove({}) try: self.merge.drop_indexes() except: pass
class DASParserDB(object): """ Caching layer for the PLY parser. """ def __init__(self, config): self.verbose = config['verbose'] self.logger = PrintManager('DASParserDB', self.verbose) self.dburi = config['mongodb']['dburi'] self.dbname = config['parserdb']['dbname'] self.sizecap = config['parserdb'].get('sizecap', 5 * 1024 * 1024) self.colname = config['parserdb']['collname'] msg = "DASParserCache::__init__ %s@%s" % (self.dburi, self.dbname) self.logger.info(msg) self.create_db() def create_db(self): """ Create db collection """ conn = db_connection(self.dburi) dbn = conn[self.dbname] if self.colname not in dbn.collection_names(): dbn.create_collection(self.colname, capped=True, size=self.sizecap) col = dbn[self.colname] index_list = [('qhash', DESCENDING)] create_indexes(col, index_list) @property def col(self): "Collection object to MongoDB" conn = db_connection(self.dburi) dbn = conn[self.dbname] col = dbn[self.colname] return col def lookup_query(self, rawtext): """ Check the parser cache for a given rawtext query. Search is done with the qhash of this string. Returns a tuple (status, value) for the cases (PARSERCACHE_VALID, mongo_query) - valid query found (PARSERCACHE_INVALID, error) - error message for invalid query (PARSERCACHE_NOTFOUND, None) - not in the cache """ result = find_one(self.col, {'qhash':genkey(rawtext)}, \ fields=['query', 'error']) if result and result['query']: if self.verbose: self.logger.debug("DASParserCache: found valid %s->%s" %\ (rawtext, result['query'])) query = decode_mongo_query(result['query']) return (PARSERCACHE_VALID, query) elif result and result['error']: if self.verbose: self.logger.debug("DASParserCache: found invalid %s->%s" %\ (rawtext, result['error'])) return (PARSERCACHE_INVALID, result['error']) else: if self.verbose: self.logger.debug("DASParserCache: not found %s" %\ (rawtext)) return (PARSERCACHE_NOTFOUND, None) def insert_valid_query(self, rawtext, query): "Insert a query that was successfully transformed" self._insert_query(rawtext, query, None) def insert_invalid_query(self, rawtext, error): "Insert the error message for an invalid query" self._insert_query(rawtext, None, error) def _insert_query(self, rawtext, query, error): """Internal method to insert a query""" if self.verbose: self.logger.debug("DASParserCache: insert %s->%s/%s" %\ (rawtext, query, error)) # since MongoDB does not support insertion of $ sign in queries # we need to encode inserted query if query: encquery = encode_mongo_query(query) else: encquery = "" self.col.insert({ 'raw': rawtext, 'qhash': genkey(rawtext), 'query': encquery, 'error': str(error) })
class QLManager(object): """ DAS QL manager. """ def __init__(self, config=None): if not config: config = das_readconfig() if not config.has_key('dasmapping'): config['dasmapping'] = DASMapping(config) if not config.has_key('dasanalytics'): config['dasanalytics'] = DASAnalytics(config) if not config['dasmapping'].check_maps(): msg = "No DAS maps found in MappingDB" raise Exception(msg) self.map = config['dasmapping'] self.analytics = config['dasanalytics'] self.dasservices = config['services'] self.daskeysmap = self.map.daskeys() self.operators = list(das_operators()) self.daskeys = list(das_special_keys()) self.verbose = config['verbose'] self.logger = PrintManager('QLManger', self.verbose) for val in self.daskeysmap.values(): for item in val: self.daskeys.append(item) parserdir = config['das']['parserdir'] self.dasply = DASPLY(parserdir, self.daskeys, self.dasservices, verbose=self.verbose) self.enabledb = config['parserdb']['enable'] if self.enabledb: self.parserdb = DASParserDB(config) def parse(self, query): """ Parse input query and return query in MongoDB form. Optionally parsed query can be written into analytics DB. """ mongo_query = self.mongo_query(query) self.convert2skeys(mongo_query) return mongo_query def add_to_analytics(self, query, mongo_query): "Add DAS query to analytics DB" self.analytics.add_query(query, mongo_query) def mongo_query(self, query): """ Return mongo query for provided input query """ # NOTE: somehow I need to keep build call just before using # PLY parser, otherwise it fails to parse. self.dasply.build() if self.verbose: msg = "input query='%s'" % query self.logger.debug(msg) self.dasply.test_lexer(query) if self.enabledb: status, value = self.parserdb.lookup_query(query) if status == PARSERCACHE_VALID and \ len(last_key_pattern.findall(query)) == 0: mongo_query = value elif status == PARSERCACHE_INVALID: raise Exception(value) else: try: ply_query = self.dasply.parser.parse(query) mongo_query = ply2mongo(ply_query) self.parserdb.insert_valid_query(query, mongo_query) except Exception as exp: self.parserdb.insert_invalid_query(query, exp) print "Input query=%s" % query raise exp else: try: ply_query = self.dasply.parser.parse(query) mongo_query = ply2mongo(ply_query) except Exception as exc: msg = "Fail to convert input query='%s' into MongoDB format" \ % query print_exc(msg, print_traceback=False) raise exc if set(mongo_query.keys()) & set(['fields', 'spec']) != \ set(['fields', 'spec']): raise Exception('Invalid MongoDB query %s' % mongo_query) if not mongo_query['fields'] and len(mongo_query['spec'].keys()) > 1: raise Exception(ambiguous_msg(query, mongo_query['spec'].keys())) for key, val in mongo_query['spec'].iteritems(): if isinstance(val, list): raise Exception(ambiguos_val_msg(query, key, val)) return mongo_query def convert2skeys(self, mongo_query): """ Convert DAS input keys into DAS selection keys. """ if not mongo_query['spec']: for key in mongo_query['fields']: for system in self.map.list_systems(): mapkey = self.map.find_mapkey(system, key) if mapkey: mongo_query['spec'][mapkey] = '*' return spec = mongo_query['spec'] to_replace = [] for key, val in spec.iteritems(): for system in self.map.list_systems(): mapkey = self.map.find_mapkey(system, key, val) if mapkey and mapkey != key and \ mongo_query['spec'].has_key(key): to_replace.append((key, mapkey)) continue for key, mapkey in to_replace: if mongo_query['spec'].has_key(key): mongo_query['spec'][mapkey] = mongo_query['spec'][key] del mongo_query['spec'][key] def services(self, query): """Find out DAS services to use for provided query""" skeys, cond = decompose(query) if not skeys: skeys = [] if isinstance(skeys, str): skeys = [skeys] slist = [] # look-up services from Mapping DB for key in skeys + [i for i in cond.keys()]: for service, keys in self.daskeysmap.iteritems(): if service not in self.dasservices: continue value = cond.get(key, None) daskeys = self.map.find_daskey(service, key, value) if set(keys) & set(daskeys) and service not in slist: slist.append(service) # look-up special key condition requested_system = query.get('system', None) if requested_system: if isinstance(requested_system, str): requested_system = [requested_system] return list( set(slist) & set(requested_system) ) return slist def service_apis_map(self, query): """ Find out which APIs correspond to provided query. Return a map of found services and their apis. """ skeys, cond = decompose(query) if not skeys: skeys = [] if isinstance(skeys, str): skeys = [skeys] adict = {} mapkeys = [key for key in cond.keys() if key not in das_special_keys()] services = self.services(query) for srv in services: alist = self.map.list_apis(srv) for api in alist: daskeys = self.map.api_info(api)['daskeys'] maps = [r['map'] for r in daskeys] if set(mapkeys) & set(maps) == set(mapkeys): if adict.has_key(srv): new_list = adict[srv] + [api] adict[srv] = list( set(new_list) ) else: adict[srv] = [api] return adict def params(self, query): """ Return dictionary of parameters to be used in DAS Core: selection keys, conditions and services. """ skeys, cond = decompose(query) services = [] for srv in self.services(query): if srv not in services: services.append(srv) return dict(selkeys=skeys, conditions=cond, services=services)
class DASMapping(object): """ This class manages DAS mapping DB. """ def __init__(self, config): self.verbose = config["verbose"] self.logger = PrintManager("DASMapping", self.verbose) self.services = config["services"] self.dburi = config["mongodb"]["dburi"] self.dbname = config["mappingdb"]["dbname"] self.colname = config["mappingdb"]["collname"] self.map_test = config.get("map_test", True) self.main_dbs = config["das"].get("main_dbs", "dbs") self.dbsinsts = config["das"].get("dbs_instances", []) msg = "%s@%s" % (self.dburi, self.dbname) self.logger.info(msg) self.init() self.on_reload = Event() # Monitoring thread which performs auto-reconnection to MongoDB thname = "mappingdb_monitor" sleep = 5 reload_time = config["mappingdb"].get("reload_time", 86400) reload_time_bad_maps = config["mappingdb"].get("reload_time_bad_maps", 120) start_new_thread( thname, db_monitor, (self.dburi, self.init, sleep, self.load_maps, reload_time, self.check_maps, reload_time_bad_maps), ) self.daskeyscache = {} # to be filled at run time self.systems = [] # to be filled at run time self.dasmapscache = {} # to be filled at run time self.keymap = {} # to be filled at run time self.presentationcache = {} # to be filled at run time self.reverse_presentation = {} # to be filled at run time self.notationcache = {} # to be filled at run time self.diffkeycache = {} # to be filled at run time self.apicache = {} # to be filled at run time self.dbs_global_url = None # to be determined at run time self.dbs_inst_names = None # to be determined at run time self.load_maps(notify=False) @property def col(self): "Return MongoDB collection object" conn = db_connection(self.dburi) dbc = conn[self.dbname] col = dbc[self.colname] return col # =============== # Management APIs # =============== def load_maps(self, notify=True): "Helper function to reload DAS maps" self.init_dasmapscache() self.init_notationcache() self.init_presentationcache() self.systems = None # re-initialize DAS system list self.list_systems() self.dbs_global_url = None # re-initialize DAS dbs global url self.dbs_url() self.dbs_inst_names = None # re-initialize DAS dbs instances self.dbs_instances() if notify: self.on_reload() def init_dasmapscache(self, records=[]): "Read DAS maps and initialize DAS API maps" if not records: spec = {"type": "service"} records = self.col.find(spec, exhaust=True) for row in records: if "urn" in row: api = row["urn"] srv = row["system"] for dmap in row["das_map"]: for key, val in dmap.iteritems(): if key == "pattern": pat = re.compile(val) dmap[key] = pat key = (row["system"], row["urn"]) self.dasmapscache[key] = row def init_notationcache(self): """ Initialize notation cache by reading notations. """ for system, notations in self.notations().iteritems(): for row in notations: key = system, row["api_output"] if key in self.notationcache: self.notationcache[key] += [(row["api"], row["rec_key"])] else: self.notationcache[key] = [(row["api"], row["rec_key"])] def init_presentationcache(self): """ Initialize presentation cache by reading presentation map. """ spec = {"type": "presentation"} data = find_one(self.col, spec) if data: self.presentationcache = data["presentation"] for daskey, uilist in self.presentationcache.iteritems(): for row in uilist: link = None if "link" in row: link = row["link"] if "diff" in row: self.diffkeycache[daskey] = row["diff"] tdict = {daskey: {"mapkey": row["das"], "link": link}} if row["ui"] in self.reverse_presentation: self.reverse_presentation[row["ui"]].update(tdict) else: self.reverse_presentation[row["ui"]] = {daskey: {"mapkey": row["das"], "link": link}} def das_presentation_map(self): "Read DAS presentation map" spec = {"type": "presentation"} data = find_one(self.col, spec) if data: for daskey, uilist in data.get("presentation", {}).iteritems(): for row in uilist: if "link" in row: yield row def init(self): """ Establish connection to MongoDB back-end and create DB. """ col = None try: conn = db_connection(self.dburi) if conn: dbc = conn[self.dbname] col = dbc[self.colname] # print "### DASMapping:init started successfully" except ConnectionFailure as _err: tstamp = dastimestamp("") thread = threading.current_thread() print "### MongoDB connection failure thread=%s, id=%s, time=%s" % (thread.name, thread.ident, tstamp) except Exception as exc: print_exc(exc) if col: index = [ ("type", DESCENDING), ("system", DESCENDING), ("urn", DESCENDING), ("das_map.das_key", DESCENDING), ("das_map.rec_key", DESCENDING), ("das_map.api_arg", DESCENDING), ] create_indexes(col, index) def delete_db(self): """ Delete mapping DB in MongoDB back-end. """ conn = db_connection(self.dburi) if conn: conn.drop_database(self.dbname) def delete_db_collection(self): """ Delete mapping DB collection in MongoDB. """ conn = db_connection(self.dburi) if conn: dbc = conn[self.dbname] dbc.drop_collection(self.colname) def check_maps(self): """ Check Mapping DB and return true/false based on its content """ if not self.map_test: return True # do not test DAS maps, useful for unit tests udict = defaultdict(int) ndict = defaultdict(int) pdict = defaultdict(int) adict = {} maps_hash = False for row in self.col.find(exhaust=True): check_map_record(row) if "urn" in row: udict[row["system"]] += 1 elif "notations" in row: ndict[row["system"]] += 1 elif "presentation" in row: pdict["presentation"] += 1 elif "arecord" in row: arec = row["arecord"] system = arec["system"] rec = {arec["type"]: arec["count"]} if system in adict: adict[system].update(rec) else: adict[system] = rec elif "verification_token" in row: maps_hash = row["verification_token"] # retrieve uri/notation/presentation maps ulist = [] nlist = [] for system in adict.keys(): if "uri" in adict[system]: ulist.append(adict[system]["uri"] == udict[system]) nlist.append(adict[system]["notations"] == ndict[system]) status_umap = sum(ulist) == len(ulist) status_nmap = sum(nlist) == len(nlist) status_pmap = adict.get("presentation", {}).get("presentation", 0) == 1 # verify completeness of maps calc_token = verification_token(self.col.find(exhaust=True)) status_complete = maps_hash and maps_hash == calc_token if self.verbose: print "### DAS map status, umap=%s, nmap=%s, pmap=%s, complete=%s" % ( status_umap, status_nmap, status_pmap, status_complete, ) if not status_complete: print "### DAS map hash do not match, got=%s calculated=%s" % (maps_hash, calc_token) # multiply statuses as a result of this map check return status_umap * status_nmap * status_pmap * status_complete def remove(self, spec): """ Remove record in DAS Mapping DB for provided Mongo spec. """ self.col.remove(spec) def add(self, record): """ Add new record into mapping DB. Example of URI record .. doctest:: { system:dbs, urn : listBlocks, url : "http://a.b.com/api" params : [{"apiversion":1_2_2, se:"*"}] lookup : block das_map: [ {"das_key":"block", "rec_key":"block.name"}, {"das_key":"site", "rec_key":"site.name", "api_arg":"se", "pattern":"^T[0-3]_}, ] } Example of notation record: .. doctest:: notations: [ {"api_output" : "storage_element_name", "rec_key":"site", "api": ""}, ] """ msg = "record=%s" % record self.logger.debug(msg) self.col.insert(record) self.init_dasmapscache([record]) # ================== # Informational APIs # ================== def dbs_global_instance(self, system=None): "Retrive from mapping DB DBS url and extract DBS instance" if not system: system = self.main_dbs url = self.dbs_url(system) return get_dbs_instance(url) def dbs_url(self, system=None): "Retrive from mapping DB DBS url" if not system: system = self.main_dbs systems = self.list_systems() dbses = set(["dbs", "dbs3"]) if dbses & set(systems) != dbses: # use caching only when we operate with single DBS if self.dbs_global_url: return self.dbs_global_url url = None for srv in systems: if srv == system: apis = self.list_apis(srv) url = self.api_info(srv, apis[0])["url"] url = parse_dbs_url(srv, url) self.dbs_global_url = url return url return url def dbs_instances(self, system=None): "Retrive from mapping DB DBS instances" # use dbs istances from the config if self.dbsinsts and not system: return self.dbsinsts # default dbs if not system: system = self.main_dbs systems = self.list_systems() dbses = set(["dbs", "dbs3"]) if dbses & set(systems) != dbses: # use caching only when we operate with single DBS if self.dbs_inst_names: return self.dbs_inst_names insts = [] for srv in systems: if srv == system: apis = self.list_apis(srv) insts = self.api_info(srv, apis[0])["instances"] self.dbs_inst_names = insts return insts return insts def list_systems(self): """ List all DAS systems. """ if not self.systems: spec = {"type": "service", "system": {"$ne": None}} gen = (row["system"] for row in self.col.find(spec, ["system"], exhaust=True)) self.systems = list(set(gen2list(gen)) & set(self.services)) return self.systems def list_apis(self, system=None): """ List all APIs. """ if self.apicache and system in self.apicache: return self.apicache[system] spec = {"type": "service", "urn": {"$ne": None}} if system: spec["system"] = system gen = (row["urn"] for row in self.col.find(spec, ["urn"], exhaust=True)) self.apicache[system] = gen2list(gen) return self.apicache[system] def api_info(self, srv, api_name): """ Return full API info record. """ return self.dasmapscache[(srv, api_name)] def relational_keys(self, system1, system2): """ Return a list of relational keys between provided systems """ for system, keys in self.daskeys().iteritems(): if system == system1: keys1 = keys if system == system2: keys2 = keys return list(set(keys1) & set(keys2)) def daskeys(self, das_system=None): """ Return a dict with all known DAS keys. """ if das_system in self.daskeyscache: return self.daskeyscache[das_system] spec = {"type": "service", "system": {"$ne": None}} if das_system: spec = {"system": das_system} gen = (row["system"] for row in self.col.find(spec, ["system"], exhaust=True)) gen = [r for r in gen] kdict = {} for system in gen: spec = {"system": system, "urn": {"$ne": None}} keys = [] for row in self.col.find(spec, exhaust=True): for entry in row["das_map"]: if entry["das_key"] not in keys: keys.append(entry["das_key"]) kdict[system] = keys # cache it self.daskeyscache[das_system] = kdict return kdict # ============ # Look-up APIs # ============ def api_lkeys(self, das_system, api): """ Return DAS lookup keys for given das system and api """ entry = self.dasmapscache[(das_system, api)] skeys = entry["lookup"].split(",") return skeys def primary_key(self, das_system, urn): """ Return DAS primary key for provided system and urn. The DAS primary key is a first entry in *lookup* attribute of DAS API record. """ spec = {"system": das_system, "urn": urn} record = find_one(self.col, spec) if not record: return None pkey = record["lookup"] if pkey.find(",") != -1: pkey = pkey.split(",")[0] return pkey def primary_mapkey(self, das_system, urn): """ Return DAS primary map key for provided system and urn. For example, the file DAS key is mapped to file.name, so this API will return file.name """ spec = {"system": das_system, "urn": urn} record = find_one(self.col, spec) mapkey = [] for row in record["das_map"]: lkey = record["lookup"] if lkey.find(",") != -1: lkey = lkey.split(",")[0] if row["das_key"] == lkey: return row["rec_key"] return mapkey def find_daskey(self, das_system, map_key, value=None): """ Find das key for given system and map key. """ msg = "system=%s\n" % das_system daskeys = [] for key, record in self.dasmapscache.iteritems(): srv, _urn = key if das_system != srv: continue for row in record["das_map"]: das_key = row["das_key"] rec_key = row["rec_key"] if rec_key != map_key: continue pat = row.get("pattern", None) if value: if pat: if pat.match(str(value)): daskeys.append(das_key) else: msg += "-- reject key=%s, val=%s, pat=%s\n" % (map_key, value, pat.pattern) self.logger.debug(msg) else: daskeys.append(das_key) else: daskeys.append(das_key) return daskeys def find_mapkey(self, das_system, das_key, value=None): """ Find map key for given system and das key. """ msg = "system=%s\n" % das_system for key, record in self.dasmapscache.iteritems(): srv, _urn = key if das_system != srv: continue for row in record["das_map"]: if row["das_key"] != das_key: continue rec_key = row["rec_key"] pat = row.get("pattern", None) if value: if pat: if pat.match(str(value)): return rec_key else: msg += "-- reject key=%s, val=%s, pat=%s\n" % (das_key, value, pat.pattern) self.logger.debug(msg) continue else: return rec_key else: return rec_key def mapkeys(self, daskey): """ Find all lookup keys (primary keys) for a given daskey """ if daskey in self.keymap: return self.keymap[daskey] spec = {"das_map.das_key": daskey} mapkeys = [] for row in self.col.find(spec, ["das_map"], exhaust=True): for kmap in row["das_map"]: if kmap["das_key"] == daskey and kmap["rec_key"] not in mapkeys: mapkeys.append(kmap["rec_key"]) self.keymap[daskey] = mapkeys return self.keymap[daskey] def find_apis(self, das_system, map_key): """ Find list of apis which correspond to provided system and das map key. """ spec = {"system": das_system, "das_map.rec_key": map_key} apilist = [] for row in self.col.find(spec, ["urn"], exhaust=True): if "urn" in row and row["urn"] not in apilist: apilist.append(row["urn"]) return apilist def find_system(self, key): """ Return system name for provided DAS key. """ spec = {"das_map.das_key": key} gen = (row["system"] for row in self.col.find(spec, ["system"], exhaust=True)) systems = [] for system in gen: if system not in systems: systems.append(system) systems.sort() return systems def lookup_keys(self, system, api, daskey=None, value=None): """ Returns lookup keys for given system and provided selection DAS key, e.g. block => block.name """ entry = self.dasmapscache.get((system, api), None) if not entry: return [] lkeys = entry.get("lookup", []).split(",") rkeys = [] if daskey in lkeys: for dmap in entry["das_map"]: rec_key = dmap["rec_key"] if daskey: if dmap["das_key"] == daskey: pat = dmap.get("pattern", None) if value: if pat.match(str(value)): rkeys.append(rec_key) else: if rec_key not in rkeys: rkeys.append(rec_key) else: rkeys.append(rec_key) return rkeys def api2das(self, system, api_input_name): """ Translates data-service API input parameter into DAS QL key, e.g. run_number => run. """ query = {"system": system, "das_map.api_arg": api_input_name} names = [] for adas in self.col.find(query, ["das_map"], exhaust=True): for row in adas["das_map"]: try: if "api_arg" in row: aparam = row["api_arg"] daskey = row["das_key"] if aparam == api_input_name and daskey not in names: names.append(daskey) except Exception, err: print "ERROR: look-up api_param/das_key in", row raise err return names
class DASAbstractService(object): """ Abstract class describing DAS service. It initialized with a name which is used to identify service parameters from DAS configuration file. Those parameters are keys, verbosity level, URL of the data-service. """ def __init__(self, name, config): self.name = name try: self.verbose = config['verbose'] title = 'DASAbstactService_%s' % self.name self.logger = PrintManager(title, self.verbose) self.dasmapping = config['dasmapping'] self.write2cache = config.get('write_cache', True) self.multitask = config['das'].get('multitask', True) self.error_expire = config['das'].get('error_expire', 300) self.dbs_global = None # to be configured at run time self.dburi = config['mongodb']['dburi'] engine = config.get('engine', None) self.gfs = db_gridfs(self.dburi) except Exception as exc: print_exc(exc) raise Exception('fail to parse DAS config') # read key/cert info try: self.ckey, self.cert = get_key_cert() except Exception as exc: print_exc(exc) self.ckey = None self.cert = None if self.multitask: nworkers = config['das'].get('api_workers', 3) thr_weights = config['das'].get('thread_weights', []) for system_weight in thr_weights: system, weight = system_weight.split(':') if system == self.name: nworkers *= int(weight) # if engine: # thr_name = 'DASAbstractService:%s:PluginTaskManager' % self.name # self.taskmgr = PluginTaskManager(\ # engine, nworkers=nworkers, name=thr_name) # self.taskmgr.subscribe() # else: # thr_name = 'DASAbstractService:%s:TaskManager' % self.name # self.taskmgr = TaskManager(nworkers=nworkers, name=thr_name) thr_name = 'DASAbstractService:%s:TaskManager' % self.name self.taskmgr = TaskManager(nworkers=nworkers, name=thr_name) else: self.taskmgr = None self.map = {} # to be defined by data-service implementation self._keys = None # to be defined at run-time in self.keys self._params = None # to be defined at run-time in self.parameters self._notations = {} # to be defined at run-time in self.notations self.logger.info('initialized') # define internal cache manager to put 'raw' results into cache if 'rawcache' in config and config['rawcache']: self.localcache = config['rawcache'] else: msg = 'Undefined rawcache, please check your configuration' raise Exception(msg) def status(self): "Return status of the service" return self.taskmgr.status() def services(self): """ Return sub-subsystems used to retrieve data records. It is used in dasheader call to setup das.services field. This method can be overwritten in sub-classes, otherwise returns dict of service name and CMS systems used to retrieve data records. """ return {self.name: [self.name]} def version(self): """Return data-services version, should be implemented in sub-classes""" return '' def keys(self): """ Return service keys """ if self._keys: return self._keys srv_keys = [] for _api, params in self.map.items(): for key in params['keys']: if not key in srv_keys: srv_keys.append(key) self._keys = srv_keys return srv_keys def parameters(self): """ Return mapped service parameters """ if self._params: return self._params srv_params = [] for _api, params in self.map.items(): for key in params['params']: param_list = self.dasmapping.api2das(self.name, key) for par in param_list: if not par in srv_params: srv_params.append(par) self._params = srv_params return srv_params def notations(self): """ Return a map of system notations. """ if self._notations: return self._notations for _, rows in self.dasmapping.notations(self.name).items(): for row in rows: api = row['api'] nmap = row['rec_key'] notation = row['api_output'] if api in self._notations: self._notations[api].update({notation: nmap}) else: self._notations[api] = {notation: nmap} return self._notations def getdata(self, url, params, expire, headers=None, post=None): """URL call wrapper""" if url.find('https:') != -1: return getdata(url, params, headers, expire, post, self.error_expire, self.verbose, self.ckey, self.cert, system=self.name) else: return getdata(url, params, headers, expire, post, self.error_expire, self.verbose, system=self.name) def call(self, dasquery): """ Invoke service API to execute given query. Return results as a collect list set. """ self.logger.info(dasquery) # check the cache for records with given query/system res = self.localcache.incache(dasquery, collection='cache', system=self.name) if res: msg = "found records in local cache" self.logger.info(msg) return # ask data-service api to get results, they'll be store them in # cache, so return at the end what we have in cache. self.api(dasquery) def write_to_cache(self, dasquery, expire, url, api, args, gen, ctime): """ Write provided result set into DAS cache. """ if not self.write2cache: return # before going to cache we should check/set possible misses, e.g. # primary key when error is thrown result = self.set_misses(dasquery, api, gen) # update the cache header = dasheader(self.name, dasquery, expire, api, url, services=self.services()) header['lookup_keys'] = self.lookup_keys(api) header['prim_key'] = self.dasmapping.primary_mapkey(self.name, api) header['ctime'] = ctime system = self.name self.localcache.update_cache(dasquery, result, header, system, api) msg = 'cache has been updated,\n' self.logger.debug(msg) def adjust_params(self, api, kwds, instance=None): """ Data-service specific parser to adjust parameters according to its specifications. For example, DQ service accepts a string of parameters, rather parameter set, while DBS2 can reuse some parameters for different API, e.g. I can use dataset path to pass to listPrimaryDatasets as primary_dataset pattern. """ pass def lookup_keys(self, api): """ Return look-up keys of data output for given data-service API. """ lkeys = self.dasmapping.lookup_keys(self.name, api) return [{api: lkeys}] def inspect_params(self, api, args): """ Perform API parameter inspection. Check if API accept a range of parameters, etc. """ for key, value in args.items(): if isinstance(value, dict): minval = None maxval = None for oper, val in value.items(): if oper == '$in': minval = int(val[0]) maxval = int(val[-1]) args[key] = range(minval, maxval) elif oper == '$lt': maxval = int(val) args[key] = maxval elif oper == '$lte': maxval = int(val) args[key] = maxval elif oper == '$gt': minval = int(val) args[key] = minval elif oper == '$gte': minval = int(val) args[key] = minval else: msg = '%s does not support operator %s' % (api, oper) raise Exception(msg) return args def get_notations(self, api): """Return notations used for given API""" notationmap = self.notations() if not notationmap: return {} notations = {} if '' in notationmap: notations = dict(notationmap['']) # notations applied to all APIs if api in notationmap: # overwrite the one for provided API notations.update(notationmap[api]) return notations def parser(self, dasquery, dformat, data, api): """ DAS data parser. Input parameters: - *query* input DAS query - *dformat* is a data format, e.g. XML, JSON - *data* is a data source, either file-like object or actual data - *api* is API name """ prim_key = self.dasmapping.primary_key(self.name, api) counter = 0 if dformat.lower() == 'xml': tags = self.dasmapping.api2daskey(self.name, api) gen = xml_parser(data, prim_key, tags) for row in gen: counter += 1 yield row elif dformat.lower() == 'json' or dformat.lower() == 'dasjson': gen = json_parser(data, self.logger) das_dict = {} for row in gen: if dformat.lower() == 'dasjson': for key, val in row.items(): if key != 'results': das_dict[key] = val row = row['results'] if isinstance(row, list): for item in row: if item: if prim_key in item: counter += 1 yield item else: counter += 1 yield {prim_key: item} else: if prim_key in row: counter += 1 yield row else: counter += 1 yield {prim_key: row} else: msg = 'Unsupported data format="%s", API="%s"' % (dformat, api) raise Exception(msg) msg = "api=%s, format=%s " % (api, dformat) msg += "prim_key=%s yield %s rows" % (prim_key, counter) self.logger.info(msg) def translator(self, api, genrows): """ Convert raw results into DAS records. """ prim_key = self.dasmapping.primary_key(self.name, api) count = 0 for row in genrows: row2das(self.dasmapping.notation2das, self.name, api, row) count += 1 # check for primary key existance, since it can be overriden # by row2das. For example DBS3 uses flat namespace, so we # override dataset=>name, while dataset still is a primary key if isinstance(row, list): yield {prim_key: row} elif prim_key in row: if prim_key in row[prim_key]: yield row[prim_key] # remapping may create nested dict else: yield row else: yield {prim_key: row} msg = "yield %s rows" % count self.logger.debug(msg) def set_misses(self, dasquery, api, genrows): """ Check and adjust DAS records wrt input query. If some of the DAS keys are missing, add it with its value to the DAS record. """ # look-up primary key prim_key = self.dasmapping.primary_key(self.name, api) # Scan all docs and store those whose size above MongoDB limit into # GridFS map_key = self.dasmapping.primary_mapkey(self.name, api) genrows = parse2gridfs(self.gfs, map_key, genrows, self.logger) spec = dasquery.mongo_query['spec'] row = next(genrows) ddict = DotDict(row) keys2adjust = [] for key in spec.keys(): val = ddict.get(key) if spec[key] != val and key not in keys2adjust: keys2adjust.append(key) msg = "adjust keys %s" % keys2adjust self.logger.debug(msg) count = 0 if keys2adjust: # adjust of the rows for row in yield_rows(row, genrows): ddict = DotDict(row) pval = ddict.get(map_key) if isinstance(pval, dict) and 'error' in pval: ddict[map_key] = '' ddict.update({prim_key: pval}) for key in keys2adjust: value = spec[key] existing_value = ddict.get(key) # the way to deal with proximity/patern/condition results if (isinstance(value, str) or isinstance(value, unicode))\ and value.find('*') != -1: # we got pattern if existing_value: value = existing_value elif isinstance(value, dict) or \ isinstance(value, list): # we got condition if existing_value: value = existing_value elif isinstance(value, dict) and \ '$in' in value: # we got a range {'$in': []} value = value['$in'] elif isinstance(value, dict) and \ '$lte' in value and '$gte' in value: # we got a between range value = [value['$gte'], value['$lte']] else: value = json.dumps(value) elif existing_value and value != existing_value: # we got proximity results if 'proximity' in ddict: proximity = DotDict({key: existing_value}) ddict['proximity'].update(proximity) else: proximity = DotDict({}) proximity[key] = existing_value ddict['proximity'] = proximity else: if existing_value: value = existing_value ddict[key] = value yield ddict count += 1 else: yield row for row in genrows: yield row count += 1 msg = "yield %s rows" % count self.logger.debug(msg) def api(self, dasquery): """ Data service api method, can be defined by data-service class. It parse input query and invoke appropriate data-service API call. All results are stored into the DAS cache along with api call inserted into Analytics DB. """ self.logger.info(dasquery) genrows = self.apimap(dasquery) if not genrows: return jobs = [] for url, api, args, dformat, expire in genrows: # insert DAS query record for given API header = dasheader(self.name, dasquery, expire, api, url) self.localcache.insert_query_record(dasquery, header) # fetch DAS data records if self.multitask: jobs.append(self.taskmgr.spawn(self.apicall, \ dasquery, url, api, args, dformat, expire)) else: self.apicall(dasquery, url, api, args, dformat, expire) if self.multitask: self.taskmgr.joinall(jobs) def apicall(self, dasquery, url, api, args, dformat, expire): """ Data service api method, can be defined by data-service class. It parse input query and invoke appropriate data-service API call. All results are stored into the DAS cache along with api call inserted into Analytics DB. We invoke explicitly close call for our datastream instead of using context manager since this method as well as getdata/parser can be overwritten by child classes. """ datastream = None try: args = self.inspect_params(api, args) time0 = time.time() headers = make_headers(dformat) datastream, expire = self.getdata(url, args, expire, headers) self.logger.info("%s expire %s" % (api, expire)) rawrows = self.parser(dasquery, dformat, datastream, api) dasrows = self.translator(api, rawrows) ctime = time.time() - time0 self.write_to_cache(dasquery, expire, url, api, args, dasrows, ctime) except Exception as exc: msg = 'Fail to process: url=%s, api=%s, args=%s' \ % (url, api, args) print(msg) print_exc(exc) close(datastream) def url_instance(self, url, _instance): """ Virtual method to adjust URL for a given instance, must be implemented in service classes """ return url def adjust_url(self, url, instance): """ Adjust data-service URL wrt provided instance, e.g. DBS carry several instances """ if instance: url = self.url_instance(url, instance) return url def apimap(self, dasquery): """ Analyze input query and yield url, api, args, format, expire for further processing. """ srv = self.name # get local copy to avoid threading issues cond = getarg(dasquery.mongo_query, 'spec', {}) instance = dasquery.mongo_query.get('instance', self.dbs_global) skeys = getarg(dasquery.mongo_query, 'fields', []) if not skeys: skeys = [] self.logger.info("\n") for api, value in self.map.items(): expire = value['expire'] iformat = value['format'] url = self.adjust_url(value['url'], instance) if not url: msg = '--- rejects API %s, no URL' % api self.logger.info(msg) continue args = dict(value['params']) # make new copy, since we'll adjust wild = value.get('wild_card', '*') found = 0 # check if input parameters are covered by API if not self.dasmapping.check_api_match(srv, api, cond): msg = '--- rejects API %s, does not cover input condition keys' \ % api self.logger.info(msg) continue # once we now that API covers input set of parameters we check # every input parameter for pattern matching for key, val in cond.items(): # check if keys from conditions are accepted by API # need to convert key (which is daskeys.map) into # input api parameter for apiparam in self.dasmapping.das2api(srv, api, key, val): if apiparam in args: args[apiparam] = val found += 1 # VK 20160708, wrong statement, it caused to pass # datasets API for query dataset in [path1, path2] # I'll leave block here until I test and verify that # commented out block will not cause other issues # # check the case when we only have single condition key # and it is the key we look-up # if not found and skeys == [k.split('.')[0] for k in cond.keys()]: # found = 1 # check if number of keys on cond and args are the same if len(cond.keys()) != found: msg = "--- reject API %s, not all condition keys are covered" \ % api self.logger.info(msg) msg = 'args=%s' % args self.logger.debug(msg) continue if not found: msg = "--- rejects API %s, parameters don't match" % api self.logger.info(msg) msg = 'args=%s' % args self.logger.debug(msg) continue self.adjust_params(api, args, instance) # delete args keys whose value is optional delete_keys(args, 'optional') # check that there is no "required" parameter left in args, # since such api will not work if 'required' in args.values(): msg = '--- rejects API %s, parameter is required' % api self.logger.info(msg) msg = 'args=%s' % args self.logger.debug(msg) continue # adjust pattern symbols in arguments if wild != '*': for key, val in args.items(): if isinstance(val, str) or isinstance(val, unicode): val = val.replace('*', wild) args[key] = val # compare query selection keys with API look-up keys api_lkeys = self.dasmapping.api_lkeys(srv, api) if set(api_lkeys) != set(skeys): msg = "--- rejects API %s, api_lkeys(%s)!=skeys(%s)"\ % (api, api_lkeys, skeys) self.logger.info(msg) continue msg = '+++ %s passes API %s' % (srv, api) self.logger.info(msg) msg = 'args=%s' % args self.logger.debug(msg) msg = "yield " msg += "system ***%s***, url=%s, api=%s, args=%s, format=%s, " \ % (srv, url, api, args, iformat) msg += "expire=%s, wild_card=%s" \ % (expire, wild) self.logger.debug(msg) yield url, api, args, iformat, expire
class DASMapping(object): """ This class manages DAS mapping DB. """ __cached_inst = None __cached_params = None def __new__(cls, config): """ creates a new instance of the class and cache it or return an existing instance if one exists (only when the params match). only the last instance is cached, but this simplifies the implementation as the param 'config' might be a complex unhashable object. """ # check if we can reuse an existing instance if cls.__cached_inst and cls.__cached_params == config: if config['verbose']: print("DASMapping::__new__: returning a cached instance") return cls.__cached_inst # otherwise create and initialize a new instance if config['verbose']: print("DASMapping::__new__: creating a new instance") self = object.__new__(cls) self.verbose = config['verbose'] self.logger = PrintManager('DASMapping', self.verbose) self.services = config['services'] self.dburi = config['mongodb']['dburi'] self.dbname = config['mappingdb']['dbname'] self.colname = config['mappingdb']['collname'] self.map_test = config.get('map_test', True) self.main_dbs = config['das'].get('main_dbs', 'dbs3') self.dbsinsts = config['das'].get('dbs_instances', []) msg = "%s@%s" % (self.dburi, self.dbname) self.logger.info(msg) self.das_son_manipulator = DAS_SONManipulator() index = [('type', DESCENDING),\ ('system', DESCENDING),\ ('urn', DESCENDING),\ ('das_map.das_key', DESCENDING),\ ('das_map.rec_key', DESCENDING),\ ('das_map.api_arg', DESCENDING),\ ] create_indexes(self.col, index) self.daskeyscache = {} # to be filled at run time self.systems = [] # to be filled at run time self.dasmapscache = {} # to be filled at run time self.keymap = {} # to be filled at run time self.presentationcache = {} # to be filled at run time self.reverse_presentation = {} # to be filled at run time self.notationcache = {} # to be filled at run time self.diffkeycache = {} # to be filled at run time self.apicache = {} # to be filled at run time self.dbs_global_url = None # to be determined at run time self.dbs_inst_names = None # to be determined at run time self.load_maps() # cache the instance and return it DASMapping.__cached_inst = self DASMapping.__cached_params = config return self @property def col(self): "col property provides access to DAS mapping collection" conn = db_connection(self.dburi) mdb = conn[self.dbname] colnames = mdb.collection_names() if not colnames or self.colname not in colnames: try: mdb.create_collection(self.colname) except OperationFailure: pass mdb.add_son_manipulator(self.das_son_manipulator) return mdb[self.colname] # =============== # Management APIs # =============== def load_maps(self): "Helper function to reload DAS maps" self.init_dasmapscache() self.init_notationcache() self.init_presentationcache() self.systems = None # re-initialize DAS system list self.list_systems() self.dbs_global_url = None # re-initialize DAS dbs global url self.dbs_url() self.dbs_inst_names = None # re-initialize DAS dbs instances self.dbs_instances() def init_dasmapscache(self, records=None): "Read DAS maps and initialize DAS API maps" if not records: spec = {'type':'service'} records = self.col.find(spec, **PYMONGO_OPTS) for row in records: if 'urn' in row: for dmap in row['das_map']: for key, val in dmap.items(): if key == 'pattern': pat = re.compile(val) dmap[key] = pat key = (row['system'], row['urn']) self.dasmapscache[key] = row def init_notationcache(self): """ Initialize notation cache by reading notations. """ for system, notations in self.notations().items(): for row in notations: key = system, row['api_output'] if key in self.notationcache: self.notationcache[key] += [ (row['api'], row['rec_key']) ] else: self.notationcache[key] = [ (row['api'], row['rec_key']) ] def init_presentationcache(self): """ Initialize presentation cache by reading presentation map. """ spec = {'type':'presentation'} data = find_one(self.col, spec) if data: self.presentationcache = data['presentation'] for daskey, uilist in self.presentationcache.items(): for row in uilist: link = None if 'link' in row: link = row['link'] if 'diff' in row: self.diffkeycache[daskey] = row['diff'] tdict = {daskey : {'mapkey': row['das'], 'link': link}} if row['ui'] in self.reverse_presentation: self.reverse_presentation[row['ui']].update(tdict) else: self.reverse_presentation[row['ui']] = \ {daskey : {'mapkey': row['das'], 'link': link}} def das_presentation_map(self): "Read DAS presentation map" spec = {'type':'presentation'} data = find_one(self.col, spec) if data: for _, uilist in data.get('presentation', {}).items(): for row in uilist: if 'link' in row: yield row def delete_db(self): """ Delete mapping DB in MongoDB back-end. """ conn = db_connection(self.dburi) if conn: conn.drop_database(self.dbname) def delete_db_collection(self): """ Delete mapping DB collection in MongoDB. """ conn = db_connection(self.dburi) if conn: dbc = conn[self.dbname] dbc.drop_collection(self.colname) def check_maps(self): """ Check Mapping DB and return true/false based on its content """ if not self.map_test: return True # do not test DAS maps, useful for unit tests udict = defaultdict(int) ndict = defaultdict(int) pdict = defaultdict(int) adict = {} maps_hash = False for row in self.col.find(**PYMONGO_OPTS): check_map_record(row) if 'urn' in row: udict[row['system']] += 1 elif 'notations' in row: ndict[row['system']] += 1 elif 'presentation' in row: pdict['presentation'] += 1 elif 'arecord' in row: arec = row['arecord'] system = arec['system'] rec = {arec['type']:arec['count']} if system in adict: adict[system].update(rec) else: adict[system] = rec elif 'verification_token' in row: maps_hash = row['verification_token'] # retrieve uri/notation/presentation maps ulist = [] nlist = [] for system in adict.keys(): if 'uri' in adict[system]: ulist.append(adict[system]['uri'] == udict[system]) nlist.append(adict[system]['notations'] == ndict[system]) status_umap = sum(ulist) == len(ulist) status_nmap = sum(nlist) == len(nlist) status_pmap = adict.get('presentation', {}).get('presentation', 0) == 1 # verify completeness of maps calc_token = verification_token(self.col.find(**PYMONGO_OPTS)) status_complete = maps_hash and maps_hash == calc_token if self.verbose: print("### DAS map status, umap=%s, nmap=%s, pmap=%s, complete=%s" \ % (status_umap, status_nmap, status_pmap, status_complete)) if not status_complete: print("### DAS map hash do not match, got=%s calculated=%s" \ % (maps_hash, calc_token)) # multiply statuses as a result of this map check return status_umap*status_nmap*status_pmap*status_complete def remove(self, spec): """ Remove record in DAS Mapping DB for provided Mongo spec. """ self.col.remove(spec) def add(self, record): """ Add new record into mapping DB. Example of URI record .. doctest:: { system:dbs, urn : listBlocks, url : "http://a.b.com/api" params : [{"apiversion":1_2_2, se:"*"}] lookup : block das_map: [ {"das_key":"block", "rec_key":"block.name"}, {"das_key":"site", "rec_key":"site.name", "api_arg":"se", "pattern":"^T[0-3]_}, ] } Example of notation record: .. doctest:: notations: [ {"api_output" : "storage_element_name", "rec_key":"site", "api": ""}, ] """ msg = 'record=%s' % record self.logger.debug(msg) self.col.insert(record) self.init_dasmapscache([record]) # ================== # Informational APIs # ================== def dbs_global_instance(self, system=None): "Retrive from mapping DB DBS url and extract DBS instance" if not system: system = self.main_dbs url = self.dbs_url(system) return get_dbs_instance(url) def dbs_url(self, system=None): "Retrive from mapping DB DBS url" if not system: system = self.main_dbs systems = self.list_systems() dbses = set(['dbs3']) if dbses & set(systems) != dbses: # use caching only when we operate with single DBS if self.dbs_global_url: return self.dbs_global_url url = None for srv in systems: if srv == system: apis = self.list_apis(srv) url = self.api_info(srv, apis[0])['url'] url = parse_dbs_url(srv, url) self.dbs_global_url = url return url return url def dbs_instances(self, system=None): "Retrive from mapping DB DBS instances" # use dbs istances from the config if self.dbsinsts and not system: return self.dbsinsts # default dbs if not system: system = self.main_dbs systems = self.list_systems() dbses = set(['dbs3']) if dbses & set(systems) != dbses: # use caching only when we operate with single DBS if self.dbs_inst_names: return self.dbs_inst_names insts = [] dbs_global_inst = self.dbs_global_instance(system) if system == 'dbs3' and dbs_global_inst: dbs_namespace = dbs_global_inst.split('/')[0] else: dbs_namespace = None for srv in systems: if srv == system: apis = self.list_apis(srv) insts = self.api_info(srv, apis[0])['instances'] if dbs_namespace: insts = [d for d in insts if d.startswith(dbs_namespace)] self.dbs_inst_names = insts return insts return insts def list_systems(self): """ List all DAS systems. """ if not self.systems: spec = { 'type': 'service', 'system' : { '$ne' : None } } gen = (row['system'] \ for row in self.col.find(spec, ['system'], **PYMONGO_OPTS)) self.systems = list( set(gen2list(gen)) & set(self.services) ) return self.systems def list_apis(self, system=None): """ List all APIs. """ if self.apicache and system in self.apicache: return self.apicache[system] spec = { 'type': 'service', 'urn' : { '$ne' : None } } if system: spec['system'] = system gen = (row['urn'] \ for row in self.col.find(spec, ['urn'], **PYMONGO_OPTS)) self.apicache[system] = gen2list(gen) return self.apicache[system] def api_info(self, srv, api_name): """ Return full API info record. """ return self.dasmapscache[(srv, api_name)] def relational_keys(self, system1, system2): """ Return a list of relational keys between provided systems """ for system, keys in self.daskeys().items(): if system == system1: keys1 = keys if system == system2: keys2 = keys return list( set(keys1) & set(keys2) ) def daskeys(self, das_system=None): """ Return a dict with all known DAS keys. """ if das_system in self.daskeyscache: return self.daskeyscache[das_system] spec = { 'type': 'service', 'system' : { '$ne' : None } } if das_system: spec = { 'system' : das_system } gen = (row['system'] \ for row in self.col.find(spec, ['system'], **PYMONGO_OPTS)) gen = [r for r in gen] kdict = {} for system in gen: spec = {'system':system, 'urn':{'$ne':None}} keys = [] for row in self.col.find(spec, **PYMONGO_OPTS): for entry in row['das_map']: if entry['das_key'] not in keys: keys.append(entry['das_key']) kdict[system] = keys # cache it self.daskeyscache[das_system] = kdict return kdict # ============ # Look-up APIs # ============ def api_lkeys(self, das_system, api): """ Return DAS lookup keys for given das system and api """ entry = self.dasmapscache[(das_system, api)] skeys = entry['lookup'].split(',') return skeys def primary_key(self, das_system, urn): """ Return DAS primary key for provided system and urn. The DAS primary key is a first entry in *lookup* attribute of DAS API record. """ spec = {'system':das_system, 'urn':urn} record = find_one(self.col, spec) if not record: return None pkey = record['lookup'] if pkey.find(',') != -1: pkey = pkey.split(',')[0] return pkey def primary_mapkey(self, das_system, urn): """ Return DAS primary map key for provided system and urn. For example, the file DAS key is mapped to file.name, so this API will return file.name """ spec = {'system':das_system, 'urn':urn} record = find_one(self.col, spec) mapkey = [] for row in record['das_map']: lkey = record['lookup'] if lkey.find(',') != -1: lkey = lkey.split(',')[0] if row['das_key'] == lkey: return row['rec_key'] return mapkey def find_daskey(self, das_system, map_key, value=None): """ Find das key for given system and map key. """ msg = 'system=%s\n' % das_system daskeys = [] for key, record in self.dasmapscache.items(): srv, _ = key if das_system != srv: continue for row in record['das_map']: das_key = row['das_key'] rec_key = row['rec_key'] if rec_key != map_key: continue pat = row.get('pattern', None) if value: if pat: if pat.match(str(value)): daskeys.append(das_key) else: msg += '-- reject key=%s, val=%s, pat=%s\n'\ % (map_key, value, pat.pattern) self.logger.debug(msg) else: daskeys.append(das_key) else: daskeys.append(das_key) return daskeys def find_mapkey(self, das_system, das_key, value=None): """ Find map key for given system and das key. """ msg = 'system=%s\n' % das_system for key, record in self.dasmapscache.items(): srv, _ = key if das_system != srv: continue for row in record['das_map']: if row['das_key'] != das_key: continue rec_key = row['rec_key'] pat = row.get('pattern', None) if value: if pat: if pat.match(str(value)): return rec_key else: msg += '-- reject key=%s, val=%s, pat=%s\n'\ % (das_key, value, pat.pattern) self.logger.debug(msg) continue else: return rec_key else: return rec_key def mapkeys(self, daskey): """ Find all lookup keys (primary keys) for a given daskey """ if daskey in self.keymap: return self.keymap[daskey] spec = {'das_map.das_key' : daskey} mapkeys = [] for row in self.col.find(spec, ['das_map'], **PYMONGO_OPTS): for kmap in row['das_map']: if kmap['das_key'] == daskey and \ kmap['rec_key'] not in mapkeys: mapkeys.append(kmap['rec_key']) self.keymap[daskey] = mapkeys return self.keymap[daskey] def find_apis(self, das_system, map_key): """ Find list of apis which correspond to provided system and das map key. """ spec = { 'system' : das_system, 'das_map.rec_key': map_key } apilist = [] for row in self.col.find(spec, ['urn'], **PYMONGO_OPTS): if 'urn' in row and row['urn'] not in apilist: apilist.append(row['urn']) return apilist def find_system(self, key): """ Return system name for provided DAS key. """ spec = { 'das_map.das_key' : key } gen = (row['system'] \ for row in self.col.find(spec, ['system'], **PYMONGO_OPTS)) systems = [] for system in gen: if system not in systems: systems.append(system) systems.sort() return systems def lookup_keys(self, system, api, daskey=None, value=None): """ Returns lookup keys for given system and provided selection DAS key, e.g. block => block.name """ entry = self.dasmapscache.get((system, api), None) if not entry: return [] lkeys = entry.get('lookup', []).split(',') rkeys = [] if daskey in lkeys: for dmap in entry['das_map']: rec_key = dmap['rec_key'] if daskey: if dmap['das_key'] == daskey: pat = dmap.get('pattern', None) if value: if pat.match(str(value)): rkeys.append(rec_key) else: if rec_key not in rkeys: rkeys.append(rec_key) else: rkeys.append(rec_key) return rkeys def api2das(self, system, api_input_name): """ Translates data-service API input parameter into DAS QL key, e.g. run_number => run. """ query = {'system':system, 'das_map.api_arg' : api_input_name} names = [] for adas in self.col.find(query, ['das_map'], **PYMONGO_OPTS): for row in adas['das_map']: try: if 'api_arg' in row: aparam = row['api_arg'] daskey = row['das_key'] if aparam == api_input_name and daskey not in names: names.append(daskey) except Exception as err: print("ERROR: look-up api_param/das_key in", row) raise err return names def check_api_match(self, system, api, icond): "Check if given API covers condition parameters" entry = self.dasmapscache.get((system, api), None) if not entry: return False ikeys = [k.split('.')[0] for k in icond.keys()] dkeys = [] for row in entry.get('das_map', []): if 'api_arg' in row: das_key = row['das_key'] dkeys.append(das_key) else: dkeys.append(row['das_key']) if set(ikeys) & set(dkeys) == set(ikeys): return True return False def das2api(self, system, api, rec_key, value=None): """ Translates DAS record key into data-service API input parameter, e.g. run.number => run_number """ entry = self.dasmapscache.get((system, api), None) names = [] if not entry: return [rec_key] for row in entry.get('das_map', []): if 'api_arg' in row: api_param = row['api_arg'] pat = row.get('pattern', None) if row['rec_key'] != rec_key: continue if value and pat: if isinstance(value, dict): if pat.match(json.dumps(value.values()[0])): if api_param not in names: names.append(api_param) if pat.match(str(value)): if api_param not in names: names.append(api_param) else: if api_param not in names: names.append(api_param) else: names.append(row['rec_key']) return names def notations(self, system=None): """ Return DAS notation map. """ notationmap = {} spec = {'type':'notation'} if system: spec['system'] = system for item in self.col.find(spec, **PYMONGO_OPTS): notationmap[item['system']] = item['notations'] return notationmap def notation2das(self, system, api_param, api=""): """ Translates data-service API parameter name into DAS name, e.g. run_number=run. In case when api_param is not presented in DB just return it back. """ if not self.notationcache: self.init_notationcache() name = api_param if (system, api_param) in self.notationcache: for item in self.notationcache[(system, api_param)]: _api, das_name = item if _api: if _api == api: name = das_name break else: # valid for all API names name = das_name return name def api2daskey(self, system, api): """ Returns list of DAS keys which cover provided data-service API """ spec = {'system':system, 'urn':api} keys = [] for row in self.col.find(spec, **PYMONGO_OPTS): for entry in row['das_map']: keys.append(entry['das_key']) return keys def servicemap(self, system): """ Constructs data-service map, e.g. .. doctest:: {api: {keys:[list of DAS keys], params: args, url:url, format:ext, expire:exp} } """ spec = {'system':system, 'urn':{'$ne':None}} smap = {} for row in self.col.find(spec, **PYMONGO_OPTS): url = row['url'] exp = row['expire'] ext = row['format'] api = row['urn'] lookup = row['lookup'] wild = row.get('wild_card', '*') ckey = row.get('ckey') cert = row.get('cert') services = row.get('services', '') keys = [] for entry in row['das_map']: keys.append(entry['das_key']) params = dict(row['params']) smap[api] = dict(keys=keys, params=params, url=url, expire=exp,\ format=ext, wild_card=wild, ckey=ckey, cert=cert,\ services=services, lookup=lookup) return smap def presentation(self, daskey): """ Return web UI presentation keys for provided DAS keyword. For example once asked for block we present block.name, block.size, etc. """ if daskey in self.presentationcache: return self.presentationcache[daskey] return [daskey] def daskey_from_presentation(self, uikey): """ Return triplet (DAS key, DAS access key, link) associated with provided UI key. """ if uikey in self.reverse_presentation: return self.reverse_presentation[uikey] def diff_keys(self, daskey): """ Return diff keys for provided DAS key. """ if daskey in self.diffkeycache: return self.diffkeycache[daskey] return [] def inputvalues_uris(self): """ Return the info on how to fetch the list of allowed input values for certain commonly used input fields (from enabled DAS systems only) """ uris = [] for row in self.col.find({'type': 'input_values'}, **PYMONGO_OPTS): # check that system is active if row['system'] not in self.services: continue uris.extend(row['input_values']) return uris
class DASMongocache(object): """ DAS cache based MongoDB. """ def __init__(self, config): self.config = config self.emptyset_expire = \ expire_timestamp(config['das'].get('emptyset_expire', 5)) self.dburi = config['mongodb']['dburi'] self.cache_size = config['mongodb']['bulkupdate_size'] self.dbname = config['dasdb']['dbname'] self.verbose = config['verbose'] self.logger = PrintManager('DASMongocache', self.verbose) self.mapping = config['dasmapping'] self.logging = config['dasdb'].get('logging', False) self.rec_ttl = config['dasdb'].get('record_ttl', 24 * 60 * 60) self.del_ttl = config['dasdb'].get('delta_ttl', 60) self.cleanup_del_ttl = config['dasdb'].get('cleanup_delta_ttl', 3600) self.retry = config['dasdb'].get('retry', 3) self.das_son_manipulator = DAS_SONManipulator() # Initialize MongoDB connection self.col_ = self.config['dasdb']['cachecollection'] self.mrcol_ = self.config['dasdb']['mrcollection'] self.merge_ = self.config['dasdb']['mergecollection'] self.gfs = db_gridfs(self.dburi) msg = "%s@%s" % (self.dburi, self.dbname) self.logger.info(msg) # ensure that we have the following indexes common_idx = [ ('file.name', DESCENDING), ('dataset.name', DESCENDING), ('block.name', DESCENDING), ('run.run_number', DESCENDING), ] index_list = [('das.expire', ASCENDING), ('das_id', ASCENDING), ('das.system', ASCENDING), ('qhash', DESCENDING), ('das.record', ASCENDING)] create_indexes(self.col, index_list + common_idx) index_list = [('das.expire', ASCENDING), ('das_id', ASCENDING), ('qhash', DESCENDING), ('das.record', ASCENDING), ('das.ts', ASCENDING)] create_indexes(self.merge, index_list) # NOTE: I found that creating index in merge collection leads to # MongoDB error when records contains multiple arrays on indexed # keys. For example, when we query file,run,lumi both file and run # are arrays in MongoDB. In this case the final sort in MongoDB # bark with the following message: # cannot sort with keys that are parallel arrays # it looks like that there is no fix for that yet # see # http://stackoverflow.com/questions/6516725/how-do-i-index-two-arrays-in-mongodb # therefore I temporary disabled create_indexes call on merge # collection which was used to have index to ease final sort, # especially in a case when a lot of records correspond to inital # query, e.g. file records. # On another hand, the most common use case where sort fails is # getting file records, and I can add one compound key to ease sort # but I can't add another compound key on array field, e.g. run common_idx = [[('qhash', DESCENDING), ('file.name', DESCENDING)]] create_indexes(self.merge, index_list + common_idx) # thread which clean-up DAS collections thname = 'mongocache_cleanup' cols = [ config['dasdb']['cachecollection'], config['dasdb']['mrcollection'], config['dasdb']['mergecollection'] ] @property def col(self): "col property provides access to DAS cache collection" conn = db_connection(self.dburi) mdb = conn[self.dbname] colnames = mdb.collection_names() if not colnames or self.col_ not in colnames: try: mdb.create_collection(self.col_) except OperationFailure: pass mdb.add_son_manipulator(self.das_son_manipulator) return mdb[self.col_] @property def merge(self): "merge property provides access to DAS merge collection" conn = db_connection(self.dburi) mdb = conn[self.dbname] colnames = mdb.collection_names() if not colnames or self.merge_ not in colnames: try: mdb.create_collection(self.merge_) except OperationFailure: pass mdb.add_son_manipulator(self.das_son_manipulator) return mdb[self.merge_] @property def mrcol(self): "mrcol property provides access to DAS map-reduce collection" conn = db_connection(self.dburi) mdb = conn[self.dbname] mdb.add_son_manipulator(self.das_son_manipulator) return mdb[self.mrcol_] def get_dataset_hashes(self, dasquery): "Get dataset hashes from DBS database" spec = dasquery.mongo_query.get('spec', {}) inst = dasquery.instance conn = db_connection(self.dburi) if spec and inst: dataset = spec.get('dataset.name', None) if dataset: if dataset.find('*') != -1: cond = {'dataset': re.compile(dataset.replace('*', '.*'))} else: cond = {'dataset': dataset} for row in conn['dbs'][inst].find(cond): if 'qhash' in row: yield row['qhash'] def check_datasets(self, dasquery): "Check dataset presence in DAS cache for given das query" hashes = [r for r in self.get_dataset_hashes(dasquery)] if hashes: spec = {'qhash': {'$in': hashes}} if len(hashes) == self.merge.find(spec, **PYMONGO_OPTS).count(): dasquery._hashes = hashes def get_superset_keys(self, key, value): """ This is a special-case version of similar_keys, intended for analysers that want to quickly find possible superset queries of a simple query of the form key=value. """ msg = "%s=%s" % (key, value) self.logger.debug(msg) cond = {'query.spec.key': key} for row in self.col.find(cond, **PYMONGO_OPTS): mongo_query = decode_mongo_query(row['query']) for thiskey, thisvalue in mongo_query.items(): if thiskey == key: if fnmatch.fnmatch(value, thisvalue): yield thisvalue def get_fields(self, dasquery): "Prepare fields to extract from MongoDB" fields = dasquery.mongo_query.get('fields', []) if fields and 'records' in fields: fields = None # look-up all records filters = dasquery.filters cond = {} if filters: new_fields = [] for dasfilter in filters: if dasfilter == 'unique': continue if fields and dasfilter not in fields and \ dasfilter not in new_fields: if dasfilter.find('=') == -1 and dasfilter.find('<') == -1\ and dasfilter.find('>') == -1: new_fields.append(dasfilter) else: cond = parse_filters(dasquery.mongo_query) if not new_fields and fields: new_fields = list(fields) return new_fields, cond return fields, cond def remove_expired(self, dasquery, collection): """ Remove expired records from DAS cache. We need to perform this operation very carefullly since we don't use transation and on-going commits can invoke this method (see das_core.py). Therefore we use MongoDB $or operator to wipe out queries which match DASQuery hash and already expired or queries which lived in cache more then rec_ttl config parameter. The later operation just prevent DAS cache from growing. """ conn = db_connection(self.dburi) mdb = conn[self.dbname] mdb.add_son_manipulator(self.das_son_manipulator) col = mdb[collection] # use additional delta to check data record expiration # we add this delta to ensure that there is no records close to # current timestamp which may expire during request processing spec = { 'qhash': dasquery.qhash, 'das.expire': { '$lt': time.time() + self.del_ttl } } col.delete_many(spec) def check_services(self, dasquery): """ Check if DAS cache contains DAS records with service response for given query. """ das_rec = self.find(dasquery) if not das_rec: return False if 'das' not in das_rec: return False if 'services' not in das_rec['das']: return False spec = { 'qhash': dasquery.qhash, 'das.system': { '$ne': 'das' }, 'das.expire': { '$gt': time.time() } } nres = self.col.find(spec, **PYMONGO_OPTS).count() if nres: return True return False def find(self, dasquery): """ Find provided query in DAS cache. """ cond = { 'qhash': dasquery.qhash, 'das.system': 'das', 'das.expire': { '$gt': time.time() } } return find_one(self.col, cond) def find_specs(self, dasquery, system='das'): """ Check if cache has query whose specs are identical to provided query. Return all matches. """ if dasquery.hashes: cond = {'qhash': {'$in': dasquery.hashes}} else: cond = {'qhash': dasquery.qhash} if system: cond.update({'das.system': system}) cond.update({'das.expire': {'$gt': time.time()}}) return self.col.find(cond, **PYMONGO_OPTS) def get_das_ids(self, dasquery): """ Return list of DAS ids associated with given query """ das_ids = [] try: das_ids = \ [r['_id'] for r in self.col.find_specs(dasquery, system='')] except: pass return das_ids def update_das_expire(self, dasquery, timestamp): "Update timestamp of all DAS data records for given query" nval = {'$set': {'das.expire': timestamp}} spec = {'qhash': dasquery.qhash} self.col.update_many(spec, nval) self.merge.update_many(spec, nval) def das_record(self, dasquery): "Retrieve DAS record for given query" cond = {'qhash': dasquery.qhash, 'das.expire': {'$gt': time.time()}} return find_one(self.col, cond) def find_records(self, das_id): " Return all the records matching a given das_id" return self.col.find({'das_id': das_id}, **PYMONGO_OPTS) def is_error_in_records(self, dasquery, collection='cache'): "Scan DAS cache for error records and return true or not" if collection == 'cache': results = self.col.find({'qhash': dasquery.qhash}, **PYMONGO_OPTS) else: results = self.merge.find({'qhash': dasquery.qhash}, **PYMONGO_OPTS) error = None reason = None for row in results: if 'error' in row: error = row.get('error') reason = row.get('reason', '') break return error, reason def add_to_record(self, dasquery, info, system=None): "Add to existing DAS record provided info" if system: self.col.update_one( { 'query': dasquery.storage_query, 'das.system': system }, {'$set': info}, upsert=True) else: self.col.update_one({'query': dasquery.storage_query}, {'$set': info}, upsert=True) def find_min_expire(self, dasquery): """Find minimal expire timestamp across all records for given DAS query""" spec = {'qhash': dasquery.qhash} min_expire = 2 * time.time() # upper bound, will update for rec in self.col.find(spec, **PYMONGO_OPTS): if 'das' in rec and 'expire' in rec['das']: estamp = rec['das']['expire'] if min_expire > estamp: min_expire = estamp return long(min_expire) def find_query_record(self, dasquery): "Find DAS query records and return them to the caller" spec = { 'qhash': dasquery.qhash, 'das.record': record_codes('query_record') } return self.col.find(spec, **PYMONGO_OPTS) def update_query_record(self, dasquery, status, header=None, reason=None): "Update DAS record for provided query" ctime = time.time() das_spec = {'qhash': dasquery.qhash, 'das.system': 'das'} min_expire = self.find_min_expire(dasquery) if header: system = header['das']['system'] sts = header['das']['status'] expire = header['das']['expire'] spec = {'qhash': dasquery.qhash, 'das.system': system} new_expire = None for rec in self.col.find(spec, **PYMONGO_OPTS): if 'das' in rec and 'expire' in rec['das']: if rec['das']['expire'] > expire: new_expire = expire ndict = {'das.expire': expire, 'das.status': status} cdict = {'das.ctime': ctime} udict = {'$set': ndict, '$push': cdict} oid = ObjectId(rec['_id']) self.col.update_one({'_id': oid}, udict) if new_expire: udict = { '$set': { 'das.expire': new_expire }, '$push': { 'das.ctime': ctime } } self.col.update_one(das_spec, udict) else: udict = { '$set': { 'das.status': status, 'das.expire': min_expire }, '$push': { 'das.ctime': ctime } } self.col.update_one(das_spec, udict) if reason: udict = {'$set': {'das.reason': reason}} self.col.update_one(das_spec, udict) # align all expire timestamps when we recieve ok status if status == 'ok': udict = {'$set': {'das.expire': min_expire}} self.col.update_one(das_spec, udict) def apilist(self, dasquery): "Return list of apis for given dasquery" spec = { 'qhash': dasquery.qhash, 'das.record': record_codes('query_record') } apis = [] for row in self.col.find(spec, ['das.api'], **PYMONGO_OPTS): try: apis += row['das']['api'] except Exception as _err: pass return apis def incache(self, dasquery, collection='merge', system=None, api=None, query_record=False): """ Check if we have query results in cache, otherwise return null. Please note, input parameter query means MongoDB query, please consult MongoDB API for more details, http://api.mongodb.org/python/ """ if query_record: record = record_codes('query_record') else: record = spec4data_records() spec = { 'qhash': dasquery.qhash, 'das.record': record, 'das.expire': { '$gt': time.time() } } if system: spec.update({'das.system': system}) if api: spec.update({'das.api': api}) conn = db_connection(self.dburi) mdb = conn[self.dbname] mdb.add_son_manipulator(self.das_son_manipulator) col = mdb[collection] res = col.find(spec, **PYMONGO_OPTS).count() msg = "(%s, coll=%s) found %s results" % (dasquery, collection, res) self.logger.info(msg) if res: return True return False def nresults(self, dasquery, collection='merge'): """Return number of results for given query.""" if dasquery.aggregators: return len(dasquery.aggregators) # Distinguish 2 use cases, unique filter and general query # in first one we should count only unique records, in later # we can rely on DB count() method. Pleas keep in mind that # usage of fields in find doesn't account for counting, since it # is a view over records found with spec, so we don't need to use it. fields, filter_cond = self.get_fields(dasquery) if not fields: spec = dasquery.mongo_query.get('spec', {}) elif dasquery.hashes: spec = { 'qhash': { '$in': dasquery.hashes }, 'das.record': spec4data_records() } else: spec = {'qhash': dasquery.qhash, 'das.record': spec4data_records()} if filter_cond: spec.update(filter_cond) conn = db_connection(self.dburi) mdb = conn[self.dbname] mdb.add_son_manipulator(self.das_son_manipulator) col = mdb[collection] if dasquery.unique_filter: skeys = self.mongo_sort_keys(collection, dasquery) if skeys: gen = col.find(spec, **PYMONGO_OPTS).sort(skeys) else: gen = col.find(spec, **PYMONGO_OPTS) res = len([r for r in unique_filter(gen)]) else: res = col.find(spec, **PYMONGO_OPTS).count() if not res: # double check that this is really the case time.sleep(1) res = col.find(spec, **PYMONGO_OPTS).count() msg = "%s" % res self.logger.info(msg) return res def mongo_sort_keys(self, collection, dasquery): """ Find list of sort keys for a given DAS query. Check existing indexes and either use fields or spec keys to find them out. Return list of mongo sort keys in a form of (key, order). """ # try to get sort keys all the time to get ordered list of # docs which allow unique_filter to apply afterwards fields = dasquery.mongo_query.get('fields') spec = dasquery.mongo_query.get('spec') skeys = dasquery.sortkeys mongo_skeys = [] if skeys: for key in skeys: if key.find('-') != -1: # reverse order, e.g. desc mongo_skeys.append((key.replace('-', ''), DESCENDING)) else: mongo_skeys.append((key, ASCENDING)) else: existing_idx = [i for i in self.existing_indexes(collection)] if fields: lkeys = [] for key in fields: for mkey in self.mapping.mapkeys(key): if mkey not in lkeys: lkeys.append(mkey) else: lkeys = list(spec.keys()) keys = [k for k in lkeys \ if k.find('das') == -1 and k.find('_id') == -1 and \ k in existing_idx] mongo_skeys = [(k, ASCENDING) for k in keys] return mongo_skeys def existing_indexes(self, collection='merge'): """ Get list of existing indexes in DB. They are returned by index_information API in the following for: .. doctest:: {u'_id_': {u'key': [(u'_id', 1)], u'v': 0}, u'das.expire_1': {u'key': [(u'das.expire', 1)], u'v': 0}, ... u'tier.name_-1': {u'key': [(u'tier.name', -1)], u'v': 0}} """ conn = db_connection(self.dburi) mdb = conn[self.dbname] mdb.add_son_manipulator(self.das_son_manipulator) col = mdb[collection] for val in col.index_information().values(): for idx in val['key']: yield idx[0] # index name def get_records(self, coll, spec, fields, skeys, idx, limit, unique=False): "Generator to get records from MongoDB." try: conn = db_connection(self.dburi) mdb = conn[self.dbname] mdb.add_son_manipulator(self.das_son_manipulator) col = mdb[coll] nres = col.find(spec, **PYMONGO_OPTS).count() if nres == 1 or nres <= limit: limit = 0 if limit: res = col.find(spec, fields, sort=skeys, skip=idx, limit=limit) else: res = col.find(spec, fields, sort=skeys, **PYMONGO_OPTS) if unique: res = unique_filter(res) for row in res: yield row except Exception as exp: print_exc(exp) row = {'exception': str(exp)} res = [] yield row def get_from_cache(self, dasquery, idx=0, limit=0, collection='merge'): "Generator which retrieves results from the cache" if dasquery.service_apis_map(): # valid DAS query result = self.get_das_records(dasquery, idx, limit, collection) for row in result: yield row else: # pure MongoDB query fields = dasquery.mongo_query.get('fields', []) if fields == None: fields = [] spec = dasquery.mongo_query.get('spec', {}) if dasquery.filters: if not fields: fields = [] fields += dasquery.filters pkeys = [k.split('.')[0] for k in fields] fields += das_record_keys() if 'records' in dasquery.query: fields = None # special case for DAS 'records' keyword skeys = self.mongo_sort_keys(collection, dasquery) result = self.get_records(collection, spec, fields, skeys, \ idx, limit, dasquery.unique_filter) for row in result: if dasquery.filters: if pkeys and set(pkeys) & set(row.keys()): yield row else: yield row def get_das_records(self, dasquery, idx=0, limit=0, collection='merge'): "Generator which retrieves DAS records from the cache" msg = "(%s, %s, %s, coll=%s)" % (dasquery, idx, limit, collection) self.logger.info(msg) idx = int(idx) fields, filter_cond = self.get_fields(dasquery) if fields == None: fields = [] if not fields: spec = dasquery.mongo_query.get('spec', {}) elif dasquery.hashes: spec = { 'qhash': { '$in': dasquery.hashes }, 'das.record': spec4data_records() } else: spec = {'qhash': dasquery.qhash, 'das.record': spec4data_records()} if filter_cond: spec.update(filter_cond) if 'records' in dasquery.query: fields = None # retrieve all fields for records DAS query else: # be sure to extract das internal keys fields += das_record_keys() # try to get sort keys all the time to get ordered list of # docs which allow unique_filter to apply afterwards skeys = self.mongo_sort_keys(collection, dasquery) res = self.get_records(collection, spec, fields, skeys, \ idx, limit, dasquery.unique_filter) counter = 0 for row in res: counter += 1 yield row msg = 'qhash %s, found %s record(s) in %s collection' \ % (dasquery.qhash, counter, collection) print(dastimestamp('DAS INFO '), msg) if counter: msg = "yield %s record(s)" % counter self.logger.info(msg) # if no raw records were yield we look-up possible error records # and reset timestamp for record with system:['das'] if not counter: spec = {'qhash': dasquery.qhash} nrec = self.col.find(spec, **PYMONGO_OPTS).count() if nrec: msg = "for query %s, found %s non-result record(s)" \ % (dasquery, nrec) print(dastimestamp('DAS WARNING'), msg) for rec in self.col.find(spec, **PYMONGO_OPTS): if 'query' in rec: print(dastimestamp('DAS das record'), rec) self.update_das_expire(dasquery, etstamp()) def map_reduce(self, mr_input, dasquery, collection='merge'): """ Wrapper around _map_reduce to allow sequential map/reduce operations, e.g. map/reduce out of map/reduce. mr_input is either alias name or list of alias names for map/reduce functions. Input dasquery which is applied to first iteration of map/reduce functions. """ # NOTE: I need to revisit mapreduce. spec = dasquery.mongo_query['spec'] if not isinstance(mr_input, list): mrlist = [mr_input] else: mrlist = mr_input conn = db_connection(self.dburi) mdb = conn[self.dbname] mdb.add_son_manipulator(self.das_son_manipulator) coll = mdb[collection] for mapreduce in mrlist: if mapreduce == mrlist[0]: cond = spec else: cond = None coll = self._map_reduce(coll, mapreduce, cond) for row in coll.find(): yield row def _map_reduce(self, coll, mapreduce, spec=None): """ Perform map/reduce operation over DAS cache using provided collection, mapreduce name and optional conditions. """ self.logger.debug("(%s, %s)" % (mapreduce, spec)) record = find_one(self.mrcol, {'name': mapreduce}) if not record: raise Exception("Map/reduce function '%s' not found" % mapreduce) fmap = record['map'] freduce = record['reduce'] if spec: result = coll.map_reduce(Code(fmap), Code(freduce), query=spec) else: result = coll.map_reduce(Code(fmap), Code(freduce)) msg = "found %s records in %s" % (result.count(), result.name) self.logger.info(msg) self.logger.debug(fmap) self.logger.debug(freduce) return result def get_map_reduce(self, name=None): """ Return definition of map/reduce functions for provided name or gives full list. """ spec = {} if name: spec = {'name': name} result = self.mrcol.find(spec, **PYMONGO_OPTS) for row in result: yield row def merge_records(self, dasquery, attempt=0): """ Merge DAS records for provided query. We perform the following steps: 1. get all queries from das.cache by ordering them by primary key 2. run aggregtor function to merge neighbors 3. insert records into das.merge """ ### TMP for asyncio # time.sleep(attempt+3) # pymongo 3.2 don't yet flush in time # remove any entries in merge collection for this query self.merge.delete_many({'qhash': dasquery.qhash}) # proceed self.logger.debug(dasquery) id_list = [] expire = 9999999999 # future # get all API records for given DAS query spec = { 'qhash': dasquery.qhash, 'das.expire': { '$gt': time.time() }, 'das.record': record_codes('query_record') } records = self.col.find(spec, **PYMONGO_OPTS) for row in records: # find smallest expire timestamp to be used by aggregator rexpire = row.get('das', {}).get('expire', expire) if rexpire < expire: expire = rexpire if row['_id'] not in id_list: id_list.append(row['_id']) inserted = 0 lookup_keys = set() fields = dasquery.mongo_query.get('fields') if not fields: # Mongo fields = [] for key in fields: for pkey in self.mapping.mapkeys(key): lookup_keys.add(pkey) for pkey in lookup_keys: skey = [(pkey, DESCENDING)] # lookup all service records spec = {'das_id': {'$in': id_list}, 'das.primary_key': pkey} if self.verbose: nrec = self.col.find(spec, **PYMONGO_OPTS).sort(skey).count() msg = "merging %s records, for %s key" % (nrec, pkey) else: msg = "merging records, for %s key" % pkey self.logger.debug(msg) # use exhaust=False since we process all records in aggregator # and it can be delay in processing records = self.col.find(spec, **PYMONGO_NOEXHAUST).sort(skey) # aggregate all records agen = aggregator(dasquery, records, expire) # diff aggregated records gen = das_diff(agen, self.mapping.diff_keys(pkey.split('.')[0])) # insert all records into das.merge using bulk insert size = self.cache_size try: res = self.merge.insert_many(gen) inserted += len(res.inserted_ids) except InvalidDocument as exp: print(dastimestamp('DAS WARNING'), 'InvalidDocument during merge', str(exp)) msg = "Caught bson error: " + str(exp) self.logger.info(msg) records = self.col.find(spec, **PYMONGO_OPTS).sort(skey) gen = aggregator(dasquery, records, expire) genrows = parse2gridfs(self.gfs, pkey, gen, self.logger) das_dict = { 'das': { 'expire': expire, 'das.record': record_codes('gridfs_record'), 'primary_key': [k for k in lookup_keys], 'system': ['gridfs'] }, 'qhash': dasquery.qhash, 'cache_id': [], 'das_id': id_list } for row in genrows: row.update(das_dict) self.merge.insert(row) except InvalidOperation as exp: pass except DuplicateKeyError as err: print(dastimestamp('DAS WARNING'), 'DuplicateKeyError during merge') if not isinstance(gen, list): raise err status = 'fail' if inserted: status = 'ok' elif not lookup_keys: # we get query w/o fields msg = 'qhash %s, no lookup_keys' % dasquery.qhash print(dastimestamp('DAS WARNING'), msg) status = 'ok' else: # we didn't merge anything, it is DB look-up failure msg = 'qhash %s, did not insert into das.merge, attempt %s' \ % (dasquery.qhash, attempt) print(dastimestamp('DAS WARNING'), msg) empty_expire = etstamp() lkeys = list(lookup_keys) das = dict(expire=empty_expire, primary_key=lkeys[0], condition_keys=lkeys, instance=dasquery.instance, system=['das'], services=dasquery.services, record=record_codes('empty_record'), ts=time.time(), api=[]) empty_record = { 'das': das, 'qhash': dasquery.qhash, 'cache_id': [], 'das_id': id_list } for key in lkeys: empty_record.update({key.split('.')[0]: []}) for key, val in dasquery.mongo_query['spec'].items(): if key.find('.') == -1: empty_record[key] = [] else: # it is compound key, e.g. site.name newkey, newval = convert_dot_notation(key, val) empty_record[newkey] = adjust_mongo_keyvalue(newval) self.merge.insert(empty_record) # update DAS records (both meta and data ones, by using qhash) nval = {'$set': {'das.expire': empty_expire}} spec = {'qhash': dasquery.qhash} self.col.update_many(spec, nval) return status def update_cache(self, dasquery, results, header, system, api): """ Insert results into cache. Use bulk insert controller by self.cache_size. Upon completion ensure indexies. """ # update results records in DAS cache gen = self.generate_records(dasquery, results, header) inserted = 0 # bulk insert try: res = self.col.insert_many(gen, ordered=False, bypass_document_validation=True) inserted += len(res.inserted_ids) except InvalidOperation: pass # update query record for this sub-system self.update_query_record_system(dasquery, system, api, 'ok') if dasquery.qcache: # custom DASQuery cache self.update_das_expire(dasquery, expire_timestamp(dasquery.qcache)) def update_query_record_system(self, dasquery, system, api, status): "Update system status of dasquery in das.cache collection" spec = { 'qhash': dasquery.qhash, 'das.system': system, 'das.api': api, 'das.record': record_codes('query_record') } udict = {'$set': {'das.status': status}} # print("### update_query_record", spec) doc = self.col.find_one_and_update( spec, udict, return_document=ReturnDocument.AFTER) # print(doc) def insert_query_record(self, dasquery, header): """ Insert query record into DAS cache. """ # check presence of API record in a cache dasheader = header['das'] system = dasheader['system'] api = dasheader['api'] collection = 'cache' check_query = True expire = dasheader.get('expire', None) if expire: dasheader['expire'] = adjust_expire(expire) if not self.incache(dasquery, collection, system, api, check_query): msg = "query=%s, header=%s" % (dasquery, header) self.logger.debug(msg) q_record = dict(das=dasheader, query=dasquery.storage_query) q_record['das']['record'] = record_codes('query_record') q_record['das']['status'] = "requested" q_record['qhash'] = dasquery.qhash q_record['das']['ctime'] = [time.time()] res = self.col.insert_one(q_record) if not res: msg = 'unable to insert query record' print(dastimestamp('DAS ERROR '), dasquery, msg, ', will retry') time.sleep(1) res = self.col.insert(q_record) if not res: print(dastimestamp('DAS ERROR '), dasquery, msg) def generate_records(self, dasquery, results, header): """ Iterate over provided results, update records and yield them to next level (update_cache) """ self.logger.debug("(%s) store to cache" % dasquery) if not results: return dasheader = header['das'] expire = adjust_expire(dasheader['expire']) system = dasheader['system'] # DAS service names, e.g. combined services = dasheader['services'] # CMS services used to get data api = dasheader['api'] prim_key = header.get('prim_key', None) if not prim_key: # get primary key from a list of lookup keys which has the # following structure [{'api':[keys]}, {...}] lup_keys = header['lookup_keys'] lkeys = [l for i in lup_keys for k in i.values() for l in k] prim_key = lkeys[0] if 'summary' not in lkeys else 'summary' cond_keys = list(dasquery.mongo_query['spec'].keys()) # get API record id spec = { 'qhash': dasquery.qhash, 'das.system': system, 'das.expire': { '$gt': time.time() }, 'das.record': record_codes('query_record') } counter = 0 rids = [str(r['_id']) for r in \ self.col.find(spec, ['_id'], **PYMONGO_OPTS)] if rids: if isinstance(results, list) or isinstance(results, GeneratorType): for item in results: counter += 1 if 'das' in item: expire = item.get('das').get('expire', expire) dasheader['expire'] = expire item['das'] = dict(expire=expire, primary_key=prim_key, condition_keys=cond_keys, instance=dasquery.instance, system=system, services=services, record=record_codes('data_record'), ts=time.time(), api=api) item['das_id'] = rids item['qhash'] = dasquery.qhash yield item else: print("\n\n ### results = ", str(results)) raise Exception( 'Provided results is not a list/generator type') if expire != dasheader['expire']: # update DAS records header['das']['expire'] = expire # update das record with new status status = 'Update DAS cache, %s API' % header['das']['api'][0] self.update_query_record(dasquery, status, header) msg = "\n%s yield %s rows" % (dasheader['system'], counter) self.logger.info(msg) def remove_from_cache(self, dasquery): """ Remove query from DAS cache. To do so, we retrieve API record and remove all data records from das.cache and das.merge """ records = self.col.find({'qhash': dasquery.qhash}, **PYMONGO_OPTS) id_list = [] for row in records: if row['_id'] not in id_list: id_list.append(row['_id']) spec = {'das_id': {'$in': id_list}} self.merge.remove(spec) self.merge.remove({'qhash': dasquery.qhash}) self.col.remove(spec) self.col.remove({'qhash': dasquery.qhash}) def clean_cache(self, collection=None): """ Clean expired docs in das.cache and das.merge. """ current_time = time.time() query = {'das.expire': {'$lt': current_time}} if not collection or collection == 'merge': self.merge.remove(query) if not collection or collection == 'cache': self.col.remove(query) def delete_cache(self): """ Delete all results in DAS cache/merge collection, including internal indexes. """ self.col.remove({}) try: self.col.drop_indexes() except: pass self.merge.remove({}) try: self.merge.drop_indexes() except: pass