def test_task_manager(self): """Test task manager""" expect = [idx for idx in range(self.size)] mypool = TaskManager() tasks = [] for idx in expect: tasks.append(mypool.spawn(worker, idx, self.data)) mypool.joinall(tasks) result = [idx for idx in self.data] self.assertEqual(result, expect)
class Maintainer(object): "Maintainer keeps alive data records in DAS cache" def __init__(self, config): self.sleep = config.get('sleep', 5) pattern = {'das.system':'dbs', 'das.primary_key': 'dataset.name'} self.pattern = config.get('query_pattern', pattern) nworkers = int(config.get('nworkers', 10)) name = config.get('name', 'dataset_keeper') dasconfig = das_readconfig() debug = False self.dascore = DASCore(config=dasconfig, nores=True, debug=debug) self.taskmgr = TaskManager(nworkers=nworkers, name=name) self.conn = db_connection(dasconfig['mongodb']['dburi']) def check_records(self): "Check and return list of DAS records which require update" for row in self.conn['das']['merge'].find(): if 'qhash' not in row: continue spec = {'qhash': row['qhash'], 'das.system':'das'} for rec in self.conn['das']['cache'].find(spec): if 'query' in rec: expire = rec['das']['expire'] if expire < time.time() or \ abs(expire-time.time()) < self.sleep: yield DASQuery(rec['query']), expire def update(self): """ Update DAS cache: - get list of expired or near expire DAS records - store them into onhold set - loop over onhold set and invoke expired queries - sleep and repeat. """ add_to_analytics = False onhold = {} while True: jobs = [] for query, expire in self.check_records(): if query not in onhold: onhold[query] = expire for query, expire in onhold.items(): if expire < time.time(): print "update %s at %s" % (query, time.time()) jobs.append(self.taskmgr.spawn(\ self.dascore.call, query, add_to_analytics)) del onhold[query] self.taskmgr.joinall(jobs) time.sleep(self.sleep)
def test_priority_task_manager(self): """Test priority task manager""" data = [idx for idx in xrange(0, 100)] shared_data = Array('i', len(data)) mypool = TaskManager(qtype='PriorityQueue') tasks = [] for idx in data: if idx%2: tasks.append(mypool.spawn(worker, idx, shared_data, uid=1)) else: tasks.append(mypool.spawn(worker, idx, shared_data, uid=2)) mypool.joinall(tasks) result = [idx for idx in shared_data] self.assertEqual(result, data)
def test_priority_task_manager(self): """Test priority task manager""" data = [idx for idx in range(0, 30)] shared_data = Array('i', len(data)) mypool = TaskManager(qtype='PriorityQueue', qfreq=10) tasks = [] for idx in data: if idx % 2: tasks.append(mypool.spawn(worker, idx, shared_data, uid=1)) else: tasks.append(mypool.spawn(worker, idx, shared_data, uid=2)) mypool.joinall(tasks) result = [idx for idx in shared_data] self.assertEqual(result, data)
class Populator(object): """ This class populates DAS cache with data. The run method accepts list of DAS queries. """ def __init__(self, config): nworkers = int(config.get('nworkers', 10)) name = config.get('name', 'dataset_populator') dasconfig = das_readconfig() debug = False self.dascore = DASCore(config=dasconfig, nores=True, debug=debug) self.taskmgr = TaskManager(nworkers=nworkers, name=name) def run(self, queries): "Run taskmanger with given queries" jobs = [] add_to_analytics = False for query in queries: jobs.append(self.taskmgr.spawn(\ self.dascore.call, DASQuery(query), add_to_analytics)) self.taskmgr.joinall(jobs)
class DASCore(object): """ DAS core class. """ def __init__(self, config=None, debug=0, nores=False, logger=None, engine=None, multitask=True): if config: dasconfig = config else: dasconfig = das_readconfig() verbose = dasconfig['verbose'] self.stdout = debug if isinstance(debug, int): self.verbose = debug dasconfig['verbose'] = debug else: self.verbose = verbose das_timer('DASCore::init', self.verbose) self.operators = das_operators() # set noresults option self.noresults = False if nores: dasconfig['write_cache'] = True self.noresults = nores self.multitask = dasconfig['das'].get('multitask', True) if debug or self.verbose: self.multitask = False # in verbose mode do not use multitask dasconfig['das']['multitask'] = False if not multitask: # explicitly call DASCore ctor, e.g. in analytics self.multitask = False dasconfig['das']['multitask'] = False dasconfig['engine'] = engine if self.multitask: nworkers = dasconfig['das'].get('core_workers', 5) if engine: thr_name = 'DASCore:PluginTaskManager' self.taskmgr = PluginTaskManager(\ engine, nworkers=nworkers, name=thr_name) self.taskmgr.subscribe() else: thr_name = 'DASCore:TaskManager' self.taskmgr = TaskManager(nworkers=nworkers, name=thr_name) else: self.taskmgr = None if logger: self.logger = logger else: self.logger = PrintManager('DASCore', self.verbose) # define Mapping/Analytics/Parser in this order since Parser depends # on first two dasmapping = DASMapping(dasconfig) dasconfig['dasmapping'] = dasmapping self.mapping = dasmapping self.analytics = DASAnalytics(dasconfig) dasconfig['dasanalytics'] = self.analytics self.keylearning = DASKeyLearning(dasconfig) dasconfig['keylearning'] = self.keylearning # init DAS cache self.rawcache = DASMongocache(dasconfig) dasconfig['rawcache'] = self.rawcache # plug-in architecture: loop over registered data-services in # dasconfig; load appropriate module/class; register data # service with DASCore. self.systems = dasmapping.list_systems() # pointer to the DAS top level directory dasroot = '/'.join(__file__.split('/')[:-3]) for name in self.systems: try: klass = 'DAS/services/%s/%s_service.py' \ % (name, name) srvfile = os.path.join(dasroot, klass) with file(srvfile) as srvclass: for line in srvclass: if line.find('(DASAbstractService)') != -1: klass = line.split('(DASAbstractService)')[0] klass = klass.split('class ')[-1] break mname = 'DAS.services.%s.%s_service' % (name, name) module = __import__(mname, fromlist=[klass]) obj = getattr(module, klass)(dasconfig) setattr(self, name, obj) SERVICES[name] = obj except IOError as err: if debug > 1: # we have virtual services, so IOError can be correct print_exc(err) try: mname = 'DAS.services.generic_service' module = __import__(mname, fromlist=['GenericService']) obj = module.GenericService(name, dasconfig) setattr(self, name, obj) except Exception as exc: print_exc(exc) msg = "Unable to load %s data-service plugin" % name raise Exception(msg) except Exception as exc: print_exc(exc) msg = "Unable to load %s data-service plugin" % name raise Exception(msg) # loop over systems and get system keys, add mapping keys to final list self.service_keys = {} self.service_parameters = {} for name in self.systems: skeys = getattr(self, name).keys() self.service_keys[getattr(self, name).name] = skeys sparams = getattr(self, name).parameters() self.service_parameters[getattr(self, name).name] = sparams self.service_keys['special'] = das_special_keys() self.dasconfig = dasconfig das_timer('DASCore::init', self.verbose) def keys(self): """ Return map of data service keys """ return self.service_keys def das_keys(self): """ Return map of data service keys """ _keys = ['records'] for values in self.service_keys.values(): for key in values: if key not in _keys: _keys.append(key) return _keys def result(self, query, idx=0, limit=None): """ Get results either from cache or from explicit call """ self.logger.info('input query=%s' % query) results = [] dasquery = DASQuery(query) dasquery.add_to_analytics() query = dasquery.mongo_query # check if we have any service which cover the query # otherwise decompose it into list of queries service_map = dasquery.service_apis_map() if not service_map: msg = 'no APIs found to answer input query, will decompose it' self.logger.info(msg) skeys = query['fields'] if not skeys: skeys = [] for key in skeys: newquery = DASQuery(dict(fields=[key], spec=query['spec'])) self.call(newquery) # process query else: self.call(dasquery) # process query # lookup provided query in a cache if not self.noresults: results = self.get_from_cache(dasquery, idx, limit) return results def remove_from_cache(self, dasquery): """ Delete in cache entries about input query """ self.rawcache.remove_from_cache(dasquery) def get_status(self, dasquery): """ Look-up status of provided query in a cache. Return status of the query request and its hash. """ status = None error = None reason = None for col in ['merge', 'cache']: self.rawcache.remove_expired(dasquery, col) if dasquery and 'fields' in dasquery.mongo_query: fields = dasquery.mongo_query['fields'] if fields and isinstance(fields, list) and 'queries' in fields: return 'ok', error, reason record = self.rawcache.find(dasquery) error, reason = self.rawcache.is_error_in_records(dasquery) try: if record and 'das' in record and 'status' in record['das']: status = record['das']['status'] if not error: error = record['das'].get('error', error) if not reason: reason = record['das'].get('reason', reason) return status, error, reason except Exception as exc: print_exc(exc) status = error = reason = None self.rawcache.remove_from_cache(dasquery) return status, error, reason def worker(self, srv, dasquery): """Main worker function which calls data-srv call function""" self.logger.info('##### %s ######\n' % srv) das_timer(srv, self.verbose) getattr(getattr(self, srv), 'call')(dasquery) das_timer(srv, self.verbose) def insert_query_records(self, dasquery): """ Insert DAS query records into DAS cache and return list of services which will answer this query """ services = dasquery.services self.logger.info('Potential services = %s' % services) if not services: msg = 'No data-services for query %s' % dasquery msg += 'mongo_query: %s' % dasquery.mongo_query msg += 'params: %s' % dasquery.params() print dastimestamp('DAS WARNING '), msg # get list of URI which can answer this query ack_services = [] for srv in services: gen = [t for t in getattr(getattr(self, srv), 'apimap')(dasquery)] for url, api, args, iformat, expire in gen: header = dasheader(srv, dasquery, expire, api, url, ctime=0) self.rawcache.insert_query_record(dasquery, header) if srv not in ack_services: ack_services.append(srv) if not ack_services: ack_services = services if dasquery.query.find('records ') != -1: srv_status = True # skip DAS queries w/ records request expire = 2*60 # 2 minutes, it should be overwriten by data-srv header = dasheader("das", dasquery, expire, api='das_core', services=dict(das=ack_services)) header['lookup_keys'] = [] self.rawcache.insert_query_record(dasquery, header) das_timer('das_record', self.verbose) return ack_services def call(self, query, add_to_analytics=True, **kwds): """ Top level DAS api which execute a given query using underlying data-services. It follows the following steps: - parse input query - identify data-sercices based on selection keys and where clause conditions - construct DAS workflow and execute data-service API calls. At this step individual data-services store results into DAS cache. Return status 0/1 depending on success of the calls, can be used by workers on cache server. kwds is provided for compatibility with web layer, e.g. it may invoke this method with additional pid parameter. """ def update_das_query(dasquery, status, reason=None): "Update DAS query record with given status and reason" self.rawcache.update_query_record(dasquery, status, reason=reason) self.rawcache.add_to_record(\ dasquery, {'das.timer': get_das_timer()}, system='das') # make sure that das record is updated, we use 7 iteration which # sum up into 1 minute to cover default syncdelay value of mongo # server (in a future it would be better to find programatically # this syncdelay value, but it seems pymongo driver does not # provide any API for it. for idx in xrange(1, 7): spec = {'qhash':dasquery.qhash, 'das.system':['das']} res = self.rawcache.col.find_one(spec) if res: dbstatus = res.get('das', {}).get('status', None) if dbstatus == status: break msg = 'qhash %s, das.status=%s, status=%s, wait for update' \ % (dasquery.qhash, dbstatus, status) print dastimestamp('DAS WARNING'), msg time.sleep(idx*idx) self.rawcache.update_query_record(dasquery, status, reason=reason) self.logger.info('input query=%s' % query) das_timer('DASCore::call', self.verbose) if isinstance(query, object) and hasattr(query, '__class__')\ and query.__class__.__name__ == 'DASQuery': dasquery = query else: dasquery = DASQuery(query) for col in ['merge', 'cache']: self.rawcache.remove_expired(dasquery, col) if add_to_analytics: dasquery.add_to_analytics() query = dasquery.mongo_query spec = query.get('spec') fields = query.get('fields') if fields == ['records']: msg = 'look-up all records in cache' self.logger.info(msg) return 'in cache' if spec == dict(records='*'): self.logger.info("look-up everything in cache") return 'in cache' for record in self.rawcache.find_specs(dasquery): status = record['das']['status'] msg = 'found query %s in cache, status=%s\n' \ % (record['query'], status) self.logger.info(msg) print dastimestamp('DAS INFO'), msg return status self.logger.info(dasquery) das_timer('das_record', self.verbose) services = self.insert_query_records(dasquery) if not services: msg = 'unable to locate data-services to fulfill this request' msg += ', will iterate over all registered services' print dastimestamp('DAS WARNING '), dasquery, msg services = dasquery.services if dasquery.services else self.systems try: if self.multitask: jobs = [] for srv in sorted(services): jobs.append(self.taskmgr.spawn(self.worker, srv, dasquery)) self.taskmgr.joinall(jobs) else: for srv in services: self.worker(srv, dasquery) except Exception as exc: print_exc(exc) return 'fail' self.logger.info('\n##### merging ######\n') update_das_query(dasquery, 'merging') das_timer('merge', self.verbose) self.rawcache.merge_records(dasquery) das_timer('merge', self.verbose) # check if we have service records and properly setup status self.logger.info('\n##### check services ######\n') das_services = self.rawcache.check_services(dasquery) reason = '' status = 'ok' if not das_services: if 'records' in dasquery.query: status = 'ok' # keep status ok for 'records' queries else: reason = 'no data records found in DAS cache' status = 'fail' print dastimestamp('DAS ERROR '), dasquery, reason update_das_query(dasquery, status, reason) das_timer('DASCore::call', self.verbose) return status def processing_time(self, dasquery): "Look-up and return DAS query processing time" query_record = self.rawcache.find(dasquery) if query_record: das = query_record.get('das', None) if isinstance(das, dict): ctime = das.get('ctime', []) if ctime: return ctime[-1]-ctime[0] return None def nresults(self, dasquery, coll='merge'): """ Return total number of results (count) for provided query Code should match body of get_from_cache method. """ fields = dasquery.mongo_query.get('fields', None) if dasquery.mapreduce: result = self.rawcache.map_reduce(dasquery.mapreduce, dasquery) return len([1 for _ in result]) elif dasquery.aggregators: return len(dasquery.aggregators) elif isinstance(fields, list) and 'queries' in fields: return len([1 for _ in self.get_queries(dasquery)]) return self.rawcache.nresults(dasquery, coll) def apilist(self, dasquery): "Return list of APIs answer given das query" return self.rawcache.apilist(dasquery) def incache(self, dasquery, coll='merge'): """ Answer the question if given query in DAS cache or not """ return self.rawcache.incache(dasquery, collection=coll) def get_from_cache(self, dasquery, idx=0, limit=0, collection='merge'): """ Look-up results from the merge cache and yield them for further processing. """ das_timer('DASCore::get_from_cache', self.verbose) msg = 'col=%s, query=%s, idx=%s, limit=%s'\ % (collection, dasquery, idx, limit) self.logger.info(msg) fields = dasquery.mongo_query.get('fields', None) if dasquery.mapreduce: res = self.rawcache.map_reduce(dasquery.mapreduce, dasquery) elif dasquery.aggregators: # extract das information from rawcache rows = self.rawcache.get_from_cache(\ dasquery, collection=collection) first = rows.next() sinfo = das_sinfo(first) # to perform aggregation we need: # - loop over all aggregator functions # - loop over all data-services # - loop over all APIs within a data-services # the code below does that, it applies aggregator # to selected (based on key/srv/api) records res = [] _id = 0 time0 = time.time() expire = 300 # min expire for func, key in dasquery.aggregators: afunc = getattr(das_aggregator, 'das_%s' % func) found = False for srv, apis, in sinfo.items(): for api in apis: rows = self.rawcache.get_from_cache(\ dasquery, collection=collection) gen = api_rows(rows, api) data = afunc(key, gen) ctime = time.time() - time0 das = dasheader(srv, dasquery, expire, api=api, ctime=ctime) if isinstance(data, dict) and data['value'] != 'N/A': aggr = {'_id':_id, 'function': func, 'key': key, 'result': data} aggr.update(das) res.append(aggr) _id += 1 found = True if not found: # when we got nothing add empty result record empty = {'value':'N/A'} ctime = time.time() - time0 das = dasheader('das', dasquery, expire, api='das_core', ctime=ctime) rec = {'_id':0, 'function':func, 'key':key, 'result':empty} rec.update(das) res.append(rec) elif isinstance(fields, list) and 'queries' in fields: res = itertools.islice(self.get_queries(dasquery), idx, idx+limit) else: res = self.rawcache.get_from_cache(dasquery, idx, limit, \ collection=collection) for row in res: fix_times(row) yield row das_timer('DASCore::get_from_cache', self.verbose) def get_queries(self, dasquery): """ Look-up (popular) queries in DAS analytics/logging db """ das_timer('DASCore::get_queries', self.verbose) fields = dasquery.mongo_query.get('fields') spec = dasquery.mongo_query.get('spec') if 'popular' in fields: res = self.analytics.get_popular_queries(spec) else: datestamp = spec.get('date') if isinstance(datestamp, dict): value = datestamp.get('$in') res = \ self.analytics.list_queries(after=value[0], before=value[1]) elif isinstance(datestamp, int): res = self.analytics.list_queries(after=datestamp) elif not datestamp: res = self.analytics.list_queries() else: msg = 'Unsupported date value: %s' % datestamp raise Exception(msg) for row in res: rid = row.pop('_id') yield dict(das_query=row, _id=rid) das_timer('DASCore::get_queries', self.verbose)
class DASAbstractService(object): """ Abstract class describing DAS service. It initialized with a name which is used to identify service parameters from DAS configuration file. Those parameters are keys, verbosity level, URL of the data-service. """ def __init__(self, name, config): self.name = name try: self.verbose = config['verbose'] title = 'DASAbstactService_%s' % self.name self.logger = PrintManager(title, self.verbose) self.dasmapping = config['dasmapping'] self.write2cache = config.get('write_cache', True) self.multitask = config['das'].get('multitask', True) self.error_expire = config['das'].get('error_expire', 300) self.dbs_global = None # to be configured at run time self.dburi = config['mongodb']['dburi'] engine = config.get('engine', None) self.gfs = db_gridfs(self.dburi) except Exception as exc: print_exc(exc) raise Exception('fail to parse DAS config') # read key/cert info try: self.ckey, self.cert = get_key_cert() except Exception as exc: print_exc(exc) self.ckey = None self.cert = None if self.multitask: nworkers = config['das'].get('api_workers', 3) thr_weights = config['das'].get('thread_weights', []) for system_weight in thr_weights: system, weight = system_weight.split(':') if system == self.name: nworkers *= int(weight) if engine: thr_name = 'DASAbstractService:%s:PluginTaskManager' % self.name self.taskmgr = PluginTaskManager(\ engine, nworkers=nworkers, name=thr_name) self.taskmgr.subscribe() else: thr_name = 'DASAbstractService:%s:TaskManager' % self.name self.taskmgr = TaskManager(nworkers=nworkers, name=thr_name) else: self.taskmgr = None self.map = {} # to be defined by data-service implementation self._keys = None # to be defined at run-time in self.keys self._params = None # to be defined at run-time in self.parameters self._notations = {} # to be defined at run-time in self.notations self.logger.info('initialized') # define internal cache manager to put 'raw' results into cache if 'rawcache' in config and config['rawcache']: self.localcache = config['rawcache'] else: msg = 'Undefined rawcache, please check your configuration' raise Exception(msg) def services(self): """ Return sub-subsystems used to retrieve data records. It is used in dasheader call to setup das.services field. This method can be overwritten in sub-classes, otherwise returns dict of service name and CMS systems used to retrieve data records. """ return {self.name:[self.name]} def version(self): """Return data-services version, should be implemented in sub-classes""" return '' def keys(self): """ Return service keys """ if self._keys: return self._keys srv_keys = [] for _api, params in self.map.items(): for key in params['keys']: if not key in srv_keys: srv_keys.append(key) self._keys = srv_keys return srv_keys def parameters(self): """ Return mapped service parameters """ if self._params: return self._params srv_params = [] for _api, params in self.map.items(): for key in params['params']: param_list = self.dasmapping.api2das(self.name, key) for par in param_list: if not par in srv_params: srv_params.append(par) self._params = srv_params return srv_params def notations(self): """ Return a map of system notations. """ if self._notations: return self._notations for _, rows in self.dasmapping.notations(self.name).items(): for row in rows: api = row['api'] nmap = row['rec_key'] notation = row['api_output'] if api in self._notations: self._notations[api].update({notation:nmap}) else: self._notations[api] = {notation:nmap} return self._notations def getdata(self, url, params, expire, headers=None, post=None): """URL call wrapper""" if url.find('https:') != -1: return getdata(url, params, headers, expire, post, self.error_expire, self.verbose, self.ckey, self.cert, system=self.name) else: return getdata(url, params, headers, expire, post, self.error_expire, self.verbose, system=self.name) def call(self, dasquery): """ Invoke service API to execute given query. Return results as a collect list set. """ self.logger.info(dasquery) # check the cache for records with given query/system res = self.localcache.incache(dasquery, collection='cache', system=self.name) if res: msg = "found records in local cache" self.logger.info(msg) return # ask data-service api to get results, they'll be store them in # cache, so return at the end what we have in cache. self.api(dasquery) def write_to_cache(self, dasquery, expire, url, api, args, gen, ctime): """ Write provided result set into DAS cache. """ if not self.write2cache: return # before going to cache we should check/set possible misses, e.g. # primary key when error is thrown result = self.set_misses(dasquery, api, gen) # update the cache header = dasheader(self.name, dasquery, expire, api, url, services=self.services()) header['lookup_keys'] = self.lookup_keys(api) header['prim_key'] = self.dasmapping.primary_mapkey(self.name, api) header['ctime'] = ctime self.localcache.update_cache(dasquery, result, header) msg = 'cache has been updated,\n' self.logger.debug(msg) def adjust_params(self, api, kwds, instance=None): """ Data-service specific parser to adjust parameters according to its specifications. For example, DQ service accepts a string of parameters, rather parameter set, while DBS2 can reuse some parameters for different API, e.g. I can use dataset path to pass to listPrimaryDatasets as primary_dataset pattern. """ pass def lookup_keys(self, api): """ Return look-up keys of data output for given data-service API. """ lkeys = self.dasmapping.lookup_keys(self.name, api) return [{api:lkeys}] def inspect_params(self, api, args): """ Perform API parameter inspection. Check if API accept a range of parameters, etc. """ for key, value in args.items(): if isinstance(value, dict): minval = None maxval = None for oper, val in value.items(): if oper == '$in': minval = int(val[0]) maxval = int(val[-1]) args[key] = range(minval, maxval) elif oper == '$lt': maxval = int(val) args[key] = maxval elif oper == '$lte': maxval = int(val) args[key] = maxval elif oper == '$gt': minval = int(val) args[key] = minval elif oper == '$gte': minval = int(val) args[key] = minval else: msg = '%s does not support operator %s' % (api, oper) raise Exception(msg) return args def get_notations(self, api): """Return notations used for given API""" notationmap = self.notations() if not notationmap: return {} notations = {} if '' in notationmap: notations = dict(notationmap['']) # notations applied to all APIs if api in notationmap: # overwrite the one for provided API notations.update(notationmap[api]) return notations def parser(self, dasquery, dformat, data, api): """ DAS data parser. Input parameters: - *query* input DAS query - *dformat* is a data format, e.g. XML, JSON - *data* is a data source, either file-like object or actual data - *api* is API name """ prim_key = self.dasmapping.primary_key(self.name, api) counter = 0 if dformat.lower() == 'xml': tags = self.dasmapping.api2daskey(self.name, api) gen = xml_parser(data, prim_key, tags) for row in gen: counter += 1 yield row elif dformat.lower() == 'json' or dformat.lower() == 'dasjson': gen = json_parser(data, self.logger) das_dict = {} for row in gen: if dformat.lower() == 'dasjson': for key, val in row.items(): if key != 'results': das_dict[key] = val row = row['results'] if isinstance(row, list): for item in row: if item: if prim_key in item: counter += 1 yield item else: counter += 1 yield {prim_key:item} else: if prim_key in row: counter += 1 yield row else: counter += 1 yield {prim_key:row} else: msg = 'Unsupported data format="%s", API="%s"' % (dformat, api) raise Exception(msg) msg = "api=%s, format=%s " % (api, dformat) msg += "prim_key=%s yield %s rows" % (prim_key, counter) self.logger.info(msg) def translator(self, api, genrows): """ Convert raw results into DAS records. """ prim_key = self.dasmapping.primary_key(self.name, api) count = 0 for row in genrows: row2das(self.dasmapping.notation2das, self.name, api, row) count += 1 # check for primary key existance, since it can be overriden # by row2das. For example DBS3 uses flat namespace, so we # override dataset=>name, while dataset still is a primary key if isinstance(row, list): yield {prim_key:row} elif prim_key in row: if prim_key in row[prim_key]: yield row[prim_key] # remapping may create nested dict else: yield row else: yield {prim_key:row} msg = "yield %s rows" % count self.logger.debug(msg) def set_misses(self, dasquery, api, genrows): """ Check and adjust DAS records wrt input query. If some of the DAS keys are missing, add it with its value to the DAS record. """ # look-up primary key prim_key = self.dasmapping.primary_key(self.name, api) # Scan all docs and store those whose size above MongoDB limit into # GridFS map_key = self.dasmapping.primary_mapkey(self.name, api) genrows = parse2gridfs(self.gfs, map_key, genrows, self.logger) spec = dasquery.mongo_query['spec'] row = next(genrows) ddict = DotDict(row) keys2adjust = [] for key in spec.keys(): val = ddict.get(key) if spec[key] != val and key not in keys2adjust: keys2adjust.append(key) msg = "adjust keys %s" % keys2adjust self.logger.debug(msg) count = 0 if keys2adjust: # adjust of the rows for row in yield_rows(row, genrows): ddict = DotDict(row) pval = ddict.get(map_key) if isinstance(pval, dict) and 'error' in pval: ddict[map_key] = '' ddict.update({prim_key: pval}) for key in keys2adjust: value = spec[key] existing_value = ddict.get(key) # the way to deal with proximity/patern/condition results if (isinstance(value, str) or isinstance(value, unicode))\ and value.find('*') != -1: # we got pattern if existing_value: value = existing_value elif isinstance(value, dict) or \ isinstance(value, list): # we got condition if existing_value: value = existing_value elif isinstance(value, dict) and \ '$in' in value: # we got a range {'$in': []} value = value['$in'] elif isinstance(value, dict) and \ '$lte' in value and '$gte' in value: # we got a between range value = [value['$gte'], value['$lte']] else: value = json.dumps(value) elif existing_value and value != existing_value: # we got proximity results if 'proximity' in ddict: proximity = DotDict({key:existing_value}) ddict['proximity'].update(proximity) else: proximity = DotDict({}) proximity[key] = existing_value ddict['proximity'] = proximity else: if existing_value: value = existing_value ddict[key] = value yield ddict count += 1 else: yield row for row in genrows: yield row count += 1 msg = "yield %s rows" % count self.logger.debug(msg) def api(self, dasquery): """ Data service api method, can be defined by data-service class. It parse input query and invoke appropriate data-service API call. All results are stored into the DAS cache along with api call inserted into Analytics DB. """ self.logger.info(dasquery) genrows = self.apimap(dasquery) if not genrows: return jobs = [] for url, api, args, dformat, expire in genrows: # insert DAS query record for given API header = dasheader(self.name, dasquery, expire, api, url) self.localcache.insert_query_record(dasquery, header) # fetch DAS data records if self.multitask: jobs.append(self.taskmgr.spawn(self.apicall, \ dasquery, url, api, args, dformat, expire)) else: self.apicall(dasquery, url, api, args, dformat, expire) if self.multitask: self.taskmgr.joinall(jobs) def apicall(self, dasquery, url, api, args, dformat, expire): """ Data service api method, can be defined by data-service class. It parse input query and invoke appropriate data-service API call. All results are stored into the DAS cache along with api call inserted into Analytics DB. We invoke explicitly close call for our datastream instead of using context manager since this method as well as getdata/parser can be overwritten by child classes. """ datastream = None try: args = self.inspect_params(api, args) time0 = time.time() headers = make_headers(dformat) datastream, expire = self.getdata(url, args, expire, headers) self.logger.info("%s expire %s" % (api, expire)) rawrows = self.parser(dasquery, dformat, datastream, api) dasrows = self.translator(api, rawrows) ctime = time.time() - time0 self.write_to_cache(dasquery, expire, url, api, args, dasrows, ctime) except Exception as exc: msg = 'Fail to process: url=%s, api=%s, args=%s' \ % (url, api, args) print(msg) print_exc(exc) close(datastream) def url_instance(self, url, _instance): """ Virtual method to adjust URL for a given instance, must be implemented in service classes """ return url def adjust_url(self, url, instance): """ Adjust data-service URL wrt provided instance, e.g. DBS carry several instances """ if instance: url = self.url_instance(url, instance) return url def apimap(self, dasquery): """ Analyze input query and yield url, api, args, format, expire for further processing. """ srv = self.name # get local copy to avoid threading issues cond = getarg(dasquery.mongo_query, 'spec', {}) instance = dasquery.mongo_query.get('instance', self.dbs_global) skeys = getarg(dasquery.mongo_query, 'fields', []) if not skeys: skeys = [] self.logger.info("\n") for api, value in self.map.items(): expire = value['expire'] iformat = value['format'] url = self.adjust_url(value['url'], instance) if not url: msg = '--- rejects API %s, no URL' % api self.logger.info(msg) continue args = dict(value['params']) # make new copy, since we'll adjust wild = value.get('wild_card', '*') found = 0 # check if input parameters are covered by API if not self.dasmapping.check_api_match(srv, api, cond): msg = '--- rejects API %s, does not cover input condition keys' \ % api self.logger.info(msg) continue # once we now that API covers input set of parameters we check # every input parameter for pattern matching for key, val in cond.items(): # check if keys from conditions are accepted by API # need to convert key (which is daskeys.map) into # input api parameter for apiparam in self.dasmapping.das2api(srv, api, key, val): if apiparam in args: args[apiparam] = val found += 1 # VK 20160708, wrong statement, it caused to pass # datasets API for query dataset in [path1, path2] # I'll leave block here until I test and verify that # commented out block will not cause other issues # # check the case when we only have single condition key # and it is the key we look-up # if not found and skeys == [k.split('.')[0] for k in cond.keys()]: # found = 1 # check if number of keys on cond and args are the same if len(cond.keys()) != found: msg = "--- reject API %s, not all condition keys are covered" \ % api self.logger.info(msg) msg = 'args=%s' % args self.logger.debug(msg) continue if not found: msg = "--- rejects API %s, parameters don't match" % api self.logger.info(msg) msg = 'args=%s' % args self.logger.debug(msg) continue self.adjust_params(api, args, instance) # delete args keys whose value is optional delete_keys(args, 'optional') # check that there is no "required" parameter left in args, # since such api will not work if 'required' in args.values(): msg = '--- rejects API %s, parameter is required' % api self.logger.info(msg) msg = 'args=%s' % args self.logger.debug(msg) continue # adjust pattern symbols in arguments if wild != '*': for key, val in args.items(): if isinstance(val, str) or isinstance(val, unicode): val = val.replace('*', wild) args[key] = val # compare query selection keys with API look-up keys api_lkeys = self.dasmapping.api_lkeys(srv, api) if set(api_lkeys) != set(skeys): msg = "--- rejects API %s, api_lkeys(%s)!=skeys(%s)"\ % (api, api_lkeys, skeys) self.logger.info(msg) continue msg = '+++ %s passes API %s' % (srv, api) self.logger.info(msg) msg = 'args=%s' % args self.logger.debug(msg) msg = "yield " msg += "system ***%s***, url=%s, api=%s, args=%s, format=%s, " \ % (srv, url, api, args, iformat) msg += "expire=%s, wild_card=%s" \ % (expire, wild) self.logger.debug(msg) yield url, api, args, iformat, expire
class DASCore(object): """ DAS core class. """ def __init__(self, config=None, debug=0, nores=False, logger=None, engine=None, multitask=True): if config: dasconfig = config else: dasconfig = das_readconfig() verbose = dasconfig['verbose'] self.stdout = debug if isinstance(debug, int) and debug: self.verbose = debug dasconfig['verbose'] = debug else: self.verbose = verbose das_timer('DASCore::init', self.verbose) self.operators = das_operators() self.collect_wait_time = dasconfig['das'].get('collect_wait_time', 120) # set noresults option self.noresults = False if nores: dasconfig['write_cache'] = True self.noresults = nores self.init_expire = dasconfig['das'].get('init_expire', 5 * 60) self.multitask = dasconfig['das'].get('multitask', True) if debug or self.verbose: self.multitask = False # in verbose mode do not use multitask dasconfig['das']['multitask'] = False if not multitask: # explicitly call DASCore ctor self.multitask = False dasconfig['das']['multitask'] = False dasconfig['engine'] = engine if self.multitask: nworkers = dasconfig['das'].get('core_workers', 5) # if engine: # thr_name = 'DASCore:PluginTaskManager' # self.taskmgr = PluginTaskManager(\ # engine, nworkers=nworkers, name=thr_name) # self.taskmgr.subscribe() # else: # thr_name = 'DASCore:TaskManager' # self.taskmgr = TaskManager(nworkers=nworkers, name=thr_name) thr_name = 'DASCore:TaskManager' self.taskmgr = TaskManager(nworkers=nworkers, name=thr_name) else: self.taskmgr = None if logger: self.logger = logger else: self.logger = PrintManager('DASCore', self.verbose) # define Mapping/Analytics/Parser in this order since Parser depends # on first two dasmapping = DASMapping(dasconfig) dasconfig['dasmapping'] = dasmapping self.mapping = dasmapping self.keylearning = DASKeyLearning(dasconfig) dasconfig['keylearning'] = self.keylearning # init DAS cache self.rawcache = DASMongocache(dasconfig) dasconfig['rawcache'] = self.rawcache # plug-in architecture: loop over registered data-services in # dasconfig; load appropriate module/class; register data # service with DASCore. self.systems = dasmapping.list_systems() # pointer to the DAS top level directory dasroot = '/'.join(__file__.split('/')[:-3]) for name in self.systems: try: klass = 'DAS/services/%s/%s_service.py' \ % (name, name) srvfile = os.path.join(dasroot, klass) with open(srvfile) as srvclass: for line in srvclass: if line.find('(DASAbstractService)') != -1: klass = line.split('(DASAbstractService)')[0] klass = klass.split('class ')[-1] break mname = 'DAS.services.%s.%s_service' % (name, name) module = __import__(mname, fromlist=[klass]) obj = getattr(module, klass)(dasconfig) setattr(self, name, obj) except IOError as err: if debug > 1: # we have virtual services, so IOError can be correct print_exc(err) try: mname = 'DAS.services.generic_service' module = __import__(mname, fromlist=['GenericService']) obj = module.GenericService(name, dasconfig) setattr(self, name, obj) except Exception as exc: print_exc(exc) msg = "Unable to load %s data-service plugin" % name raise Exception(msg) except Exception as exc: print_exc(exc) msg = "Unable to load %s data-service plugin" % name raise Exception(msg) # loop over systems and get system keys, add mapping keys to final list self.service_keys = {} self.service_parameters = {} for name in self.systems: skeys = list(getattr(self, name).keys()) self.service_keys[getattr(self, name).name] = skeys sparams = getattr(self, name).parameters() self.service_parameters[getattr(self, name).name] = sparams self.service_keys['special'] = das_special_keys() self.dasconfig = dasconfig das_timer('DASCore::init', self.verbose) def keys(self): """ Return map of data service keys """ return self.service_keys def das_keys(self): """ Return map of data service keys """ _keys = ['records'] for values in self.service_keys.values(): for key in values: if key not in _keys: _keys.append(key) return _keys def result(self, query, idx=0, limit=None): """ Get results either from cache or from explicit call """ self.logger.info('input query=%s' % query) results = [] dasquery = DASQuery(query) query = dasquery.mongo_query # check if we have any service which cover the query # otherwise decompose it into list of queries service_map = dasquery.service_apis_map() if not service_map: msg = 'no APIs found to answer input query, will decompose it' self.logger.info(msg) skeys = query['fields'] if not skeys: skeys = [] for key in skeys: newquery = DASQuery(dict(fields=[key], spec=query['spec'])) self.call(newquery) # process query else: self.call(dasquery) # process query # lookup provided query in a cache if not self.noresults: results = self.get_from_cache(dasquery, idx, limit) return results def remove_from_cache(self, dasquery): """ Delete in cache entries about input query """ self.rawcache.remove_from_cache(dasquery) def get_status(self, dasquery): """ Look-up status of provided query in a cache. Return status of the query request and its hash. """ status = None error = None reason = None if dasquery and 'fields' in dasquery.mongo_query: fields = dasquery.mongo_query['fields'] if fields and isinstance(fields, list) and 'queries' in fields: return 'ok', error, reason record = self.rawcache.find(dasquery) error, reason = self.rawcache.is_error_in_records(dasquery) try: if record and 'das' in record and 'status' in record['das']: status = record['das']['status'] if not error: error = record['das'].get('error', error) if not reason: reason = record['das'].get('reason', reason) return status, error, reason except Exception as exc: print_exc(exc) status = error = reason = None self.rawcache.remove_from_cache(dasquery) return status, error, reason def status(self): "Return status of given service" sdict = {'das': self.taskmgr.status()} for srv in sorted(self.systems): sdict[srv] = getattr(getattr(self, srv), 'status')() return sdict def worker(self, srv, dasquery): """Main worker function which calls data-srv call function""" self.logger.info('##### %s ######\n' % srv) das_timer(srv, self.verbose) getattr(getattr(self, srv), 'call')(dasquery) das_timer(srv, self.verbose) def insert_query_records(self, dasquery): """ Insert DAS query records into DAS cache and return list of services which will answer this query """ services = dasquery.services self.logger.info('Potential services = %s' % services) if not services: msg = 'No data-services for query %s' % dasquery msg += 'mongo_query: %s' % dasquery.mongo_query msg += 'params: %s' % dasquery.params() print(dastimestamp('DAS WARNING '), msg) # get list of URI which can answer this query ack_services = [] for srv in services: gen = [t for t in getattr(getattr(self, srv), 'apimap')(dasquery)] for url, api, args, iformat, expire in gen: header = dasheader(srv, dasquery, expire, api, url, ctime=0) self.rawcache.insert_query_record(dasquery, header) if srv not in ack_services: ack_services.append(srv) if not ack_services: ack_services = services if dasquery.query.find('records ') != -1: srv_status = True # skip DAS queries w/ records request # create das record with initial expire tstamp expire = time.time() + self.init_expire header = dasheader("das", dasquery, expire, api='das_core', services=dict(das=ack_services)) header['lookup_keys'] = [] self.rawcache.insert_query_record(dasquery, header) das_timer('das_record', self.verbose) return ack_services def call(self, query, **kwds): """ Top level DAS api which execute a given query using underlying data-services. It follows the following steps: - parse input query - identify data-sercices based on selection keys and where clause conditions - construct DAS workflow and execute data-service API calls. At this step individual data-services store results into DAS cache. Return status 0/1 depending on success of the calls, can be used by workers on cache server. kwds is provided for compatibility with web layer, e.g. it may invoke this method with additional pid parameter. """ def update_das_query(dasquery, status, reason=None): "Update DAS query record with given status and reason" self.rawcache.update_query_record(dasquery, status, reason=reason) self.rawcache.add_to_record(\ dasquery, {'das.timer': get_das_timer()}, system='das') self.logger.info('input query=%s' % query) das_timer('DASCore::call', self.verbose) if isinstance(query, object) and hasattr(query, '__class__')\ and query.__class__.__name__ == 'DASQuery': dasquery = query else: dasquery = DASQuery(query) for col in ['merge', 'cache']: self.rawcache.remove_expired(dasquery, col) query = dasquery.mongo_query spec = query.get('spec') fields = query.get('fields') if fields == ['records']: msg = 'look-up all records in cache' self.logger.info(msg) return 'in cache' if spec == dict(records='*'): self.logger.info("look-up everything in cache") return 'in cache' for record in self.rawcache.find_specs(dasquery): status = record['das']['status'] msg = 'found query %s in cache, status=%s\n' \ % (record['query'], status) self.logger.info(msg) print(dastimestamp('DAS INFO'), msg) return status self.logger.info(dasquery) das_timer('das_record', self.verbose) services = self.insert_query_records(dasquery) if not services: msg = 'unable to locate data-services to fulfill this request' msg += ', will iterate over all registered services' print(dastimestamp('DAS WARNING '), dasquery, msg) services = dasquery.services if dasquery.services else self.systems try: if self.multitask: jobs = [] for srv in sorted(services): jobs.append(self.taskmgr.spawn(self.worker, srv, dasquery)) self.taskmgr.joinall(jobs) else: for srv in services: self.worker(srv, dasquery) except Exception as exc: print_exc(exc) return 'fail' self.logger.info('\n##### merging ######\n') update_das_query(dasquery, 'merging') das_timer('merge', self.verbose) # check that all query record statuses are ok, i.e. we did insert records # this status is set by self.rawcache.update_cache for idx in range(self.collect_wait_time): records = self.rawcache.find_query_record(dasquery) statuses = [] for row in records: system = row['das']['system'] status = row['das']['status'] self.logger.info("### query record status %s %s %s" % (dasquery.qhash, system, status)) statuses.append(status) all_statuses = sorted(list(set(statuses))) # at this point we're looking that all services will have 'ok' and das status will be 'merging' if len(all_statuses) == 2 and all_statuses == ['merging', 'ok']: break time.sleep(1) # now we can merge records status = self.rawcache.merge_records(dasquery) das_timer('merge', self.verbose) # check if we have service records and properly setup status self.logger.info('\n##### check services ######\n') das_services = self.rawcache.check_services(dasquery) reason = '' status = 'ok' if not das_services: if 'records' in dasquery.query: status = 'ok' # keep status ok for 'records' queries else: reason = 'no data records found in DAS cache' status = 'fail' print(dastimestamp('DAS ERROR '), dasquery, reason) update_das_query(dasquery, status, reason) das_timer('DASCore::call', self.verbose) return status def processing_time(self, dasquery): "Look-up and return DAS query processing time" query_record = self.rawcache.find(dasquery) if query_record: das = query_record.get('das', None) if isinstance(das, dict): ctime = das.get('ctime', []) if ctime: return ctime[-1] - ctime[0] return None def nresults(self, dasquery, coll='merge'): """ Return total number of results (count) for provided query Code should match body of get_from_cache method. """ fields = dasquery.mongo_query.get('fields', None) if dasquery.mapreduce: result = self.rawcache.map_reduce(dasquery.mapreduce, dasquery) return len([1 for _ in result]) elif dasquery.aggregators: return len(dasquery.aggregators) return self.rawcache.nresults(dasquery, coll) def apilist(self, dasquery): "Return list of APIs answer given das query" return self.rawcache.apilist(dasquery) def incache(self, dasquery, coll='merge'): """ Answer the question if given query in DAS cache or not """ return self.rawcache.incache(dasquery, collection=coll) def get_from_cache(self, dasquery, idx=0, limit=0, collection='merge'): """ Look-up results from the merge cache and yield them for further processing. """ das_timer('DASCore::get_from_cache', self.verbose) msg = 'col=%s, query=%s, idx=%s, limit=%s'\ % (collection, dasquery, idx, limit) self.logger.info(msg) fields = dasquery.mongo_query.get('fields', None) if dasquery.mapreduce: res = self.rawcache.map_reduce(dasquery.mapreduce, dasquery) elif dasquery.aggregators: # extract das information from rawcache rows = self.rawcache.get_from_cache(\ dasquery, collection=collection) first = next(rows) sinfo = das_sinfo(first) # to perform aggregation we need: # - loop over all aggregator functions # - loop over all data-services # - loop over all APIs within a data-services # the code below does that, it applies aggregator # to selected (based on key/srv/api) records res = [] _id = 0 time0 = time.time() expire = 300 # min expire for func, key in dasquery.aggregators: afunc = getattr(das_aggregator, 'das_%s' % func) found = False for srv, apis, in sinfo.items(): for api in apis: rows = self.rawcache.get_from_cache(\ dasquery, collection=collection) gen = api_rows(rows, api) data = afunc(key, gen) ctime = time.time() - time0 das = dasheader(srv, dasquery, expire, api=api, ctime=ctime) if isinstance(data, dict) and data['value'] != 'N/A': aggr = { '_id': _id, 'function': func, 'key': key, 'result': data } aggr.update(das) res.append(aggr) _id += 1 found = True if not found: # when we got nothing add empty result record empty = {'value': 'N/A'} ctime = time.time() - time0 das = dasheader('das', dasquery, expire, api='das_core', ctime=ctime) rec = { '_id': 0, 'function': func, 'key': key, 'result': empty } rec.update(das) res.append(rec) else: res = self.rawcache.get_from_cache(dasquery, idx, limit, \ collection=collection) # we assume that all records from single query will have # identical structure, therefore it will be sufficient to update # keylearning DB only with first record count = 0 for row in res: if not count: self.keylearning.add_record(dasquery, row) fix_times(row) yield row count += 1 das_timer('DASCore::get_from_cache', self.verbose)
class DASCore(object): """ DAS core class. """ def __init__(self, config=None, debug=None, nores=False, logger=None, engine=None, multitask=True): if config: dasconfig = config else: dasconfig = das_readconfig() verbose = dasconfig['verbose'] self.stdout = debug if isinstance(debug, int): self.verbose = debug dasconfig['verbose'] = debug else: self.verbose = verbose das_timer('DASCore::init', self.verbose) self.operators = das_operators() # set noresults option self.noresults = False if nores: dasconfig['write_cache'] = True self.noresults = nores self.multitask = dasconfig['das'].get('multitask', True) if debug or self.verbose: self.multitask = False # in verbose mode do not use multitask dasconfig['das']['multitask'] = False if not multitask: # explicitly call DASCore ctor, e.g. in analytics self.multitask = False dasconfig['das']['multitask'] = False dasconfig['engine'] = engine if self.multitask: nworkers = dasconfig['das'].get('core_workers', 5) if engine: thr_name = 'DASCore:PluginTaskManager' self.taskmgr = PluginTaskManager(\ engine, nworkers=nworkers, name=thr_name) self.taskmgr.subscribe() else: thr_name = 'DASCore:TaskManager' self.taskmgr = TaskManager(nworkers=nworkers, name=thr_name) else: self.taskmgr = None if logger: self.logger = logger else: self.logger = PrintManager('DASCore', self.verbose) # define Mapping/Analytics/Parser in this order since Parser depends # on first two dasmapping = DASMapping(dasconfig) dasconfig['dasmapping'] = dasmapping self.mapping = dasmapping self.analytics = DASAnalytics(dasconfig) dasconfig['dasanalytics'] = self.analytics self.mongoparser = ql_manager(dasconfig) dasconfig['mongoparser'] = self.mongoparser self.keylearning = DASKeyLearning(dasconfig) dasconfig['keylearning'] = self.keylearning # init DAS cache self.rawcache = DASMongocache(dasconfig) dasconfig['rawcache'] = self.rawcache # plug-in architecture: loop over registered data-services in # dasconfig; load appropriate module/class; register data # service with DASCore. self.systems = dasmapping.list_systems() # pointer to the DAS top level directory dasroot = '/'.join(__file__.split('/')[:-3]) for name in self.systems: try: klass = 'DAS/services/%s/%s_service.py' \ % (name, name) srvfile = os.path.join(dasroot, klass) with file(srvfile) as srvclass: for line in srvclass: if line.find('(DASAbstractService)') != -1: klass = line.split('(DASAbstractService)')[0] klass = klass.split('class ')[-1] break mname = 'DAS.services.%s.%s_service' % (name, name) module = __import__(mname, fromlist=[klass]) obj = getattr(module, klass)(dasconfig) setattr(self, name, obj) SERVICES[name] = obj except IOError as err: if debug > 1: # we have virtual services, so IOError can be correct print_exc(err) try: mname = 'DAS.services.generic_service' module = __import__(mname, fromlist=['GenericService']) obj = module.GenericService(name, dasconfig) setattr(self, name, obj) except Exception as exc: print_exc(exc) msg = "Unable to load %s data-service plugin" % name raise Exception(msg) except Exception as exc: print_exc(exc) msg = "Unable to load %s data-service plugin" % name raise Exception(msg) # loop over systems and get system keys, add mapping keys to final list self.service_keys = {} self.service_parameters = {} for name in self.systems: skeys = getattr(self, name).keys() self.service_keys[getattr(self, name).name] = skeys sparams = getattr(self, name).parameters() self.service_parameters[getattr(self, name).name] = sparams self.service_keys['special'] = das_special_keys() self.dasconfig = dasconfig das_timer('DASCore::init', self.verbose) def keys(self): """ Return map of data service keys """ return self.service_keys def das_keys(self): """ Return map of data service keys """ _keys = ['records'] for values in self.service_keys.values(): for key in values: if key not in _keys: _keys.append(key) return _keys def result(self, query, idx=0, limit=None): """ Get results either from cache or from explicit call """ self.logger.info('input query=%s' % query) results = [] dasquery = DASQuery(query, mongoparser=self.mongoparser) dasquery.add_to_analytics() query = dasquery.mongo_query # check if we have any service which cover the query # otherwise decompose it into list of queries service_map = dasquery.service_apis_map() if not service_map: msg = 'no APIs found to answer input query, will decompose it' self.logger.info(msg) skeys = query['fields'] if not skeys: skeys = [] for key in skeys: newquery = DASQuery(dict(fields=[key], spec=query['spec']), mongoparser=self.mongoparser) self.call(newquery) # process query else: self.call(dasquery) # process query # lookup provided query in a cache if not self.noresults: results = self.get_from_cache(dasquery, idx, limit) return results def remove_from_cache(self, dasquery): """ Delete in cache entries about input query """ self.rawcache.remove_from_cache(dasquery) def get_status(self, dasquery): """ Look-up status of provided query in a cache. Return status of the query request and its hash. """ if dasquery and dasquery.mongo_query.has_key('fields'): fields = dasquery.mongo_query['fields'] if fields and isinstance(fields, list) and 'queries' in fields: return 'ok', dasquery.qhash status = 0 record = self.rawcache.find(dasquery) try: if record and record.has_key('das') and \ record['das'].has_key('status'): status = record['das']['status'] return status, record['qhash'] except: pass similar_dasquery = self.rawcache.similar_queries(dasquery) if similar_dasquery: record = self.rawcache.find(similar_dasquery) if record and record.has_key('das') and \ record['das'].has_key('status'): similar_query_status = record['das']['status'] return similar_query_status, record['qhash'] return status, 0 def worker(self, srv, dasquery): """Main worker function which calls data-srv call function""" self.logger.info('##### %s ######\n' % srv) das_timer(srv, self.verbose) getattr(getattr(self, srv), 'call')(dasquery) das_timer(srv, self.verbose) def call(self, query, add_to_analytics=True, **kwds): """ Top level DAS api which execute a given query using underlying data-services. It follows the following steps: - parse input query - identify data-sercices based on selection keys and where clause conditions - construct DAS workflow and execute data-service API calls. At this step individual data-services store results into DAS cache. Return status 0/1 depending on success of the calls, can be used by workers on cache server. kwds is provided for compatibility with web layer, e.g. it may invoke this method with additional pid parameter. """ self.logger.info('input query=%s' % query) das_timer('DASCore::call', self.verbose) services = [] if isinstance(query, object) and hasattr(query, '__class__')\ and query.__class__.__name__ == 'DASQuery': dasquery = query else: dasquery = DASQuery(query, mongoparser=self.mongoparser) if add_to_analytics: dasquery.add_to_analytics() query = dasquery.mongo_query if dasquery.mongo_query.has_key('system'): system = query['system'] if isinstance(system, str) or isinstance(system, unicode): services = [system] elif isinstance(system, list): services = system else: msg = 'Unsupported system=%s type=%s in DAS query' \ % (system, type(system)) raise Exception(msg) spec = query.get('spec') fields = query.get('fields') if fields == ['records']: msg = 'look-up all records in cache' self.logger.info(msg) return 'in cache' if spec == dict(records='*'): self.logger.info("look-up everything in cache") return 'in cache' for record in self.rawcache.find_specs(dasquery): status = record['das']['status'] msg = 'found query %s in cache, status=%s\n' \ % (record['query'], status) self.logger.info(msg) return status similar_dasquery = self.rawcache.similar_queries(dasquery) if similar_dasquery: for record in self.rawcache.find_specs(similar_dasquery): if record: try: status = record['das']['status'] except: status = 'N/A' msg = 'Fail to look-up das.status, record=%s' % record self.logger.info(msg) msg = 'found SIMILAR query in cache,' msg += 'query=%s, status=%s\n' % (record['query'], status) self.logger.info(msg) return status self.logger.info(dasquery) params = dasquery.params() if not services: services = params['services'] self.logger.info('services = %s' % services) das_timer('das_record', self.verbose) # initial expire tstamp 1 day (long enough to be overwriten by data-srv) expire = expire_timestamp(time.time()+1*24*60*60) header = dasheader("das", dasquery, expire) header['lookup_keys'] = [] self.rawcache.insert_query_record(dasquery, header) das_timer('das_record', self.verbose) try: if self.multitask: jobs = [] for srv in services: jobs.append(self.taskmgr.spawn(self.worker, srv, dasquery)) self.taskmgr.joinall(jobs) else: for srv in services: self.worker(srv, dasquery) except Exception as exc: print_exc(exc) return 'fail' self.logger.info('\n##### merging ######\n') self.rawcache.update_query_record(dasquery, 'merging') das_timer('merge', self.verbose) self.rawcache.merge_records(dasquery) das_timer('merge', self.verbose) self.rawcache.update_query_record(dasquery, 'ok') self.rawcache.add_to_record(\ dasquery, {'das.timer': get_das_timer()}, system='das') das_timer('DASCore::call', self.verbose) return 'ok' def nresults(self, dasquery, coll='merge'): """ Return total number of results (count) for provided query Code should match body of get_from_cache method. """ fields = dasquery.mongo_query.get('fields', None) if dasquery.mapreduce: result = self.rawcache.map_reduce(dasquery.mapreduce, dasquery) return len([1 for _ in result]) elif dasquery.aggregators: return len(dasquery.aggregators) elif isinstance(fields, list) and 'queries' in fields: return len([1 for _ in self.get_queries(dasquery)]) return self.rawcache.nresults(dasquery, coll) def incache(self, dasquery, coll='merge'): """ Answer the question if given query in DAS cache or not """ return self.rawcache.incache(dasquery, collection=coll) def get_from_cache(self, dasquery, idx=0, limit=0, collection='merge'): """ Look-up results from the merge cache and yield them for further processing. """ das_timer('DASCore::get_from_cache', self.verbose) msg = 'col=%s, query=%s, idx=%s, limit=%s'\ % (collection, dasquery, idx, limit) self.logger.info(msg) fields = dasquery.mongo_query.get('fields', None) if dasquery.mapreduce: res = self.rawcache.map_reduce(dasquery.mapreduce, dasquery) elif dasquery.aggregators: res = [] _id = 0 for func, key in dasquery.aggregators: rows = self.rawcache.get_from_cache(\ dasquery, collection=collection) data = getattr(das_aggregator, 'das_%s' % func)(key, rows) res += \ [{'_id':_id, 'function': func, 'key': key, 'result': data}] _id += 1 elif isinstance(fields, list) and 'queries' in fields: res = itertools.islice(self.get_queries(dasquery), idx, idx+limit) else: res = self.rawcache.get_from_cache(dasquery, idx, limit, \ collection=collection) for row in res: fix_times(row) yield row das_timer('DASCore::get_from_cache', self.verbose) def get_queries(self, dasquery): """ Look-up (popular) queries in DAS analytics/logging db """ das_timer('DASCore::get_queries', self.verbose) fields = dasquery.mongo_query.get('fields') spec = dasquery.mongo_query.get('spec') if 'popular' in fields: res = self.analytics.get_popular_queries(spec) else: datestamp = spec.get('date') if isinstance(datestamp, dict): value = datestamp.get('$in') res = \ self.analytics.list_queries(after=value[0], before=value[1]) elif isinstance(datestamp, int): res = self.analytics.list_queries(after=datestamp) elif not datestamp: res = self.analytics.list_queries() else: msg = 'Unsupported date value: %s' % datestamp raise Exception(msg) for row in res: rid = row.pop('_id') yield dict(das_query=row, _id=rid) das_timer('DASCore::get_queries', self.verbose)
class DASCore(object): """ DAS core class. """ def __init__(self, config=None, debug=0, nores=False, logger=None, engine=None, multitask=True): if config: dasconfig = config else: dasconfig = das_readconfig() verbose = dasconfig["verbose"] self.stdout = debug if isinstance(debug, int): self.verbose = debug dasconfig["verbose"] = debug else: self.verbose = verbose das_timer("DASCore::init", self.verbose) self.operators = das_operators() # set noresults option self.noresults = False if nores: dasconfig["write_cache"] = True self.noresults = nores self.multitask = dasconfig["das"].get("multitask", True) if debug or self.verbose: self.multitask = False # in verbose mode do not use multitask dasconfig["das"]["multitask"] = False if not multitask: # explicitly call DASCore ctor self.multitask = False dasconfig["das"]["multitask"] = False dasconfig["engine"] = engine if self.multitask: nworkers = dasconfig["das"].get("core_workers", 5) if engine: thr_name = "DASCore:PluginTaskManager" self.taskmgr = PluginTaskManager(engine, nworkers=nworkers, name=thr_name) self.taskmgr.subscribe() else: thr_name = "DASCore:TaskManager" self.taskmgr = TaskManager(nworkers=nworkers, name=thr_name) else: self.taskmgr = None if logger: self.logger = logger else: self.logger = PrintManager("DASCore", self.verbose) # define Mapping/Analytics/Parser in this order since Parser depends # on first two dasmapping = DASMapping(dasconfig) dasconfig["dasmapping"] = dasmapping self.mapping = dasmapping self.keylearning = DASKeyLearning(dasconfig) dasconfig["keylearning"] = self.keylearning # init DAS cache self.rawcache = DASMongocache(dasconfig) dasconfig["rawcache"] = self.rawcache # plug-in architecture: loop over registered data-services in # dasconfig; load appropriate module/class; register data # service with DASCore. self.systems = dasmapping.list_systems() # pointer to the DAS top level directory dasroot = "/".join(__file__.split("/")[:-3]) for name in self.systems: try: klass = "DAS/services/%s/%s_service.py" % (name, name) srvfile = os.path.join(dasroot, klass) with open(srvfile) as srvclass: for line in srvclass: if line.find("(DASAbstractService)") != -1: klass = line.split("(DASAbstractService)")[0] klass = klass.split("class ")[-1] break mname = "DAS.services.%s.%s_service" % (name, name) module = __import__(mname, fromlist=[klass]) obj = getattr(module, klass)(dasconfig) setattr(self, name, obj) except IOError as err: if debug > 1: # we have virtual services, so IOError can be correct print_exc(err) try: mname = "DAS.services.generic_service" module = __import__(mname, fromlist=["GenericService"]) obj = module.GenericService(name, dasconfig) setattr(self, name, obj) except Exception as exc: print_exc(exc) msg = "Unable to load %s data-service plugin" % name raise Exception(msg) except Exception as exc: print_exc(exc) msg = "Unable to load %s data-service plugin" % name raise Exception(msg) # loop over systems and get system keys, add mapping keys to final list self.service_keys = {} self.service_parameters = {} for name in self.systems: skeys = list(getattr(self, name).keys()) self.service_keys[getattr(self, name).name] = skeys sparams = getattr(self, name).parameters() self.service_parameters[getattr(self, name).name] = sparams self.service_keys["special"] = das_special_keys() self.dasconfig = dasconfig das_timer("DASCore::init", self.verbose) def keys(self): """ Return map of data service keys """ return self.service_keys def das_keys(self): """ Return map of data service keys """ _keys = ["records"] for values in self.service_keys.values(): for key in values: if key not in _keys: _keys.append(key) return _keys def result(self, query, idx=0, limit=None): """ Get results either from cache or from explicit call """ self.logger.info("input query=%s" % query) results = [] dasquery = DASQuery(query) query = dasquery.mongo_query # check if we have any service which cover the query # otherwise decompose it into list of queries service_map = dasquery.service_apis_map() if not service_map: msg = "no APIs found to answer input query, will decompose it" self.logger.info(msg) skeys = query["fields"] if not skeys: skeys = [] for key in skeys: newquery = DASQuery(dict(fields=[key], spec=query["spec"])) self.call(newquery) # process query else: self.call(dasquery) # process query # lookup provided query in a cache if not self.noresults: results = self.get_from_cache(dasquery, idx, limit) return results def remove_from_cache(self, dasquery): """ Delete in cache entries about input query """ self.rawcache.remove_from_cache(dasquery) def get_status(self, dasquery): """ Look-up status of provided query in a cache. Return status of the query request and its hash. """ status = None error = None reason = None if dasquery and "fields" in dasquery.mongo_query: fields = dasquery.mongo_query["fields"] if fields and isinstance(fields, list) and "queries" in fields: return "ok", error, reason record = self.rawcache.find(dasquery) error, reason = self.rawcache.is_error_in_records(dasquery) try: if record and "das" in record and "status" in record["das"]: status = record["das"]["status"] if not error: error = record["das"].get("error", error) if not reason: reason = record["das"].get("reason", reason) return status, error, reason except Exception as exc: print_exc(exc) status = error = reason = None self.rawcache.remove_from_cache(dasquery) return status, error, reason def worker(self, srv, dasquery): """Main worker function which calls data-srv call function""" self.logger.info("##### %s ######\n" % srv) das_timer(srv, self.verbose) getattr(getattr(self, srv), "call")(dasquery) das_timer(srv, self.verbose) def insert_query_records(self, dasquery): """ Insert DAS query records into DAS cache and return list of services which will answer this query """ services = dasquery.services self.logger.info("Potential services = %s" % services) if not services: msg = "No data-services for query %s" % dasquery msg += "mongo_query: %s" % dasquery.mongo_query msg += "params: %s" % dasquery.params() print(dastimestamp("DAS WARNING "), msg) # get list of URI which can answer this query ack_services = [] for srv in services: gen = [t for t in getattr(getattr(self, srv), "apimap")(dasquery)] for url, api, args, iformat, expire in gen: header = dasheader(srv, dasquery, expire, api, url, ctime=0) self.rawcache.insert_query_record(dasquery, header) if srv not in ack_services: ack_services.append(srv) if not ack_services: ack_services = services if dasquery.query.find("records ") != -1: srv_status = True # skip DAS queries w/ records request # create das record with initial expire tstamp 2 min in a future # it should be sufficient for processing data-srv records expire = time.time() + 2 * 60 header = dasheader("das", dasquery, expire, api="das_core", services=dict(das=ack_services)) header["lookup_keys"] = [] self.rawcache.insert_query_record(dasquery, header) das_timer("das_record", self.verbose) return ack_services def call(self, query, **kwds): """ Top level DAS api which execute a given query using underlying data-services. It follows the following steps: - parse input query - identify data-sercices based on selection keys and where clause conditions - construct DAS workflow and execute data-service API calls. At this step individual data-services store results into DAS cache. Return status 0/1 depending on success of the calls, can be used by workers on cache server. kwds is provided for compatibility with web layer, e.g. it may invoke this method with additional pid parameter. """ def update_das_query(dasquery, status, reason=None): "Update DAS query record with given status and reason" self.rawcache.update_query_record(dasquery, status, reason=reason) self.rawcache.add_to_record(dasquery, {"das.timer": get_das_timer()}, system="das") # make sure that das record is updated, we use 7 iteration which # sum up into 1 minute to cover default syncdelay value of mongo # server (in a future it would be better to find programatically # this syncdelay value, but it seems pymongo driver does not # provide any API for it. for idx in range(0, 7): spec = {"qhash": dasquery.qhash, "das.system": ["das"]} res = self.rawcache.col.find_one(spec) if res: dbstatus = res.get("das", {}).get("status", None) if dbstatus == status: break msg = "qhash %s, das.status=%s, status=%s, wait for update" % (dasquery.qhash, dbstatus, status) print(dastimestamp("DAS WARNING"), msg) self.rawcache.update_query_record(dasquery, status, reason=reason) time.sleep(idx * idx) self.logger.info("input query=%s" % query) das_timer("DASCore::call", self.verbose) if isinstance(query, object) and hasattr(query, "__class__") and query.__class__.__name__ == "DASQuery": dasquery = query else: dasquery = DASQuery(query) for col in ["merge", "cache"]: self.rawcache.remove_expired(dasquery, col) query = dasquery.mongo_query spec = query.get("spec") fields = query.get("fields") if fields == ["records"]: msg = "look-up all records in cache" self.logger.info(msg) return "in cache" if spec == dict(records="*"): self.logger.info("look-up everything in cache") return "in cache" for record in self.rawcache.find_specs(dasquery): status = record["das"]["status"] msg = "found query %s in cache, status=%s\n" % (record["query"], status) self.logger.info(msg) print(dastimestamp("DAS INFO"), msg) return status self.logger.info(dasquery) das_timer("das_record", self.verbose) services = self.insert_query_records(dasquery) if not services: msg = "unable to locate data-services to fulfill this request" msg += ", will iterate over all registered services" print(dastimestamp("DAS WARNING "), dasquery, msg) services = dasquery.services if dasquery.services else self.systems try: if self.multitask: jobs = [] for srv in sorted(services): jobs.append(self.taskmgr.spawn(self.worker, srv, dasquery)) self.taskmgr.joinall(jobs) else: for srv in services: self.worker(srv, dasquery) except Exception as exc: print_exc(exc) return "fail" self.logger.info("\n##### merging ######\n") update_das_query(dasquery, "merging") das_timer("merge", self.verbose) for attempt in range(0, 4): # try couple of times to avoid DB problems time.sleep(attempt) status = self.rawcache.merge_records(dasquery, attempt) if status == "ok": break das_timer("merge", self.verbose) # check if we have service records and properly setup status self.logger.info("\n##### check services ######\n") das_services = self.rawcache.check_services(dasquery) reason = "" status = "ok" if not das_services: if "records" in dasquery.query: status = "ok" # keep status ok for 'records' queries else: reason = "no data records found in DAS cache" status = "fail" print(dastimestamp("DAS ERROR "), dasquery, reason) update_das_query(dasquery, status, reason) das_timer("DASCore::call", self.verbose) return status def processing_time(self, dasquery): "Look-up and return DAS query processing time" query_record = self.rawcache.find(dasquery) if query_record: das = query_record.get("das", None) if isinstance(das, dict): ctime = das.get("ctime", []) if ctime: return ctime[-1] - ctime[0] return None def nresults(self, dasquery, coll="merge"): """ Return total number of results (count) for provided query Code should match body of get_from_cache method. """ fields = dasquery.mongo_query.get("fields", None) if dasquery.mapreduce: result = self.rawcache.map_reduce(dasquery.mapreduce, dasquery) return len([1 for _ in result]) elif dasquery.aggregators: return len(dasquery.aggregators) return self.rawcache.nresults(dasquery, coll) def apilist(self, dasquery): "Return list of APIs answer given das query" return self.rawcache.apilist(dasquery) def incache(self, dasquery, coll="merge"): """ Answer the question if given query in DAS cache or not """ return self.rawcache.incache(dasquery, collection=coll) def get_from_cache(self, dasquery, idx=0, limit=0, collection="merge"): """ Look-up results from the merge cache and yield them for further processing. """ das_timer("DASCore::get_from_cache", self.verbose) msg = "col=%s, query=%s, idx=%s, limit=%s" % (collection, dasquery, idx, limit) self.logger.info(msg) fields = dasquery.mongo_query.get("fields", None) if dasquery.mapreduce: res = self.rawcache.map_reduce(dasquery.mapreduce, dasquery) elif dasquery.aggregators: # extract das information from rawcache rows = self.rawcache.get_from_cache(dasquery, collection=collection) first = next(rows) sinfo = das_sinfo(first) # to perform aggregation we need: # - loop over all aggregator functions # - loop over all data-services # - loop over all APIs within a data-services # the code below does that, it applies aggregator # to selected (based on key/srv/api) records res = [] _id = 0 time0 = time.time() expire = 300 # min expire for func, key in dasquery.aggregators: afunc = getattr(das_aggregator, "das_%s" % func) found = False for srv, apis in sinfo.items(): for api in apis: rows = self.rawcache.get_from_cache(dasquery, collection=collection) gen = api_rows(rows, api) data = afunc(key, gen) ctime = time.time() - time0 das = dasheader(srv, dasquery, expire, api=api, ctime=ctime) if isinstance(data, dict) and data["value"] != "N/A": aggr = {"_id": _id, "function": func, "key": key, "result": data} aggr.update(das) res.append(aggr) _id += 1 found = True if not found: # when we got nothing add empty result record empty = {"value": "N/A"} ctime = time.time() - time0 das = dasheader("das", dasquery, expire, api="das_core", ctime=ctime) rec = {"_id": 0, "function": func, "key": key, "result": empty} rec.update(das) res.append(rec) else: res = self.rawcache.get_from_cache(dasquery, idx, limit, collection=collection) # we assume that all records from single query will have # identical structure, therefore it will be sufficient to update # keylearning DB only with first record count = 0 for row in res: if not count: self.keylearning.add_record(dasquery, row) fix_times(row) yield row count += 1 das_timer("DASCore::get_from_cache", self.verbose)
class DASAbstractService(object): """ Abstract class describing DAS service. It initialized with a name which is used to identify service parameters from DAS configuration file. Those parameters are keys, verbosity level, URL of the data-service. """ def __init__(self, name, config): self.name = name try: self.verbose = config['verbose'] title = 'DASAbstactService_%s' % self.name self.logger = PrintManager(title, self.verbose) self.dasmapping = config['dasmapping'] self.write2cache = config.get('write_cache', True) self.multitask = config['das'].get('multitask', True) self.error_expire = config['das'].get('error_expire', 300) self.dbs_global = None # to be configured at run time self.dburi = config['mongodb']['dburi'] engine = config.get('engine', None) self.gfs = db_gridfs(self.dburi) except Exception as exc: print_exc(exc) raise Exception('fail to parse DAS config') # read key/cert info try: self.ckey, self.cert = get_key_cert() except Exception as exc: print_exc(exc) self.ckey = None self.cert = None if self.multitask: nworkers = config['das'].get('api_workers', 3) thr_weights = config['das'].get('thread_weights', []) for system_weight in thr_weights: system, weight = system_weight.split(':') if system == self.name: nworkers *= int(weight) # if engine: # thr_name = 'DASAbstractService:%s:PluginTaskManager' % self.name # self.taskmgr = PluginTaskManager(\ # engine, nworkers=nworkers, name=thr_name) # self.taskmgr.subscribe() # else: # thr_name = 'DASAbstractService:%s:TaskManager' % self.name # self.taskmgr = TaskManager(nworkers=nworkers, name=thr_name) thr_name = 'DASAbstractService:%s:TaskManager' % self.name self.taskmgr = TaskManager(nworkers=nworkers, name=thr_name) else: self.taskmgr = None self.map = {} # to be defined by data-service implementation self._keys = None # to be defined at run-time in self.keys self._params = None # to be defined at run-time in self.parameters self._notations = {} # to be defined at run-time in self.notations self.logger.info('initialized') # define internal cache manager to put 'raw' results into cache if 'rawcache' in config and config['rawcache']: self.localcache = config['rawcache'] else: msg = 'Undefined rawcache, please check your configuration' raise Exception(msg) def status(self): "Return status of the service" return self.taskmgr.status() def services(self): """ Return sub-subsystems used to retrieve data records. It is used in dasheader call to setup das.services field. This method can be overwritten in sub-classes, otherwise returns dict of service name and CMS systems used to retrieve data records. """ return {self.name: [self.name]} def version(self): """Return data-services version, should be implemented in sub-classes""" return '' def keys(self): """ Return service keys """ if self._keys: return self._keys srv_keys = [] for _api, params in self.map.items(): for key in params['keys']: if not key in srv_keys: srv_keys.append(key) self._keys = srv_keys return srv_keys def parameters(self): """ Return mapped service parameters """ if self._params: return self._params srv_params = [] for _api, params in self.map.items(): for key in params['params']: param_list = self.dasmapping.api2das(self.name, key) for par in param_list: if not par in srv_params: srv_params.append(par) self._params = srv_params return srv_params def notations(self): """ Return a map of system notations. """ if self._notations: return self._notations for _, rows in self.dasmapping.notations(self.name).items(): for row in rows: api = row['api'] nmap = row['rec_key'] notation = row['api_output'] if api in self._notations: self._notations[api].update({notation: nmap}) else: self._notations[api] = {notation: nmap} return self._notations def getdata(self, url, params, expire, headers=None, post=None): """URL call wrapper""" if url.find('https:') != -1: return getdata(url, params, headers, expire, post, self.error_expire, self.verbose, self.ckey, self.cert, system=self.name) else: return getdata(url, params, headers, expire, post, self.error_expire, self.verbose, system=self.name) def call(self, dasquery): """ Invoke service API to execute given query. Return results as a collect list set. """ self.logger.info(dasquery) # check the cache for records with given query/system res = self.localcache.incache(dasquery, collection='cache', system=self.name) if res: msg = "found records in local cache" self.logger.info(msg) return # ask data-service api to get results, they'll be store them in # cache, so return at the end what we have in cache. self.api(dasquery) def write_to_cache(self, dasquery, expire, url, api, args, gen, ctime): """ Write provided result set into DAS cache. """ if not self.write2cache: return # before going to cache we should check/set possible misses, e.g. # primary key when error is thrown result = self.set_misses(dasquery, api, gen) # update the cache header = dasheader(self.name, dasquery, expire, api, url, services=self.services()) header['lookup_keys'] = self.lookup_keys(api) header['prim_key'] = self.dasmapping.primary_mapkey(self.name, api) header['ctime'] = ctime system = self.name self.localcache.update_cache(dasquery, result, header, system, api) msg = 'cache has been updated,\n' self.logger.debug(msg) def adjust_params(self, api, kwds, instance=None): """ Data-service specific parser to adjust parameters according to its specifications. For example, DQ service accepts a string of parameters, rather parameter set, while DBS2 can reuse some parameters for different API, e.g. I can use dataset path to pass to listPrimaryDatasets as primary_dataset pattern. """ pass def lookup_keys(self, api): """ Return look-up keys of data output for given data-service API. """ lkeys = self.dasmapping.lookup_keys(self.name, api) return [{api: lkeys}] def inspect_params(self, api, args): """ Perform API parameter inspection. Check if API accept a range of parameters, etc. """ for key, value in args.items(): if isinstance(value, dict): minval = None maxval = None for oper, val in value.items(): if oper == '$in': minval = int(val[0]) maxval = int(val[-1]) args[key] = range(minval, maxval) elif oper == '$lt': maxval = int(val) args[key] = maxval elif oper == '$lte': maxval = int(val) args[key] = maxval elif oper == '$gt': minval = int(val) args[key] = minval elif oper == '$gte': minval = int(val) args[key] = minval else: msg = '%s does not support operator %s' % (api, oper) raise Exception(msg) return args def get_notations(self, api): """Return notations used for given API""" notationmap = self.notations() if not notationmap: return {} notations = {} if '' in notationmap: notations = dict(notationmap['']) # notations applied to all APIs if api in notationmap: # overwrite the one for provided API notations.update(notationmap[api]) return notations def parser(self, dasquery, dformat, data, api): """ DAS data parser. Input parameters: - *query* input DAS query - *dformat* is a data format, e.g. XML, JSON - *data* is a data source, either file-like object or actual data - *api* is API name """ prim_key = self.dasmapping.primary_key(self.name, api) counter = 0 if dformat.lower() == 'xml': tags = self.dasmapping.api2daskey(self.name, api) gen = xml_parser(data, prim_key, tags) for row in gen: counter += 1 yield row elif dformat.lower() == 'json' or dformat.lower() == 'dasjson': gen = json_parser(data, self.logger) das_dict = {} for row in gen: if dformat.lower() == 'dasjson': for key, val in row.items(): if key != 'results': das_dict[key] = val row = row['results'] if isinstance(row, list): for item in row: if item: if prim_key in item: counter += 1 yield item else: counter += 1 yield {prim_key: item} else: if prim_key in row: counter += 1 yield row else: counter += 1 yield {prim_key: row} else: msg = 'Unsupported data format="%s", API="%s"' % (dformat, api) raise Exception(msg) msg = "api=%s, format=%s " % (api, dformat) msg += "prim_key=%s yield %s rows" % (prim_key, counter) self.logger.info(msg) def translator(self, api, genrows): """ Convert raw results into DAS records. """ prim_key = self.dasmapping.primary_key(self.name, api) count = 0 for row in genrows: row2das(self.dasmapping.notation2das, self.name, api, row) count += 1 # check for primary key existance, since it can be overriden # by row2das. For example DBS3 uses flat namespace, so we # override dataset=>name, while dataset still is a primary key if isinstance(row, list): yield {prim_key: row} elif prim_key in row: if prim_key in row[prim_key]: yield row[prim_key] # remapping may create nested dict else: yield row else: yield {prim_key: row} msg = "yield %s rows" % count self.logger.debug(msg) def set_misses(self, dasquery, api, genrows): """ Check and adjust DAS records wrt input query. If some of the DAS keys are missing, add it with its value to the DAS record. """ # look-up primary key prim_key = self.dasmapping.primary_key(self.name, api) # Scan all docs and store those whose size above MongoDB limit into # GridFS map_key = self.dasmapping.primary_mapkey(self.name, api) genrows = parse2gridfs(self.gfs, map_key, genrows, self.logger) spec = dasquery.mongo_query['spec'] row = next(genrows) ddict = DotDict(row) keys2adjust = [] for key in spec.keys(): val = ddict.get(key) if spec[key] != val and key not in keys2adjust: keys2adjust.append(key) msg = "adjust keys %s" % keys2adjust self.logger.debug(msg) count = 0 if keys2adjust: # adjust of the rows for row in yield_rows(row, genrows): ddict = DotDict(row) pval = ddict.get(map_key) if isinstance(pval, dict) and 'error' in pval: ddict[map_key] = '' ddict.update({prim_key: pval}) for key in keys2adjust: value = spec[key] existing_value = ddict.get(key) # the way to deal with proximity/patern/condition results if (isinstance(value, str) or isinstance(value, unicode))\ and value.find('*') != -1: # we got pattern if existing_value: value = existing_value elif isinstance(value, dict) or \ isinstance(value, list): # we got condition if existing_value: value = existing_value elif isinstance(value, dict) and \ '$in' in value: # we got a range {'$in': []} value = value['$in'] elif isinstance(value, dict) and \ '$lte' in value and '$gte' in value: # we got a between range value = [value['$gte'], value['$lte']] else: value = json.dumps(value) elif existing_value and value != existing_value: # we got proximity results if 'proximity' in ddict: proximity = DotDict({key: existing_value}) ddict['proximity'].update(proximity) else: proximity = DotDict({}) proximity[key] = existing_value ddict['proximity'] = proximity else: if existing_value: value = existing_value ddict[key] = value yield ddict count += 1 else: yield row for row in genrows: yield row count += 1 msg = "yield %s rows" % count self.logger.debug(msg) def api(self, dasquery): """ Data service api method, can be defined by data-service class. It parse input query and invoke appropriate data-service API call. All results are stored into the DAS cache along with api call inserted into Analytics DB. """ self.logger.info(dasquery) genrows = self.apimap(dasquery) if not genrows: return jobs = [] for url, api, args, dformat, expire in genrows: # insert DAS query record for given API header = dasheader(self.name, dasquery, expire, api, url) self.localcache.insert_query_record(dasquery, header) # fetch DAS data records if self.multitask: jobs.append(self.taskmgr.spawn(self.apicall, \ dasquery, url, api, args, dformat, expire)) else: self.apicall(dasquery, url, api, args, dformat, expire) if self.multitask: self.taskmgr.joinall(jobs) def apicall(self, dasquery, url, api, args, dformat, expire): """ Data service api method, can be defined by data-service class. It parse input query and invoke appropriate data-service API call. All results are stored into the DAS cache along with api call inserted into Analytics DB. We invoke explicitly close call for our datastream instead of using context manager since this method as well as getdata/parser can be overwritten by child classes. """ datastream = None try: args = self.inspect_params(api, args) time0 = time.time() headers = make_headers(dformat) datastream, expire = self.getdata(url, args, expire, headers) self.logger.info("%s expire %s" % (api, expire)) rawrows = self.parser(dasquery, dformat, datastream, api) dasrows = self.translator(api, rawrows) ctime = time.time() - time0 self.write_to_cache(dasquery, expire, url, api, args, dasrows, ctime) except Exception as exc: msg = 'Fail to process: url=%s, api=%s, args=%s' \ % (url, api, args) print(msg) print_exc(exc) close(datastream) def url_instance(self, url, _instance): """ Virtual method to adjust URL for a given instance, must be implemented in service classes """ return url def adjust_url(self, url, instance): """ Adjust data-service URL wrt provided instance, e.g. DBS carry several instances """ if instance: url = self.url_instance(url, instance) return url def apimap(self, dasquery): """ Analyze input query and yield url, api, args, format, expire for further processing. """ srv = self.name # get local copy to avoid threading issues cond = getarg(dasquery.mongo_query, 'spec', {}) instance = dasquery.mongo_query.get('instance', self.dbs_global) skeys = getarg(dasquery.mongo_query, 'fields', []) if not skeys: skeys = [] self.logger.info("\n") for api, value in self.map.items(): expire = value['expire'] iformat = value['format'] url = self.adjust_url(value['url'], instance) if not url: msg = '--- rejects API %s, no URL' % api self.logger.info(msg) continue args = dict(value['params']) # make new copy, since we'll adjust wild = value.get('wild_card', '*') found = 0 # check if input parameters are covered by API if not self.dasmapping.check_api_match(srv, api, cond): msg = '--- rejects API %s, does not cover input condition keys' \ % api self.logger.info(msg) continue # once we now that API covers input set of parameters we check # every input parameter for pattern matching for key, val in cond.items(): # check if keys from conditions are accepted by API # need to convert key (which is daskeys.map) into # input api parameter for apiparam in self.dasmapping.das2api(srv, api, key, val): if apiparam in args: args[apiparam] = val found += 1 # VK 20160708, wrong statement, it caused to pass # datasets API for query dataset in [path1, path2] # I'll leave block here until I test and verify that # commented out block will not cause other issues # # check the case when we only have single condition key # and it is the key we look-up # if not found and skeys == [k.split('.')[0] for k in cond.keys()]: # found = 1 # check if number of keys on cond and args are the same if len(cond.keys()) != found: msg = "--- reject API %s, not all condition keys are covered" \ % api self.logger.info(msg) msg = 'args=%s' % args self.logger.debug(msg) continue if not found: msg = "--- rejects API %s, parameters don't match" % api self.logger.info(msg) msg = 'args=%s' % args self.logger.debug(msg) continue self.adjust_params(api, args, instance) # delete args keys whose value is optional delete_keys(args, 'optional') # check that there is no "required" parameter left in args, # since such api will not work if 'required' in args.values(): msg = '--- rejects API %s, parameter is required' % api self.logger.info(msg) msg = 'args=%s' % args self.logger.debug(msg) continue # adjust pattern symbols in arguments if wild != '*': for key, val in args.items(): if isinstance(val, str) or isinstance(val, unicode): val = val.replace('*', wild) args[key] = val # compare query selection keys with API look-up keys api_lkeys = self.dasmapping.api_lkeys(srv, api) if set(api_lkeys) != set(skeys): msg = "--- rejects API %s, api_lkeys(%s)!=skeys(%s)"\ % (api, api_lkeys, skeys) self.logger.info(msg) continue msg = '+++ %s passes API %s' % (srv, api) self.logger.info(msg) msg = 'args=%s' % args self.logger.debug(msg) msg = "yield " msg += "system ***%s***, url=%s, api=%s, args=%s, format=%s, " \ % (srv, url, api, args, iformat) msg += "expire=%s, wild_card=%s" \ % (expire, wild) self.logger.debug(msg) yield url, api, args, iformat, expire