Beispiel #1
0
 def setUp(self):
     """
     set up DAS core module
     """
     debug    = 0
     self.db  = 'test_analytics.db'
     config   = deepcopy(das_readconfig())
     dburi    = config['mongodb']['dburi']
     logger   = PrintManager('TestDASAnalytics', verbose=debug)
     config['logger']  = logger
     config['verbose'] = debug
     config['analyticsdb'] = dict(dburi=dburi, history=5184000,
             dbname='test_analytics', collname='db')
     self.mgr = DASAnalytics(config)
Beispiel #2
0
    def setUp(self):
        """
        set up data used in the tests.
        setUp is called before each test function execution.
        """
        self.i1 = "find dataset, run, bfield where site = T2 and admin=VK and storage=castor"
        self.i2 = "  find dataset, run where (run=1 or run=2) and storage=castor or site = T2"

        debug = 0
        config = das_readconfig()
        logger = DASLogger(verbose=debug, stdout=debug)
        config['logger'] = logger
        config['verbose'] = debug
        config['mapping_dbhost'] = 'localhost'
        config['mapping_dbport'] = 27017
        config['mapping_dbname'] = 'mapping'
        config['dasmapping'] = DASMapping(config)
        config['dasanalytics'] = DASAnalytics(config)
        self.parser = MongoParser(config)
        self.operators = [o.strip() for o in DAS_OPERATORS]
Beispiel #3
0
    def __init__(self, config=None):
        if  not config:
            config = das_readconfig()
        self.dasmapping  = DASMapping(config)
        if  not self.dasmapping.check_maps():
            msg = "No DAS maps found in MappingDB"
            raise Exception(msg)
        self.analytics   = DASAnalytics(config)
        self.dasservices = config['services']
        self.daskeysmap  = self.dasmapping.daskeys()
        self.operators   = list(das_operators())
        self.daskeys     = list(das_special_keys())
        self.verbose     = config['verbose']
        self.logger      = PrintManager('QLManger', self.verbose)
        for val in self.daskeysmap.values():
            for item in val:
                self.daskeys.append(item)
        parserdir   = config['das']['parserdir']
        self.parserdir = parserdir

        self.enabledb = config['parserdb']['enable']
        if  self.enabledb:
            self.parserdb = DASParserDB(config)
Beispiel #4
0
    def __init__(self, config=None, debug=0,
                nores=False, logger=None, engine=None, multitask=True):
        if  config:
            dasconfig = config
        else:
            dasconfig = das_readconfig()
        verbose       = dasconfig['verbose']
        self.stdout   = debug
        if  isinstance(debug, int):
            self.verbose = debug
            dasconfig['verbose'] = debug
        else:
            self.verbose = verbose
        das_timer('DASCore::init', self.verbose)
        self.operators = das_operators()

        # set noresults option
        self.noresults = False
        if  nores:
            dasconfig['write_cache'] = True
            self.noresults = nores

        self.multitask = dasconfig['das'].get('multitask', True)
        if  debug or self.verbose:
            self.multitask = False # in verbose mode do not use multitask
            dasconfig['das']['multitask'] = False
        if  not multitask: # explicitly call DASCore ctor, e.g. in analytics
            self.multitask = False
            dasconfig['das']['multitask'] = False
        dasconfig['engine'] = engine
        if  self.multitask:
            nworkers = dasconfig['das'].get('core_workers', 5)
            if  engine:
                thr_name = 'DASCore:PluginTaskManager'
                self.taskmgr = PluginTaskManager(\
                        engine, nworkers=nworkers, name=thr_name)
                self.taskmgr.subscribe()
            else:
                thr_name = 'DASCore:TaskManager'
                self.taskmgr = TaskManager(nworkers=nworkers, name=thr_name)
        else:
            self.taskmgr = None

        if  logger:
            self.logger = logger
        else:
            self.logger = PrintManager('DASCore', self.verbose)

        # define Mapping/Analytics/Parser in this order since Parser depends
        # on first two
        dasmapping = DASMapping(dasconfig)
        dasconfig['dasmapping'] = dasmapping
        self.mapping = dasmapping

        self.analytics = DASAnalytics(dasconfig)
        dasconfig['dasanalytics'] = self.analytics

        self.keylearning = DASKeyLearning(dasconfig)
        dasconfig['keylearning'] = self.keylearning

        # init DAS cache
        self.rawcache = DASMongocache(dasconfig)
        dasconfig['rawcache'] = self.rawcache

        # plug-in architecture: loop over registered data-services in
        # dasconfig; load appropriate module/class; register data
        # service with DASCore.
        self.systems = dasmapping.list_systems()
        # pointer to the DAS top level directory
        dasroot = '/'.join(__file__.split('/')[:-3])
        for name in self.systems:
            try:
                klass  = 'DAS/services/%s/%s_service.py' \
                    % (name, name)
                srvfile = os.path.join(dasroot, klass)
                with file(srvfile) as srvclass:
                    for line in srvclass:
                        if  line.find('(DASAbstractService)') != -1:
                            klass = line.split('(DASAbstractService)')[0]
                            klass = klass.split('class ')[-1] 
                            break
                mname  = 'DAS.services.%s.%s_service' % (name, name)
                module = __import__(mname, fromlist=[klass])
                obj = getattr(module, klass)(dasconfig)
                setattr(self, name, obj)
                SERVICES[name] = obj
            except IOError as err:
                if  debug > 1:
                    # we have virtual services, so IOError can be correct
                    print_exc(err)
                try:
                    mname  = 'DAS.services.generic_service'
                    module = __import__(mname, fromlist=['GenericService'])
                    obj    = module.GenericService(name, dasconfig)
                    setattr(self, name, obj)
                except Exception as exc:
                    print_exc(exc)
                    msg = "Unable to load %s data-service plugin" % name
                    raise Exception(msg)
            except Exception as exc:
                print_exc(exc)
                msg = "Unable to load %s data-service plugin" % name
                raise Exception(msg)

        # loop over systems and get system keys, add mapping keys to final list
        self.service_keys = {}
        self.service_parameters = {}
        for name in self.systems: 
            skeys = getattr(self, name).keys()
            self.service_keys[getattr(self, name).name] = skeys
            sparams = getattr(self, name).parameters()
            self.service_parameters[getattr(self, name).name] = sparams

        self.service_keys['special'] = das_special_keys()
        self.dasconfig = dasconfig
        das_timer('DASCore::init', self.verbose)
Beispiel #5
0
class DASCore(object):
    """
    DAS core class.
    """
    def __init__(self, config=None, debug=0,
                nores=False, logger=None, engine=None, multitask=True):
        if  config:
            dasconfig = config
        else:
            dasconfig = das_readconfig()
        verbose       = dasconfig['verbose']
        self.stdout   = debug
        if  isinstance(debug, int):
            self.verbose = debug
            dasconfig['verbose'] = debug
        else:
            self.verbose = verbose
        das_timer('DASCore::init', self.verbose)
        self.operators = das_operators()

        # set noresults option
        self.noresults = False
        if  nores:
            dasconfig['write_cache'] = True
            self.noresults = nores

        self.multitask = dasconfig['das'].get('multitask', True)
        if  debug or self.verbose:
            self.multitask = False # in verbose mode do not use multitask
            dasconfig['das']['multitask'] = False
        if  not multitask: # explicitly call DASCore ctor, e.g. in analytics
            self.multitask = False
            dasconfig['das']['multitask'] = False
        dasconfig['engine'] = engine
        if  self.multitask:
            nworkers = dasconfig['das'].get('core_workers', 5)
            if  engine:
                thr_name = 'DASCore:PluginTaskManager'
                self.taskmgr = PluginTaskManager(\
                        engine, nworkers=nworkers, name=thr_name)
                self.taskmgr.subscribe()
            else:
                thr_name = 'DASCore:TaskManager'
                self.taskmgr = TaskManager(nworkers=nworkers, name=thr_name)
        else:
            self.taskmgr = None

        if  logger:
            self.logger = logger
        else:
            self.logger = PrintManager('DASCore', self.verbose)

        # define Mapping/Analytics/Parser in this order since Parser depends
        # on first two
        dasmapping = DASMapping(dasconfig)
        dasconfig['dasmapping'] = dasmapping
        self.mapping = dasmapping

        self.analytics = DASAnalytics(dasconfig)
        dasconfig['dasanalytics'] = self.analytics

        self.keylearning = DASKeyLearning(dasconfig)
        dasconfig['keylearning'] = self.keylearning

        # init DAS cache
        self.rawcache = DASMongocache(dasconfig)
        dasconfig['rawcache'] = self.rawcache

        # plug-in architecture: loop over registered data-services in
        # dasconfig; load appropriate module/class; register data
        # service with DASCore.
        self.systems = dasmapping.list_systems()
        # pointer to the DAS top level directory
        dasroot = '/'.join(__file__.split('/')[:-3])
        for name in self.systems:
            try:
                klass  = 'DAS/services/%s/%s_service.py' \
                    % (name, name)
                srvfile = os.path.join(dasroot, klass)
                with file(srvfile) as srvclass:
                    for line in srvclass:
                        if  line.find('(DASAbstractService)') != -1:
                            klass = line.split('(DASAbstractService)')[0]
                            klass = klass.split('class ')[-1] 
                            break
                mname  = 'DAS.services.%s.%s_service' % (name, name)
                module = __import__(mname, fromlist=[klass])
                obj = getattr(module, klass)(dasconfig)
                setattr(self, name, obj)
                SERVICES[name] = obj
            except IOError as err:
                if  debug > 1:
                    # we have virtual services, so IOError can be correct
                    print_exc(err)
                try:
                    mname  = 'DAS.services.generic_service'
                    module = __import__(mname, fromlist=['GenericService'])
                    obj    = module.GenericService(name, dasconfig)
                    setattr(self, name, obj)
                except Exception as exc:
                    print_exc(exc)
                    msg = "Unable to load %s data-service plugin" % name
                    raise Exception(msg)
            except Exception as exc:
                print_exc(exc)
                msg = "Unable to load %s data-service plugin" % name
                raise Exception(msg)

        # loop over systems and get system keys, add mapping keys to final list
        self.service_keys = {}
        self.service_parameters = {}
        for name in self.systems: 
            skeys = getattr(self, name).keys()
            self.service_keys[getattr(self, name).name] = skeys
            sparams = getattr(self, name).parameters()
            self.service_parameters[getattr(self, name).name] = sparams

        self.service_keys['special'] = das_special_keys()
        self.dasconfig = dasconfig
        das_timer('DASCore::init', self.verbose)

    def keys(self):
        """
        Return map of data service keys
        """
        return self.service_keys

    def das_keys(self):
        """
        Return map of data service keys
        """
        _keys = ['records']
        for values in self.service_keys.values():
            for key in values:
                if  key not in _keys:
                    _keys.append(key)
        return _keys

    def result(self, query, idx=0, limit=None):
        """
        Get results either from cache or from explicit call
        """
        self.logger.info('input query=%s' % query)
        results = []
        dasquery = DASQuery(query)
        dasquery.add_to_analytics()
        query    = dasquery.mongo_query
        # check if we have any service which cover the query
        # otherwise decompose it into list of queries
        service_map = dasquery.service_apis_map()
        if  not service_map:
            msg  = 'no APIs found to answer input query, will decompose it'
            self.logger.info(msg)
            skeys = query['fields']
            if  not skeys:
                skeys = []
            for key in skeys:
                newquery = DASQuery(dict(fields=[key], spec=query['spec']))
                self.call(newquery) # process query
        else:
            self.call(dasquery) # process query

        # lookup provided query in a cache
        if  not self.noresults:
            results = self.get_from_cache(dasquery, idx, limit)
        return results

    def remove_from_cache(self, dasquery):
        """
        Delete in cache entries about input query
        """
        self.rawcache.remove_from_cache(dasquery)

    def get_status(self, dasquery):
        """
        Look-up status of provided query in a cache.
        Return status of the query request and its hash.
        """
        status = None
        error  = None
        reason = None
        for col in ['merge', 'cache']:
            self.rawcache.remove_expired(dasquery, col)
        if  dasquery and 'fields' in dasquery.mongo_query:
            fields = dasquery.mongo_query['fields']
            if  fields and isinstance(fields, list) and 'queries' in fields:
                return 'ok', error, reason
        record = self.rawcache.find(dasquery)
        error, reason = self.rawcache.is_error_in_records(dasquery)
        try:
            if  record and 'das' in record and 'status' in record['das']:
                status = record['das']['status']
                if  not error:
                    error = record['das'].get('error', error)
                if  not reason:
                    reason = record['das'].get('reason', reason)
                return status, error, reason
        except Exception as exc:
            print_exc(exc)
            status = error = reason = None
            self.rawcache.remove_from_cache(dasquery)
        return status, error, reason

    def worker(self, srv, dasquery):
        """Main worker function which calls data-srv call function"""
        self.logger.info('##### %s ######\n' % srv)
        das_timer(srv, self.verbose)
        getattr(getattr(self, srv), 'call')(dasquery)
        das_timer(srv, self.verbose)

    def insert_query_records(self, dasquery):
        """
        Insert DAS query records into DAS cache and return list of services
        which will answer this query
        """
        services = dasquery.services
        self.logger.info('Potential services = %s' % services)
        if  not services:
            msg  = 'No data-services for query %s' % dasquery
            msg += 'mongo_query: %s' % dasquery.mongo_query
            msg += 'params: %s' % dasquery.params()
            print dastimestamp('DAS WARNING '), msg

        # get list of URI which can answer this query
        ack_services = []
        for srv in services:
            gen = [t for t in getattr(getattr(self, srv), 'apimap')(dasquery)]
            for url, api, args, iformat, expire in gen:
                header = dasheader(srv, dasquery, expire, api, url, ctime=0)
                self.rawcache.insert_query_record(dasquery, header)
                if  srv not in ack_services:
                    ack_services.append(srv)
        if  not ack_services:
            ack_services = services
        if  dasquery.query.find('records ') != -1:
            srv_status = True # skip DAS queries w/ records request
        expire = 2*60 # 2 minutes, it should be overwriten by data-srv
        header = dasheader("das", dasquery, expire, api='das_core',
                services=dict(das=ack_services))
        header['lookup_keys'] = []
        self.rawcache.insert_query_record(dasquery, header)
        das_timer('das_record', self.verbose)
        return ack_services

    def call(self, query, add_to_analytics=True, **kwds):
        """
        Top level DAS api which execute a given query using underlying
        data-services. It follows the following steps:

            - parse input query
            - identify data-sercices based on selection keys
              and where clause conditions
            - construct DAS workflow and execute data-service 
              API calls. At this step individual 
              data-services store results into DAS cache.

        Return status 0/1 depending on success of the calls, can be
        used by workers on cache server.

        kwds is provided for compatibility with web layer, e.g. it
        may invoke this method with additional pid parameter.
        """
        def update_das_query(dasquery, status, reason=None):
            "Update DAS query record with given status and reason"
            self.rawcache.update_query_record(dasquery, status, reason=reason)
            self.rawcache.add_to_record(\
                    dasquery, {'das.timer': get_das_timer()}, system='das')
            # make sure that das record is updated, we use 7 iteration which
            # sum up into 1 minute to cover default syncdelay value of mongo
            # server (in a future it would be better to find programatically
            # this syncdelay value, but it seems pymongo driver does not
            # provide any API for it.
            for idx in xrange(1, 7):
                spec = {'qhash':dasquery.qhash, 'das.system':['das']}
                res = self.rawcache.col.find_one(spec)
                if  res:
                    dbstatus = res.get('das', {}).get('status', None)
                    if  dbstatus == status:
                        break
                    msg = 'qhash %s, das.status=%s, status=%s, wait for update' \
                            % (dasquery.qhash, dbstatus, status)
                    print dastimestamp('DAS WARNING'), msg
                time.sleep(idx*idx)
                self.rawcache.update_query_record(dasquery, status, reason=reason)

        self.logger.info('input query=%s' % query)
        das_timer('DASCore::call', self.verbose)
        if  isinstance(query, object) and hasattr(query, '__class__')\
            and query.__class__.__name__ == 'DASQuery':
            dasquery = query
        else:
            dasquery = DASQuery(query)
        for col in ['merge', 'cache']:
            self.rawcache.remove_expired(dasquery, col)
        if  add_to_analytics:
            dasquery.add_to_analytics()
        query  = dasquery.mongo_query
        spec   = query.get('spec')
        fields = query.get('fields')
        if  fields == ['records']:
            msg = 'look-up all records in cache'
            self.logger.info(msg)
            return 'in cache'
        if  spec == dict(records='*'):
            self.logger.info("look-up everything in cache")
            return 'in cache'
        for record in self.rawcache.find_specs(dasquery):
            status = record['das']['status']
            msg = 'found query %s in cache, status=%s\n' \
                        % (record['query'], status)
            self.logger.info(msg)
            print dastimestamp('DAS INFO'), msg
            return status

        self.logger.info(dasquery)
        das_timer('das_record', self.verbose)
        services = self.insert_query_records(dasquery)
        if  not services:
            msg = 'unable to locate data-services to fulfill this request'
            msg += ', will iterate over all registered services'
            print dastimestamp('DAS WARNING '), dasquery, msg
            services = dasquery.services if dasquery.services else self.systems
        try:
            if  self.multitask:
                jobs = []
                for srv in sorted(services):
                    jobs.append(self.taskmgr.spawn(self.worker, srv, dasquery))
                self.taskmgr.joinall(jobs)
            else:
                for srv in services:
                    self.worker(srv, dasquery)
        except Exception as exc:
            print_exc(exc)
            return 'fail'
        self.logger.info('\n##### merging ######\n')
        update_das_query(dasquery, 'merging')
        das_timer('merge', self.verbose)
        self.rawcache.merge_records(dasquery)
        das_timer('merge', self.verbose)
        # check if we have service records and properly setup status
        self.logger.info('\n##### check services ######\n')
        das_services = self.rawcache.check_services(dasquery)
        reason = ''
        status = 'ok'
        if  not das_services:
            if  'records' in dasquery.query:
                status = 'ok' # keep status ok for 'records' queries
            else:
                reason = 'no data records found in DAS cache'
                status = 'fail'
                print dastimestamp('DAS ERROR '), dasquery, reason
        update_das_query(dasquery, status, reason)
        das_timer('DASCore::call', self.verbose)
        return status

    def processing_time(self, dasquery):
        "Look-up and return DAS query processing time"
        query_record = self.rawcache.find(dasquery)
        if  query_record:
            das = query_record.get('das', None)
            if  isinstance(das, dict):
                ctime = das.get('ctime', [])
                if  ctime:
                    return ctime[-1]-ctime[0]
        return None

    def nresults(self, dasquery, coll='merge'):
        """
        Return total number of results (count) for provided query
        Code should match body of get_from_cache method.
        """
        fields = dasquery.mongo_query.get('fields', None)
        if  dasquery.mapreduce:
            result = self.rawcache.map_reduce(dasquery.mapreduce, dasquery)
            return len([1 for _ in result])
        elif dasquery.aggregators:
            return len(dasquery.aggregators)
        elif isinstance(fields, list) and 'queries' in fields:
            return len([1 for _ in self.get_queries(dasquery)])
        return self.rawcache.nresults(dasquery, coll)

    def apilist(self, dasquery):
        "Return list of APIs answer given das query"
        return self.rawcache.apilist(dasquery)

    def incache(self, dasquery, coll='merge'):
        """
        Answer the question if given query in DAS cache or not
        """
        return self.rawcache.incache(dasquery, collection=coll)

    def get_from_cache(self, dasquery, idx=0, limit=0, collection='merge'):
        """
        Look-up results from the merge cache and yield them for
        further processing.
        """
        das_timer('DASCore::get_from_cache', self.verbose)
        msg = 'col=%s, query=%s, idx=%s, limit=%s'\
                % (collection, dasquery, idx, limit)
        self.logger.info(msg)

        fields  = dasquery.mongo_query.get('fields', None)

        if  dasquery.mapreduce:
            res = self.rawcache.map_reduce(dasquery.mapreduce, dasquery)
        elif dasquery.aggregators:
            # extract das information from rawcache
            rows  = self.rawcache.get_from_cache(\
                    dasquery, collection=collection)
            first = rows.next()
            sinfo = das_sinfo(first)
            # to perform aggregation we need:
            # - loop over all aggregator functions
            # - loop over all data-services
            # - loop over all APIs within a data-services
            # the code below does that, it applies aggregator
            # to selected (based on key/srv/api) records
            res = []
            _id = 0
            time0  = time.time()
            expire = 300 # min expire
            for func, key in dasquery.aggregators:
                afunc = getattr(das_aggregator, 'das_%s' % func)
                found = False
                for srv, apis, in sinfo.items():
                    for api in apis:
                        rows  = self.rawcache.get_from_cache(\
                                dasquery, collection=collection)
                        gen   = api_rows(rows, api)
                        data  = afunc(key, gen)
                        ctime = time.time() - time0
                        das   = dasheader(srv, dasquery, expire, api=api,
                                ctime=ctime)
                        if  isinstance(data, dict) and data['value'] != 'N/A':
                            aggr = {'_id':_id, 'function': func,
                                    'key': key, 'result': data}
                            aggr.update(das)
                            res.append(aggr)
                            _id += 1
                            found = True
                if  not found: # when we got nothing add empty result record
                    empty = {'value':'N/A'}
                    ctime = time.time() - time0
                    das = dasheader('das', dasquery, expire, api='das_core',
                            ctime=ctime)
                    rec = {'_id':0, 'function':func, 'key':key, 'result':empty}
                    rec.update(das)
                    res.append(rec)
        elif isinstance(fields, list) and 'queries' in fields:
            res = itertools.islice(self.get_queries(dasquery), idx, idx+limit)
        else:
            res = self.rawcache.get_from_cache(dasquery, idx, limit, \
                    collection=collection)
        for row in res:
            fix_times(row)
            yield row
        das_timer('DASCore::get_from_cache', self.verbose)

    def get_queries(self, dasquery):
        """
        Look-up (popular) queries in DAS analytics/logging db
        """
        das_timer('DASCore::get_queries', self.verbose)
        fields = dasquery.mongo_query.get('fields')
        spec   = dasquery.mongo_query.get('spec')
        if  'popular' in fields:
            res = self.analytics.get_popular_queries(spec)
        else:
            datestamp = spec.get('date')
            if  isinstance(datestamp, dict):
                value = datestamp.get('$in')
                res = \
                self.analytics.list_queries(after=value[0], before=value[1])
            elif isinstance(datestamp, int):
                res = self.analytics.list_queries(after=datestamp)
            elif not datestamp:
                res = self.analytics.list_queries()
            else:
                msg = 'Unsupported date value: %s' % datestamp
                raise Exception(msg)
        for row in res:
            rid = row.pop('_id')
            yield dict(das_query=row, _id=rid)
        das_timer('DASCore::get_queries', self.verbose)
Beispiel #6
0
class testDASAnalytics(unittest.TestCase):
    """
    A test class for the DAS analyticsdb class
    """
    def setUp(self):
        """
        set up DAS core module
        """
        debug    = 0
        self.db  = 'test_analytics.db'
        config   = deepcopy(das_readconfig())
        dburi    = config['mongodb']['dburi']
        logger   = PrintManager('TestDASAnalytics', verbose=debug)
        config['logger']  = logger
        config['verbose'] = debug
        config['analyticsdb'] = dict(dburi=dburi, history=5184000,
                dbname='test_analytics', collname='db')
        self.mgr = DASAnalytics(config)

    def tearDown(self):
        """Invoke after each test"""
        self.mgr.delete_db()

    def test_api(self):                          
        """test methods for api table"""
        self.mgr.delete_db()
        self.mgr.create_db()

        query = 'find block'
        dbs_api = 'listBlocks'
        dbs_params = {'apiversion':'DBS_2_0_8',
                  'block_name':'*', 'storage_element_name':'*',
                  'user_type':'NORMAL'}
        self.mgr.add_api('dbs', query, dbs_api, dbs_params)

        phedex_api = 'blockReplicas'
        phedex_params = {'node': '*', 'se': '*', 'block': '*'}
        self.mgr.add_api('phedex', query, phedex_api, phedex_params)

        res = self.mgr.list_systems()
        res.sort()
        self.assertEqual(['dbs', 'phedex'], res)

        res = self.mgr.list_apis('dbs')
        self.assertEqual([dbs_api], res)

        self.mgr.add_api('dbs', query, dbs_api, dbs_params)
        res = self.mgr.api_counter(dbs_api)
        self.assertEqual(1, res) # we invoke API twice, so should get 2

        self.mgr.update('dbs', query)
        res = self.mgr.api_counter(dbs_api)
        self.assertEqual(2, res) # we invoke API twice, so should get 2
        
        res = self.mgr.api_params(phedex_api)
        self.assertEqual([phedex_params], res)
Beispiel #7
0
class DASCore(object):
    """
    DAS core class.
    """
    def __init__(self, config=None, debug=None, 
                nores=False, logger=None, engine=None, multitask=True):
        if  config:
            dasconfig = config
        else:
            dasconfig = das_readconfig()
        verbose       = dasconfig['verbose']
        self.stdout   = debug
        if  isinstance(debug, int):
            self.verbose = debug
            dasconfig['verbose'] = debug
        else:
            self.verbose = verbose
        das_timer('DASCore::init', self.verbose)
        self.operators = das_operators()

        # set noresults option
        self.noresults = False
        if  nores:
            dasconfig['write_cache'] = True
            self.noresults = nores

        self.multitask = dasconfig['das'].get('multitask', True)
        if  debug or self.verbose:
            self.multitask = False # in verbose mode do not use multitask
            dasconfig['das']['multitask'] = False
        if  not multitask: # explicitly call DASCore ctor, e.g. in analytics
            self.multitask = False
            dasconfig['das']['multitask'] = False
        dasconfig['engine'] = engine
        if  self.multitask:
            nworkers = dasconfig['das'].get('core_workers', 5)
            if  engine:
                thr_name = 'DASCore:PluginTaskManager'
                self.taskmgr = PluginTaskManager(\
                        engine, nworkers=nworkers, name=thr_name)
                self.taskmgr.subscribe()
            else:
                thr_name = 'DASCore:TaskManager'
                self.taskmgr = TaskManager(nworkers=nworkers, name=thr_name)
        else:
            self.taskmgr = None

        if  logger:
            self.logger = logger
        else:
            self.logger = PrintManager('DASCore', self.verbose)

        # define Mapping/Analytics/Parser in this order since Parser depends
        # on first two
        dasmapping = DASMapping(dasconfig)
        dasconfig['dasmapping'] = dasmapping
        self.mapping = dasmapping

        self.analytics = DASAnalytics(dasconfig)
        dasconfig['dasanalytics'] = self.analytics

        self.mongoparser = ql_manager(dasconfig)
        dasconfig['mongoparser'] = self.mongoparser

        self.keylearning = DASKeyLearning(dasconfig)
        dasconfig['keylearning'] = self.keylearning

        # init DAS cache
        self.rawcache = DASMongocache(dasconfig)
        dasconfig['rawcache'] = self.rawcache

        # plug-in architecture: loop over registered data-services in
        # dasconfig; load appropriate module/class; register data
        # service with DASCore.
        self.systems = dasmapping.list_systems()
        # pointer to the DAS top level directory
        dasroot = '/'.join(__file__.split('/')[:-3])
        for name in self.systems:
            try:
                klass  = 'DAS/services/%s/%s_service.py' \
                    % (name, name)
                srvfile = os.path.join(dasroot, klass)
                with file(srvfile) as srvclass:
                    for line in srvclass:
                        if  line.find('(DASAbstractService)') != -1:
                            klass = line.split('(DASAbstractService)')[0]
                            klass = klass.split('class ')[-1] 
                            break
                mname  = 'DAS.services.%s.%s_service' % (name, name)
                module = __import__(mname, fromlist=[klass])
                obj = getattr(module, klass)(dasconfig)
                setattr(self, name, obj)
                SERVICES[name] = obj
            except IOError as err:
                if  debug > 1:
                    # we have virtual services, so IOError can be correct
                    print_exc(err)
                try:
                    mname  = 'DAS.services.generic_service'
                    module = __import__(mname, fromlist=['GenericService'])
                    obj    = module.GenericService(name, dasconfig)
                    setattr(self, name, obj)
                except Exception as exc:
                    print_exc(exc)
                    msg = "Unable to load %s data-service plugin" % name
                    raise Exception(msg)
            except Exception as exc:
                print_exc(exc)
                msg = "Unable to load %s data-service plugin" % name
                raise Exception(msg)

        # loop over systems and get system keys, add mapping keys to final list
        self.service_keys = {}
        self.service_parameters = {}
        for name in self.systems: 
            skeys = getattr(self, name).keys()
            self.service_keys[getattr(self, name).name] = skeys
            sparams = getattr(self, name).parameters()
            self.service_parameters[getattr(self, name).name] = sparams

        self.service_keys['special'] = das_special_keys()
        self.dasconfig = dasconfig
        das_timer('DASCore::init', self.verbose)

    def keys(self):
        """
        Return map of data service keys
        """
        return self.service_keys

    def das_keys(self):
        """
        Return map of data service keys
        """
        _keys = ['records']
        for values in self.service_keys.values():
            for key in values:
                if  key not in _keys:
                    _keys.append(key)
        return _keys

    def result(self, query, idx=0, limit=None):
        """
        Get results either from cache or from explicit call
        """
        self.logger.info('input query=%s' % query)
        results = []
        dasquery = DASQuery(query, mongoparser=self.mongoparser)
        dasquery.add_to_analytics()
        query    = dasquery.mongo_query
        # check if we have any service which cover the query
        # otherwise decompose it into list of queries
        service_map = dasquery.service_apis_map()
        if  not service_map:
            msg  = 'no APIs found to answer input query, will decompose it'
            self.logger.info(msg)
            skeys = query['fields']
            if  not skeys:
                skeys = []
            for key in skeys:
                newquery = DASQuery(dict(fields=[key], spec=query['spec']),
                                        mongoparser=self.mongoparser)
                self.call(newquery) # process query
        else:
            self.call(dasquery) # process query

        # lookup provided query in a cache
        if  not self.noresults:
            results = self.get_from_cache(dasquery, idx, limit)
        return results

    def remove_from_cache(self, dasquery):
        """
        Delete in cache entries about input query
        """
        self.rawcache.remove_from_cache(dasquery)

    def get_status(self, dasquery):
        """
        Look-up status of provided query in a cache.
        Return status of the query request and its hash.
        """
        if  dasquery and dasquery.mongo_query.has_key('fields'):
            fields = dasquery.mongo_query['fields']
            if  fields and isinstance(fields, list) and 'queries' in fields:
                return 'ok', dasquery.qhash
        status = 0
        record = self.rawcache.find(dasquery)
        try:
            if  record and record.has_key('das') and \
                record['das'].has_key('status'):
                status = record['das']['status']
                return status, record['qhash']
        except:
            pass

        similar_dasquery = self.rawcache.similar_queries(dasquery)
        if  similar_dasquery:
            record = self.rawcache.find(similar_dasquery)
            if  record and record.has_key('das') and \
                record['das'].has_key('status'):
                similar_query_status = record['das']['status']
                return similar_query_status, record['qhash']
        return status, 0

    def worker(self, srv, dasquery):
        """Main worker function which calls data-srv call function"""
        self.logger.info('##### %s ######\n' % srv)
        das_timer(srv, self.verbose)
        getattr(getattr(self, srv), 'call')(dasquery)
        das_timer(srv, self.verbose)

    def call(self, query, add_to_analytics=True, **kwds):
        """
        Top level DAS api which execute a given query using underlying
        data-services. It follows the following steps:

            - parse input query
            - identify data-sercices based on selection keys
              and where clause conditions
            - construct DAS workflow and execute data-service 
              API calls. At this step individual 
              data-services store results into DAS cache.

        Return status 0/1 depending on success of the calls, can be
        used by workers on cache server.

        kwds is provided for compatibility with web layer, e.g. it
        may invoke this method with additional pid parameter.
        """
        self.logger.info('input query=%s' % query)
        das_timer('DASCore::call', self.verbose)
        services = []
        if  isinstance(query, object) and hasattr(query, '__class__')\
            and query.__class__.__name__ == 'DASQuery':
            dasquery = query
        else:
            dasquery = DASQuery(query, mongoparser=self.mongoparser)
        if  add_to_analytics:
            dasquery.add_to_analytics()
        query = dasquery.mongo_query
        if  dasquery.mongo_query.has_key('system'):
            system = query['system']
            if  isinstance(system, str) or isinstance(system, unicode):
                services = [system]
            elif isinstance(system, list):
                services = system
            else:
                msg = 'Unsupported system=%s type=%s in DAS query' \
                        % (system, type(system))
                raise Exception(msg)
        spec   = query.get('spec')
        fields = query.get('fields')
        if  fields == ['records']:
            msg = 'look-up all records in cache'
            self.logger.info(msg)
            return 'in cache'
        if  spec == dict(records='*'):
            self.logger.info("look-up everything in cache")
            return 'in cache'
        for record in self.rawcache.find_specs(dasquery):
            status = record['das']['status']
            msg = 'found query %s in cache, status=%s\n' \
                        % (record['query'], status)
            self.logger.info(msg)
            return status
        similar_dasquery = self.rawcache.similar_queries(dasquery)
        if  similar_dasquery:
            for record in self.rawcache.find_specs(similar_dasquery):
                if  record:
                    try:
                        status = record['das']['status']
                    except:
                        status = 'N/A'
                        msg = 'Fail to look-up das.status, record=%s' % record
                        self.logger.info(msg)
                msg  = 'found SIMILAR query in cache,'
                msg += 'query=%s, status=%s\n' % (record['query'], status)
                self.logger.info(msg)
                return status

        self.logger.info(dasquery)
        params = dasquery.params()
        if  not services:
            services = params['services']
        self.logger.info('services = %s' % services)
        das_timer('das_record', self.verbose)
        # initial expire tstamp 1 day (long enough to be overwriten by data-srv)
        expire = expire_timestamp(time.time()+1*24*60*60)
        header = dasheader("das", dasquery, expire)
        header['lookup_keys'] = []
        self.rawcache.insert_query_record(dasquery, header)
        das_timer('das_record', self.verbose)
        try:
            if  self.multitask:
                jobs = []
                for srv in services:
                    jobs.append(self.taskmgr.spawn(self.worker, srv, dasquery))
                self.taskmgr.joinall(jobs)
            else:
                for srv in services:
                    self.worker(srv, dasquery)
        except Exception as exc:
            print_exc(exc)
            return 'fail'
        self.logger.info('\n##### merging ######\n')
        self.rawcache.update_query_record(dasquery, 'merging')
        das_timer('merge', self.verbose)
        self.rawcache.merge_records(dasquery)
        das_timer('merge', self.verbose)
        self.rawcache.update_query_record(dasquery, 'ok')
        self.rawcache.add_to_record(\
                dasquery, {'das.timer': get_das_timer()}, system='das')
        das_timer('DASCore::call', self.verbose)
        return 'ok'

    def nresults(self, dasquery, coll='merge'):
        """
        Return total number of results (count) for provided query
        Code should match body of get_from_cache method.
        """
        fields = dasquery.mongo_query.get('fields', None)
        if  dasquery.mapreduce:
            result = self.rawcache.map_reduce(dasquery.mapreduce, dasquery)
            return len([1 for _ in result])
        elif dasquery.aggregators:
            return len(dasquery.aggregators)
        elif isinstance(fields, list) and 'queries' in fields:
            return len([1 for _ in self.get_queries(dasquery)])
        return self.rawcache.nresults(dasquery, coll)

    def incache(self, dasquery, coll='merge'):
        """
        Answer the question if given query in DAS cache or not
        """
        return self.rawcache.incache(dasquery, collection=coll)

    def get_from_cache(self, dasquery, idx=0, limit=0, collection='merge'):
        """
        Look-up results from the merge cache and yield them for
        further processing.
        """
        das_timer('DASCore::get_from_cache', self.verbose)
        msg = 'col=%s, query=%s, idx=%s, limit=%s'\
                % (collection, dasquery, idx, limit)
        self.logger.info(msg)

        fields  = dasquery.mongo_query.get('fields', None)

        if  dasquery.mapreduce:
            res = self.rawcache.map_reduce(dasquery.mapreduce, dasquery)
        elif dasquery.aggregators:
            res = []
            _id = 0
            for func, key in dasquery.aggregators:
                rows = self.rawcache.get_from_cache(\
                        dasquery, collection=collection)
                data = getattr(das_aggregator, 'das_%s' % func)(key, rows)
                res += \
                [{'_id':_id, 'function': func, 'key': key, 'result': data}]
                _id += 1
        elif isinstance(fields, list) and 'queries' in fields:
            res = itertools.islice(self.get_queries(dasquery), idx, idx+limit)
        else:
            res = self.rawcache.get_from_cache(dasquery, idx, limit, \
                    collection=collection)
        for row in res:
            fix_times(row)
            yield row
        das_timer('DASCore::get_from_cache', self.verbose)

    def get_queries(self, dasquery):
        """
        Look-up (popular) queries in DAS analytics/logging db
        """
        das_timer('DASCore::get_queries', self.verbose)
        fields = dasquery.mongo_query.get('fields')
        spec   = dasquery.mongo_query.get('spec')
        if  'popular' in fields:
            res = self.analytics.get_popular_queries(spec)
        else:
            datestamp = spec.get('date')
            if  isinstance(datestamp, dict):
                value = datestamp.get('$in')
                res = \
                self.analytics.list_queries(after=value[0], before=value[1])
            elif isinstance(datestamp, int):
                res = self.analytics.list_queries(after=datestamp)
            elif not datestamp:
                res = self.analytics.list_queries()
            else:
                msg = 'Unsupported date value: %s' % datestamp
                raise Exception(msg)
        for row in res:
            rid = row.pop('_id')
            yield dict(das_query=row, _id=rid)
        das_timer('DASCore::get_queries', self.verbose)
Beispiel #8
0
class QLManager(object):
    """
    DAS QL manager.
    """
    def __init__(self, config=None):
        if  not config:
            config = das_readconfig()
        self.dasmapping  = DASMapping(config)
        if  not self.dasmapping.check_maps():
            msg = "No DAS maps found in MappingDB"
            raise Exception(msg)
        self.analytics   = DASAnalytics(config)
        self.dasservices = config['services']
        self.daskeysmap  = self.dasmapping.daskeys()
        self.operators   = list(das_operators())
        self.daskeys     = list(das_special_keys())
        self.verbose     = config['verbose']
        self.logger      = PrintManager('QLManger', self.verbose)
        for val in self.daskeysmap.values():
            for item in val:
                self.daskeys.append(item)
        parserdir   = config['das']['parserdir']
        self.parserdir = parserdir

        self.enabledb = config['parserdb']['enable']
        if  self.enabledb:
            self.parserdb = DASParserDB(config)

    def parse(self, query):
        """
        Parse input query and return query in MongoDB form.
        Optionally parsed query can be written into analytics DB.
        """
        mongo_query = self.mongo_query(query)
        self.convert2skeys(mongo_query)
        return mongo_query

    def add_to_analytics(self, query, mongo_query):
        "Add DAS query to analytics DB"
        self.analytics.add_query(query, mongo_query)

    def get_ply_query(self, query):
        """
        Get ply object for given query. Since we rely on PLY package and it may
        fail under the load we use couple of trials.
        """
        ply_query = ply_parse_query(query, self.daskeys, self.dasservices,
                    self.parserdir, self.verbose)
        return ply_query

    def mongo_query(self, query):
        """
        Return mongo query for provided input query
        """
        mongo_query = None
        if  self.verbose:
            ply_output(query, self.daskeys, self.dasservices,
                    self.parserdir, self.verbose)
        parse_again = True
        if  self.enabledb:
            status, value = self.parserdb.lookup_query(query)
            if status == PARSERCACHE_VALID and \
                len(last_key_pattern.findall(query)) == 0:
                mongo_query = value
                parse_again = False
            elif status == PARSERCACHE_INVALID:
                # we unable to find query in parserdb, so will parse again
                parse_again = True
            else:
                ply_query = self.get_ply_query(query)
                if  ply_query:
                    try:
                        mongo_query = ply2mongo(ply_query)
                        parse_again = False
                    except Exception as exc:
                        msg = "Fail in ply2mongo, query=%s, ply_query=%s" \
                                % (query, ply_query)
                        print msg
                    try:
                        self.parserdb.insert_valid_query(query, mongo_query)
                    except Exception as exc:
                        msg = "Fail to insert into parserdb, exception=%s" \
                                % str(exc)
                        print_exc(msg, print_traceback=True)
        if  parse_again:
            try:
                ply_query   = self.get_ply_query(query)
                mongo_query = ply2mongo(ply_query)
            except Exception as exc:
                msg = "Fail to parse query='%s'" % query
                print_exc(msg, print_traceback=False)
                raise exc
        if  set(mongo_query.keys()) & set(['fields', 'spec']) != \
                set(['fields', 'spec']):
            raise Exception('Invalid MongoDB query %s' % mongo_query)
        if  not mongo_query['fields'] and len(mongo_query['spec'].keys()) > 1:
            raise Exception(ambiguous_msg(query, mongo_query['spec'].keys()))
        for key, val in mongo_query['spec'].iteritems():
            if  isinstance(val, list):
                raise Exception(ambiguos_val_msg(query, key, val))
        return mongo_query

    def convert2skeys(self, mongo_query):
        """
        Convert DAS input keys into DAS selection keys.
        """
        if  not mongo_query['spec']:
            for key in mongo_query['fields']:
                for system in self.dasmapping.list_systems():
                    mapkey = self.dasmapping.find_mapkey(system, key)
                    if  mapkey:
                        mongo_query['spec'][mapkey] = '*'
            return
        spec = mongo_query['spec']
        to_replace = []
        for key, val in spec.iteritems():
            for system in self.dasmapping.list_systems():
                mapkey = self.dasmapping.find_mapkey(system, key, val)
                if  mapkey and mapkey != key and \
                    key in mongo_query['spec']:
                    to_replace.append((key, mapkey))
                    continue
        for key, mapkey in to_replace:
            if  key in mongo_query['spec']:
                mongo_query['spec'][mapkey] = mongo_query['spec'][key]
                del mongo_query['spec'][key]
        
    def services(self, query):
        """Find out DAS services to use for provided query"""
        skeys, cond = decompose(query)
        if  not skeys:
            skeys = []
        if  isinstance(skeys, str):
            skeys = [skeys]
        slist = []
        # look-up services from Mapping DB
        for key in skeys + [i for i in cond.keys()]:
            for service, keys in self.daskeysmap.iteritems():
                if  service not in self.dasservices:
                    continue
                value = cond.get(key, None)
                daskeys = self.dasmapping.find_daskey(service, key, value)
                if  set(keys) & set(daskeys) and service not in slist:
                    slist.append(service)
        # look-up special key condition
        requested_system = query.get('system', None)
        if  requested_system:
            if  isinstance(requested_system, basestring):
                requested_system = [requested_system]
            return list( set(slist) & set(requested_system) )
        return slist

    def service_apis_map(self, query):
        """
        Find out which APIs correspond to provided query.
        Return a map of found services and their apis.
        """
        skeys, cond = decompose(query)
        if  not skeys:
            skeys = []
        if  isinstance(skeys, str):
            skeys = [skeys]
        adict = {}
        mapkeys = [key for key in cond.keys() if key not in das_special_keys()]
        services = self.services(query)
        for srv in services:
            alist = self.dasmapping.list_apis(srv)
            for api in alist:
                daskeys = self.dasmapping.api_info(srv, api)['das_map']
                maps = [r['rec_key'] for r in daskeys]
                if  set(mapkeys) & set(maps) == set(mapkeys): 
                    if  srv in adict:
                        new_list = adict[srv] + [api]
                        adict[srv] = list( set(new_list) )
                    else:
                        adict[srv] = [api]
        return adict

    def params(self, query):
        """
        Return dictionary of parameters to be used in DAS Core:
        selection keys, conditions and services.
        """
        skeys, cond = decompose(query)
        services = []
        for srv in self.services(query):
            if  srv not in services:
                services.append(srv)
        return dict(selkeys=skeys, conditions=cond, services=services)