Beispiel #1
0
 def get_queries(self, dasquery):
     """
     Look-up (popular) queries in DAS analytics/logging db
     """
     das_timer('DASCore::get_queries', self.verbose)
     fields = dasquery.mongo_query.get('fields')
     spec   = dasquery.mongo_query.get('spec')
     if  'popular' in fields:
         res = self.analytics.get_popular_queries(spec)
     else:
         datestamp = spec.get('date')
         if  isinstance(datestamp, dict):
             value = datestamp.get('$in')
             res = \
             self.analytics.list_queries(after=value[0], before=value[1])
         elif isinstance(datestamp, int):
             res = self.analytics.list_queries(after=datestamp)
         elif not datestamp:
             res = self.analytics.list_queries()
         else:
             msg = 'Unsupported date value: %s' % datestamp
             raise Exception(msg)
     for row in res:
         rid = row.pop('_id')
         yield dict(das_query=row, _id=rid)
     das_timer('DASCore::get_queries', self.verbose)
Beispiel #2
0
    def get_from_cache(self, dasquery, idx=0, limit=0, collection='merge'):
        """
        Look-up results from the merge cache and yield them for
        further processing.
        """
        das_timer('DASCore::get_from_cache', self.verbose)
        msg = 'col=%s, query=%s, idx=%s, limit=%s'\
                % (collection, dasquery, idx, limit)
        self.logger.info(msg)

        fields  = dasquery.mongo_query.get('fields', None)

        if  dasquery.mapreduce:
            res = self.rawcache.map_reduce(dasquery.mapreduce, dasquery)
        elif dasquery.aggregators:
            res = []
            _id = 0
            for func, key in dasquery.aggregators:
                rows = self.rawcache.get_from_cache(\
                        dasquery, collection=collection)
                data = getattr(das_aggregator, 'das_%s' % func)(key, rows)
                res += \
                [{'_id':_id, 'function': func, 'key': key, 'result': data}]
                _id += 1
        elif isinstance(fields, list) and 'queries' in fields:
            res = itertools.islice(self.get_queries(dasquery), idx, idx+limit)
        else:
            res = self.rawcache.get_from_cache(dasquery, idx, limit, \
                    collection=collection)
        for row in res:
            fix_times(row)
            yield row
        das_timer('DASCore::get_from_cache', self.verbose)
Beispiel #3
0
    def insert_query_records(self, dasquery):
        """
        Insert DAS query records into DAS cache and return list of services
        which will answer this query
        """
        services = dasquery.services
        self.logger.info("Potential services = %s" % services)
        if not services:
            msg = "No data-services for query %s" % dasquery
            msg += "mongo_query: %s" % dasquery.mongo_query
            msg += "params: %s" % dasquery.params()
            print(dastimestamp("DAS WARNING "), msg)

        # get list of URI which can answer this query
        ack_services = []
        for srv in services:
            gen = [t for t in getattr(getattr(self, srv), "apimap")(dasquery)]
            for url, api, args, iformat, expire in gen:
                header = dasheader(srv, dasquery, expire, api, url, ctime=0)
                self.rawcache.insert_query_record(dasquery, header)
                if srv not in ack_services:
                    ack_services.append(srv)
        if not ack_services:
            ack_services = services
        if dasquery.query.find("records ") != -1:
            srv_status = True  # skip DAS queries w/ records request
        # create das record with initial expire tstamp 2 min in a future
        # it should be sufficient for processing data-srv records
        expire = time.time() + 2 * 60
        header = dasheader("das", dasquery, expire, api="das_core", services=dict(das=ack_services))
        header["lookup_keys"] = []
        self.rawcache.insert_query_record(dasquery, header)
        das_timer("das_record", self.verbose)
        return ack_services
Beispiel #4
0
    def insert_query_records(self, dasquery):
        """
        Insert DAS query records into DAS cache and return list of services
        which will answer this query
        """
        services = dasquery.services
        self.logger.info('Potential services = %s' % services)
        if  not services:
            msg  = 'No data-services for query %s' % dasquery
            msg += 'mongo_query: %s' % dasquery.mongo_query
            msg += 'params: %s' % dasquery.params()
            print dastimestamp('DAS WARNING '), msg

        # get list of URI which can answer this query
        ack_services = []
        for srv in services:
            gen = [t for t in getattr(getattr(self, srv), 'apimap')(dasquery)]
            for url, api, args, iformat, expire in gen:
                header = dasheader(srv, dasquery, expire, api, url, ctime=0)
                self.rawcache.insert_query_record(dasquery, header)
                if  srv not in ack_services:
                    ack_services.append(srv)
        if  not ack_services:
            ack_services = services
        if  dasquery.query.find('records ') != -1:
            srv_status = True # skip DAS queries w/ records request
        expire = 2*60 # 2 minutes, it should be overwriten by data-srv
        header = dasheader("das", dasquery, expire, api='das_core',
                services=dict(das=ack_services))
        header['lookup_keys'] = []
        self.rawcache.insert_query_record(dasquery, header)
        das_timer('das_record', self.verbose)
        return ack_services
Beispiel #5
0
def getdata_pycurl(url,
                   params,
                   headers=None,
                   expire=3600,
                   post=None,
                   error_expire=300,
                   verbose=0,
                   ckey=None,
                   cert=None,
                   doseq=True,
                   system=None):
    "Fetch data via pycurl library"
    contact = 'data-service.'
    if system:
        contact = system + ' ' + contact
    if isinstance(params, dict):
        timer_key = '%s?%s' % (url, urllib.urlencode(params, doseq=True))
    else:
        timer_key = '%s?%s' % (url, params)
    das_timer(timer_key, verbose)
    handler = REQUEST_HANDLER
    try:
        data, expire = handler.getdata(url, params, headers, expire, post, \
                    error_expire, verbose, ckey, cert, doseq)
    except urllib2.HTTPError as httperror:
        msg  = 'urllib2.HTTPError, system=%s, url=%s, args=%s, headers=%s' \
                    % (system, url, json.dumps(params), json.dumps(headers))
        data = {
            'error': 'Received HTTP error from %s data-service' % contact,
            'reason': msg,
            'ts': time.time()
        }
        try:
            reason = extract_http_error(httperror.read())
            data.update({'reason': reason, 'request': msg})
            # TODO: err variable did not exit in this function!
            msg += '\n' + reason
        except Exception as exp:
            data.update({'httperror': None})
            msg += '\n' + str(exp)
        print(dastimestamp('getdata_pycurl'), msg)
        data = json.dumps(data)
        expire = expire_timestamp(error_expire)
    except Exception as exp:
        msg  = 'HTTPError, system=%s, url=%s, args=%s, headers=%s' \
                    % (system, url, json.dumps(params), json.dumps(headers))
        print(dastimestamp('getdata_pycurl'), msg + '\n' + str(exp))
        data = {
            'error': 'Received generic error from %s data-service' % contact,
            'reason': msg,
            'ts': time.time()
        }
        data = json.dumps(data)
        expire = expire_timestamp(error_expire)
    das_timer(timer_key, verbose)
    return data, expire
Beispiel #6
0
def getdata_pycurl(
    url,
    params,
    headers=None,
    expire=3600,
    post=None,
    error_expire=300,
    verbose=0,
    ckey=None,
    cert=None,
    doseq=True,
    system=None,
):
    "Fetch data via pycurl library"
    contact = "data-service."
    if system:
        contact = system + " " + contact
    timer_key = "%s?%s" % (url, urllib.urlencode(params, doseq=True))
    das_timer(timer_key, verbose)
    handler = REQUEST_HANDLER
    try:
        data, expire = handler.getdata(url, params, headers, expire, post, error_expire, verbose, ckey, cert, doseq)
    except urllib2.HTTPError as httperror:
        msg = "urllib2.HTTPError, system=%s, url=%s, args=%s, headers=%s" % (
            system,
            url,
            json.dumps(params),
            json.dumps(headers),
        )
        data = {"error": "Unable to contact %s" % contact, "reason": msg, "ts": time.time()}
        try:
            reason = extract_http_error(httperror.read())
            data.update({"reason": reason, "request": msg})
            msg += "\n" + err
        except Exception as exp:
            data.update({"httperror": None})
            msg += "\n" + str(exp)
        print dastimestamp("getdata_pycurl"), msg
        data = json.dumps(data)
        expire = expire_timestamp(error_expire)
    except Exception as exp:
        msg = "HTTPError, system=%s, url=%s, args=%s, headers=%s" % (
            system,
            url,
            json.dumps(params),
            json.dumps(headers),
        )
        print dastimestamp("getdata_pycurl"), msg + "\n" + str(exp)
        data = {"error": "Unable to contact %s" % contact, "reason": msg, "ts": time.time()}
        data = json.dumps(data)
        expire = expire_timestamp(error_expire)
    das_timer(timer_key, verbose)
    return data, expire
Beispiel #7
0
def getdata_pycurl(url, params, headers=None, expire=3600, post=None,
    error_expire=300, verbose=0, ckey=None, cert=None, doseq=True, system=None):
    "Fetch data via pycurl library"
    contact = 'data-service.'
    if  system:
        contact = system + ' ' + contact
    if  isinstance(params, dict):
        timer_key = '%s?%s' % (url, urllib.urlencode(params, doseq=True))
    else:
        timer_key = '%s?%s' % (url, params)
    das_timer(timer_key, verbose)
    handler = REQUEST_HANDLER
    try:
        data, expire = handler.getdata(url, params, headers, expire, post, \
                    error_expire, verbose, ckey, cert, doseq)
    except urllib2.HTTPError as httperror:
        msg  = 'urllib2.HTTPError, system=%s, url=%s, args=%s, headers=%s' \
                    % (system, url, json.dumps(params), json.dumps(headers))
        data = {'error': 'Received HTTP error from %s data-service' % contact,
                'reason': msg, 'ts':time.time()}
        try:
            reason = extract_http_error(httperror.read())
            data.update({'reason': reason, 'request': msg})
            # TODO: err variable did not exit in this function!
            msg += '\n' + reason
        except Exception as exp:
            data.update({'httperror': None})
            msg += '\n' + str(exp)
        print(dastimestamp('getdata_pycurl'), msg)
        data = json.dumps(data)
        expire = expire_timestamp(error_expire)
    except Exception as exp:
        msg  = 'HTTPError, system=%s, url=%s, args=%s, headers=%s' \
                    % (system, url, json.dumps(params), json.dumps(headers))
        print(dastimestamp('getdata_pycurl'), msg + '\n' + str(exp))
        data = {'error': 'Received generic error from %s data-service' % contact,
                'reason': msg, 'ts':time.time()}
        data = json.dumps(data)
        expire = expire_timestamp(error_expire)
    das_timer(timer_key, verbose)
    return data, expire
Beispiel #8
0
    def insert_query_records(self, dasquery):
        """
        Insert DAS query records into DAS cache and return list of services
        which will answer this query
        """
        services = dasquery.services
        self.logger.info('Potential services = %s' % services)
        if not services:
            msg = 'No data-services for query %s' % dasquery
            msg += 'mongo_query: %s' % dasquery.mongo_query
            msg += 'params: %s' % dasquery.params()
            print(dastimestamp('DAS WARNING '), msg)

        # get list of URI which can answer this query
        ack_services = []
        for srv in services:
            gen = [t for t in getattr(getattr(self, srv), 'apimap')(dasquery)]
            for url, api, args, iformat, expire in gen:
                header = dasheader(srv, dasquery, expire, api, url, ctime=0)
                self.rawcache.insert_query_record(dasquery, header)
                if srv not in ack_services:
                    ack_services.append(srv)
        if not ack_services:
            ack_services = services
        if dasquery.query.find('records ') != -1:
            srv_status = True  # skip DAS queries w/ records request
        # create das record with initial expire tstamp
        expire = time.time() + self.init_expire
        header = dasheader("das",
                           dasquery,
                           expire,
                           api='das_core',
                           services=dict(das=ack_services))
        header['lookup_keys'] = []
        self.rawcache.insert_query_record(dasquery, header)
        das_timer('das_record', self.verbose)
        return ack_services
Beispiel #9
0
 def worker(self, srv, dasquery):
     """Main worker function which calls data-srv call function"""
     self.logger.info('##### %s ######\n' % srv)
     das_timer(srv, self.verbose)
     getattr(getattr(self, srv), 'call')(dasquery)
     das_timer(srv, self.verbose)
Beispiel #10
0
def getdata_urllib(url,
                   params,
                   headers=None,
                   expire=3600,
                   post=None,
                   error_expire=300,
                   verbose=0,
                   ckey=None,
                   cert=None,
                   doseq=True,
                   system=None,
                   tstamp=None):
    """
    Invoke URL call and retrieve data from data-service based
    on provided URL and set of parameters. Use post=True to
    invoke POST request.
    """
    contact = 'data-service.'
    if system:
        contact = system + ' ' + contact
    timer_key = '%s?%s' % (url, urllib.urlencode(params, doseq=True))
    das_timer(timer_key, verbose)
    encoded_data = urllib.urlencode(params, doseq=doseq)
    if not post:
        url = url + '?' + encoded_data
    if not headers:
        headers = {}
    if tstamp and 'If-Modified-Since' not in headers.keys():
        headers['If-Modified-Since'] = http_timestamp(tstamp)
    if verbose:
        print('+++ getdata, url=%s, headers=%s' % (url, headers))
    req = urllib2.Request(url)
    for key, val in headers.items():
        req.add_header(key, val)
    if verbose > 1:
        handler = urllib2.HTTPHandler(debuglevel=1)
        opener = urllib2.build_opener(handler)
        urllib2.install_opener(opener)
    if ckey and cert:
        handler = HTTPSClientAuthHandler(ckey, cert, verbose)
        opener = urllib2.build_opener(handler)
        urllib2.install_opener(opener)
    try:
        time0 = time.time()
        if post:
            data = urllib2.urlopen(req, encoded_data)
        else:
            data = urllib2.urlopen(req)
        data_srv_time = time.time() - time0
        info = data.info()
        code = data.getcode()
        if verbose > 1:
            print("+++ response code:", code)
            print("+++ response info\n", info)
        try:  # get HTTP header and look for Expires
            e_time = expire_timestamp(\
                info.__dict__['dict']['expires'])
            if e_time < expire_timestamp(data_srv_time):
                expire = max(e_time, expire_timestamp(expire))
            elif e_time > time.time():
                expire = e_time
        except Exception as _exp:
            pass
    except urllib2.HTTPError as httperror:
        msg  = 'HTTPError, url=%s, args=%s, headers=%s' \
                    % (url, params, headers)
        data = {
            'error': 'Received HTTP error from %s data-service' % contact,
            'reason': msg
        }
        try:
            err = '%s %s' % (contact, extract_http_error(httperror.read()))
            data.update({'error': err})
            msg += '\n' + err
        except Exception as exp:
            data.update({'httperror': None})
            msg += '\n' + str(exp)
        print(msg)
        data = json.dumps(data)
        expire = expire_timestamp(error_expire)
    except Exception as exp:
        msg  = 'HTTPError, url=%s, args=%s, headers=%s' \
                    % (url, params, headers)
        print(msg + '\n' + str(exp))
        data = {
            'error': 'Received generic error from %s data-service' % contact,
            'reason': msg
        }
        data = json.dumps(data)
        expire = expire_timestamp(error_expire)
    das_timer(timer_key, verbose)
    return data, expire
Beispiel #11
0
    def call(self, query, **kwds):
        """
        Top level DAS api which execute a given query using underlying
        data-services. It follows the following steps:

            - parse input query
            - identify data-sercices based on selection keys
              and where clause conditions
            - construct DAS workflow and execute data-service 
              API calls. At this step individual 
              data-services store results into DAS cache.

        Return status 0/1 depending on success of the calls, can be
        used by workers on cache server.

        kwds is provided for compatibility with web layer, e.g. it
        may invoke this method with additional pid parameter.
        """

        def update_das_query(dasquery, status, reason=None):
            "Update DAS query record with given status and reason"
            self.rawcache.update_query_record(dasquery, status, reason=reason)
            self.rawcache.add_to_record(dasquery, {"das.timer": get_das_timer()}, system="das")
            # make sure that das record is updated, we use 7 iteration which
            # sum up into 1 minute to cover default syncdelay value of mongo
            # server (in a future it would be better to find programatically
            # this syncdelay value, but it seems pymongo driver does not
            # provide any API for it.
            for idx in range(0, 7):
                spec = {"qhash": dasquery.qhash, "das.system": ["das"]}
                res = self.rawcache.col.find_one(spec)
                if res:
                    dbstatus = res.get("das", {}).get("status", None)
                    if dbstatus == status:
                        break
                    msg = "qhash %s, das.status=%s, status=%s, wait for update" % (dasquery.qhash, dbstatus, status)
                    print(dastimestamp("DAS WARNING"), msg)
                self.rawcache.update_query_record(dasquery, status, reason=reason)
                time.sleep(idx * idx)

        self.logger.info("input query=%s" % query)
        das_timer("DASCore::call", self.verbose)
        if isinstance(query, object) and hasattr(query, "__class__") and query.__class__.__name__ == "DASQuery":
            dasquery = query
        else:
            dasquery = DASQuery(query)
        for col in ["merge", "cache"]:
            self.rawcache.remove_expired(dasquery, col)
        query = dasquery.mongo_query
        spec = query.get("spec")
        fields = query.get("fields")
        if fields == ["records"]:
            msg = "look-up all records in cache"
            self.logger.info(msg)
            return "in cache"
        if spec == dict(records="*"):
            self.logger.info("look-up everything in cache")
            return "in cache"
        for record in self.rawcache.find_specs(dasquery):
            status = record["das"]["status"]
            msg = "found query %s in cache, status=%s\n" % (record["query"], status)
            self.logger.info(msg)
            print(dastimestamp("DAS INFO"), msg)
            return status

        self.logger.info(dasquery)
        das_timer("das_record", self.verbose)
        services = self.insert_query_records(dasquery)
        if not services:
            msg = "unable to locate data-services to fulfill this request"
            msg += ", will iterate over all registered services"
            print(dastimestamp("DAS WARNING "), dasquery, msg)
            services = dasquery.services if dasquery.services else self.systems
        try:
            if self.multitask:
                jobs = []
                for srv in sorted(services):
                    jobs.append(self.taskmgr.spawn(self.worker, srv, dasquery))
                self.taskmgr.joinall(jobs)
            else:
                for srv in services:
                    self.worker(srv, dasquery)
        except Exception as exc:
            print_exc(exc)
            return "fail"
        self.logger.info("\n##### merging ######\n")
        update_das_query(dasquery, "merging")
        das_timer("merge", self.verbose)
        for attempt in range(0, 4):  # try couple of times to avoid DB problems
            time.sleep(attempt)
            status = self.rawcache.merge_records(dasquery, attempt)
            if status == "ok":
                break
        das_timer("merge", self.verbose)
        # check if we have service records and properly setup status
        self.logger.info("\n##### check services ######\n")
        das_services = self.rawcache.check_services(dasquery)
        reason = ""
        status = "ok"
        if not das_services:
            if "records" in dasquery.query:
                status = "ok"  # keep status ok for 'records' queries
            else:
                reason = "no data records found in DAS cache"
                status = "fail"
                print(dastimestamp("DAS ERROR "), dasquery, reason)
        update_das_query(dasquery, status, reason)
        das_timer("DASCore::call", self.verbose)
        return status
Beispiel #12
0
def getdata_urllib(url, params, headers=None, expire=3600, post=None,
    error_expire=300, verbose=0, ckey=None, cert=None, doseq=True, system=None,
    tstamp=None):
    """
    Invoke URL call and retrieve data from data-service based
    on provided URL and set of parameters. Use post=True to
    invoke POST request.
    """
    contact = 'data-service.'
    if  system:
        contact = system + ' ' + contact
    timer_key = '%s?%s' % (url, urllib.urlencode(params, doseq=True))
    das_timer(timer_key, verbose)
    encoded_data = urllib.urlencode(params, doseq=doseq)
    if  not post:
        url = url + '?' + encoded_data
    if  not headers:
        headers = {}
    if  tstamp and 'If-Modified-Since' not in headers.keys():
        headers['If-Modified-Since'] = http_timestamp(tstamp)
    if  verbose:
        print('+++ getdata, url=%s, headers=%s' % (url, headers))
    req = urllib2.Request(url)
    for key, val in headers.items():
        req.add_header(key, val)
    if  verbose > 1:
        handler = urllib2.HTTPHandler(debuglevel=1)
        opener  = urllib2.build_opener(handler)
        urllib2.install_opener(opener)
    if  ckey and cert:
        handler = HTTPSClientAuthHandler(ckey, cert, verbose)
        opener  = urllib2.build_opener(handler)
        urllib2.install_opener(opener)
    try:
        time0 = time.time()
        if  post:
            data = urllib2.urlopen(req, encoded_data)
        else:
            data = urllib2.urlopen(req)
        data_srv_time = time.time()-time0
        info = data.info()
        code = data.getcode()
        if  verbose > 1:
            print("+++ response code:", code)
            print("+++ response info\n", info)
        try: # get HTTP header and look for Expires
            e_time = expire_timestamp(\
                info.__dict__['dict']['expires'])
            if  e_time < expire_timestamp(data_srv_time):
                expire = max(e_time, expire_timestamp(expire))
            elif e_time > time.time():
                expire = e_time
        except Exception as _exp:
            pass
    except urllib2.HTTPError as httperror:
        msg  = 'HTTPError, url=%s, args=%s, headers=%s' \
                    % (url, params, headers)
        data = {'error': 'Received HTTP error from %s data-service' % contact,
                'reason': msg}
        try:
            err  = '%s %s' % (contact, extract_http_error(httperror.read()))
            data.update({'error':err})
            msg += '\n' + err
        except Exception as exp:
            data.update({'httperror': None})
            msg += '\n' + str(exp)
        print(msg)
        data = json.dumps(data)
        expire = expire_timestamp(error_expire)
    except Exception as exp:
        msg  = 'HTTPError, url=%s, args=%s, headers=%s' \
                    % (url, params, headers)
        print(msg + '\n' + str(exp))
        data = {'error': 'Received generic error from %s data-service' % contact,
                'reason': msg}
        data = json.dumps(data)
        expire = expire_timestamp(error_expire)
    das_timer(timer_key, verbose)
    return data, expire
Beispiel #13
0
    def call(self, query, **kwds):
        """
        Top level DAS api which execute a given query using underlying
        data-services. It follows the following steps:

            - parse input query
            - identify data-sercices based on selection keys
              and where clause conditions
            - construct DAS workflow and execute data-service 
              API calls. At this step individual 
              data-services store results into DAS cache.

        Return status 0/1 depending on success of the calls, can be
        used by workers on cache server.

        kwds is provided for compatibility with web layer, e.g. it
        may invoke this method with additional pid parameter.
        """
        def update_das_query(dasquery, status, reason=None):
            "Update DAS query record with given status and reason"
            self.rawcache.update_query_record(dasquery, status, reason=reason)
            self.rawcache.add_to_record(\
                    dasquery, {'das.timer': get_das_timer()}, system='das')

        self.logger.info('input query=%s' % query)
        das_timer('DASCore::call', self.verbose)
        if  isinstance(query, object) and hasattr(query, '__class__')\
            and query.__class__.__name__ == 'DASQuery':
            dasquery = query
        else:
            dasquery = DASQuery(query)
        for col in ['merge', 'cache']:
            self.rawcache.remove_expired(dasquery, col)
        query  = dasquery.mongo_query
        spec   = query.get('spec')
        fields = query.get('fields')
        if  fields == ['records']:
            msg = 'look-up all records in cache'
            self.logger.info(msg)
            return 'in cache'
        if  spec == dict(records='*'):
            self.logger.info("look-up everything in cache")
            return 'in cache'
        for record in self.rawcache.find_specs(dasquery):
            status = record['das']['status']
            msg = 'found query %s in cache, status=%s\n' \
                        % (record['query'], status)
            self.logger.info(msg)
            print(dastimestamp('DAS INFO'), msg)
            return status

        self.logger.info(dasquery)
        das_timer('das_record', self.verbose)
        services = self.insert_query_records(dasquery)
        if  not services:
            msg = 'unable to locate data-services to fulfill this request'
            msg += ', will iterate over all registered services'
            print(dastimestamp('DAS WARNING '), dasquery, msg)
            services = dasquery.services if dasquery.services else self.systems
        try:
            if  self.multitask:
                jobs = []
                for srv in sorted(services):
                    jobs.append(self.taskmgr.spawn(self.worker, srv, dasquery))
                self.taskmgr.joinall(jobs)
            else:
                for srv in services:
                    self.worker(srv, dasquery)
        except Exception as exc:
            print_exc(exc)
            return 'fail'
        self.logger.info('\n##### merging ######\n')
        update_das_query(dasquery, 'merging')
        das_timer('merge', self.verbose)

        # check that all query record statuses are ok, i.e. we did insert records
        # this status is set by self.rawcache.update_cache
        for idx in range(self.collect_wait_time):
            records = self.rawcache.find_query_record(dasquery)
            statuses = []
            for row in records:
                system = row['das']['system']
                status = row['das']['status']
                self.logger.info("### query record status %s %s %s" % (dasquery.qhash, system, status))
                statuses.append(status)
            all_statuses = sorted(list(set(statuses)))
            # at this point we're looking that all services will have 'ok' and das status will be 'merging'
            if len(all_statuses) == 2 and all_statuses == ['merging', 'ok']:
                break
            time.sleep(1)

        # now we can merge records
        status = self.rawcache.merge_records(dasquery)
        das_timer('merge', self.verbose)
        # check if we have service records and properly setup status
        self.logger.info('\n##### check services ######\n')
        das_services = self.rawcache.check_services(dasquery)
        reason = ''
        status = 'ok'
        if  not das_services:
            if  'records' in dasquery.query:
                status = 'ok' # keep status ok for 'records' queries
            else:
                reason = 'no data records found in DAS cache'
                status = 'fail'
                print(dastimestamp('DAS ERROR '), dasquery, reason)
        update_das_query(dasquery, status, reason)
        das_timer('DASCore::call', self.verbose)
        return status
Beispiel #14
0
    def __init__(self,
                 config=None,
                 debug=0,
                 nores=False,
                 logger=None,
                 engine=None,
                 multitask=True):
        if config:
            dasconfig = config
        else:
            dasconfig = das_readconfig()
        verbose = dasconfig['verbose']
        self.stdout = debug
        if isinstance(debug, int) and debug:
            self.verbose = debug
            dasconfig['verbose'] = debug
        else:
            self.verbose = verbose
        das_timer('DASCore::init', self.verbose)
        self.operators = das_operators()
        self.collect_wait_time = dasconfig['das'].get('collect_wait_time', 120)

        # set noresults option
        self.noresults = False
        if nores:
            dasconfig['write_cache'] = True
            self.noresults = nores

        self.init_expire = dasconfig['das'].get('init_expire', 5 * 60)
        self.multitask = dasconfig['das'].get('multitask', True)
        if debug or self.verbose:
            self.multitask = False  # in verbose mode do not use multitask
            dasconfig['das']['multitask'] = False
        if not multitask:  # explicitly call DASCore ctor
            self.multitask = False
            dasconfig['das']['multitask'] = False
        dasconfig['engine'] = engine
        if self.multitask:
            nworkers = dasconfig['das'].get('core_workers', 5)
            #             if  engine:
            #                 thr_name = 'DASCore:PluginTaskManager'
            #                 self.taskmgr = PluginTaskManager(\
            #                         engine, nworkers=nworkers, name=thr_name)
            #                 self.taskmgr.subscribe()
            #             else:
            #                 thr_name = 'DASCore:TaskManager'
            #                 self.taskmgr = TaskManager(nworkers=nworkers, name=thr_name)
            thr_name = 'DASCore:TaskManager'
            self.taskmgr = TaskManager(nworkers=nworkers, name=thr_name)
        else:
            self.taskmgr = None

        if logger:
            self.logger = logger
        else:
            self.logger = PrintManager('DASCore', self.verbose)

        # define Mapping/Analytics/Parser in this order since Parser depends
        # on first two
        dasmapping = DASMapping(dasconfig)
        dasconfig['dasmapping'] = dasmapping
        self.mapping = dasmapping

        self.keylearning = DASKeyLearning(dasconfig)
        dasconfig['keylearning'] = self.keylearning

        # init DAS cache
        self.rawcache = DASMongocache(dasconfig)
        dasconfig['rawcache'] = self.rawcache

        # plug-in architecture: loop over registered data-services in
        # dasconfig; load appropriate module/class; register data
        # service with DASCore.
        self.systems = dasmapping.list_systems()
        # pointer to the DAS top level directory
        dasroot = '/'.join(__file__.split('/')[:-3])
        for name in self.systems:
            try:
                klass  = 'DAS/services/%s/%s_service.py' \
                    % (name, name)
                srvfile = os.path.join(dasroot, klass)
                with open(srvfile) as srvclass:
                    for line in srvclass:
                        if line.find('(DASAbstractService)') != -1:
                            klass = line.split('(DASAbstractService)')[0]
                            klass = klass.split('class ')[-1]
                            break
                mname = 'DAS.services.%s.%s_service' % (name, name)
                module = __import__(mname, fromlist=[klass])
                obj = getattr(module, klass)(dasconfig)
                setattr(self, name, obj)
            except IOError as err:
                if debug > 1:
                    # we have virtual services, so IOError can be correct
                    print_exc(err)
                try:
                    mname = 'DAS.services.generic_service'
                    module = __import__(mname, fromlist=['GenericService'])
                    obj = module.GenericService(name, dasconfig)
                    setattr(self, name, obj)
                except Exception as exc:
                    print_exc(exc)
                    msg = "Unable to load %s data-service plugin" % name
                    raise Exception(msg)
            except Exception as exc:
                print_exc(exc)
                msg = "Unable to load %s data-service plugin" % name
                raise Exception(msg)

        # loop over systems and get system keys, add mapping keys to final list
        self.service_keys = {}
        self.service_parameters = {}
        for name in self.systems:
            skeys = list(getattr(self, name).keys())
            self.service_keys[getattr(self, name).name] = skeys
            sparams = getattr(self, name).parameters()
            self.service_parameters[getattr(self, name).name] = sparams

        self.service_keys['special'] = das_special_keys()
        self.dasconfig = dasconfig
        das_timer('DASCore::init', self.verbose)
Beispiel #15
0
    def get_from_cache(self, dasquery, idx=0, limit=0, collection="merge"):
        """
        Look-up results from the merge cache and yield them for
        further processing.
        """
        das_timer("DASCore::get_from_cache", self.verbose)
        msg = "col=%s, query=%s, idx=%s, limit=%s" % (collection, dasquery, idx, limit)
        self.logger.info(msg)

        fields = dasquery.mongo_query.get("fields", None)

        if dasquery.mapreduce:
            res = self.rawcache.map_reduce(dasquery.mapreduce, dasquery)
        elif dasquery.aggregators:
            # extract das information from rawcache
            rows = self.rawcache.get_from_cache(dasquery, collection=collection)
            first = next(rows)
            sinfo = das_sinfo(first)
            # to perform aggregation we need:
            # - loop over all aggregator functions
            # - loop over all data-services
            # - loop over all APIs within a data-services
            # the code below does that, it applies aggregator
            # to selected (based on key/srv/api) records
            res = []
            _id = 0
            time0 = time.time()
            expire = 300  # min expire
            for func, key in dasquery.aggregators:
                afunc = getattr(das_aggregator, "das_%s" % func)
                found = False
                for srv, apis in sinfo.items():
                    for api in apis:
                        rows = self.rawcache.get_from_cache(dasquery, collection=collection)
                        gen = api_rows(rows, api)
                        data = afunc(key, gen)
                        ctime = time.time() - time0
                        das = dasheader(srv, dasquery, expire, api=api, ctime=ctime)
                        if isinstance(data, dict) and data["value"] != "N/A":
                            aggr = {"_id": _id, "function": func, "key": key, "result": data}
                            aggr.update(das)
                            res.append(aggr)
                            _id += 1
                            found = True
                if not found:  # when we got nothing add empty result record
                    empty = {"value": "N/A"}
                    ctime = time.time() - time0
                    das = dasheader("das", dasquery, expire, api="das_core", ctime=ctime)
                    rec = {"_id": 0, "function": func, "key": key, "result": empty}
                    rec.update(das)
                    res.append(rec)
        else:
            res = self.rawcache.get_from_cache(dasquery, idx, limit, collection=collection)
        # we assume that all records from single query will have
        # identical structure, therefore it will be sufficient to update
        # keylearning DB only with first record
        count = 0
        for row in res:
            if not count:
                self.keylearning.add_record(dasquery, row)
            fix_times(row)
            yield row
            count += 1
        das_timer("DASCore::get_from_cache", self.verbose)
Beispiel #16
0
    def call(self, query, add_to_analytics=True, **kwds):
        """
        Top level DAS api which execute a given query using underlying
        data-services. It follows the following steps:

            - parse input query
            - identify data-sercices based on selection keys
              and where clause conditions
            - construct DAS workflow and execute data-service 
              API calls. At this step individual 
              data-services store results into DAS cache.

        Return status 0/1 depending on success of the calls, can be
        used by workers on cache server.

        kwds is provided for compatibility with web layer, e.g. it
        may invoke this method with additional pid parameter.
        """
        def update_das_query(dasquery, status, reason=None):
            "Update DAS query record with given status and reason"
            self.rawcache.update_query_record(dasquery, status, reason=reason)
            self.rawcache.add_to_record(\
                    dasquery, {'das.timer': get_das_timer()}, system='das')
            # make sure that das record is updated, we use 7 iteration which
            # sum up into 1 minute to cover default syncdelay value of mongo
            # server (in a future it would be better to find programatically
            # this syncdelay value, but it seems pymongo driver does not
            # provide any API for it.
            for idx in xrange(1, 7):
                spec = {'qhash':dasquery.qhash, 'das.system':['das']}
                res = self.rawcache.col.find_one(spec)
                if  res:
                    dbstatus = res.get('das', {}).get('status', None)
                    if  dbstatus == status:
                        break
                    msg = 'qhash %s, das.status=%s, status=%s, wait for update' \
                            % (dasquery.qhash, dbstatus, status)
                    print dastimestamp('DAS WARNING'), msg
                time.sleep(idx*idx)
                self.rawcache.update_query_record(dasquery, status, reason=reason)

        self.logger.info('input query=%s' % query)
        das_timer('DASCore::call', self.verbose)
        if  isinstance(query, object) and hasattr(query, '__class__')\
            and query.__class__.__name__ == 'DASQuery':
            dasquery = query
        else:
            dasquery = DASQuery(query)
        for col in ['merge', 'cache']:
            self.rawcache.remove_expired(dasquery, col)
        if  add_to_analytics:
            dasquery.add_to_analytics()
        query  = dasquery.mongo_query
        spec   = query.get('spec')
        fields = query.get('fields')
        if  fields == ['records']:
            msg = 'look-up all records in cache'
            self.logger.info(msg)
            return 'in cache'
        if  spec == dict(records='*'):
            self.logger.info("look-up everything in cache")
            return 'in cache'
        for record in self.rawcache.find_specs(dasquery):
            status = record['das']['status']
            msg = 'found query %s in cache, status=%s\n' \
                        % (record['query'], status)
            self.logger.info(msg)
            print dastimestamp('DAS INFO'), msg
            return status

        self.logger.info(dasquery)
        das_timer('das_record', self.verbose)
        services = self.insert_query_records(dasquery)
        if  not services:
            msg = 'unable to locate data-services to fulfill this request'
            msg += ', will iterate over all registered services'
            print dastimestamp('DAS WARNING '), dasquery, msg
            services = dasquery.services if dasquery.services else self.systems
        try:
            if  self.multitask:
                jobs = []
                for srv in sorted(services):
                    jobs.append(self.taskmgr.spawn(self.worker, srv, dasquery))
                self.taskmgr.joinall(jobs)
            else:
                for srv in services:
                    self.worker(srv, dasquery)
        except Exception as exc:
            print_exc(exc)
            return 'fail'
        self.logger.info('\n##### merging ######\n')
        update_das_query(dasquery, 'merging')
        das_timer('merge', self.verbose)
        self.rawcache.merge_records(dasquery)
        das_timer('merge', self.verbose)
        # check if we have service records and properly setup status
        self.logger.info('\n##### check services ######\n')
        das_services = self.rawcache.check_services(dasquery)
        reason = ''
        status = 'ok'
        if  not das_services:
            if  'records' in dasquery.query:
                status = 'ok' # keep status ok for 'records' queries
            else:
                reason = 'no data records found in DAS cache'
                status = 'fail'
                print dastimestamp('DAS ERROR '), dasquery, reason
        update_das_query(dasquery, status, reason)
        das_timer('DASCore::call', self.verbose)
        return status
Beispiel #17
0
 def worker(self, srv, dasquery):
     """Main worker function which calls data-srv call function"""
     self.logger.info('##### %s ######\n' % srv)
     das_timer(srv, self.verbose)
     getattr(getattr(self, srv), 'call')(dasquery)
     das_timer(srv, self.verbose)
Beispiel #18
0
def getdata_urllib(
    url,
    params,
    headers=None,
    expire=3600,
    post=None,
    error_expire=300,
    verbose=0,
    ckey=None,
    cert=None,
    doseq=True,
    system=None,
):
    """
    Invoke URL call and retrieve data from data-service based
    on provided URL and set of parameters. Use post=True to
    invoke POST request.
    """
    contact = "data-service."
    if system:
        contact = system + " " + contact
    timer_key = "%s?%s" % (url, urllib.urlencode(params, doseq=True))
    das_timer(timer_key, verbose)
    encoded_data = urllib.urlencode(params, doseq=doseq)
    if not post:
        url = url + "?" + encoded_data
    if not headers:
        headers = {}
    if verbose:
        print "+++ getdata, url=%s, headers=%s" % (url, headers)
    req = urllib2.Request(url)
    for key, val in headers.iteritems():
        req.add_header(key, val)
    if verbose > 1:
        handler = urllib2.HTTPHandler(debuglevel=1)
        opener = urllib2.build_opener(handler)
        urllib2.install_opener(opener)
    if ckey and cert:
        handler = HTTPSClientAuthHandler(ckey, cert, verbose)
        opener = urllib2.build_opener(handler)
        urllib2.install_opener(opener)
    try:
        time0 = time.time()
        if post:
            data = urllib2.urlopen(req, encoded_data)
        else:
            data = urllib2.urlopen(req)
        data_srv_time = time.time() - time0
        try:  # get HTTP header and look for Expires
            e_time = expire_timestamp(data.info().__dict__["dict"]["expires"])
            if e_time < expire_timestamp(data_srv_time):
                expire = max(e_time, expire_timestamp(expire))
            elif e_time > time.time():
                expire = e_time
        except Exception as _exp:
            pass
    except urllib2.HTTPError as httperror:
        msg = "HTTPError, url=%s, args=%s, headers=%s" % (url, params, headers)
        data = {"error": "Unable to contact %s" % contact, "reason": msg}
        try:
            err = "%s %s" % (contact, extract_http_error(httperror.read()))
            data.update({"error": err})
            msg += "\n" + err
        except Exception as exp:
            data.update({"httperror": None})
            msg += "\n" + str(exp)
        print msg
        data = json.dumps(data)
        expire = expire_timestamp(error_expire)
    except Exception as exp:
        msg = "HTTPError, url=%s, args=%s, headers=%s" % (url, params, headers)
        print msg + "\n" + str(exp)
        data = {"error": "Unable to contact %s" % contact, "reason": msg}
        data = json.dumps(data)
        expire = expire_timestamp(error_expire)
    das_timer(timer_key, verbose)
    return data, expire
Beispiel #19
0
    def call(self, query, add_to_analytics=True, **kwds):
        """
        Top level DAS api which execute a given query using underlying
        data-services. It follows the following steps:

            - parse input query
            - identify data-sercices based on selection keys
              and where clause conditions
            - construct DAS workflow and execute data-service 
              API calls. At this step individual 
              data-services store results into DAS cache.

        Return status 0/1 depending on success of the calls, can be
        used by workers on cache server.

        kwds is provided for compatibility with web layer, e.g. it
        may invoke this method with additional pid parameter.
        """
        self.logger.info('input query=%s' % query)
        das_timer('DASCore::call', self.verbose)
        services = []
        if  isinstance(query, object) and hasattr(query, '__class__')\
            and query.__class__.__name__ == 'DASQuery':
            dasquery = query
        else:
            dasquery = DASQuery(query, mongoparser=self.mongoparser)
        if  add_to_analytics:
            dasquery.add_to_analytics()
        query = dasquery.mongo_query
        if  dasquery.mongo_query.has_key('system'):
            system = query['system']
            if  isinstance(system, str) or isinstance(system, unicode):
                services = [system]
            elif isinstance(system, list):
                services = system
            else:
                msg = 'Unsupported system=%s type=%s in DAS query' \
                        % (system, type(system))
                raise Exception(msg)
        spec   = query.get('spec')
        fields = query.get('fields')
        if  fields == ['records']:
            msg = 'look-up all records in cache'
            self.logger.info(msg)
            return 'in cache'
        if  spec == dict(records='*'):
            self.logger.info("look-up everything in cache")
            return 'in cache'
        for record in self.rawcache.find_specs(dasquery):
            status = record['das']['status']
            msg = 'found query %s in cache, status=%s\n' \
                        % (record['query'], status)
            self.logger.info(msg)
            return status
        similar_dasquery = self.rawcache.similar_queries(dasquery)
        if  similar_dasquery:
            for record in self.rawcache.find_specs(similar_dasquery):
                if  record:
                    try:
                        status = record['das']['status']
                    except:
                        status = 'N/A'
                        msg = 'Fail to look-up das.status, record=%s' % record
                        self.logger.info(msg)
                msg  = 'found SIMILAR query in cache,'
                msg += 'query=%s, status=%s\n' % (record['query'], status)
                self.logger.info(msg)
                return status

        self.logger.info(dasquery)
        params = dasquery.params()
        if  not services:
            services = params['services']
        self.logger.info('services = %s' % services)
        das_timer('das_record', self.verbose)
        # initial expire tstamp 1 day (long enough to be overwriten by data-srv)
        expire = expire_timestamp(time.time()+1*24*60*60)
        header = dasheader("das", dasquery, expire)
        header['lookup_keys'] = []
        self.rawcache.insert_query_record(dasquery, header)
        das_timer('das_record', self.verbose)
        try:
            if  self.multitask:
                jobs = []
                for srv in services:
                    jobs.append(self.taskmgr.spawn(self.worker, srv, dasquery))
                self.taskmgr.joinall(jobs)
            else:
                for srv in services:
                    self.worker(srv, dasquery)
        except Exception as exc:
            print_exc(exc)
            return 'fail'
        self.logger.info('\n##### merging ######\n')
        self.rawcache.update_query_record(dasquery, 'merging')
        das_timer('merge', self.verbose)
        self.rawcache.merge_records(dasquery)
        das_timer('merge', self.verbose)
        self.rawcache.update_query_record(dasquery, 'ok')
        self.rawcache.add_to_record(\
                dasquery, {'das.timer': get_das_timer()}, system='das')
        das_timer('DASCore::call', self.verbose)
        return 'ok'
Beispiel #20
0
    def get_from_cache(self, dasquery, idx=0, limit=0, collection='merge'):
        """
        Look-up results from the merge cache and yield them for
        further processing.
        """
        das_timer('DASCore::get_from_cache', self.verbose)
        msg = 'col=%s, query=%s, idx=%s, limit=%s'\
                % (collection, dasquery, idx, limit)
        self.logger.info(msg)

        fields  = dasquery.mongo_query.get('fields', None)

        if  dasquery.mapreduce:
            res = self.rawcache.map_reduce(dasquery.mapreduce, dasquery)
        elif dasquery.aggregators:
            # extract das information from rawcache
            rows  = self.rawcache.get_from_cache(\
                    dasquery, collection=collection)
            first = rows.next()
            sinfo = das_sinfo(first)
            # to perform aggregation we need:
            # - loop over all aggregator functions
            # - loop over all data-services
            # - loop over all APIs within a data-services
            # the code below does that, it applies aggregator
            # to selected (based on key/srv/api) records
            res = []
            _id = 0
            time0  = time.time()
            expire = 300 # min expire
            for func, key in dasquery.aggregators:
                afunc = getattr(das_aggregator, 'das_%s' % func)
                found = False
                for srv, apis, in sinfo.items():
                    for api in apis:
                        rows  = self.rawcache.get_from_cache(\
                                dasquery, collection=collection)
                        gen   = api_rows(rows, api)
                        data  = afunc(key, gen)
                        ctime = time.time() - time0
                        das   = dasheader(srv, dasquery, expire, api=api,
                                ctime=ctime)
                        if  isinstance(data, dict) and data['value'] != 'N/A':
                            aggr = {'_id':_id, 'function': func,
                                    'key': key, 'result': data}
                            aggr.update(das)
                            res.append(aggr)
                            _id += 1
                            found = True
                if  not found: # when we got nothing add empty result record
                    empty = {'value':'N/A'}
                    ctime = time.time() - time0
                    das = dasheader('das', dasquery, expire, api='das_core',
                            ctime=ctime)
                    rec = {'_id':0, 'function':func, 'key':key, 'result':empty}
                    rec.update(das)
                    res.append(rec)
        elif isinstance(fields, list) and 'queries' in fields:
            res = itertools.islice(self.get_queries(dasquery), idx, idx+limit)
        else:
            res = self.rawcache.get_from_cache(dasquery, idx, limit, \
                    collection=collection)
        for row in res:
            fix_times(row)
            yield row
        das_timer('DASCore::get_from_cache', self.verbose)
Beispiel #21
0
    def call(self, query, **kwds):
        """
        Top level DAS api which execute a given query using underlying
        data-services. It follows the following steps:

            - parse input query
            - identify data-sercices based on selection keys
              and where clause conditions
            - construct DAS workflow and execute data-service 
              API calls. At this step individual 
              data-services store results into DAS cache.

        Return status 0/1 depending on success of the calls, can be
        used by workers on cache server.

        kwds is provided for compatibility with web layer, e.g. it
        may invoke this method with additional pid parameter.
        """
        def update_das_query(dasquery, status, reason=None):
            "Update DAS query record with given status and reason"
            self.rawcache.update_query_record(dasquery, status, reason=reason)
            self.rawcache.add_to_record(\
                    dasquery, {'das.timer': get_das_timer()}, system='das')

        self.logger.info('input query=%s' % query)
        das_timer('DASCore::call', self.verbose)
        if  isinstance(query, object) and hasattr(query, '__class__')\
            and query.__class__.__name__ == 'DASQuery':
            dasquery = query
        else:
            dasquery = DASQuery(query)
        for col in ['merge', 'cache']:
            self.rawcache.remove_expired(dasquery, col)
        query = dasquery.mongo_query
        spec = query.get('spec')
        fields = query.get('fields')
        if fields == ['records']:
            msg = 'look-up all records in cache'
            self.logger.info(msg)
            return 'in cache'
        if spec == dict(records='*'):
            self.logger.info("look-up everything in cache")
            return 'in cache'
        for record in self.rawcache.find_specs(dasquery):
            status = record['das']['status']
            msg = 'found query %s in cache, status=%s\n' \
                        % (record['query'], status)
            self.logger.info(msg)
            print(dastimestamp('DAS INFO'), msg)
            return status

        self.logger.info(dasquery)
        das_timer('das_record', self.verbose)
        services = self.insert_query_records(dasquery)
        if not services:
            msg = 'unable to locate data-services to fulfill this request'
            msg += ', will iterate over all registered services'
            print(dastimestamp('DAS WARNING '), dasquery, msg)
            services = dasquery.services if dasquery.services else self.systems
        try:
            if self.multitask:
                jobs = []
                for srv in sorted(services):
                    jobs.append(self.taskmgr.spawn(self.worker, srv, dasquery))
                self.taskmgr.joinall(jobs)
            else:
                for srv in services:
                    self.worker(srv, dasquery)
        except Exception as exc:
            print_exc(exc)
            return 'fail'
        self.logger.info('\n##### merging ######\n')
        update_das_query(dasquery, 'merging')
        das_timer('merge', self.verbose)

        # check that all query record statuses are ok, i.e. we did insert records
        # this status is set by self.rawcache.update_cache
        for idx in range(self.collect_wait_time):
            records = self.rawcache.find_query_record(dasquery)
            statuses = []
            for row in records:
                system = row['das']['system']
                status = row['das']['status']
                self.logger.info("### query record status %s %s %s" %
                                 (dasquery.qhash, system, status))
                statuses.append(status)
            all_statuses = sorted(list(set(statuses)))
            # at this point we're looking that all services will have 'ok' and das status will be 'merging'
            if len(all_statuses) == 2 and all_statuses == ['merging', 'ok']:
                break
            time.sleep(1)

        # now we can merge records
        status = self.rawcache.merge_records(dasquery)
        das_timer('merge', self.verbose)
        # check if we have service records and properly setup status
        self.logger.info('\n##### check services ######\n')
        das_services = self.rawcache.check_services(dasquery)
        reason = ''
        status = 'ok'
        if not das_services:
            if 'records' in dasquery.query:
                status = 'ok'  # keep status ok for 'records' queries
            else:
                reason = 'no data records found in DAS cache'
                status = 'fail'
                print(dastimestamp('DAS ERROR '), dasquery, reason)
        update_das_query(dasquery, status, reason)
        das_timer('DASCore::call', self.verbose)
        return status
Beispiel #22
0
    def __init__(self, config=None, debug=0,
                nores=False, logger=None, engine=None, multitask=True):
        if  config:
            dasconfig = config
        else:
            dasconfig = das_readconfig()
        verbose       = dasconfig['verbose']
        self.stdout   = debug
        if  isinstance(debug, int):
            self.verbose = debug
            dasconfig['verbose'] = debug
        else:
            self.verbose = verbose
        das_timer('DASCore::init', self.verbose)
        self.operators = das_operators()

        # set noresults option
        self.noresults = False
        if  nores:
            dasconfig['write_cache'] = True
            self.noresults = nores

        self.multitask = dasconfig['das'].get('multitask', True)
        if  debug or self.verbose:
            self.multitask = False # in verbose mode do not use multitask
            dasconfig['das']['multitask'] = False
        if  not multitask: # explicitly call DASCore ctor, e.g. in analytics
            self.multitask = False
            dasconfig['das']['multitask'] = False
        dasconfig['engine'] = engine
        if  self.multitask:
            nworkers = dasconfig['das'].get('core_workers', 5)
            if  engine:
                thr_name = 'DASCore:PluginTaskManager'
                self.taskmgr = PluginTaskManager(\
                        engine, nworkers=nworkers, name=thr_name)
                self.taskmgr.subscribe()
            else:
                thr_name = 'DASCore:TaskManager'
                self.taskmgr = TaskManager(nworkers=nworkers, name=thr_name)
        else:
            self.taskmgr = None

        if  logger:
            self.logger = logger
        else:
            self.logger = PrintManager('DASCore', self.verbose)

        # define Mapping/Analytics/Parser in this order since Parser depends
        # on first two
        dasmapping = DASMapping(dasconfig)
        dasconfig['dasmapping'] = dasmapping
        self.mapping = dasmapping

        self.analytics = DASAnalytics(dasconfig)
        dasconfig['dasanalytics'] = self.analytics

        self.keylearning = DASKeyLearning(dasconfig)
        dasconfig['keylearning'] = self.keylearning

        # init DAS cache
        self.rawcache = DASMongocache(dasconfig)
        dasconfig['rawcache'] = self.rawcache

        # plug-in architecture: loop over registered data-services in
        # dasconfig; load appropriate module/class; register data
        # service with DASCore.
        self.systems = dasmapping.list_systems()
        # pointer to the DAS top level directory
        dasroot = '/'.join(__file__.split('/')[:-3])
        for name in self.systems:
            try:
                klass  = 'DAS/services/%s/%s_service.py' \
                    % (name, name)
                srvfile = os.path.join(dasroot, klass)
                with file(srvfile) as srvclass:
                    for line in srvclass:
                        if  line.find('(DASAbstractService)') != -1:
                            klass = line.split('(DASAbstractService)')[0]
                            klass = klass.split('class ')[-1] 
                            break
                mname  = 'DAS.services.%s.%s_service' % (name, name)
                module = __import__(mname, fromlist=[klass])
                obj = getattr(module, klass)(dasconfig)
                setattr(self, name, obj)
                SERVICES[name] = obj
            except IOError as err:
                if  debug > 1:
                    # we have virtual services, so IOError can be correct
                    print_exc(err)
                try:
                    mname  = 'DAS.services.generic_service'
                    module = __import__(mname, fromlist=['GenericService'])
                    obj    = module.GenericService(name, dasconfig)
                    setattr(self, name, obj)
                except Exception as exc:
                    print_exc(exc)
                    msg = "Unable to load %s data-service plugin" % name
                    raise Exception(msg)
            except Exception as exc:
                print_exc(exc)
                msg = "Unable to load %s data-service plugin" % name
                raise Exception(msg)

        # loop over systems and get system keys, add mapping keys to final list
        self.service_keys = {}
        self.service_parameters = {}
        for name in self.systems: 
            skeys = getattr(self, name).keys()
            self.service_keys[getattr(self, name).name] = skeys
            sparams = getattr(self, name).parameters()
            self.service_parameters[getattr(self, name).name] = sparams

        self.service_keys['special'] = das_special_keys()
        self.dasconfig = dasconfig
        das_timer('DASCore::init', self.verbose)
Beispiel #23
0
    def get_from_cache(self, dasquery, idx=0, limit=0, collection='merge'):
        """
        Look-up results from the merge cache and yield them for
        further processing.
        """
        das_timer('DASCore::get_from_cache', self.verbose)
        msg = 'col=%s, query=%s, idx=%s, limit=%s'\
                % (collection, dasquery, idx, limit)
        self.logger.info(msg)

        fields = dasquery.mongo_query.get('fields', None)

        if dasquery.mapreduce:
            res = self.rawcache.map_reduce(dasquery.mapreduce, dasquery)
        elif dasquery.aggregators:
            # extract das information from rawcache
            rows  = self.rawcache.get_from_cache(\
                    dasquery, collection=collection)
            first = next(rows)
            sinfo = das_sinfo(first)
            # to perform aggregation we need:
            # - loop over all aggregator functions
            # - loop over all data-services
            # - loop over all APIs within a data-services
            # the code below does that, it applies aggregator
            # to selected (based on key/srv/api) records
            res = []
            _id = 0
            time0 = time.time()
            expire = 300  # min expire
            for func, key in dasquery.aggregators:
                afunc = getattr(das_aggregator, 'das_%s' % func)
                found = False
                for srv, apis, in sinfo.items():
                    for api in apis:
                        rows  = self.rawcache.get_from_cache(\
                                dasquery, collection=collection)
                        gen = api_rows(rows, api)
                        data = afunc(key, gen)
                        ctime = time.time() - time0
                        das = dasheader(srv,
                                        dasquery,
                                        expire,
                                        api=api,
                                        ctime=ctime)
                        if isinstance(data, dict) and data['value'] != 'N/A':
                            aggr = {
                                '_id': _id,
                                'function': func,
                                'key': key,
                                'result': data
                            }
                            aggr.update(das)
                            res.append(aggr)
                            _id += 1
                            found = True
                if not found:  # when we got nothing add empty result record
                    empty = {'value': 'N/A'}
                    ctime = time.time() - time0
                    das = dasheader('das',
                                    dasquery,
                                    expire,
                                    api='das_core',
                                    ctime=ctime)
                    rec = {
                        '_id': 0,
                        'function': func,
                        'key': key,
                        'result': empty
                    }
                    rec.update(das)
                    res.append(rec)
        else:
            res = self.rawcache.get_from_cache(dasquery, idx, limit, \
                    collection=collection)
        # we assume that all records from single query will have
        # identical structure, therefore it will be sufficient to update
        # keylearning DB only with first record
        count = 0
        for row in res:
            if not count:
                self.keylearning.add_record(dasquery, row)
            fix_times(row)
            yield row
            count += 1
        das_timer('DASCore::get_from_cache', self.verbose)