Beispiel #1
0
 def getdata_helper(self, url, params, expire, headers=None, post=None):
     "Helper function to get data from SiteDB or local cache"
     cname = url.split('/')[-1].replace('-', '_')
     col   = self.localcache.conn[self.name][cname]
     local = col.find_one({'expire':{'$gt':expire_timestamp(time.time())}})
     data  = None
     if  local:
         msg = 'SiteDBService reads from %s.%s' % (self.name, cname)
         self.logger.info(msg)
         try: # get data from local cache
             data = [r for r in col.find() if not r.has_key('expire')][0]
             del data['_id']
         except Exception as exc:
             print_exc(exc)
             data = {}
     if  not data or not local:
         headers = {'Accept':'application/json'}
         datastream, expire = getdata(\
                 url, params, headers, expire, post,
                 self.error_expire, self.verbose, self.ckey, self.cert,
                 system=self.name)
         try: # read data and write it to local cache
             data = json.load(datastream)
             datastream.close()
             col.remove()
             col.insert(data)
             col.insert({'expire':expire_timestamp(expire)})
         except Exception as exc:
             print_exc(exc)
     return data, expire
Beispiel #2
0
 def getdata_helper(self, url, params, expire, headers=None, post=None):
     "Helper function to get data from SiteDB or local cache"
     cname = url.split('/')[-1].replace('-', '_')
     conn  = db_connection(self.dburi)
     col   = conn[self.name][cname]
     local = find_one(col, {'expire':{'$gt':expire_timestamp(time.time())}})
     data  = None
     if  local:
         msg = 'SiteDBService reads from %s.%s' % (self.name, cname)
         self.logger.info(msg)
         try: # get data from local cache
             data = [r for r in col.find() if 'expire' not in r][0]
             del data['_id']
         except Exception as exc:
             print_exc(exc)
             data = {}
     if  not data or not local:
         headers = {'Accept':'application/json'}
         datastream, expire = getdata(\
                 url, params, headers, expire, post,
                 self.error_expire, self.verbose, self.ckey, self.cert,
                 system=self.name)
         try: # read data and write it to local cache
             data = json.load(datastream)
             datastream.close()
             col.remove()
             col.insert(data)
             col.insert({'expire':expire_timestamp(expire)})
         except Exception as exc:
             print_exc(exc)
     return data, expire
Beispiel #3
0
def dasheader(system, dasquery, expire, api=None, url=None, ctime=None,
        services=None):
    """
    Return DAS header (dict) wrt DAS specifications:

         - system represents DAS services, e.g. combined
         - dasquery is DASQuery representation
         - expire is expire timestamp of the record
         - api is data-service API name
         - url is data-service URL
         - ctime is current timestamp
         - services is a dict (or list of dicts) of CMS services contributed
           to data record, e.g. combined service uses dbs and phedex
    """
    # tstamp must be integer in order for json encoder/decoder to
    # work properly, see utils/jsonwrapper/__init__.py
    tstamp = round(time.time())
    if  isinstance(system, basestring):
        system = [system]
    if  not api:
        dasdict = dict(system=system, ts=tstamp,
                    expire=expire_timestamp(expire),
                    status="requested")
    else:
        dasdict = dict(system=system, ts=tstamp,
                    url=[url], ctime=[ctime],
                    expire=expire_timestamp(expire), urn=[api],
                    api=[api], status="requested")
    if  services:
        if  isinstance(services, dict):
            services = [services]
        dasdict.update({"services": services})
    return dict(das=dasdict)
Beispiel #4
0
def getdata_pycurl(url,
                   params,
                   headers=None,
                   expire=3600,
                   post=None,
                   error_expire=300,
                   verbose=0,
                   ckey=None,
                   cert=None,
                   doseq=True,
                   system=None):
    "Fetch data via pycurl library"
    contact = 'data-service.'
    if system:
        contact = system + ' ' + contact
    if isinstance(params, dict):
        timer_key = '%s?%s' % (url, urllib.urlencode(params, doseq=True))
    else:
        timer_key = '%s?%s' % (url, params)
    das_timer(timer_key, verbose)
    handler = REQUEST_HANDLER
    try:
        data, expire = handler.getdata(url, params, headers, expire, post, \
                    error_expire, verbose, ckey, cert, doseq)
    except urllib2.HTTPError as httperror:
        msg  = 'urllib2.HTTPError, system=%s, url=%s, args=%s, headers=%s' \
                    % (system, url, json.dumps(params), json.dumps(headers))
        data = {
            'error': 'Received HTTP error from %s data-service' % contact,
            'reason': msg,
            'ts': time.time()
        }
        try:
            reason = extract_http_error(httperror.read())
            data.update({'reason': reason, 'request': msg})
            # TODO: err variable did not exit in this function!
            msg += '\n' + reason
        except Exception as exp:
            data.update({'httperror': None})
            msg += '\n' + str(exp)
        print(dastimestamp('getdata_pycurl'), msg)
        data = json.dumps(data)
        expire = expire_timestamp(error_expire)
    except Exception as exp:
        msg  = 'HTTPError, system=%s, url=%s, args=%s, headers=%s' \
                    % (system, url, json.dumps(params), json.dumps(headers))
        print(dastimestamp('getdata_pycurl'), msg + '\n' + str(exp))
        data = {
            'error': 'Received generic error from %s data-service' % contact,
            'reason': msg,
            'ts': time.time()
        }
        data = json.dumps(data)
        expire = expire_timestamp(error_expire)
    das_timer(timer_key, verbose)
    return data, expire
Beispiel #5
0
def getdata_pycurl(
    url,
    params,
    headers=None,
    expire=3600,
    post=None,
    error_expire=300,
    verbose=0,
    ckey=None,
    cert=None,
    doseq=True,
    system=None,
):
    "Fetch data via pycurl library"
    contact = "data-service."
    if system:
        contact = system + " " + contact
    timer_key = "%s?%s" % (url, urllib.urlencode(params, doseq=True))
    das_timer(timer_key, verbose)
    handler = REQUEST_HANDLER
    try:
        data, expire = handler.getdata(url, params, headers, expire, post, error_expire, verbose, ckey, cert, doseq)
    except urllib2.HTTPError as httperror:
        msg = "urllib2.HTTPError, system=%s, url=%s, args=%s, headers=%s" % (
            system,
            url,
            json.dumps(params),
            json.dumps(headers),
        )
        data = {"error": "Unable to contact %s" % contact, "reason": msg, "ts": time.time()}
        try:
            reason = extract_http_error(httperror.read())
            data.update({"reason": reason, "request": msg})
            msg += "\n" + err
        except Exception as exp:
            data.update({"httperror": None})
            msg += "\n" + str(exp)
        print dastimestamp("getdata_pycurl"), msg
        data = json.dumps(data)
        expire = expire_timestamp(error_expire)
    except Exception as exp:
        msg = "HTTPError, system=%s, url=%s, args=%s, headers=%s" % (
            system,
            url,
            json.dumps(params),
            json.dumps(headers),
        )
        print dastimestamp("getdata_pycurl"), msg + "\n" + str(exp)
        data = {"error": "Unable to contact %s" % contact, "reason": msg, "ts": time.time()}
        data = json.dumps(data)
        expire = expire_timestamp(error_expire)
    das_timer(timer_key, verbose)
    return data, expire
Beispiel #6
0
    def getdata(self, url, params, headers=None, expire=3600, post=False,
                error_expire=300, verbose=0, ckey=None, cert=None, doseq=True):
        """Fetch data for given set of parameters"""
        time0     = time.time()
        thread    = threading.current_thread().ident
        if  post:
            cache = self.pcache
        else:
            cache = self.gcache
        if  thread in cache:
            curl  = cache.get(thread)
        else:
            curl  = pycurl.Curl()
            cache[thread] = curl
#        print "\n+++ getdata curl gcache", self.gcache.keys()
#        print "+++ getdata curl pcache", self.pcache.keys()
        bbuf, hbuf = self.set_opts(curl, url, params, headers,
                ckey, cert, verbose, post, doseq)
        curl.perform()

        http_header = hbuf.getvalue()

#        data = parse_body(bbuf.getvalue())
#        data = bbuf.getvalue() # read entire content
#        bbuf.flush()
        bbuf.seek(0)# to use file description seek to the beggning of the stream
        data = bbuf # leave StringIO object, which will serve as file descriptor
        expire = get_expire(http_header, error_expire, verbose)
        hbuf.flush()

        # check for HTTP error
        http_code = curl.getinfo(pycurl.HTTP_CODE)

        # get HTTP status message and Expires
        http_expire  = ''
        http_msg = ''
        for item in http_header.splitlines():
            if  pat_http_msg.match(item):
                http_msg = item
            if  pat_expires.match(item):
                http_expire = item.split('Expires:')[-1].strip()
                e_time = expire_timestamp(http_expire)
                if  e_time < expire_timestamp(time0):
                    expire = max(e_time, expire_timestamp(expire))
                elif e_time > time.time():
                    expire = e_time

        if  http_code < 200 or http_code >= 300:
            effective_url = curl.getinfo(pycurl.EFFECTIVE_URL)
            raise HTTPError(effective_url, http_code, http_msg, \
                    http_header, data)
        return data, expire
Beispiel #7
0
def dasheader(system, dasquery, expire, api=None, url=None, ctime=None):
    """
    Return DAS header (dict) wrt DAS specifications, see
    https://twiki.cern.ch/twiki/bin/view/CMS/DMWMDataAggregationService#DAS_data_service_compliance
    """
    if  not api:
        dasdict = dict(system=[system], timestamp=time.time(),
                    expire=expire_timestamp(expire),
                    status="requested")
    else:
        dasdict = dict(system=[system], timestamp=time.time(),
                    url=[url], ctime=[ctime],
                    expire=expire_timestamp(expire), urn=[api],
                    api=[api], status="requested")
    return dict(das=dasdict)
Beispiel #8
0
    def update_cache(self, dasquery, results, header):
        """
        Insert results into cache. Use bulk insert controller by
        self.cache_size. Upon completion ensure indexies.
        """
        # insert/check query record in DAS cache
        self.insert_query_record(dasquery, header)

        # update results records in DAS cache
        gen  = self.generate_records(dasquery, results, header)
        inserted = 0
        # bulk insert
        try:
            if  pymongo.version.startswith('3.'): # pymongo 3.X
                res = self.col.insert_many(gen, ordered=False, bypass_document_validation=True)
                inserted += len(res.inserted_ids)
            else:
                while True:
                    nres = self.col.insert(itertools.islice(gen, self.cache_size))
                    if  nres and isinstance(nres, list):
                        inserted += len(nres)
                    else:
                        break
        except InvalidOperation:
            pass

        if  dasquery.qcache: # custom DASQuery cache
            self.update_das_expire(dasquery, expire_timestamp(dasquery.qcache))
Beispiel #9
0
 def parser(self, query, dformat, source, api):
     """
     DBS3 data-service parser.
     """
     if  isinstance(source, GeneratorType):
         for row in source:
             yield row
         return
     for row in self.parser_helper(query, dformat, source, api):
         mod_time = get_modification_time(row)
         if  self.extended_expire:
             new_expire = expire_timestamp(self.extended_expire)
             if  mod_time and \
                 old_timestamp(mod_time, self.extended_threshold):
                 row.update({'das':{'expire': new_expire}})
             # filesummaries is summary DBS API about dataset,
             # it collects information about number of files/blocks/events
             # for given dataset and therefore will be merged with datasets
             # API record. To make a proper merge with extended
             # timestamp/threshold options I need explicitly assign
             # das.expire=extended_timestamp, otherwise
             # the merged record will pick-up smallest between
             # filesummaries and datasets records.
             if  api == 'filesummaries':
                 row.update({'das': {'expire': new_expire}})
         yield row
Beispiel #10
0
def dasheader(system,
              dasquery,
              expire,
              api=None,
              url=None,
              ctime=None,
              services=None):
    """
    Return DAS header (dict) wrt DAS specifications:

         - system represents DAS services, e.g. combined
         - dasquery is DASQuery representation
         - expire is expire timestamp of the record
         - api is data-service API name
         - url is data-service URL
         - ctime is current timestamp
         - services is a dict (or list of dicts) of CMS services contributed
           to data record, e.g. combined service uses dbs and phedex
    """
    # tstamp must be integer in order for json encoder/decoder to
    # work properly, see utils/jsonwrapper/__init__.py
    tstamp = round(time.time())
    if isinstance(system, basestring):
        system = [system]
    if not api:
        dasdict = dict(system=system,
                       ts=tstamp,
                       expire=expire_timestamp(expire),
                       status="requested")
    else:
        dasdict = dict(system=system,
                       ts=tstamp,
                       url=[url],
                       ctime=[ctime],
                       expire=expire_timestamp(expire),
                       urn=[api],
                       api=[api],
                       status="requested")
    if services:
        if isinstance(services, dict):
            services = [services]
        dasdict.update({"services": services})
    return dict(das=dasdict)
Beispiel #11
0
def getdata_pycurl(url, params, headers=None, expire=3600, post=None,
    error_expire=300, verbose=0, ckey=None, cert=None, doseq=True, system=None):
    "Fetch data via pycurl library"
    contact = 'data-service.'
    if  system:
        contact = system + ' ' + contact
    if  isinstance(params, dict):
        timer_key = '%s?%s' % (url, urllib.urlencode(params, doseq=True))
    else:
        timer_key = '%s?%s' % (url, params)
    das_timer(timer_key, verbose)
    handler = REQUEST_HANDLER
    try:
        data, expire = handler.getdata(url, params, headers, expire, post, \
                    error_expire, verbose, ckey, cert, doseq)
    except urllib2.HTTPError as httperror:
        msg  = 'urllib2.HTTPError, system=%s, url=%s, args=%s, headers=%s' \
                    % (system, url, json.dumps(params), json.dumps(headers))
        data = {'error': 'Received HTTP error from %s data-service' % contact,
                'reason': msg, 'ts':time.time()}
        try:
            reason = extract_http_error(httperror.read())
            data.update({'reason': reason, 'request': msg})
            # TODO: err variable did not exit in this function!
            msg += '\n' + reason
        except Exception as exp:
            data.update({'httperror': None})
            msg += '\n' + str(exp)
        print(dastimestamp('getdata_pycurl'), msg)
        data = json.dumps(data)
        expire = expire_timestamp(error_expire)
    except Exception as exp:
        msg  = 'HTTPError, system=%s, url=%s, args=%s, headers=%s' \
                    % (system, url, json.dumps(params), json.dumps(headers))
        print(dastimestamp('getdata_pycurl'), msg + '\n' + str(exp))
        data = {'error': 'Received generic error from %s data-service' % contact,
                'reason': msg, 'ts':time.time()}
        data = json.dumps(data)
        expire = expire_timestamp(error_expire)
    das_timer(timer_key, verbose)
    return data, expire
Beispiel #12
0
 def parser(self, dasquery, dformat, source, api):
     """
     DBS data-service parser.
     """
     for row in self.parser_helper(dasquery, dformat, source, api):
         if  self.extended_expire:
             new_expire = expire_timestamp(self.extended_expire)
             mod_time = get_modification_time(row)
             if  mod_time and \
                 old_timestamp(mod_time, self.extended_threshold):
                 row.update({'das':{'expire': new_expire}})
         yield row
Beispiel #13
0
    def test_expire_timestamp(self):
        """Test expire_timestamp function"""
        result = expire_timestamp('Mon, 04 Oct 2010 18:57:42 GMT')
        expect = 1286218662
        self.assertEqual(result, expect)

        tstamp = time.time() + 10000
        result = expire_timestamp(tstamp)
        expect = tstamp
        self.assertEqual(result, expect)

        tstamp = long(time.time() + 10)
        result = long(expire_timestamp(10))
        expect = tstamp
        self.assertEqual(result, expect)

        expire = '900'
        result = long(expire_timestamp(expire))
#        expect = long(time.time()) + 900
        expect = 900
        self.assertEqual(result, expect)
Beispiel #14
0
    def test_expire_timestamp(self):
        """Test expire_timestamp function"""
        result = expire_timestamp('Mon, 04 Oct 2010 18:57:42 GMT')
        expect = 1286218662
        self.assertEqual(result, expect)

        tstamp = time.time() + 10000
        result = expire_timestamp(tstamp)
        expect = tstamp
        self.assertEqual(result, expect)

        tstamp = long(time.time() + 10)
        result = long(expire_timestamp(10))
        expect = tstamp
        self.assertEqual(result, expect)

        expire = '900'
        tstamp = long(time.time() + 900)
        result = long(expire_timestamp(expire))
        expect = tstamp
        self.assertEqual(result, expect)
Beispiel #15
0
def das_populator_helper(dasmgr, query, expire):
    """Process DAS query through DAS Core and sets new expire tstamp for it"""
    try:
        # To allow re-use of queries feeded by DAS populator
        # we need to ensure that instance is present in DAS query,
        # since web interface does it by default.
        dasquery = dasmgr.adjust_query(query)
        if 'instance' not in dasquery:
            raise Exception('Supplied query does not have DBS instance')
        newts = expire_timestamp(expire)
        # process DAS query
        dasmgr.call(dasquery)
        # update DAS expire timestamp
        dasmgr.rawcache.update_das_expire(dasquery, newts)
        print("\n### DAS populator", query, dasquery, expire, newts)
    except Exception as exc:
        print_exc(exc)
Beispiel #16
0
def das_populator_helper(dasmgr, query, expire):
    """Process DAS query through DAS Core and sets new expire tstamp for it"""
    try:
        # To allow re-use of queries feeded by DAS populator
        # we need to ensure that instance is present in DAS query,
        # since web interface does it by default.
        dasquery = dasmgr.adjust_query(query)
        if  'instance' not in dasquery:
            raise Exception('Supplied query does not have DBS instance')
        newts = expire_timestamp(expire)
        # process DAS query
        dasmgr.call(dasquery)
        # update DAS expire timestamp
        dasmgr.rawcache.update_das_expire(dasquery, newts)
        print("\n### DAS populator", query, dasquery, expire, newts)
    except Exception as exc:
        print_exc(exc)
Beispiel #17
0
    def update_cache(self, dasquery, results, header, system, api):
        """
        Insert results into cache. Use bulk insert controller by
        self.cache_size. Upon completion ensure indexies.
        """
        # update results records in DAS cache
        gen  = self.generate_records(dasquery, results, header)
        inserted = 0
        # bulk insert
        try:
            res = self.col.insert_many(gen, ordered=False, bypass_document_validation=True)
            inserted += len(res.inserted_ids)
        except InvalidOperation:
            pass

        # update query record for this sub-system
        self.update_query_record_system(dasquery, system, api, 'ok')

        if  dasquery.qcache: # custom DASQuery cache
            self.update_das_expire(dasquery, expire_timestamp(dasquery.qcache))
Beispiel #18
0
 def insert_apicall(self, system, query, url, api, api_params, expire):
     """
     Remove obsolete apicall records and
     insert into Analytics DB provided information about API call.
     Moved from AbstractService.
     
     Updated so that we do not have multiple records when performing
     forced updates (ie, the old record is not yet expired) - now
     look for an existing record with the same parameters (I'm hoping
     the fact that some of the variables are indexed will make this
     fast even though not all are), and if it exists just update
     the expiry. Otherwise insert a new record.
     """
     msg = 'query=%s, url=%s,' % (query, url)
     msg += 'api=%s, args=%s, expire=%s' % (api, api_params, expire)
     self.logger.debug(msg)
     expire = expire_timestamp(expire)
     query = encode_mongo_query(query)
     qhash = genkey(query)
     self.remove_expired()
     existing = self.col.find_one({'apicall.system':     system,
                                   'apicall.url':        url,
                                   'apicall.api':        api,
                                   'apicall.api_params': api_params,
                                   'apicall.qhash':      qhash})
     if existing:
         self.logger.debug("updating")
         self.col.update({'_id': existing['_id']},
                         {'$set':{'apicall.expire': expire}})
     else:
         self.col.insert({'apicall':{'api_params':   api_params,
                                     'url':          url,
                                     'api':          api,
                                     'system':       system,
                                     'expire':       expire,
                                     'qhash':        qhash}})
     index_list = [('apicall.url', DESCENDING),
                   ('apicall.api', DESCENDING),
                   ('qhash', DESCENDING)]
     create_indexes(self.col, index_list)
Beispiel #19
0
    def update_cache(self, dasquery, results, header, system, api):
        """
        Insert results into cache. Use bulk insert controller by
        self.cache_size. Upon completion ensure indexies.
        """
        # update results records in DAS cache
        gen = self.generate_records(dasquery, results, header)
        inserted = 0
        # bulk insert
        try:
            res = self.col.insert_many(gen,
                                       ordered=False,
                                       bypass_document_validation=True)
            inserted += len(res.inserted_ids)
        except InvalidOperation:
            pass

        # update query record for this sub-system
        self.update_query_record_system(dasquery, system, api, 'ok')

        if dasquery.qcache:  # custom DASQuery cache
            self.update_das_expire(dasquery, expire_timestamp(dasquery.qcache))
Beispiel #20
0
    def __init__(self, config):
        self.emptyset_expire = expire_timestamp(\
            config['das'].get('emptyset_expire', 5))
        self.dburi   = config['mongodb']['dburi']
        self.cache_size = config['mongodb']['bulkupdate_size']
        self.dbname  = config['dasdb']['dbname']
        self.verbose = config['verbose']
        self.logger  = PrintManager('DASMongocache', self.verbose)
        self.mapping = config['dasmapping']

        self.conn    = db_connection(self.dburi)
        self.mdb     = self.conn[self.dbname]
        self.col     = self.mdb[config['dasdb']['cachecollection']]
        self.mrcol   = self.mdb[config['dasdb']['mrcollection']]
        self.merge   = self.mdb[config['dasdb']['mergecollection']]
        self.gfs     = db_gridfs(self.dburi)

        self.logdb   = DASLogdb(config)

        self.das_internal_keys = ['das_id', 'das', 'cache_id', 'qhash']

        msg = "%s@%s" % (self.dburi, self.dbname)
        self.logger.info(msg)

        self.add_manipulator()

        # ensure that we have the following indexes
        index_list = [('das.expire', ASCENDING), ('das_id', ASCENDING),
                      ('das.system', ASCENDING),
                      ('qhash', DESCENDING),
                      ('das.empty_record', ASCENDING)]
        create_indexes(self.col, index_list)
        index_list = [('das.expire', ASCENDING), ('das_id', ASCENDING),
                      ('qhash', DESCENDING),
                      ('das.empty_record', ASCENDING), ('das.ts', ASCENDING)]
        create_indexes(self.merge, index_list)
Beispiel #21
0
    def apicall(self, dasquery, url, api, args, dformat, expire):
        """
        A service worker. It parses input query, invoke service API 
        and return results in a list with provided row.
        """
        # NOTE: I use helper function since it is 2 step process
        # therefore the expire time stamp will not be changed, since
        # helper function will yield results
        time0 = time.time()
        if  api == 'combined_dataset4site_release' or \
            api == 'combined_site4dataset':
            genrows = self.helper(url, api, args, expire)
        # here I use directly the call to the service which returns
        # proper expire timestamp. Moreover I use HTTP header to look
        # at expires and adjust my expire parameter accordingly
        if  api == 'combined_dataset4site':
            headers = {'Accept': 'application/json;text/json'}
            datastream, expire = self.getdata(url, args, expire, headers)
            try: # get HTTP header and look for Expires
                e_time = expire_timestamp(\
                    datastream.info().__dict__['dict']['expires'])
                if  e_time > time.time():
                    expire = e_time
            except:
                pass
            genrows = parse_data(datastream)

        # proceed with standard workflow
        dasrows = self.set_misses(dasquery, api, genrows)
        ctime   = time.time() - time0
        try:
            if  isinstance(url, dict):
                url = "combined: %s" % url.values()
            self.write_to_cache(dasquery, expire, url, api, args, dasrows, ctime)
        except Exception as exc:
            print_exc(exc)
Beispiel #22
0
    def __init__(self, config):
        self.config  = config
        self.emptyset_expire = \
                expire_timestamp(config['das'].get('emptyset_expire', 5))
        self.dburi   = config['mongodb']['dburi']
        self.cache_size = config['mongodb']['bulkupdate_size']
        self.dbname  = config['dasdb']['dbname']
        self.verbose = config['verbose']
        self.logger  = PrintManager('DASMongocache', self.verbose)
        self.mapping = config['dasmapping']
        self.logging = config['dasdb'].get('logging', False)
        self.rec_ttl = config['dasdb'].get('record_ttl', 24*60*60)
        self.del_ttl = config['dasdb'].get('delta_ttl', 60)
        self.cleanup_del_ttl = config['dasdb'].get('cleanup_delta_ttl', 3600)
        self.retry   = config['dasdb'].get('retry', 3)
        self.das_son_manipulator = DAS_SONManipulator()

        # Initialize MongoDB connection
        self.col_    = self.config['dasdb']['cachecollection']
        self.mrcol_  = self.config['dasdb']['mrcollection']
        self.merge_  = self.config['dasdb']['mergecollection']
        self.gfs     = db_gridfs(self.dburi)

        msg = "%s@%s" % (self.dburi, self.dbname)
        self.logger.info(msg)

        # ensure that we have the following indexes
        common_idx = [
                      ('file.name', DESCENDING),
                      ('dataset.name', DESCENDING),
                      ('block.name', DESCENDING),
                      ('run.run_number', DESCENDING),
                      ]
        index_list = [('das.expire', ASCENDING), ('das_id', ASCENDING),
                      ('das.system', ASCENDING),
                      ('qhash', DESCENDING),
                      ('das.record', ASCENDING)]
        create_indexes(self.col, index_list + common_idx)
        index_list = [('das.expire', ASCENDING), ('das_id', ASCENDING),
                      ('qhash', DESCENDING),
                      ('das.record', ASCENDING),
                      ('das.ts', ASCENDING)]
        create_indexes(self.merge, index_list)
        # NOTE: I found that creating index in merge collection leads to
        # MongoDB error when records contains multiple arrays on indexed
        # keys. For example, when we query file,run,lumi both file and run
        # are arrays in MongoDB. In this case the final sort in MongoDB
        # bark with the following message:
        # cannot sort with keys that are parallel arrays
        # it looks like that there is no fix for that yet
        # see
        # http://stackoverflow.com/questions/6516725/how-do-i-index-two-arrays-in-mongodb
        # therefore I temporary disabled create_indexes call on merge
        # collection which was used to have index to ease final sort,
        # especially in a case when a lot of records correspond to inital
        # query, e.g. file records.
        # On another hand, the most common use case where sort fails is
        # getting file records, and I can add one compound key to ease sort
        # but I can't add another compound key on array field, e.g. run
        common_idx = [[('qhash', DESCENDING), ('file.name', DESCENDING)]]
        create_indexes(self.merge, index_list + common_idx)

        # thread which clean-up DAS collections
        thname = 'mongocache_cleanup'
        cols   = [config['dasdb']['cachecollection'],
                  config['dasdb']['mrcollection'],
                  config['dasdb']['mergecollection']]
Beispiel #23
0
    def getdata(self,
                url,
                params,
                headers=None,
                expire=3600,
                post=False,
                error_expire=300,
                verbose=0,
                ckey=None,
                cert=None,
                doseq=True):
        """Fetch data for given set of parameters"""
        time0 = time.time()
        thread = threading.current_thread().ident
        if post:
            cache = self.pcache
        else:
            cache = self.gcache
        if thread in cache:
            curl = cache.get(thread)
        else:
            curl = pycurl.Curl()
            cache[thread] = curl
#        print "\n+++ getdata curl gcache", self.gcache.keys()
#        print "+++ getdata curl pcache", self.pcache.keys()
        bbuf, hbuf = self.set_opts(curl, url, params, headers,\
                ckey, cert, verbose, post, doseq)
        curl.perform()

        if sys.version.startswith('3.'):
            http_header = hbuf.getvalue().decode('UTF-8')
        else:
            http_header = hbuf.getvalue()

#        data = parse_body(bbuf.getvalue())
#        data = bbuf.getvalue() # read entire content
#        bbuf.flush()

#         bbuf.seek(0)# to use file description seek to the beggning of the stream
#         data = bbuf # leave StringIO object, which will serve as file descriptor

# will yield data as StringIO object, i.e. provide file object
        if sys.version.startswith('3.'):
            data = io.StringIO(bbuf.getvalue().decode('UTF-8'))
        else:
            bbuf.seek(
                0
            )  # to use file description seek to the beggning of the stream
            data = bbuf  # leave StringIO object, which will serve as file descriptor

        expire = get_expire(http_header, error_expire, verbose)
        hbuf.flush()

        # check for HTTP error
        http_code = curl.getinfo(pycurl.HTTP_CODE)

        # get HTTP status message and Expires
        http_expire = ''
        http_msg = ''
        for item in http_header.splitlines():
            if pat_http_msg.match(item):
                http_msg = item
            if pat_expires.match(item):
                http_expire = item.split('Expires:')[-1].strip()
                e_time = expire_timestamp(http_expire)
                if e_time < expire_timestamp(time0):
                    expire = max(e_time, expire_timestamp(expire))
                elif e_time > time.time():
                    expire = e_time

        if http_code < 200 or http_code >= 300:
            effective_url = curl.getinfo(pycurl.EFFECTIVE_URL)
            raise HTTPError(effective_url, http_code, http_msg, \
                    http_header, data)
        return data, expire
Beispiel #24
0
def getdata_urllib(url,
                   params,
                   headers=None,
                   expire=3600,
                   post=None,
                   error_expire=300,
                   verbose=0,
                   ckey=None,
                   cert=None,
                   doseq=True,
                   system=None,
                   tstamp=None):
    """
    Invoke URL call and retrieve data from data-service based
    on provided URL and set of parameters. Use post=True to
    invoke POST request.
    """
    contact = 'data-service.'
    if system:
        contact = system + ' ' + contact
    timer_key = '%s?%s' % (url, urllib.urlencode(params, doseq=True))
    das_timer(timer_key, verbose)
    encoded_data = urllib.urlencode(params, doseq=doseq)
    if not post:
        url = url + '?' + encoded_data
    if not headers:
        headers = {}
    if tstamp and 'If-Modified-Since' not in headers.keys():
        headers['If-Modified-Since'] = http_timestamp(tstamp)
    if verbose:
        print('+++ getdata, url=%s, headers=%s' % (url, headers))
    req = urllib2.Request(url)
    for key, val in headers.items():
        req.add_header(key, val)
    if verbose > 1:
        handler = urllib2.HTTPHandler(debuglevel=1)
        opener = urllib2.build_opener(handler)
        urllib2.install_opener(opener)
    if ckey and cert:
        handler = HTTPSClientAuthHandler(ckey, cert, verbose)
        opener = urllib2.build_opener(handler)
        urllib2.install_opener(opener)
    try:
        time0 = time.time()
        if post:
            data = urllib2.urlopen(req, encoded_data)
        else:
            data = urllib2.urlopen(req)
        data_srv_time = time.time() - time0
        info = data.info()
        code = data.getcode()
        if verbose > 1:
            print("+++ response code:", code)
            print("+++ response info\n", info)
        try:  # get HTTP header and look for Expires
            e_time = expire_timestamp(\
                info.__dict__['dict']['expires'])
            if e_time < expire_timestamp(data_srv_time):
                expire = max(e_time, expire_timestamp(expire))
            elif e_time > time.time():
                expire = e_time
        except Exception as _exp:
            pass
    except urllib2.HTTPError as httperror:
        msg  = 'HTTPError, url=%s, args=%s, headers=%s' \
                    % (url, params, headers)
        data = {
            'error': 'Received HTTP error from %s data-service' % contact,
            'reason': msg
        }
        try:
            err = '%s %s' % (contact, extract_http_error(httperror.read()))
            data.update({'error': err})
            msg += '\n' + err
        except Exception as exp:
            data.update({'httperror': None})
            msg += '\n' + str(exp)
        print(msg)
        data = json.dumps(data)
        expire = expire_timestamp(error_expire)
    except Exception as exp:
        msg  = 'HTTPError, url=%s, args=%s, headers=%s' \
                    % (url, params, headers)
        print(msg + '\n' + str(exp))
        data = {
            'error': 'Received generic error from %s data-service' % contact,
            'reason': msg
        }
        data = json.dumps(data)
        expire = expire_timestamp(error_expire)
    das_timer(timer_key, verbose)
    return data, expire
Beispiel #25
0
    def call(self, query, add_to_analytics=True, **kwds):
        """
        Top level DAS api which execute a given query using underlying
        data-services. It follows the following steps:

            - parse input query
            - identify data-sercices based on selection keys
              and where clause conditions
            - construct DAS workflow and execute data-service 
              API calls. At this step individual 
              data-services store results into DAS cache.

        Return status 0/1 depending on success of the calls, can be
        used by workers on cache server.

        kwds is provided for compatibility with web layer, e.g. it
        may invoke this method with additional pid parameter.
        """
        self.logger.info('input query=%s' % query)
        das_timer('DASCore::call', self.verbose)
        services = []
        if  isinstance(query, object) and hasattr(query, '__class__')\
            and query.__class__.__name__ == 'DASQuery':
            dasquery = query
        else:
            dasquery = DASQuery(query, mongoparser=self.mongoparser)
        if  add_to_analytics:
            dasquery.add_to_analytics()
        query = dasquery.mongo_query
        if  dasquery.mongo_query.has_key('system'):
            system = query['system']
            if  isinstance(system, str) or isinstance(system, unicode):
                services = [system]
            elif isinstance(system, list):
                services = system
            else:
                msg = 'Unsupported system=%s type=%s in DAS query' \
                        % (system, type(system))
                raise Exception(msg)
        spec   = query.get('spec')
        fields = query.get('fields')
        if  fields == ['records']:
            msg = 'look-up all records in cache'
            self.logger.info(msg)
            return 'in cache'
        if  spec == dict(records='*'):
            self.logger.info("look-up everything in cache")
            return 'in cache'
        for record in self.rawcache.find_specs(dasquery):
            status = record['das']['status']
            msg = 'found query %s in cache, status=%s\n' \
                        % (record['query'], status)
            self.logger.info(msg)
            return status
        similar_dasquery = self.rawcache.similar_queries(dasquery)
        if  similar_dasquery:
            for record in self.rawcache.find_specs(similar_dasquery):
                if  record:
                    try:
                        status = record['das']['status']
                    except:
                        status = 'N/A'
                        msg = 'Fail to look-up das.status, record=%s' % record
                        self.logger.info(msg)
                msg  = 'found SIMILAR query in cache,'
                msg += 'query=%s, status=%s\n' % (record['query'], status)
                self.logger.info(msg)
                return status

        self.logger.info(dasquery)
        params = dasquery.params()
        if  not services:
            services = params['services']
        self.logger.info('services = %s' % services)
        das_timer('das_record', self.verbose)
        # initial expire tstamp 1 day (long enough to be overwriten by data-srv)
        expire = expire_timestamp(time.time()+1*24*60*60)
        header = dasheader("das", dasquery, expire)
        header['lookup_keys'] = []
        self.rawcache.insert_query_record(dasquery, header)
        das_timer('das_record', self.verbose)
        try:
            if  self.multitask:
                jobs = []
                for srv in services:
                    jobs.append(self.taskmgr.spawn(self.worker, srv, dasquery))
                self.taskmgr.joinall(jobs)
            else:
                for srv in services:
                    self.worker(srv, dasquery)
        except Exception as exc:
            print_exc(exc)
            return 'fail'
        self.logger.info('\n##### merging ######\n')
        self.rawcache.update_query_record(dasquery, 'merging')
        das_timer('merge', self.verbose)
        self.rawcache.merge_records(dasquery)
        das_timer('merge', self.verbose)
        self.rawcache.update_query_record(dasquery, 'ok')
        self.rawcache.add_to_record(\
                dasquery, {'das.timer': get_das_timer()}, system='das')
        das_timer('DASCore::call', self.verbose)
        return 'ok'
Beispiel #26
0
def getdata_urllib(url, params, headers=None, expire=3600, post=None,
    error_expire=300, verbose=0, ckey=None, cert=None, doseq=True, system=None,
    tstamp=None):
    """
    Invoke URL call and retrieve data from data-service based
    on provided URL and set of parameters. Use post=True to
    invoke POST request.
    """
    contact = 'data-service.'
    if  system:
        contact = system + ' ' + contact
    timer_key = '%s?%s' % (url, urllib.urlencode(params, doseq=True))
    das_timer(timer_key, verbose)
    encoded_data = urllib.urlencode(params, doseq=doseq)
    if  not post:
        url = url + '?' + encoded_data
    if  not headers:
        headers = {}
    if  tstamp and 'If-Modified-Since' not in headers.keys():
        headers['If-Modified-Since'] = http_timestamp(tstamp)
    if  verbose:
        print('+++ getdata, url=%s, headers=%s' % (url, headers))
    req = urllib2.Request(url)
    for key, val in headers.items():
        req.add_header(key, val)
    if  verbose > 1:
        handler = urllib2.HTTPHandler(debuglevel=1)
        opener  = urllib2.build_opener(handler)
        urllib2.install_opener(opener)
    if  ckey and cert:
        handler = HTTPSClientAuthHandler(ckey, cert, verbose)
        opener  = urllib2.build_opener(handler)
        urllib2.install_opener(opener)
    try:
        time0 = time.time()
        if  post:
            data = urllib2.urlopen(req, encoded_data)
        else:
            data = urllib2.urlopen(req)
        data_srv_time = time.time()-time0
        info = data.info()
        code = data.getcode()
        if  verbose > 1:
            print("+++ response code:", code)
            print("+++ response info\n", info)
        try: # get HTTP header and look for Expires
            e_time = expire_timestamp(\
                info.__dict__['dict']['expires'])
            if  e_time < expire_timestamp(data_srv_time):
                expire = max(e_time, expire_timestamp(expire))
            elif e_time > time.time():
                expire = e_time
        except Exception as _exp:
            pass
    except urllib2.HTTPError as httperror:
        msg  = 'HTTPError, url=%s, args=%s, headers=%s' \
                    % (url, params, headers)
        data = {'error': 'Received HTTP error from %s data-service' % contact,
                'reason': msg}
        try:
            err  = '%s %s' % (contact, extract_http_error(httperror.read()))
            data.update({'error':err})
            msg += '\n' + err
        except Exception as exp:
            data.update({'httperror': None})
            msg += '\n' + str(exp)
        print(msg)
        data = json.dumps(data)
        expire = expire_timestamp(error_expire)
    except Exception as exp:
        msg  = 'HTTPError, url=%s, args=%s, headers=%s' \
                    % (url, params, headers)
        print(msg + '\n' + str(exp))
        data = {'error': 'Received generic error from %s data-service' % contact,
                'reason': msg}
        data = json.dumps(data)
        expire = expire_timestamp(error_expire)
    das_timer(timer_key, verbose)
    return data, expire
Beispiel #27
0
    def __init__(self, config):
        self.config = config
        self.emptyset_expire = \
                expire_timestamp(config['das'].get('emptyset_expire', 5))
        self.dburi = config['mongodb']['dburi']
        self.cache_size = config['mongodb']['bulkupdate_size']
        self.dbname = config['dasdb']['dbname']
        self.verbose = config['verbose']
        self.logger = PrintManager('DASMongocache', self.verbose)
        self.mapping = config['dasmapping']
        self.logging = config['dasdb'].get('logging', False)
        self.rec_ttl = config['dasdb'].get('record_ttl', 24 * 60 * 60)
        self.del_ttl = config['dasdb'].get('delta_ttl', 60)
        self.cleanup_del_ttl = config['dasdb'].get('cleanup_delta_ttl', 3600)
        self.retry = config['dasdb'].get('retry', 3)
        self.das_son_manipulator = DAS_SONManipulator()

        # Initialize MongoDB connection
        self.col_ = self.config['dasdb']['cachecollection']
        self.mrcol_ = self.config['dasdb']['mrcollection']
        self.merge_ = self.config['dasdb']['mergecollection']
        self.gfs = db_gridfs(self.dburi)

        msg = "%s@%s" % (self.dburi, self.dbname)
        self.logger.info(msg)

        # ensure that we have the following indexes
        common_idx = [
            ('file.name', DESCENDING),
            ('dataset.name', DESCENDING),
            ('block.name', DESCENDING),
            ('run.run_number', DESCENDING),
        ]
        index_list = [('das.expire', ASCENDING), ('das_id', ASCENDING),
                      ('das.system', ASCENDING), ('qhash', DESCENDING),
                      ('das.record', ASCENDING)]
        create_indexes(self.col, index_list + common_idx)
        index_list = [('das.expire', ASCENDING), ('das_id', ASCENDING),
                      ('qhash', DESCENDING), ('das.record', ASCENDING),
                      ('das.ts', ASCENDING)]
        create_indexes(self.merge, index_list)
        # NOTE: I found that creating index in merge collection leads to
        # MongoDB error when records contains multiple arrays on indexed
        # keys. For example, when we query file,run,lumi both file and run
        # are arrays in MongoDB. In this case the final sort in MongoDB
        # bark with the following message:
        # cannot sort with keys that are parallel arrays
        # it looks like that there is no fix for that yet
        # see
        # http://stackoverflow.com/questions/6516725/how-do-i-index-two-arrays-in-mongodb
        # therefore I temporary disabled create_indexes call on merge
        # collection which was used to have index to ease final sort,
        # especially in a case when a lot of records correspond to inital
        # query, e.g. file records.
        # On another hand, the most common use case where sort fails is
        # getting file records, and I can add one compound key to ease sort
        # but I can't add another compound key on array field, e.g. run
        common_idx = [[('qhash', DESCENDING), ('file.name', DESCENDING)]]
        create_indexes(self.merge, index_list + common_idx)

        # thread which clean-up DAS collections
        thname = 'mongocache_cleanup'
        cols = [
            config['dasdb']['cachecollection'],
            config['dasdb']['mrcollection'], config['dasdb']['mergecollection']
        ]
Beispiel #28
0
def getdata_urllib(
    url,
    params,
    headers=None,
    expire=3600,
    post=None,
    error_expire=300,
    verbose=0,
    ckey=None,
    cert=None,
    doseq=True,
    system=None,
):
    """
    Invoke URL call and retrieve data from data-service based
    on provided URL and set of parameters. Use post=True to
    invoke POST request.
    """
    contact = "data-service."
    if system:
        contact = system + " " + contact
    timer_key = "%s?%s" % (url, urllib.urlencode(params, doseq=True))
    das_timer(timer_key, verbose)
    encoded_data = urllib.urlencode(params, doseq=doseq)
    if not post:
        url = url + "?" + encoded_data
    if not headers:
        headers = {}
    if verbose:
        print "+++ getdata, url=%s, headers=%s" % (url, headers)
    req = urllib2.Request(url)
    for key, val in headers.iteritems():
        req.add_header(key, val)
    if verbose > 1:
        handler = urllib2.HTTPHandler(debuglevel=1)
        opener = urllib2.build_opener(handler)
        urllib2.install_opener(opener)
    if ckey and cert:
        handler = HTTPSClientAuthHandler(ckey, cert, verbose)
        opener = urllib2.build_opener(handler)
        urllib2.install_opener(opener)
    try:
        time0 = time.time()
        if post:
            data = urllib2.urlopen(req, encoded_data)
        else:
            data = urllib2.urlopen(req)
        data_srv_time = time.time() - time0
        try:  # get HTTP header and look for Expires
            e_time = expire_timestamp(data.info().__dict__["dict"]["expires"])
            if e_time < expire_timestamp(data_srv_time):
                expire = max(e_time, expire_timestamp(expire))
            elif e_time > time.time():
                expire = e_time
        except Exception as _exp:
            pass
    except urllib2.HTTPError as httperror:
        msg = "HTTPError, url=%s, args=%s, headers=%s" % (url, params, headers)
        data = {"error": "Unable to contact %s" % contact, "reason": msg}
        try:
            err = "%s %s" % (contact, extract_http_error(httperror.read()))
            data.update({"error": err})
            msg += "\n" + err
        except Exception as exp:
            data.update({"httperror": None})
            msg += "\n" + str(exp)
        print msg
        data = json.dumps(data)
        expire = expire_timestamp(error_expire)
    except Exception as exp:
        msg = "HTTPError, url=%s, args=%s, headers=%s" % (url, params, headers)
        print msg + "\n" + str(exp)
        data = {"error": "Unable to contact %s" % contact, "reason": msg}
        data = json.dumps(data)
        expire = expire_timestamp(error_expire)
    das_timer(timer_key, verbose)
    return data, expire