def getdata_helper(self, url, params, expire, headers=None, post=None): "Helper function to get data from SiteDB or local cache" cname = url.split('/')[-1].replace('-', '_') col = self.localcache.conn[self.name][cname] local = col.find_one({'expire':{'$gt':expire_timestamp(time.time())}}) data = None if local: msg = 'SiteDBService reads from %s.%s' % (self.name, cname) self.logger.info(msg) try: # get data from local cache data = [r for r in col.find() if not r.has_key('expire')][0] del data['_id'] except Exception as exc: print_exc(exc) data = {} if not data or not local: headers = {'Accept':'application/json'} datastream, expire = getdata(\ url, params, headers, expire, post, self.error_expire, self.verbose, self.ckey, self.cert, system=self.name) try: # read data and write it to local cache data = json.load(datastream) datastream.close() col.remove() col.insert(data) col.insert({'expire':expire_timestamp(expire)}) except Exception as exc: print_exc(exc) return data, expire
def getdata_helper(self, url, params, expire, headers=None, post=None): "Helper function to get data from SiteDB or local cache" cname = url.split('/')[-1].replace('-', '_') conn = db_connection(self.dburi) col = conn[self.name][cname] local = find_one(col, {'expire':{'$gt':expire_timestamp(time.time())}}) data = None if local: msg = 'SiteDBService reads from %s.%s' % (self.name, cname) self.logger.info(msg) try: # get data from local cache data = [r for r in col.find() if 'expire' not in r][0] del data['_id'] except Exception as exc: print_exc(exc) data = {} if not data or not local: headers = {'Accept':'application/json'} datastream, expire = getdata(\ url, params, headers, expire, post, self.error_expire, self.verbose, self.ckey, self.cert, system=self.name) try: # read data and write it to local cache data = json.load(datastream) datastream.close() col.remove() col.insert(data) col.insert({'expire':expire_timestamp(expire)}) except Exception as exc: print_exc(exc) return data, expire
def dasheader(system, dasquery, expire, api=None, url=None, ctime=None, services=None): """ Return DAS header (dict) wrt DAS specifications: - system represents DAS services, e.g. combined - dasquery is DASQuery representation - expire is expire timestamp of the record - api is data-service API name - url is data-service URL - ctime is current timestamp - services is a dict (or list of dicts) of CMS services contributed to data record, e.g. combined service uses dbs and phedex """ # tstamp must be integer in order for json encoder/decoder to # work properly, see utils/jsonwrapper/__init__.py tstamp = round(time.time()) if isinstance(system, basestring): system = [system] if not api: dasdict = dict(system=system, ts=tstamp, expire=expire_timestamp(expire), status="requested") else: dasdict = dict(system=system, ts=tstamp, url=[url], ctime=[ctime], expire=expire_timestamp(expire), urn=[api], api=[api], status="requested") if services: if isinstance(services, dict): services = [services] dasdict.update({"services": services}) return dict(das=dasdict)
def getdata_pycurl(url, params, headers=None, expire=3600, post=None, error_expire=300, verbose=0, ckey=None, cert=None, doseq=True, system=None): "Fetch data via pycurl library" contact = 'data-service.' if system: contact = system + ' ' + contact if isinstance(params, dict): timer_key = '%s?%s' % (url, urllib.urlencode(params, doseq=True)) else: timer_key = '%s?%s' % (url, params) das_timer(timer_key, verbose) handler = REQUEST_HANDLER try: data, expire = handler.getdata(url, params, headers, expire, post, \ error_expire, verbose, ckey, cert, doseq) except urllib2.HTTPError as httperror: msg = 'urllib2.HTTPError, system=%s, url=%s, args=%s, headers=%s' \ % (system, url, json.dumps(params), json.dumps(headers)) data = { 'error': 'Received HTTP error from %s data-service' % contact, 'reason': msg, 'ts': time.time() } try: reason = extract_http_error(httperror.read()) data.update({'reason': reason, 'request': msg}) # TODO: err variable did not exit in this function! msg += '\n' + reason except Exception as exp: data.update({'httperror': None}) msg += '\n' + str(exp) print(dastimestamp('getdata_pycurl'), msg) data = json.dumps(data) expire = expire_timestamp(error_expire) except Exception as exp: msg = 'HTTPError, system=%s, url=%s, args=%s, headers=%s' \ % (system, url, json.dumps(params), json.dumps(headers)) print(dastimestamp('getdata_pycurl'), msg + '\n' + str(exp)) data = { 'error': 'Received generic error from %s data-service' % contact, 'reason': msg, 'ts': time.time() } data = json.dumps(data) expire = expire_timestamp(error_expire) das_timer(timer_key, verbose) return data, expire
def getdata_pycurl( url, params, headers=None, expire=3600, post=None, error_expire=300, verbose=0, ckey=None, cert=None, doseq=True, system=None, ): "Fetch data via pycurl library" contact = "data-service." if system: contact = system + " " + contact timer_key = "%s?%s" % (url, urllib.urlencode(params, doseq=True)) das_timer(timer_key, verbose) handler = REQUEST_HANDLER try: data, expire = handler.getdata(url, params, headers, expire, post, error_expire, verbose, ckey, cert, doseq) except urllib2.HTTPError as httperror: msg = "urllib2.HTTPError, system=%s, url=%s, args=%s, headers=%s" % ( system, url, json.dumps(params), json.dumps(headers), ) data = {"error": "Unable to contact %s" % contact, "reason": msg, "ts": time.time()} try: reason = extract_http_error(httperror.read()) data.update({"reason": reason, "request": msg}) msg += "\n" + err except Exception as exp: data.update({"httperror": None}) msg += "\n" + str(exp) print dastimestamp("getdata_pycurl"), msg data = json.dumps(data) expire = expire_timestamp(error_expire) except Exception as exp: msg = "HTTPError, system=%s, url=%s, args=%s, headers=%s" % ( system, url, json.dumps(params), json.dumps(headers), ) print dastimestamp("getdata_pycurl"), msg + "\n" + str(exp) data = {"error": "Unable to contact %s" % contact, "reason": msg, "ts": time.time()} data = json.dumps(data) expire = expire_timestamp(error_expire) das_timer(timer_key, verbose) return data, expire
def getdata(self, url, params, headers=None, expire=3600, post=False, error_expire=300, verbose=0, ckey=None, cert=None, doseq=True): """Fetch data for given set of parameters""" time0 = time.time() thread = threading.current_thread().ident if post: cache = self.pcache else: cache = self.gcache if thread in cache: curl = cache.get(thread) else: curl = pycurl.Curl() cache[thread] = curl # print "\n+++ getdata curl gcache", self.gcache.keys() # print "+++ getdata curl pcache", self.pcache.keys() bbuf, hbuf = self.set_opts(curl, url, params, headers, ckey, cert, verbose, post, doseq) curl.perform() http_header = hbuf.getvalue() # data = parse_body(bbuf.getvalue()) # data = bbuf.getvalue() # read entire content # bbuf.flush() bbuf.seek(0)# to use file description seek to the beggning of the stream data = bbuf # leave StringIO object, which will serve as file descriptor expire = get_expire(http_header, error_expire, verbose) hbuf.flush() # check for HTTP error http_code = curl.getinfo(pycurl.HTTP_CODE) # get HTTP status message and Expires http_expire = '' http_msg = '' for item in http_header.splitlines(): if pat_http_msg.match(item): http_msg = item if pat_expires.match(item): http_expire = item.split('Expires:')[-1].strip() e_time = expire_timestamp(http_expire) if e_time < expire_timestamp(time0): expire = max(e_time, expire_timestamp(expire)) elif e_time > time.time(): expire = e_time if http_code < 200 or http_code >= 300: effective_url = curl.getinfo(pycurl.EFFECTIVE_URL) raise HTTPError(effective_url, http_code, http_msg, \ http_header, data) return data, expire
def dasheader(system, dasquery, expire, api=None, url=None, ctime=None): """ Return DAS header (dict) wrt DAS specifications, see https://twiki.cern.ch/twiki/bin/view/CMS/DMWMDataAggregationService#DAS_data_service_compliance """ if not api: dasdict = dict(system=[system], timestamp=time.time(), expire=expire_timestamp(expire), status="requested") else: dasdict = dict(system=[system], timestamp=time.time(), url=[url], ctime=[ctime], expire=expire_timestamp(expire), urn=[api], api=[api], status="requested") return dict(das=dasdict)
def update_cache(self, dasquery, results, header): """ Insert results into cache. Use bulk insert controller by self.cache_size. Upon completion ensure indexies. """ # insert/check query record in DAS cache self.insert_query_record(dasquery, header) # update results records in DAS cache gen = self.generate_records(dasquery, results, header) inserted = 0 # bulk insert try: if pymongo.version.startswith('3.'): # pymongo 3.X res = self.col.insert_many(gen, ordered=False, bypass_document_validation=True) inserted += len(res.inserted_ids) else: while True: nres = self.col.insert(itertools.islice(gen, self.cache_size)) if nres and isinstance(nres, list): inserted += len(nres) else: break except InvalidOperation: pass if dasquery.qcache: # custom DASQuery cache self.update_das_expire(dasquery, expire_timestamp(dasquery.qcache))
def parser(self, query, dformat, source, api): """ DBS3 data-service parser. """ if isinstance(source, GeneratorType): for row in source: yield row return for row in self.parser_helper(query, dformat, source, api): mod_time = get_modification_time(row) if self.extended_expire: new_expire = expire_timestamp(self.extended_expire) if mod_time and \ old_timestamp(mod_time, self.extended_threshold): row.update({'das':{'expire': new_expire}}) # filesummaries is summary DBS API about dataset, # it collects information about number of files/blocks/events # for given dataset and therefore will be merged with datasets # API record. To make a proper merge with extended # timestamp/threshold options I need explicitly assign # das.expire=extended_timestamp, otherwise # the merged record will pick-up smallest between # filesummaries and datasets records. if api == 'filesummaries': row.update({'das': {'expire': new_expire}}) yield row
def getdata_pycurl(url, params, headers=None, expire=3600, post=None, error_expire=300, verbose=0, ckey=None, cert=None, doseq=True, system=None): "Fetch data via pycurl library" contact = 'data-service.' if system: contact = system + ' ' + contact if isinstance(params, dict): timer_key = '%s?%s' % (url, urllib.urlencode(params, doseq=True)) else: timer_key = '%s?%s' % (url, params) das_timer(timer_key, verbose) handler = REQUEST_HANDLER try: data, expire = handler.getdata(url, params, headers, expire, post, \ error_expire, verbose, ckey, cert, doseq) except urllib2.HTTPError as httperror: msg = 'urllib2.HTTPError, system=%s, url=%s, args=%s, headers=%s' \ % (system, url, json.dumps(params), json.dumps(headers)) data = {'error': 'Received HTTP error from %s data-service' % contact, 'reason': msg, 'ts':time.time()} try: reason = extract_http_error(httperror.read()) data.update({'reason': reason, 'request': msg}) # TODO: err variable did not exit in this function! msg += '\n' + reason except Exception as exp: data.update({'httperror': None}) msg += '\n' + str(exp) print(dastimestamp('getdata_pycurl'), msg) data = json.dumps(data) expire = expire_timestamp(error_expire) except Exception as exp: msg = 'HTTPError, system=%s, url=%s, args=%s, headers=%s' \ % (system, url, json.dumps(params), json.dumps(headers)) print(dastimestamp('getdata_pycurl'), msg + '\n' + str(exp)) data = {'error': 'Received generic error from %s data-service' % contact, 'reason': msg, 'ts':time.time()} data = json.dumps(data) expire = expire_timestamp(error_expire) das_timer(timer_key, verbose) return data, expire
def parser(self, dasquery, dformat, source, api): """ DBS data-service parser. """ for row in self.parser_helper(dasquery, dformat, source, api): if self.extended_expire: new_expire = expire_timestamp(self.extended_expire) mod_time = get_modification_time(row) if mod_time and \ old_timestamp(mod_time, self.extended_threshold): row.update({'das':{'expire': new_expire}}) yield row
def test_expire_timestamp(self): """Test expire_timestamp function""" result = expire_timestamp('Mon, 04 Oct 2010 18:57:42 GMT') expect = 1286218662 self.assertEqual(result, expect) tstamp = time.time() + 10000 result = expire_timestamp(tstamp) expect = tstamp self.assertEqual(result, expect) tstamp = long(time.time() + 10) result = long(expire_timestamp(10)) expect = tstamp self.assertEqual(result, expect) expire = '900' result = long(expire_timestamp(expire)) # expect = long(time.time()) + 900 expect = 900 self.assertEqual(result, expect)
def test_expire_timestamp(self): """Test expire_timestamp function""" result = expire_timestamp('Mon, 04 Oct 2010 18:57:42 GMT') expect = 1286218662 self.assertEqual(result, expect) tstamp = time.time() + 10000 result = expire_timestamp(tstamp) expect = tstamp self.assertEqual(result, expect) tstamp = long(time.time() + 10) result = long(expire_timestamp(10)) expect = tstamp self.assertEqual(result, expect) expire = '900' tstamp = long(time.time() + 900) result = long(expire_timestamp(expire)) expect = tstamp self.assertEqual(result, expect)
def das_populator_helper(dasmgr, query, expire): """Process DAS query through DAS Core and sets new expire tstamp for it""" try: # To allow re-use of queries feeded by DAS populator # we need to ensure that instance is present in DAS query, # since web interface does it by default. dasquery = dasmgr.adjust_query(query) if 'instance' not in dasquery: raise Exception('Supplied query does not have DBS instance') newts = expire_timestamp(expire) # process DAS query dasmgr.call(dasquery) # update DAS expire timestamp dasmgr.rawcache.update_das_expire(dasquery, newts) print("\n### DAS populator", query, dasquery, expire, newts) except Exception as exc: print_exc(exc)
def update_cache(self, dasquery, results, header, system, api): """ Insert results into cache. Use bulk insert controller by self.cache_size. Upon completion ensure indexies. """ # update results records in DAS cache gen = self.generate_records(dasquery, results, header) inserted = 0 # bulk insert try: res = self.col.insert_many(gen, ordered=False, bypass_document_validation=True) inserted += len(res.inserted_ids) except InvalidOperation: pass # update query record for this sub-system self.update_query_record_system(dasquery, system, api, 'ok') if dasquery.qcache: # custom DASQuery cache self.update_das_expire(dasquery, expire_timestamp(dasquery.qcache))
def insert_apicall(self, system, query, url, api, api_params, expire): """ Remove obsolete apicall records and insert into Analytics DB provided information about API call. Moved from AbstractService. Updated so that we do not have multiple records when performing forced updates (ie, the old record is not yet expired) - now look for an existing record with the same parameters (I'm hoping the fact that some of the variables are indexed will make this fast even though not all are), and if it exists just update the expiry. Otherwise insert a new record. """ msg = 'query=%s, url=%s,' % (query, url) msg += 'api=%s, args=%s, expire=%s' % (api, api_params, expire) self.logger.debug(msg) expire = expire_timestamp(expire) query = encode_mongo_query(query) qhash = genkey(query) self.remove_expired() existing = self.col.find_one({'apicall.system': system, 'apicall.url': url, 'apicall.api': api, 'apicall.api_params': api_params, 'apicall.qhash': qhash}) if existing: self.logger.debug("updating") self.col.update({'_id': existing['_id']}, {'$set':{'apicall.expire': expire}}) else: self.col.insert({'apicall':{'api_params': api_params, 'url': url, 'api': api, 'system': system, 'expire': expire, 'qhash': qhash}}) index_list = [('apicall.url', DESCENDING), ('apicall.api', DESCENDING), ('qhash', DESCENDING)] create_indexes(self.col, index_list)
def __init__(self, config): self.emptyset_expire = expire_timestamp(\ config['das'].get('emptyset_expire', 5)) self.dburi = config['mongodb']['dburi'] self.cache_size = config['mongodb']['bulkupdate_size'] self.dbname = config['dasdb']['dbname'] self.verbose = config['verbose'] self.logger = PrintManager('DASMongocache', self.verbose) self.mapping = config['dasmapping'] self.conn = db_connection(self.dburi) self.mdb = self.conn[self.dbname] self.col = self.mdb[config['dasdb']['cachecollection']] self.mrcol = self.mdb[config['dasdb']['mrcollection']] self.merge = self.mdb[config['dasdb']['mergecollection']] self.gfs = db_gridfs(self.dburi) self.logdb = DASLogdb(config) self.das_internal_keys = ['das_id', 'das', 'cache_id', 'qhash'] msg = "%s@%s" % (self.dburi, self.dbname) self.logger.info(msg) self.add_manipulator() # ensure that we have the following indexes index_list = [('das.expire', ASCENDING), ('das_id', ASCENDING), ('das.system', ASCENDING), ('qhash', DESCENDING), ('das.empty_record', ASCENDING)] create_indexes(self.col, index_list) index_list = [('das.expire', ASCENDING), ('das_id', ASCENDING), ('qhash', DESCENDING), ('das.empty_record', ASCENDING), ('das.ts', ASCENDING)] create_indexes(self.merge, index_list)
def apicall(self, dasquery, url, api, args, dformat, expire): """ A service worker. It parses input query, invoke service API and return results in a list with provided row. """ # NOTE: I use helper function since it is 2 step process # therefore the expire time stamp will not be changed, since # helper function will yield results time0 = time.time() if api == 'combined_dataset4site_release' or \ api == 'combined_site4dataset': genrows = self.helper(url, api, args, expire) # here I use directly the call to the service which returns # proper expire timestamp. Moreover I use HTTP header to look # at expires and adjust my expire parameter accordingly if api == 'combined_dataset4site': headers = {'Accept': 'application/json;text/json'} datastream, expire = self.getdata(url, args, expire, headers) try: # get HTTP header and look for Expires e_time = expire_timestamp(\ datastream.info().__dict__['dict']['expires']) if e_time > time.time(): expire = e_time except: pass genrows = parse_data(datastream) # proceed with standard workflow dasrows = self.set_misses(dasquery, api, genrows) ctime = time.time() - time0 try: if isinstance(url, dict): url = "combined: %s" % url.values() self.write_to_cache(dasquery, expire, url, api, args, dasrows, ctime) except Exception as exc: print_exc(exc)
def __init__(self, config): self.config = config self.emptyset_expire = \ expire_timestamp(config['das'].get('emptyset_expire', 5)) self.dburi = config['mongodb']['dburi'] self.cache_size = config['mongodb']['bulkupdate_size'] self.dbname = config['dasdb']['dbname'] self.verbose = config['verbose'] self.logger = PrintManager('DASMongocache', self.verbose) self.mapping = config['dasmapping'] self.logging = config['dasdb'].get('logging', False) self.rec_ttl = config['dasdb'].get('record_ttl', 24*60*60) self.del_ttl = config['dasdb'].get('delta_ttl', 60) self.cleanup_del_ttl = config['dasdb'].get('cleanup_delta_ttl', 3600) self.retry = config['dasdb'].get('retry', 3) self.das_son_manipulator = DAS_SONManipulator() # Initialize MongoDB connection self.col_ = self.config['dasdb']['cachecollection'] self.mrcol_ = self.config['dasdb']['mrcollection'] self.merge_ = self.config['dasdb']['mergecollection'] self.gfs = db_gridfs(self.dburi) msg = "%s@%s" % (self.dburi, self.dbname) self.logger.info(msg) # ensure that we have the following indexes common_idx = [ ('file.name', DESCENDING), ('dataset.name', DESCENDING), ('block.name', DESCENDING), ('run.run_number', DESCENDING), ] index_list = [('das.expire', ASCENDING), ('das_id', ASCENDING), ('das.system', ASCENDING), ('qhash', DESCENDING), ('das.record', ASCENDING)] create_indexes(self.col, index_list + common_idx) index_list = [('das.expire', ASCENDING), ('das_id', ASCENDING), ('qhash', DESCENDING), ('das.record', ASCENDING), ('das.ts', ASCENDING)] create_indexes(self.merge, index_list) # NOTE: I found that creating index in merge collection leads to # MongoDB error when records contains multiple arrays on indexed # keys. For example, when we query file,run,lumi both file and run # are arrays in MongoDB. In this case the final sort in MongoDB # bark with the following message: # cannot sort with keys that are parallel arrays # it looks like that there is no fix for that yet # see # http://stackoverflow.com/questions/6516725/how-do-i-index-two-arrays-in-mongodb # therefore I temporary disabled create_indexes call on merge # collection which was used to have index to ease final sort, # especially in a case when a lot of records correspond to inital # query, e.g. file records. # On another hand, the most common use case where sort fails is # getting file records, and I can add one compound key to ease sort # but I can't add another compound key on array field, e.g. run common_idx = [[('qhash', DESCENDING), ('file.name', DESCENDING)]] create_indexes(self.merge, index_list + common_idx) # thread which clean-up DAS collections thname = 'mongocache_cleanup' cols = [config['dasdb']['cachecollection'], config['dasdb']['mrcollection'], config['dasdb']['mergecollection']]
def getdata(self, url, params, headers=None, expire=3600, post=False, error_expire=300, verbose=0, ckey=None, cert=None, doseq=True): """Fetch data for given set of parameters""" time0 = time.time() thread = threading.current_thread().ident if post: cache = self.pcache else: cache = self.gcache if thread in cache: curl = cache.get(thread) else: curl = pycurl.Curl() cache[thread] = curl # print "\n+++ getdata curl gcache", self.gcache.keys() # print "+++ getdata curl pcache", self.pcache.keys() bbuf, hbuf = self.set_opts(curl, url, params, headers,\ ckey, cert, verbose, post, doseq) curl.perform() if sys.version.startswith('3.'): http_header = hbuf.getvalue().decode('UTF-8') else: http_header = hbuf.getvalue() # data = parse_body(bbuf.getvalue()) # data = bbuf.getvalue() # read entire content # bbuf.flush() # bbuf.seek(0)# to use file description seek to the beggning of the stream # data = bbuf # leave StringIO object, which will serve as file descriptor # will yield data as StringIO object, i.e. provide file object if sys.version.startswith('3.'): data = io.StringIO(bbuf.getvalue().decode('UTF-8')) else: bbuf.seek( 0 ) # to use file description seek to the beggning of the stream data = bbuf # leave StringIO object, which will serve as file descriptor expire = get_expire(http_header, error_expire, verbose) hbuf.flush() # check for HTTP error http_code = curl.getinfo(pycurl.HTTP_CODE) # get HTTP status message and Expires http_expire = '' http_msg = '' for item in http_header.splitlines(): if pat_http_msg.match(item): http_msg = item if pat_expires.match(item): http_expire = item.split('Expires:')[-1].strip() e_time = expire_timestamp(http_expire) if e_time < expire_timestamp(time0): expire = max(e_time, expire_timestamp(expire)) elif e_time > time.time(): expire = e_time if http_code < 200 or http_code >= 300: effective_url = curl.getinfo(pycurl.EFFECTIVE_URL) raise HTTPError(effective_url, http_code, http_msg, \ http_header, data) return data, expire
def getdata_urllib(url, params, headers=None, expire=3600, post=None, error_expire=300, verbose=0, ckey=None, cert=None, doseq=True, system=None, tstamp=None): """ Invoke URL call and retrieve data from data-service based on provided URL and set of parameters. Use post=True to invoke POST request. """ contact = 'data-service.' if system: contact = system + ' ' + contact timer_key = '%s?%s' % (url, urllib.urlencode(params, doseq=True)) das_timer(timer_key, verbose) encoded_data = urllib.urlencode(params, doseq=doseq) if not post: url = url + '?' + encoded_data if not headers: headers = {} if tstamp and 'If-Modified-Since' not in headers.keys(): headers['If-Modified-Since'] = http_timestamp(tstamp) if verbose: print('+++ getdata, url=%s, headers=%s' % (url, headers)) req = urllib2.Request(url) for key, val in headers.items(): req.add_header(key, val) if verbose > 1: handler = urllib2.HTTPHandler(debuglevel=1) opener = urllib2.build_opener(handler) urllib2.install_opener(opener) if ckey and cert: handler = HTTPSClientAuthHandler(ckey, cert, verbose) opener = urllib2.build_opener(handler) urllib2.install_opener(opener) try: time0 = time.time() if post: data = urllib2.urlopen(req, encoded_data) else: data = urllib2.urlopen(req) data_srv_time = time.time() - time0 info = data.info() code = data.getcode() if verbose > 1: print("+++ response code:", code) print("+++ response info\n", info) try: # get HTTP header and look for Expires e_time = expire_timestamp(\ info.__dict__['dict']['expires']) if e_time < expire_timestamp(data_srv_time): expire = max(e_time, expire_timestamp(expire)) elif e_time > time.time(): expire = e_time except Exception as _exp: pass except urllib2.HTTPError as httperror: msg = 'HTTPError, url=%s, args=%s, headers=%s' \ % (url, params, headers) data = { 'error': 'Received HTTP error from %s data-service' % contact, 'reason': msg } try: err = '%s %s' % (contact, extract_http_error(httperror.read())) data.update({'error': err}) msg += '\n' + err except Exception as exp: data.update({'httperror': None}) msg += '\n' + str(exp) print(msg) data = json.dumps(data) expire = expire_timestamp(error_expire) except Exception as exp: msg = 'HTTPError, url=%s, args=%s, headers=%s' \ % (url, params, headers) print(msg + '\n' + str(exp)) data = { 'error': 'Received generic error from %s data-service' % contact, 'reason': msg } data = json.dumps(data) expire = expire_timestamp(error_expire) das_timer(timer_key, verbose) return data, expire
def call(self, query, add_to_analytics=True, **kwds): """ Top level DAS api which execute a given query using underlying data-services. It follows the following steps: - parse input query - identify data-sercices based on selection keys and where clause conditions - construct DAS workflow and execute data-service API calls. At this step individual data-services store results into DAS cache. Return status 0/1 depending on success of the calls, can be used by workers on cache server. kwds is provided for compatibility with web layer, e.g. it may invoke this method with additional pid parameter. """ self.logger.info('input query=%s' % query) das_timer('DASCore::call', self.verbose) services = [] if isinstance(query, object) and hasattr(query, '__class__')\ and query.__class__.__name__ == 'DASQuery': dasquery = query else: dasquery = DASQuery(query, mongoparser=self.mongoparser) if add_to_analytics: dasquery.add_to_analytics() query = dasquery.mongo_query if dasquery.mongo_query.has_key('system'): system = query['system'] if isinstance(system, str) or isinstance(system, unicode): services = [system] elif isinstance(system, list): services = system else: msg = 'Unsupported system=%s type=%s in DAS query' \ % (system, type(system)) raise Exception(msg) spec = query.get('spec') fields = query.get('fields') if fields == ['records']: msg = 'look-up all records in cache' self.logger.info(msg) return 'in cache' if spec == dict(records='*'): self.logger.info("look-up everything in cache") return 'in cache' for record in self.rawcache.find_specs(dasquery): status = record['das']['status'] msg = 'found query %s in cache, status=%s\n' \ % (record['query'], status) self.logger.info(msg) return status similar_dasquery = self.rawcache.similar_queries(dasquery) if similar_dasquery: for record in self.rawcache.find_specs(similar_dasquery): if record: try: status = record['das']['status'] except: status = 'N/A' msg = 'Fail to look-up das.status, record=%s' % record self.logger.info(msg) msg = 'found SIMILAR query in cache,' msg += 'query=%s, status=%s\n' % (record['query'], status) self.logger.info(msg) return status self.logger.info(dasquery) params = dasquery.params() if not services: services = params['services'] self.logger.info('services = %s' % services) das_timer('das_record', self.verbose) # initial expire tstamp 1 day (long enough to be overwriten by data-srv) expire = expire_timestamp(time.time()+1*24*60*60) header = dasheader("das", dasquery, expire) header['lookup_keys'] = [] self.rawcache.insert_query_record(dasquery, header) das_timer('das_record', self.verbose) try: if self.multitask: jobs = [] for srv in services: jobs.append(self.taskmgr.spawn(self.worker, srv, dasquery)) self.taskmgr.joinall(jobs) else: for srv in services: self.worker(srv, dasquery) except Exception as exc: print_exc(exc) return 'fail' self.logger.info('\n##### merging ######\n') self.rawcache.update_query_record(dasquery, 'merging') das_timer('merge', self.verbose) self.rawcache.merge_records(dasquery) das_timer('merge', self.verbose) self.rawcache.update_query_record(dasquery, 'ok') self.rawcache.add_to_record(\ dasquery, {'das.timer': get_das_timer()}, system='das') das_timer('DASCore::call', self.verbose) return 'ok'
def getdata_urllib(url, params, headers=None, expire=3600, post=None, error_expire=300, verbose=0, ckey=None, cert=None, doseq=True, system=None, tstamp=None): """ Invoke URL call and retrieve data from data-service based on provided URL and set of parameters. Use post=True to invoke POST request. """ contact = 'data-service.' if system: contact = system + ' ' + contact timer_key = '%s?%s' % (url, urllib.urlencode(params, doseq=True)) das_timer(timer_key, verbose) encoded_data = urllib.urlencode(params, doseq=doseq) if not post: url = url + '?' + encoded_data if not headers: headers = {} if tstamp and 'If-Modified-Since' not in headers.keys(): headers['If-Modified-Since'] = http_timestamp(tstamp) if verbose: print('+++ getdata, url=%s, headers=%s' % (url, headers)) req = urllib2.Request(url) for key, val in headers.items(): req.add_header(key, val) if verbose > 1: handler = urllib2.HTTPHandler(debuglevel=1) opener = urllib2.build_opener(handler) urllib2.install_opener(opener) if ckey and cert: handler = HTTPSClientAuthHandler(ckey, cert, verbose) opener = urllib2.build_opener(handler) urllib2.install_opener(opener) try: time0 = time.time() if post: data = urllib2.urlopen(req, encoded_data) else: data = urllib2.urlopen(req) data_srv_time = time.time()-time0 info = data.info() code = data.getcode() if verbose > 1: print("+++ response code:", code) print("+++ response info\n", info) try: # get HTTP header and look for Expires e_time = expire_timestamp(\ info.__dict__['dict']['expires']) if e_time < expire_timestamp(data_srv_time): expire = max(e_time, expire_timestamp(expire)) elif e_time > time.time(): expire = e_time except Exception as _exp: pass except urllib2.HTTPError as httperror: msg = 'HTTPError, url=%s, args=%s, headers=%s' \ % (url, params, headers) data = {'error': 'Received HTTP error from %s data-service' % contact, 'reason': msg} try: err = '%s %s' % (contact, extract_http_error(httperror.read())) data.update({'error':err}) msg += '\n' + err except Exception as exp: data.update({'httperror': None}) msg += '\n' + str(exp) print(msg) data = json.dumps(data) expire = expire_timestamp(error_expire) except Exception as exp: msg = 'HTTPError, url=%s, args=%s, headers=%s' \ % (url, params, headers) print(msg + '\n' + str(exp)) data = {'error': 'Received generic error from %s data-service' % contact, 'reason': msg} data = json.dumps(data) expire = expire_timestamp(error_expire) das_timer(timer_key, verbose) return data, expire
def __init__(self, config): self.config = config self.emptyset_expire = \ expire_timestamp(config['das'].get('emptyset_expire', 5)) self.dburi = config['mongodb']['dburi'] self.cache_size = config['mongodb']['bulkupdate_size'] self.dbname = config['dasdb']['dbname'] self.verbose = config['verbose'] self.logger = PrintManager('DASMongocache', self.verbose) self.mapping = config['dasmapping'] self.logging = config['dasdb'].get('logging', False) self.rec_ttl = config['dasdb'].get('record_ttl', 24 * 60 * 60) self.del_ttl = config['dasdb'].get('delta_ttl', 60) self.cleanup_del_ttl = config['dasdb'].get('cleanup_delta_ttl', 3600) self.retry = config['dasdb'].get('retry', 3) self.das_son_manipulator = DAS_SONManipulator() # Initialize MongoDB connection self.col_ = self.config['dasdb']['cachecollection'] self.mrcol_ = self.config['dasdb']['mrcollection'] self.merge_ = self.config['dasdb']['mergecollection'] self.gfs = db_gridfs(self.dburi) msg = "%s@%s" % (self.dburi, self.dbname) self.logger.info(msg) # ensure that we have the following indexes common_idx = [ ('file.name', DESCENDING), ('dataset.name', DESCENDING), ('block.name', DESCENDING), ('run.run_number', DESCENDING), ] index_list = [('das.expire', ASCENDING), ('das_id', ASCENDING), ('das.system', ASCENDING), ('qhash', DESCENDING), ('das.record', ASCENDING)] create_indexes(self.col, index_list + common_idx) index_list = [('das.expire', ASCENDING), ('das_id', ASCENDING), ('qhash', DESCENDING), ('das.record', ASCENDING), ('das.ts', ASCENDING)] create_indexes(self.merge, index_list) # NOTE: I found that creating index in merge collection leads to # MongoDB error when records contains multiple arrays on indexed # keys. For example, when we query file,run,lumi both file and run # are arrays in MongoDB. In this case the final sort in MongoDB # bark with the following message: # cannot sort with keys that are parallel arrays # it looks like that there is no fix for that yet # see # http://stackoverflow.com/questions/6516725/how-do-i-index-two-arrays-in-mongodb # therefore I temporary disabled create_indexes call on merge # collection which was used to have index to ease final sort, # especially in a case when a lot of records correspond to inital # query, e.g. file records. # On another hand, the most common use case where sort fails is # getting file records, and I can add one compound key to ease sort # but I can't add another compound key on array field, e.g. run common_idx = [[('qhash', DESCENDING), ('file.name', DESCENDING)]] create_indexes(self.merge, index_list + common_idx) # thread which clean-up DAS collections thname = 'mongocache_cleanup' cols = [ config['dasdb']['cachecollection'], config['dasdb']['mrcollection'], config['dasdb']['mergecollection'] ]
def getdata_urllib( url, params, headers=None, expire=3600, post=None, error_expire=300, verbose=0, ckey=None, cert=None, doseq=True, system=None, ): """ Invoke URL call and retrieve data from data-service based on provided URL and set of parameters. Use post=True to invoke POST request. """ contact = "data-service." if system: contact = system + " " + contact timer_key = "%s?%s" % (url, urllib.urlencode(params, doseq=True)) das_timer(timer_key, verbose) encoded_data = urllib.urlencode(params, doseq=doseq) if not post: url = url + "?" + encoded_data if not headers: headers = {} if verbose: print "+++ getdata, url=%s, headers=%s" % (url, headers) req = urllib2.Request(url) for key, val in headers.iteritems(): req.add_header(key, val) if verbose > 1: handler = urllib2.HTTPHandler(debuglevel=1) opener = urllib2.build_opener(handler) urllib2.install_opener(opener) if ckey and cert: handler = HTTPSClientAuthHandler(ckey, cert, verbose) opener = urllib2.build_opener(handler) urllib2.install_opener(opener) try: time0 = time.time() if post: data = urllib2.urlopen(req, encoded_data) else: data = urllib2.urlopen(req) data_srv_time = time.time() - time0 try: # get HTTP header and look for Expires e_time = expire_timestamp(data.info().__dict__["dict"]["expires"]) if e_time < expire_timestamp(data_srv_time): expire = max(e_time, expire_timestamp(expire)) elif e_time > time.time(): expire = e_time except Exception as _exp: pass except urllib2.HTTPError as httperror: msg = "HTTPError, url=%s, args=%s, headers=%s" % (url, params, headers) data = {"error": "Unable to contact %s" % contact, "reason": msg} try: err = "%s %s" % (contact, extract_http_error(httperror.read())) data.update({"error": err}) msg += "\n" + err except Exception as exp: data.update({"httperror": None}) msg += "\n" + str(exp) print msg data = json.dumps(data) expire = expire_timestamp(error_expire) except Exception as exp: msg = "HTTPError, url=%s, args=%s, headers=%s" % (url, params, headers) print msg + "\n" + str(exp) data = {"error": "Unable to contact %s" % contact, "reason": msg} data = json.dumps(data) expire = expire_timestamp(error_expire) das_timer(timer_key, verbose) return data, expire