def fetch(self, api, params=None): "Fetch data for given api" if api == 'sites': api = 'site-names' url = '%s/%s' % (self.url, api) data = super(SiteDBService, self).fetch(url, params) for row in sitedb_parser(data): if api == 'people': rid = genkey(str(row['dn']), truncate=5) rec = {'dn':row['dn'], 'rid':rid} if api == 'site-names': rid = genkey(str(row['alias']), truncate=5) rec = {'site':row['alias'], 'rid':rid} yield rec
def fetch(self, url, params, cache=True): "Fetch data for given api" debug = 0 data = "[]" if cache: docid = genkey("url=%s params=%s" % (url, params)) res = self.storage.fetch_one('cache', {'_id':docid}) if res and 'data' in res: if self.verbose: print("%s::fetch url=%s, params=%s, docid=%s" \ % (self.name, url, params, docid)) return res['data'] if self.verbose: print("%s::fetch url=%s, params=%s" % (self.name, url, params)) debug = self.verbose-1 try: data = getdata(url, params, debug=debug) except Exception as exc: print(str(exc)) for attempt in xrange(3): time.sleep(0.1) print("Attempt %s" % attempt) try: data = getdata(url, params, debug=debug) break except Exception as err: print(str(err)) pass if cache: self.storage.insert('cache', {'_id':docid, 'data': data, 'url': url, 'params': params}) return data
def fetch(self, api, params=None): "Fetch data for given api" url = '%s/%s/?%s' % (self.url, api, urllib.urlencode(params, doseq=True)) docid = genkey("url=%s params=%s" % (url, params)) res = self.storage.fetch_one('cache', {'_id':docid}) if res and 'data' in res: if self.verbose: print("%s::fetch url=%s, params=%s, docid=%s" \ % (self.name, url, params, docid)) data = res['data'] else: if self.verbose: print("%s::fetch url=%s, params=%s" % (self.name, url, params)) # NOTE: popularity DB has two different access points, one # within CERN network and out outside. The former does not require # authentication, while later passes through CERN SSO. # The following block reflects this, in a future, when popularity DB # will move into cmsweb domain we'll no longer need it if self.url.find('cms-popularity-prod') != -1 or \ self.url.find('cmsweb') != -1: data = getdata(url, ckey=self.ckey, cert=self.cert, debug=self.verbose) else: data = sso_getdata(url, ckey=self.ckey, cert=self.cert, debug=self.verbose) self.storage.insert('cache', {'_id':docid, 'data': data, 'url': url, 'params': params}) data = json.loads(data) for row in data['DATA']: yield row
def convert(config, sep=',', sortby='tier'): "Lookup DBS data tiers" dbs = DBSService(config) tiers = {} salt = config.get('core', {}).get('salt', 'secret sauce') for tier in dbs.data_tiers(): tid = genkey(tier, salt, 5) if sortby == 'tier': tiers[tier] = tid else: tiers[tid] = tier for tier in sorted(tiers.keys()): if sortby == 'tier': print('%s%s%s' % (tiers[tier], sep, tier)) else: print('%s%s%s' % (tier, sep, tiers[tier]))
def fetch(self, api, params=None): "Fetch data for given api" url = '%s/%s/?%s' % (self.url, api, urllib.urlencode(params, doseq=True)) docid = genkey("url=%s params=%s" % (url, params)) res = self.storage.fetch_one('cache', {'_id':docid}) if res and 'data' in res: if self.verbose: print("%s::fetch url=%s, params=%s, docid=%s" \ % (self.name, url, params, docid)) data = res['data'] else: if self.verbose: print("%s::fetch url=%s, params=%s" % (self.name, url, params)) data = getdata(url, ckey=self.ckey, cert=self.cert, debug=self.verbose) self.storage.insert('cache', {'_id':docid, 'data': data, 'url': url, 'params': params}) data = json.loads(data) for row in data['DATA']: yield row
def fetch(self, api, params=None, dbsinst='prod/global', cache=True): "Fetch data for given api" if dbsinst: dbs_url = self.url.replace('prod/global', dbsinst) inst = {'dbs_instance': self.all_dbs.index(dbsinst)} if api == 'releases': url = '%s/releaseversions' % dbs_url else: url = '%s/%s' % (dbs_url, api) data = json.loads(super(DBSService, self).fetch(url, params, cache)) if api == 'releases': data = data[0]['release_version'] for row in data: if api == 'datasets': try: row['rid'] = row['dataset_id'] except KeyError: print("Unable to process dataset row", row) if 'dataset' in row: h = hashlib.md5() h.update(row['dataset']) row['rid'] = int(h.hexdigest()[:10], 16) print("Generated new dataset_id", row['dataset'], h.hexdigest(), row['rid']) except: print("Unable to process dataset row", row) raise row.update(inst) yield row elif api == 'releases': rid = genkey(row, truncate=5) rec = {'release': row, 'rid': rid} yield rec elif api == 'filesummaries': yield row else: yield row
def fetch(self, api, params=None, dbsinst='prod/global', cache=True): "Fetch data for given api" if dbsinst: dbs_url = self.url.replace('prod/global', dbsinst) inst = {'dbs_instance':self.all_dbs.index(dbsinst)} if api == 'releases': url = '%s/releaseversions' % dbs_url else: url = '%s/%s' % (dbs_url, api) data = json.loads(super(DBSService, self).fetch(url, params, cache)) if api == 'releases': data = data[0]['release_version'] for row in data: if api == 'datasets': try: row['rid'] = row['dataset_id'] except KeyError: print("Unable to process dataset row", row) if 'dataset' in row: h = hashlib.md5() h.update(row['dataset']) row['rid'] = int(h.hexdigest()[:10], 16) print("Generated new dataset_id", row['dataset'], h.hexdigest(), row['rid']) except: print("Unable to process dataset row", row) raise row.update(inst) yield row elif api == 'releases': rid = genkey(row, truncate=5) rec = {'release':row, 'rid':rid} yield rec elif api == 'filesummaries': yield row else: yield row
def fetch(self, url, params, cache=True): "Fetch data for given api" debug = 0 data = "[]" if cache: docid = genkey("url=%s params=%s" % (url, params)) res = self.storage.fetch_one('cache', {'_id': docid}) if res and 'data' in res: if self.verbose: print("%s::fetch url=%s, params=%s, docid=%s" \ % (self.name, url, params, docid)) return res['data'] if self.verbose: print("%s::fetch url=%s, params=%s" % (self.name, url, params)) debug = self.verbose - 1 try: data = getdata(url, params, debug=debug) except Exception as exc: print(str(exc)) for attempt in xrange(3): time.sleep(0.1) print("Attempt %s" % attempt) try: data = getdata(url, params, debug=debug) break except Exception as err: print(str(err)) pass if cache: self.storage.insert('cache', { '_id': docid, 'data': data, 'url': url, 'params': params }) return data
def dataset_info(self, timeframe, dataset, dtypes, stypes, rtypes, tiers, dformat, target=0): "Return common dataset info in specified data format" dbsinst = self.dbs.dataset_dbsinst(dataset) if not dbsinst: return row = self.dbs.dataset_info(dataset, dbsinst) if row: if self.multitask: releases, sites, parents, summary, dashboard = \ self.dataset_info_all(dataset, dbsinst, timeframe) else: releases = [rname for rname in self.dbs.dataset_release_versions(dataset, dbsinst)] sites = [sname for sname in self.phedex.sites(dataset)] parents = [r for r in self.dbs.dataset_parents(dataset, dbsinst)] summary = self.dbs.dataset_summary(dataset, dbsinst) dashboard = self.dashboard.dataset_info(dataset, timeframe[0], timeframe[1]) nrels = len(releases) series = rtypes['series'] majors = rtypes['majors'] minors = rtypes['minors'] relclf = rtypes['rtypes'] for rel in releases: rserie, rmajor, rminor = rel_ver(rel) if not cmssw_test(rserie, rmajor, rminor): continue rtype = rel_type(rel) try: series['rel1_%s'%rserie] += 1 except: pass try: majors['rel2_%s'%rmajor] += 1 except: pass try: minors['rel3_%s'%rminor] += 1 except: pass try: relclf['relt_%s'%rtype] += 1 except: pass nsites = len(sites) for site in sites: stier = site_tier(site) stypes['s_%s'%stier] += 1 dataset_id = row['rid'] era = genkey(row['acquisition_era_name'], self.salt, 5) create_dn = self.sitedb.dnid(row['create_by']) dbsinstid = row['dbs_instance'] dtype = row['primary_ds_type'] # number of data types should be small and simple # list look-up shouldn't be a problem if dtype not in dtypes: dtypes.append(dtype) dtype = dtypes.index(dtype) _, prim, proc, tier = dataset.split('/') prim = genkey(prim, self.salt, 5) proc = genkey(proc, self.salt, 5) if tier not in tiers: tiers.append(tier) tier = genkey(tier, self.salt, 5) parent = parents[0] if len(parents) else 0 uid = genuid(yyyymmdd(timeframe[0]), dbsinstid, dataset_id) size_norm = 2**30 # normalization factor for file size rec = dict(id=uid, dataset=dataset_id, primds=prim, procds=proc, tier=tier, dtype=dtype, creator=create_dn, nrel=nrels, nsites=nsites, parent=parent, era=era, dbs=dbsinstid, nfiles=summary.get('num_file', 0), nlumis=summary.get('num_lumi', 0), nblk=summary.get('num_block', 0), nevt=summary.get('num_event', 0), size=summary.get('file_size', 0)/size_norm, cpu=dashboard.get('cpu', 0), wct=dashboard.get('wct', 0), proc_evts=dashboard.get('nevt', 0)) if isinstance(target, dict): rec.update(target) for key,val in series.items(): rec.update({key:val}) for key, val in majors.items(): rec.update({key:val}) for key, val in minors.items(): rec.update({key:val}) for key, val in relclf.items(): rec.update({key:val}) for key, val in stypes.items(): rec.update({key:val}) headers = rec.keys() headers.sort() headers.remove('id') headers = ['id'] + headers # let dataset id be the first column if dformat == 'headers': yield headers elif dformat == 'csv': res = [str(rec[h]) for h in headers] yield ','.join(res) elif dformat == 'vw': target_str = target.get('rnaccess') vals = ' '.join([str(rec[h]) for h in headers]) uid = genkey(vals, self.salt, 5) # unique row identified vwrow = "%s '%s |f %s" % (target_str, uid, vals) yield vwrow
def dataset_info(self, timeframe, dataset, dtypes, stypes, rtypes, tiers, dformat, target=0): "Return common dataset info in specified data format" dbsinst = self.dbs.dataset_dbsinst(dataset) if not dbsinst: return row = self.dbs.dataset_info(dataset, dbsinst) if row: if self.multitask: releases, sites, parents, summary, dashboard = \ self.dataset_info_all(dataset, dbsinst, timeframe) else: releases = [ rname for rname in self.dbs.dataset_release_versions( dataset, dbsinst) ] sites = [sname for sname in self.phedex.sites(dataset)] parents = [ r for r in self.dbs.dataset_parents(dataset, dbsinst) ] summary = self.dbs.dataset_summary(dataset, dbsinst) dashboard = self.dashboard.dataset_info( dataset, timeframe[0], timeframe[1]) nrels = len(releases) series = {} for k in rtypes['series'].keys(): series[k] = 0 majors = {} for k in rtypes['majors'].keys(): majors[k] = 0 minors = {} for k in rtypes['minors'].keys(): minors[k] = 0 relclf = {} for k in rtypes['rtypes'].keys(): relclf[k] = 0 for rel in releases: rserie, rmajor, rminor = rel_ver(rel) if not cmssw_test(rserie, rmajor, rminor): continue rtype = rel_type(rel) try: series['rel1_%s' % rserie] += 1 except: pass try: majors['rel2_%s' % rmajor] += 1 except: pass try: minors['rel3_%s' % rminor] += 1 except: pass try: relclf['relt_%s' % rtype] += 1 except: pass nsites = len(sites) for site in sites: stier = site_tier(site) stypes['s_%s' % stier] += 1 dataset_id = row['rid'] era = genkey(row['acquisition_era_name'], self.salt, 5) create_dn = self.sitedb.dnid(row['create_by']) dbsinstid = row['dbs_instance'] dtype = row['primary_ds_type'] # number of data types should be small and simple # list look-up shouldn't be a problem if dtype not in dtypes: dtypes.append(dtype) dtype = dtypes.index(dtype) _, prim, proc, tier = dataset.split('/') prim = genkey(prim, self.salt, 5) proc = genkey(proc, self.salt, 5) if tier not in tiers: tiers.append(tier) tier = genkey(tier, self.salt, 5) parent = parents[0] if len(parents) else 0 uid = genuid(yyyymmdd(timeframe[0]), dbsinstid, dataset_id) size_norm = 2**30 # normalization factor for file size if not summary: summary = {} # we need a dict type rec = dict(id=uid, dataset=dataset_id, primds=prim, procds=proc, tier=tier, dtype=dtype, creator=create_dn, nrel=nrels, nsites=nsites, parent=parent, era=era, dbs=dbsinstid, nfiles=summary.get('num_file', 0), nlumis=summary.get('num_lumi', 0), nblk=summary.get('num_block', 0), nevt=summary.get('num_event', 0), size=summary.get('file_size', 0) / size_norm, cpu=dashboard.get('cpu', 0), wct=dashboard.get('wct', 0), proc_evts=dashboard.get('nevt', 0)) if isinstance(target, dict): rec.update(target) for key, val in series.items(): rec.update({key: val}) for key, val in majors.items(): rec.update({key: val}) for key, val in minors.items(): rec.update({key: val}) for key, val in relclf.items(): rec.update({key: val}) for key, val in stypes.items(): rec.update({key: val}) headers = rec.keys() headers.sort() headers.remove('id') headers = ['id'] + headers # let dataset id be the first column if dformat == 'headers': yield headers elif dformat == 'csv': res = [str(rec[h]) for h in headers] yield ','.join(res) elif dformat == 'vw': target_str = target.get('rnaccess') vals = ' '.join([str(rec[h]) for h in headers]) uid = genkey(vals, self.salt, 5) # unique row identified vwrow = "%s '%s |f %s" % (target_str, uid, vals) yield vwrow