Example #1
0
 def fetch(self, api, params=None):
     "Fetch data for given api"
     if  api == 'sites':
         api = 'site-names'
     url = '%s/%s' % (self.url, api)
     data = super(SiteDBService, self).fetch(url, params)
     for row in sitedb_parser(data):
         if  api == 'people':
             rid = genkey(str(row['dn']), truncate=5)
             rec = {'dn':row['dn'], 'rid':rid}
         if  api == 'site-names':
             rid = genkey(str(row['alias']), truncate=5)
             rec = {'site':row['alias'], 'rid':rid}
         yield rec
Example #2
0
 def fetch(self, url, params, cache=True):
     "Fetch data for given api"
     debug = 0
     data = "[]"
     if  cache:
         docid = genkey("url=%s params=%s" % (url, params))
         res = self.storage.fetch_one('cache', {'_id':docid})
         if  res and 'data' in res:
             if  self.verbose:
                 print("%s::fetch url=%s, params=%s, docid=%s" \
                         % (self.name, url, params, docid))
             return res['data']
     if  self.verbose:
         print("%s::fetch url=%s, params=%s" % (self.name, url, params))
         debug = self.verbose-1
     try:
         data = getdata(url, params, debug=debug)
     except Exception as exc:
         print(str(exc))
         for attempt in xrange(3):
             time.sleep(0.1)
             print("Attempt %s" % attempt)
             try:
                 data = getdata(url, params, debug=debug)
                 break
             except Exception as err:
                 print(str(err))
                 pass
     if  cache:
         self.storage.insert('cache', {'_id':docid, 'data': data, 'url': url, 'params': params})
     return data
Example #3
0
 def fetch(self, api, params=None):
     "Fetch data for given api"
     url = '%s/%s/?%s' % (self.url, api, urllib.urlencode(params, doseq=True))
     docid = genkey("url=%s params=%s" % (url, params))
     res = self.storage.fetch_one('cache', {'_id':docid})
     if  res and 'data' in res:
         if  self.verbose:
             print("%s::fetch url=%s, params=%s, docid=%s" \
                     % (self.name, url, params, docid))
         data = res['data']
     else:
         if  self.verbose:
             print("%s::fetch url=%s, params=%s" % (self.name, url, params))
         # NOTE: popularity DB has two different access points, one
         # within CERN network and out outside. The former does not require
         # authentication, while later passes through CERN SSO.
         # The following block reflects this, in a future, when popularity DB
         # will move into cmsweb domain we'll no longer need it
         if  self.url.find('cms-popularity-prod') != -1 or \
                 self.url.find('cmsweb') != -1:
             data = getdata(url, ckey=self.ckey, cert=self.cert, debug=self.verbose)
         else:
             data = sso_getdata(url, ckey=self.ckey, cert=self.cert, debug=self.verbose)
         self.storage.insert('cache', {'_id':docid, 'data': data, 'url': url, 'params': params})
     data = json.loads(data)
     for row in data['DATA']:
         yield row
def convert(config, sep=',', sortby='tier'):
    "Lookup DBS data tiers"
    dbs = DBSService(config)
    tiers = {}
    salt = config.get('core', {}).get('salt', 'secret sauce')
    for tier in dbs.data_tiers():
        tid = genkey(tier, salt, 5)
        if  sortby == 'tier':
            tiers[tier] = tid
        else:
            tiers[tid] = tier
    for tier in sorted(tiers.keys()):
        if  sortby == 'tier':
            print('%s%s%s' % (tiers[tier], sep, tier))
        else:
            print('%s%s%s' % (tier, sep, tiers[tier]))
Example #5
0
def convert(config, sep=',', sortby='tier'):
    "Lookup DBS data tiers"
    dbs = DBSService(config)
    tiers = {}
    salt = config.get('core', {}).get('salt', 'secret sauce')
    for tier in dbs.data_tiers():
        tid = genkey(tier, salt, 5)
        if sortby == 'tier':
            tiers[tier] = tid
        else:
            tiers[tid] = tier
    for tier in sorted(tiers.keys()):
        if sortby == 'tier':
            print('%s%s%s' % (tiers[tier], sep, tier))
        else:
            print('%s%s%s' % (tier, sep, tiers[tier]))
Example #6
0
 def fetch(self, api, params=None):
     "Fetch data for given api"
     url = '%s/%s/?%s' % (self.url, api, urllib.urlencode(params, doseq=True))
     docid = genkey("url=%s params=%s" % (url, params))
     res = self.storage.fetch_one('cache', {'_id':docid})
     if  res and 'data' in res:
         if  self.verbose:
             print("%s::fetch url=%s, params=%s, docid=%s" \
                     % (self.name, url, params, docid))
         data = res['data']
     else:
         if  self.verbose:
             print("%s::fetch url=%s, params=%s" % (self.name, url, params))
         data = getdata(url, ckey=self.ckey, cert=self.cert, debug=self.verbose)
         self.storage.insert('cache', {'_id':docid, 'data': data, 'url': url, 'params': params})
     data = json.loads(data)
     for row in data['DATA']:
         yield row
Example #7
0
 def fetch(self, api, params=None, dbsinst='prod/global', cache=True):
     "Fetch data for given api"
     if dbsinst:
         dbs_url = self.url.replace('prod/global', dbsinst)
     inst = {'dbs_instance': self.all_dbs.index(dbsinst)}
     if api == 'releases':
         url = '%s/releaseversions' % dbs_url
     else:
         url = '%s/%s' % (dbs_url, api)
     data = json.loads(super(DBSService, self).fetch(url, params, cache))
     if api == 'releases':
         data = data[0]['release_version']
     for row in data:
         if api == 'datasets':
             try:
                 row['rid'] = row['dataset_id']
             except KeyError:
                 print("Unable to process dataset row", row)
                 if 'dataset' in row:
                     h = hashlib.md5()
                     h.update(row['dataset'])
                     row['rid'] = int(h.hexdigest()[:10], 16)
                     print("Generated new dataset_id", row['dataset'],
                           h.hexdigest(), row['rid'])
             except:
                 print("Unable to process dataset row", row)
                 raise
             row.update(inst)
             yield row
         elif api == 'releases':
             rid = genkey(row, truncate=5)
             rec = {'release': row, 'rid': rid}
             yield rec
         elif api == 'filesummaries':
             yield row
         else:
             yield row
Example #8
0
 def fetch(self, api, params=None, dbsinst='prod/global', cache=True):
     "Fetch data for given api"
     if  dbsinst:
         dbs_url = self.url.replace('prod/global', dbsinst)
     inst = {'dbs_instance':self.all_dbs.index(dbsinst)}
     if  api == 'releases':
         url = '%s/releaseversions' % dbs_url
     else:
         url = '%s/%s' % (dbs_url, api)
     data = json.loads(super(DBSService, self).fetch(url, params, cache))
     if  api == 'releases':
         data = data[0]['release_version']
     for row in data:
         if  api == 'datasets':
             try:
                 row['rid'] = row['dataset_id']
             except KeyError:
                 print("Unable to process dataset row", row)
                 if  'dataset' in row:
                     h = hashlib.md5()
                     h.update(row['dataset'])
                     row['rid'] = int(h.hexdigest()[:10], 16)
                     print("Generated new dataset_id", row['dataset'], h.hexdigest(), row['rid'])
             except:
                 print("Unable to process dataset row", row)
                 raise
             row.update(inst)
             yield row
         elif api == 'releases':
             rid = genkey(row, truncate=5)
             rec = {'release':row, 'rid':rid}
             yield rec
         elif api == 'filesummaries':
             yield row
         else:
             yield row
Example #9
0
 def fetch(self, url, params, cache=True):
     "Fetch data for given api"
     debug = 0
     data = "[]"
     if cache:
         docid = genkey("url=%s params=%s" % (url, params))
         res = self.storage.fetch_one('cache', {'_id': docid})
         if res and 'data' in res:
             if self.verbose:
                 print("%s::fetch url=%s, params=%s, docid=%s" \
                         % (self.name, url, params, docid))
             return res['data']
     if self.verbose:
         print("%s::fetch url=%s, params=%s" % (self.name, url, params))
         debug = self.verbose - 1
     try:
         data = getdata(url, params, debug=debug)
     except Exception as exc:
         print(str(exc))
         for attempt in xrange(3):
             time.sleep(0.1)
             print("Attempt %s" % attempt)
             try:
                 data = getdata(url, params, debug=debug)
                 break
             except Exception as err:
                 print(str(err))
                 pass
     if cache:
         self.storage.insert('cache', {
             '_id': docid,
             'data': data,
             'url': url,
             'params': params
         })
     return data
Example #10
0
 def dataset_info(self, timeframe, dataset, dtypes, stypes, rtypes, tiers, dformat, target=0):
     "Return common dataset info in specified data format"
     dbsinst = self.dbs.dataset_dbsinst(dataset)
     if  not dbsinst:
         return
     row = self.dbs.dataset_info(dataset, dbsinst)
     if  row:
         if  self.multitask:
             releases, sites, parents, summary, dashboard = \
                     self.dataset_info_all(dataset, dbsinst, timeframe)
         else:
             releases = [rname for rname in self.dbs.dataset_release_versions(dataset, dbsinst)]
             sites = [sname for sname in self.phedex.sites(dataset)]
             parents = [r for r in self.dbs.dataset_parents(dataset, dbsinst)]
             summary = self.dbs.dataset_summary(dataset, dbsinst)
             dashboard = self.dashboard.dataset_info(dataset, timeframe[0], timeframe[1])
         nrels = len(releases)
         series = rtypes['series']
         majors = rtypes['majors']
         minors = rtypes['minors']
         relclf = rtypes['rtypes']
         for rel in releases:
             rserie, rmajor, rminor = rel_ver(rel)
             if  not cmssw_test(rserie, rmajor, rminor):
                 continue
             rtype = rel_type(rel)
             try:
                 series['rel1_%s'%rserie] += 1
             except:
                 pass
             try:
                 majors['rel2_%s'%rmajor] += 1
             except:
                 pass
             try:
                 minors['rel3_%s'%rminor] += 1
             except:
                 pass
             try:
                 relclf['relt_%s'%rtype] += 1
             except:
                 pass
         nsites = len(sites)
         for site in sites:
             stier = site_tier(site)
             stypes['s_%s'%stier] += 1
         dataset_id = row['rid']
         era = genkey(row['acquisition_era_name'], self.salt, 5)
         create_dn = self.sitedb.dnid(row['create_by'])
         dbsinstid = row['dbs_instance']
         dtype = row['primary_ds_type']
         # number of data types should be small and simple
         # list look-up shouldn't be a problem
         if  dtype not in dtypes:
             dtypes.append(dtype)
         dtype = dtypes.index(dtype)
         _, prim, proc, tier = dataset.split('/')
         prim = genkey(prim, self.salt, 5)
         proc = genkey(proc, self.salt, 5)
         if  tier not in tiers:
             tiers.append(tier)
         tier = genkey(tier, self.salt, 5)
         parent = parents[0] if len(parents) else 0
         uid = genuid(yyyymmdd(timeframe[0]), dbsinstid, dataset_id)
         size_norm = 2**30 # normalization factor for file size
         rec = dict(id=uid, dataset=dataset_id, primds=prim, procds=proc, tier=tier,
                 dtype=dtype, creator=create_dn, nrel=nrels, nsites=nsites,
                 parent=parent, era=era, dbs=dbsinstid,
                 nfiles=summary.get('num_file', 0),
                 nlumis=summary.get('num_lumi', 0),
                 nblk=summary.get('num_block', 0),
                 nevt=summary.get('num_event', 0),
                 size=summary.get('file_size', 0)/size_norm,
                 cpu=dashboard.get('cpu', 0),
                 wct=dashboard.get('wct', 0),
                 proc_evts=dashboard.get('nevt', 0))
         if  isinstance(target, dict):
             rec.update(target)
         for key,val in series.items():
             rec.update({key:val})
         for key, val in majors.items():
             rec.update({key:val})
         for key, val in minors.items():
             rec.update({key:val})
         for key, val in relclf.items():
             rec.update({key:val})
         for key, val in stypes.items():
             rec.update({key:val})
         headers = rec.keys()
         headers.sort()
         headers.remove('id')
         headers = ['id'] + headers # let dataset id be the first column
         if  dformat == 'headers':
             yield headers
         elif  dformat == 'csv':
             res = [str(rec[h]) for h in headers]
             yield ','.join(res)
         elif dformat == 'vw':
             target_str = target.get('rnaccess')
             vals = ' '.join([str(rec[h]) for h in headers])
             uid = genkey(vals, self.salt, 5) # unique row identified
             vwrow = "%s '%s |f %s" % (target_str, uid, vals)
             yield vwrow
Example #11
0
 def dataset_info(self,
                  timeframe,
                  dataset,
                  dtypes,
                  stypes,
                  rtypes,
                  tiers,
                  dformat,
                  target=0):
     "Return common dataset info in specified data format"
     dbsinst = self.dbs.dataset_dbsinst(dataset)
     if not dbsinst:
         return
     row = self.dbs.dataset_info(dataset, dbsinst)
     if row:
         if self.multitask:
             releases, sites, parents, summary, dashboard = \
                     self.dataset_info_all(dataset, dbsinst, timeframe)
         else:
             releases = [
                 rname for rname in self.dbs.dataset_release_versions(
                     dataset, dbsinst)
             ]
             sites = [sname for sname in self.phedex.sites(dataset)]
             parents = [
                 r for r in self.dbs.dataset_parents(dataset, dbsinst)
             ]
             summary = self.dbs.dataset_summary(dataset, dbsinst)
             dashboard = self.dashboard.dataset_info(
                 dataset, timeframe[0], timeframe[1])
         nrels = len(releases)
         series = {}
         for k in rtypes['series'].keys():
             series[k] = 0
         majors = {}
         for k in rtypes['majors'].keys():
             majors[k] = 0
         minors = {}
         for k in rtypes['minors'].keys():
             minors[k] = 0
         relclf = {}
         for k in rtypes['rtypes'].keys():
             relclf[k] = 0
         for rel in releases:
             rserie, rmajor, rminor = rel_ver(rel)
             if not cmssw_test(rserie, rmajor, rminor):
                 continue
             rtype = rel_type(rel)
             try:
                 series['rel1_%s' % rserie] += 1
             except:
                 pass
             try:
                 majors['rel2_%s' % rmajor] += 1
             except:
                 pass
             try:
                 minors['rel3_%s' % rminor] += 1
             except:
                 pass
             try:
                 relclf['relt_%s' % rtype] += 1
             except:
                 pass
         nsites = len(sites)
         for site in sites:
             stier = site_tier(site)
             stypes['s_%s' % stier] += 1
         dataset_id = row['rid']
         era = genkey(row['acquisition_era_name'], self.salt, 5)
         create_dn = self.sitedb.dnid(row['create_by'])
         dbsinstid = row['dbs_instance']
         dtype = row['primary_ds_type']
         # number of data types should be small and simple
         # list look-up shouldn't be a problem
         if dtype not in dtypes:
             dtypes.append(dtype)
         dtype = dtypes.index(dtype)
         _, prim, proc, tier = dataset.split('/')
         prim = genkey(prim, self.salt, 5)
         proc = genkey(proc, self.salt, 5)
         if tier not in tiers:
             tiers.append(tier)
         tier = genkey(tier, self.salt, 5)
         parent = parents[0] if len(parents) else 0
         uid = genuid(yyyymmdd(timeframe[0]), dbsinstid, dataset_id)
         size_norm = 2**30  # normalization factor for file size
         if not summary:
             summary = {}  # we need a dict type
         rec = dict(id=uid,
                    dataset=dataset_id,
                    primds=prim,
                    procds=proc,
                    tier=tier,
                    dtype=dtype,
                    creator=create_dn,
                    nrel=nrels,
                    nsites=nsites,
                    parent=parent,
                    era=era,
                    dbs=dbsinstid,
                    nfiles=summary.get('num_file', 0),
                    nlumis=summary.get('num_lumi', 0),
                    nblk=summary.get('num_block', 0),
                    nevt=summary.get('num_event', 0),
                    size=summary.get('file_size', 0) / size_norm,
                    cpu=dashboard.get('cpu', 0),
                    wct=dashboard.get('wct', 0),
                    proc_evts=dashboard.get('nevt', 0))
         if isinstance(target, dict):
             rec.update(target)
         for key, val in series.items():
             rec.update({key: val})
         for key, val in majors.items():
             rec.update({key: val})
         for key, val in minors.items():
             rec.update({key: val})
         for key, val in relclf.items():
             rec.update({key: val})
         for key, val in stypes.items():
             rec.update({key: val})
         headers = rec.keys()
         headers.sort()
         headers.remove('id')
         headers = ['id'] + headers  # let dataset id be the first column
         if dformat == 'headers':
             yield headers
         elif dformat == 'csv':
             res = [str(rec[h]) for h in headers]
             yield ','.join(res)
         elif dformat == 'vw':
             target_str = target.get('rnaccess')
             vals = ' '.join([str(rec[h]) for h in headers])
             uid = genkey(vals, self.salt, 5)  # unique row identified
             vwrow = "%s '%s |f %s" % (target_str, uid, vals)
             yield vwrow