def dataframe(self, timeframe, seed, dformat, dbs_extra, newdata=None): """Form a dataframe from various CMS data-providers""" dtypes, stypes, rtypes, tiers = self.data_types() pop_datasets = 0 dbs_datasets = 0 popdb_results = [r for r in self.popdb.dataset_stat(timeframe[0], timeframe[1])] if dformat == 'csv': row = popdb_results[0] dataset = row['dataset'] target = dict(naccess=row['naccess'],nusers=row['nusers'],totcpu=row['totcpu'], rnaccess=row['rnaccess'],rnusers=row['rnusers'],rtotcpu=row['rtotcpu']) # seed dataset to determine headers of the dataframe rows = self.dataset_info(timeframe, seed, dtypes, stypes, rtypes, tiers, 'headers', target) headers = [r for r in rows][0] yield ','.join(headers) tstamp = time.strftime("%Y-%m-%d %H:%M:%S GMT", time.gmtime()) if newdata: # request new dataset if self.verbose: print("Generate dataframe for new datasets", tstamp) n_days = 7 if timeframe: n_days = ndays(yyyymmdd(timeframe[0]), yyyymmdd(timeframe[1])) new_datasets = self.dbs.new_datasets(n_days) target = dict(naccess=0,nusers=0,totcpu=0, rnaccess=0,rnusers=0,rtotcpu=0) for row in new_datasets: dataset = row['dataset'] rows = self.dataset_info(timeframe, dataset, dtypes, stypes, \ rtypes, tiers, dformat, target) for row in rows: yield row return # get list of popular datasets in certain time frame # popdb_results = [r for r in self.popdb.dataset_stat(timeframe[0], timeframe[1])] popdb_datasets = {} # for row in popdb_results: dataset = row['dataset'] if not DATASET_PAT.match(dataset): continue if self.verbose: print("Generate dataframe for %s, timeframe: %s, %s" \ % (dataset, timeframe, tstamp)) target = dict(naccess=row['naccess'],nusers=row['nusers'],totcpu=row['totcpu'], rnaccess=row['rnaccess'],rnusers=row['rnusers'],rtotcpu=row['rtotcpu']) rows = self.dataset_info(timeframe, dataset, dtypes, stypes, \ rtypes, tiers, dformat, target) popdb_datasets[dataset] = row for row in rows: yield row pop_datasets += 1 # get list of datasets from DBS and discard from this list # those who were presented in popdb all_dbs_datasets = self.dbs.datasets() dbsdatasets = [d for d in all_dbs_datasets if d not in popdb_datasets.keys()] target = dict(naccess=0,nusers=0,totcpu=0, rnaccess=0,rnusers=0,rtotcpu=0) for dataset in random.sample(dbsdatasets, dbs_extra): rows = self.dataset_info(timeframe, dataset, dtypes, stypes, \ rtypes, tiers, dformat, target) for row in rows: yield row dbs_datasets += 1 if self.verbose: print("DBS datasets : %s" % dbs_datasets) print("PopDB datasets: %s out of %s" % (pop_datasets, len(popdb_results)))
def dataframe(self, timeframe, seed, dformat, dbs_extra, newdata=None): """Form a dataframe from various CMS data-providers""" dtypes, stypes, rtypes, tiers = self.data_types() pop_datasets = 0 dbs_datasets = 0 popdb_results = [ r for r in self.popdb.dataset_stat(timeframe[0], timeframe[1]) ] if dformat == 'csv': dataset = None for row in popdb_results: if len(row['dataset'].split( '/')) == 4: # dataset with 3 slashes dataset = row['dataset'] break if not dataset: raise Exception( "Unable to find valid dataset name in popdb output") target = dict(naccess=row['naccess'], nusers=row['nusers'], totcpu=row['totcpu'], rnaccess=row['rnaccess'], rnusers=row['rnusers'], rtotcpu=row['rtotcpu']) # seed dataset to determine headers of the dataframe rows = self.dataset_info(timeframe, seed, dtypes, stypes, rtypes, tiers, 'headers', target) headers = [r for r in rows][0] yield ','.join(headers) tstamp = time.strftime("%Y-%m-%d %H:%M:%S GMT", time.gmtime()) if newdata: # request new dataset if self.verbose: print("Generate dataframe for new datasets", tstamp) n_days = 7 if timeframe: n_days = ndays(yyyymmdd(timeframe[0]), yyyymmdd(timeframe[1])) new_datasets = self.dbs.new_datasets(n_days) target = dict(naccess=0, nusers=0, totcpu=0, rnaccess=0, rnusers=0, rtotcpu=0) for row in new_datasets: dataset = row['dataset'] rows = self.dataset_info(timeframe, dataset, dtypes, stypes, \ rtypes, tiers, dformat, target) for row in rows: yield row return # get list of popular datasets in certain time frame # popdb_results = [r for r in self.popdb.dataset_stat(timeframe[0], timeframe[1])] popdb_datasets = {} # for row in popdb_results: dataset = row['dataset'] if not DATASET_PAT.match(dataset): continue if self.verbose: print("Generate dataframe for %s, timeframe: %s, %s" \ % (dataset, timeframe, tstamp)) target = dict(naccess=row['naccess'], nusers=row['nusers'], totcpu=row['totcpu'], rnaccess=row['rnaccess'], rnusers=row['rnusers'], rtotcpu=row['rtotcpu']) rows = self.dataset_info(timeframe, dataset, dtypes, stypes, \ rtypes, tiers, dformat, target) popdb_datasets[dataset] = row for row in rows: yield row pop_datasets += 1 # get list of datasets from DBS and discard from this list # those who were presented in popdb all_dbs_datasets = self.dbs.datasets() dbsdatasets = [ d for d in all_dbs_datasets if d not in popdb_datasets.keys() ] target = dict(naccess=0, nusers=0, totcpu=0, rnaccess=0, rnusers=0, rtotcpu=0) for dataset in random.sample(dbsdatasets, dbs_extra): rows = self.dataset_info(timeframe, dataset, dtypes, stypes, \ rtypes, tiers, dformat, target) for row in rows: yield row dbs_datasets += 1 if self.verbose: print("DBS datasets : %s" % dbs_datasets) print("PopDB datasets: %s out of %s" % (pop_datasets, len(popdb_results)))
def dataset_info(self, timeframe, dataset, dtypes, stypes, rtypes, tiers, dformat, target=0): "Return common dataset info in specified data format" dbsinst = self.dbs.dataset_dbsinst(dataset) if not dbsinst: return row = self.dbs.dataset_info(dataset, dbsinst) if row: if self.multitask: releases, sites, parents, summary, dashboard = \ self.dataset_info_all(dataset, dbsinst, timeframe) else: releases = [rname for rname in self.dbs.dataset_release_versions(dataset, dbsinst)] sites = [sname for sname in self.phedex.sites(dataset)] parents = [r for r in self.dbs.dataset_parents(dataset, dbsinst)] summary = self.dbs.dataset_summary(dataset, dbsinst) dashboard = self.dashboard.dataset_info(dataset, timeframe[0], timeframe[1]) nrels = len(releases) series = rtypes['series'] majors = rtypes['majors'] minors = rtypes['minors'] relclf = rtypes['rtypes'] for rel in releases: rserie, rmajor, rminor = rel_ver(rel) if not cmssw_test(rserie, rmajor, rminor): continue rtype = rel_type(rel) try: series['rel1_%s'%rserie] += 1 except: pass try: majors['rel2_%s'%rmajor] += 1 except: pass try: minors['rel3_%s'%rminor] += 1 except: pass try: relclf['relt_%s'%rtype] += 1 except: pass nsites = len(sites) for site in sites: stier = site_tier(site) stypes['s_%s'%stier] += 1 dataset_id = row['rid'] era = genkey(row['acquisition_era_name'], self.salt, 5) create_dn = self.sitedb.dnid(row['create_by']) dbsinstid = row['dbs_instance'] dtype = row['primary_ds_type'] # number of data types should be small and simple # list look-up shouldn't be a problem if dtype not in dtypes: dtypes.append(dtype) dtype = dtypes.index(dtype) _, prim, proc, tier = dataset.split('/') prim = genkey(prim, self.salt, 5) proc = genkey(proc, self.salt, 5) if tier not in tiers: tiers.append(tier) tier = genkey(tier, self.salt, 5) parent = parents[0] if len(parents) else 0 uid = genuid(yyyymmdd(timeframe[0]), dbsinstid, dataset_id) size_norm = 2**30 # normalization factor for file size rec = dict(id=uid, dataset=dataset_id, primds=prim, procds=proc, tier=tier, dtype=dtype, creator=create_dn, nrel=nrels, nsites=nsites, parent=parent, era=era, dbs=dbsinstid, nfiles=summary.get('num_file', 0), nlumis=summary.get('num_lumi', 0), nblk=summary.get('num_block', 0), nevt=summary.get('num_event', 0), size=summary.get('file_size', 0)/size_norm, cpu=dashboard.get('cpu', 0), wct=dashboard.get('wct', 0), proc_evts=dashboard.get('nevt', 0)) if isinstance(target, dict): rec.update(target) for key,val in series.items(): rec.update({key:val}) for key, val in majors.items(): rec.update({key:val}) for key, val in minors.items(): rec.update({key:val}) for key, val in relclf.items(): rec.update({key:val}) for key, val in stypes.items(): rec.update({key:val}) headers = rec.keys() headers.sort() headers.remove('id') headers = ['id'] + headers # let dataset id be the first column if dformat == 'headers': yield headers elif dformat == 'csv': res = [str(rec[h]) for h in headers] yield ','.join(res) elif dformat == 'vw': target_str = target.get('rnaccess') vals = ' '.join([str(rec[h]) for h in headers]) uid = genkey(vals, self.salt, 5) # unique row identified vwrow = "%s '%s |f %s" % (target_str, uid, vals) yield vwrow
def dataset_info(self, timeframe, dataset, dtypes, stypes, rtypes, tiers, dformat, target=0): "Return common dataset info in specified data format" dbsinst = self.dbs.dataset_dbsinst(dataset) if not dbsinst: return row = self.dbs.dataset_info(dataset, dbsinst) if row: if self.multitask: releases, sites, parents, summary, dashboard = \ self.dataset_info_all(dataset, dbsinst, timeframe) else: releases = [ rname for rname in self.dbs.dataset_release_versions( dataset, dbsinst) ] sites = [sname for sname in self.phedex.sites(dataset)] parents = [ r for r in self.dbs.dataset_parents(dataset, dbsinst) ] summary = self.dbs.dataset_summary(dataset, dbsinst) dashboard = self.dashboard.dataset_info( dataset, timeframe[0], timeframe[1]) nrels = len(releases) series = {} for k in rtypes['series'].keys(): series[k] = 0 majors = {} for k in rtypes['majors'].keys(): majors[k] = 0 minors = {} for k in rtypes['minors'].keys(): minors[k] = 0 relclf = {} for k in rtypes['rtypes'].keys(): relclf[k] = 0 for rel in releases: rserie, rmajor, rminor = rel_ver(rel) if not cmssw_test(rserie, rmajor, rminor): continue rtype = rel_type(rel) try: series['rel1_%s' % rserie] += 1 except: pass try: majors['rel2_%s' % rmajor] += 1 except: pass try: minors['rel3_%s' % rminor] += 1 except: pass try: relclf['relt_%s' % rtype] += 1 except: pass nsites = len(sites) for site in sites: stier = site_tier(site) stypes['s_%s' % stier] += 1 dataset_id = row['rid'] era = genkey(row['acquisition_era_name'], self.salt, 5) create_dn = self.sitedb.dnid(row['create_by']) dbsinstid = row['dbs_instance'] dtype = row['primary_ds_type'] # number of data types should be small and simple # list look-up shouldn't be a problem if dtype not in dtypes: dtypes.append(dtype) dtype = dtypes.index(dtype) _, prim, proc, tier = dataset.split('/') prim = genkey(prim, self.salt, 5) proc = genkey(proc, self.salt, 5) if tier not in tiers: tiers.append(tier) tier = genkey(tier, self.salt, 5) parent = parents[0] if len(parents) else 0 uid = genuid(yyyymmdd(timeframe[0]), dbsinstid, dataset_id) size_norm = 2**30 # normalization factor for file size if not summary: summary = {} # we need a dict type rec = dict(id=uid, dataset=dataset_id, primds=prim, procds=proc, tier=tier, dtype=dtype, creator=create_dn, nrel=nrels, nsites=nsites, parent=parent, era=era, dbs=dbsinstid, nfiles=summary.get('num_file', 0), nlumis=summary.get('num_lumi', 0), nblk=summary.get('num_block', 0), nevt=summary.get('num_event', 0), size=summary.get('file_size', 0) / size_norm, cpu=dashboard.get('cpu', 0), wct=dashboard.get('wct', 0), proc_evts=dashboard.get('nevt', 0)) if isinstance(target, dict): rec.update(target) for key, val in series.items(): rec.update({key: val}) for key, val in majors.items(): rec.update({key: val}) for key, val in minors.items(): rec.update({key: val}) for key, val in relclf.items(): rec.update({key: val}) for key, val in stypes.items(): rec.update({key: val}) headers = rec.keys() headers.sort() headers.remove('id') headers = ['id'] + headers # let dataset id be the first column if dformat == 'headers': yield headers elif dformat == 'csv': res = [str(rec[h]) for h in headers] yield ','.join(res) elif dformat == 'vw': target_str = target.get('rnaccess') vals = ' '.join([str(rec[h]) for h in headers]) uid = genkey(vals, self.salt, 5) # unique row identified vwrow = "%s '%s |f %s" % (target_str, uid, vals) yield vwrow