def dataframe(self, timeframe, seed, dformat, dbs_extra, newdata=None): """Form a dataframe from various CMS data-providers""" dtypes, stypes, rtypes, tiers = self.data_types() pop_datasets = 0 dbs_datasets = 0 popdb_results = [ r for r in self.popdb.dataset_stat(timeframe[0], timeframe[1]) ] if dformat == 'csv': dataset = None for row in popdb_results: if len(row['dataset'].split( '/')) == 4: # dataset with 3 slashes dataset = row['dataset'] break if not dataset: raise Exception( "Unable to find valid dataset name in popdb output") target = dict(naccess=row['naccess'], nusers=row['nusers'], totcpu=row['totcpu'], rnaccess=row['rnaccess'], rnusers=row['rnusers'], rtotcpu=row['rtotcpu']) # seed dataset to determine headers of the dataframe rows = self.dataset_info(timeframe, seed, dtypes, stypes, rtypes, tiers, 'headers', target) headers = [r for r in rows][0] yield ','.join(headers) tstamp = time.strftime("%Y-%m-%d %H:%M:%S GMT", time.gmtime()) if newdata: # request new dataset if self.verbose: print("Generate dataframe for new datasets", tstamp) n_days = 7 if timeframe: n_days = ndays(yyyymmdd(timeframe[0]), yyyymmdd(timeframe[1])) new_datasets = self.dbs.new_datasets(n_days) target = dict(naccess=0, nusers=0, totcpu=0, rnaccess=0, rnusers=0, rtotcpu=0) for row in new_datasets: dataset = row['dataset'] rows = self.dataset_info(timeframe, dataset, dtypes, stypes, \ rtypes, tiers, dformat, target) for row in rows: yield row return # get list of popular datasets in certain time frame # popdb_results = [r for r in self.popdb.dataset_stat(timeframe[0], timeframe[1])] popdb_datasets = {} # for row in popdb_results: dataset = row['dataset'] if not DATASET_PAT.match(dataset): continue if self.verbose: print("Generate dataframe for %s, timeframe: %s, %s" \ % (dataset, timeframe, tstamp)) target = dict(naccess=row['naccess'], nusers=row['nusers'], totcpu=row['totcpu'], rnaccess=row['rnaccess'], rnusers=row['rnusers'], rtotcpu=row['rtotcpu']) rows = self.dataset_info(timeframe, dataset, dtypes, stypes, \ rtypes, tiers, dformat, target) popdb_datasets[dataset] = row for row in rows: yield row pop_datasets += 1 # get list of datasets from DBS and discard from this list # those who were presented in popdb all_dbs_datasets = self.dbs.datasets() dbsdatasets = [ d for d in all_dbs_datasets if d not in popdb_datasets.keys() ] target = dict(naccess=0, nusers=0, totcpu=0, rnaccess=0, rnusers=0, rtotcpu=0) for dataset in random.sample(dbsdatasets, dbs_extra): rows = self.dataset_info(timeframe, dataset, dtypes, stypes, \ rtypes, tiers, dformat, target) for row in rows: yield row dbs_datasets += 1 if self.verbose: print("DBS datasets : %s" % dbs_datasets) print("PopDB datasets: %s out of %s" % (pop_datasets, len(popdb_results)))
def dataframe(self, timeframe, seed, dformat, dbs_extra, newdata=None): """Form a dataframe from various CMS data-providers""" dtypes, stypes, rtypes, tiers = self.data_types() pop_datasets = 0 dbs_datasets = 0 popdb_results = [r for r in self.popdb.dataset_stat(timeframe[0], timeframe[1])] if dformat == 'csv': row = popdb_results[0] dataset = row['dataset'] target = dict(naccess=row['naccess'],nusers=row['nusers'],totcpu=row['totcpu'], rnaccess=row['rnaccess'],rnusers=row['rnusers'],rtotcpu=row['rtotcpu']) # seed dataset to determine headers of the dataframe rows = self.dataset_info(timeframe, seed, dtypes, stypes, rtypes, tiers, 'headers', target) headers = [r for r in rows][0] yield ','.join(headers) tstamp = time.strftime("%Y-%m-%d %H:%M:%S GMT", time.gmtime()) if newdata: # request new dataset if self.verbose: print("Generate dataframe for new datasets", tstamp) n_days = 7 if timeframe: n_days = ndays(yyyymmdd(timeframe[0]), yyyymmdd(timeframe[1])) new_datasets = self.dbs.new_datasets(n_days) target = dict(naccess=0,nusers=0,totcpu=0, rnaccess=0,rnusers=0,rtotcpu=0) for row in new_datasets: dataset = row['dataset'] rows = self.dataset_info(timeframe, dataset, dtypes, stypes, \ rtypes, tiers, dformat, target) for row in rows: yield row return # get list of popular datasets in certain time frame # popdb_results = [r for r in self.popdb.dataset_stat(timeframe[0], timeframe[1])] popdb_datasets = {} # for row in popdb_results: dataset = row['dataset'] if not DATASET_PAT.match(dataset): continue if self.verbose: print("Generate dataframe for %s, timeframe: %s, %s" \ % (dataset, timeframe, tstamp)) target = dict(naccess=row['naccess'],nusers=row['nusers'],totcpu=row['totcpu'], rnaccess=row['rnaccess'],rnusers=row['rnusers'],rtotcpu=row['rtotcpu']) rows = self.dataset_info(timeframe, dataset, dtypes, stypes, \ rtypes, tiers, dformat, target) popdb_datasets[dataset] = row for row in rows: yield row pop_datasets += 1 # get list of datasets from DBS and discard from this list # those who were presented in popdb all_dbs_datasets = self.dbs.datasets() dbsdatasets = [d for d in all_dbs_datasets if d not in popdb_datasets.keys()] target = dict(naccess=0,nusers=0,totcpu=0, rnaccess=0,rnusers=0,rtotcpu=0) for dataset in random.sample(dbsdatasets, dbs_extra): rows = self.dataset_info(timeframe, dataset, dtypes, stypes, \ rtypes, tiers, dformat, target) for row in rows: yield row dbs_datasets += 1 if self.verbose: print("DBS datasets : %s" % dbs_datasets) print("PopDB datasets: %s out of %s" % (pop_datasets, len(popdb_results)))
def test_ndays(self): "Test ndays function" time1, time2 = '20141120', '20141124' result = ndays(time1, time2) expect = 4 self.assertEqual(expect, result)