Beispiel #1
0
    def dataframe(self, timeframe, seed, dformat, dbs_extra, newdata=None):
        """Form a dataframe from various CMS data-providers"""
        dtypes, stypes, rtypes, tiers = self.data_types()
        pop_datasets = 0
        dbs_datasets = 0
        popdb_results = [
            r for r in self.popdb.dataset_stat(timeframe[0], timeframe[1])
        ]
        if dformat == 'csv':
            dataset = None
            for row in popdb_results:
                if len(row['dataset'].split(
                        '/')) == 4:  # dataset with 3 slashes
                    dataset = row['dataset']
                    break
            if not dataset:
                raise Exception(
                    "Unable to find valid dataset name in popdb output")
            target = dict(naccess=row['naccess'],
                          nusers=row['nusers'],
                          totcpu=row['totcpu'],
                          rnaccess=row['rnaccess'],
                          rnusers=row['rnusers'],
                          rtotcpu=row['rtotcpu'])
            # seed dataset to determine headers of the dataframe
            rows = self.dataset_info(timeframe, seed, dtypes, stypes, rtypes,
                                     tiers, 'headers', target)
            headers = [r for r in rows][0]
            yield ','.join(headers)
        tstamp = time.strftime("%Y-%m-%d %H:%M:%S GMT", time.gmtime())
        if newdata:  # request new dataset
            if self.verbose:
                print("Generate dataframe for new datasets", tstamp)
            n_days = 7
            if timeframe:
                n_days = ndays(yyyymmdd(timeframe[0]), yyyymmdd(timeframe[1]))
            new_datasets = self.dbs.new_datasets(n_days)
            target = dict(naccess=0,
                          nusers=0,
                          totcpu=0,
                          rnaccess=0,
                          rnusers=0,
                          rtotcpu=0)
            for row in new_datasets:
                dataset = row['dataset']
                rows = self.dataset_info(timeframe, dataset, dtypes, stypes, \
                        rtypes, tiers, dformat, target)
                for row in rows:
                    yield row
            return
        # get list of popular datasets in certain time frame
#        popdb_results = [r for r in self.popdb.dataset_stat(timeframe[0], timeframe[1])]
        popdb_datasets = {}  #
        for row in popdb_results:
            dataset = row['dataset']
            if not DATASET_PAT.match(dataset):
                continue
            if self.verbose:
                print("Generate dataframe for %s, timeframe: %s, %s" \
                        % (dataset, timeframe, tstamp))
            target = dict(naccess=row['naccess'],
                          nusers=row['nusers'],
                          totcpu=row['totcpu'],
                          rnaccess=row['rnaccess'],
                          rnusers=row['rnusers'],
                          rtotcpu=row['rtotcpu'])
            rows = self.dataset_info(timeframe, dataset, dtypes, stypes, \
                    rtypes, tiers, dformat, target)
            popdb_datasets[dataset] = row
            for row in rows:
                yield row
                pop_datasets += 1

        # get list of datasets from DBS and discard from this list
        # those who were presented in popdb
        all_dbs_datasets = self.dbs.datasets()
        dbsdatasets = [
            d for d in all_dbs_datasets if d not in popdb_datasets.keys()
        ]
        target = dict(naccess=0,
                      nusers=0,
                      totcpu=0,
                      rnaccess=0,
                      rnusers=0,
                      rtotcpu=0)
        for dataset in random.sample(dbsdatasets, dbs_extra):
            rows = self.dataset_info(timeframe, dataset, dtypes, stypes, \
                    rtypes, tiers, dformat, target)
            for row in rows:
                yield row
                dbs_datasets += 1
        if self.verbose:
            print("DBS datasets  : %s" % dbs_datasets)
            print("PopDB datasets: %s out of %s" %
                  (pop_datasets, len(popdb_results)))
Beispiel #2
0
    def dataframe(self, timeframe, seed, dformat, dbs_extra, newdata=None):
        """Form a dataframe from various CMS data-providers"""
        dtypes, stypes, rtypes, tiers = self.data_types()
        pop_datasets = 0
        dbs_datasets = 0
        popdb_results = [r for r in self.popdb.dataset_stat(timeframe[0], timeframe[1])]
        if  dformat == 'csv':
            row = popdb_results[0]
            dataset = row['dataset']
            target = dict(naccess=row['naccess'],nusers=row['nusers'],totcpu=row['totcpu'],
                    rnaccess=row['rnaccess'],rnusers=row['rnusers'],rtotcpu=row['rtotcpu'])
            # seed dataset to determine headers of the dataframe
            rows = self.dataset_info(timeframe, seed, dtypes, stypes, rtypes,
                    tiers, 'headers', target)
            headers = [r for r in rows][0]
            yield ','.join(headers)
        tstamp = time.strftime("%Y-%m-%d %H:%M:%S GMT", time.gmtime())
        if  newdata: # request new dataset
            if  self.verbose:
                print("Generate dataframe for new datasets", tstamp)
            n_days = 7
            if  timeframe:
                n_days = ndays(yyyymmdd(timeframe[0]), yyyymmdd(timeframe[1]))
            new_datasets = self.dbs.new_datasets(n_days)
            target = dict(naccess=0,nusers=0,totcpu=0,
                    rnaccess=0,rnusers=0,rtotcpu=0)
            for row in new_datasets:
                dataset = row['dataset']
                rows = self.dataset_info(timeframe, dataset, dtypes, stypes, \
                        rtypes, tiers, dformat, target)
                for row in rows:
                    yield row
            return
        # get list of popular datasets in certain time frame
#        popdb_results = [r for r in self.popdb.dataset_stat(timeframe[0], timeframe[1])]
        popdb_datasets = {} #
        for row in popdb_results:
            dataset = row['dataset']
            if  not DATASET_PAT.match(dataset):
                continue
            if  self.verbose:
                print("Generate dataframe for %s, timeframe: %s, %s" \
                        % (dataset, timeframe, tstamp))
            target = dict(naccess=row['naccess'],nusers=row['nusers'],totcpu=row['totcpu'],
                    rnaccess=row['rnaccess'],rnusers=row['rnusers'],rtotcpu=row['rtotcpu'])
            rows = self.dataset_info(timeframe, dataset, dtypes, stypes, \
                    rtypes, tiers, dformat, target)
            popdb_datasets[dataset] = row
            for row in rows:
                yield row
                pop_datasets += 1

        # get list of datasets from DBS and discard from this list
        # those who were presented in popdb
        all_dbs_datasets = self.dbs.datasets()
        dbsdatasets = [d for d in all_dbs_datasets if d not in popdb_datasets.keys()]
        target = dict(naccess=0,nusers=0,totcpu=0,
                rnaccess=0,rnusers=0,rtotcpu=0)
        for dataset in random.sample(dbsdatasets, dbs_extra):
            rows = self.dataset_info(timeframe, dataset, dtypes, stypes, \
                    rtypes, tiers, dformat, target)
            for row in rows:
                yield row
                dbs_datasets += 1
        if  self.verbose:
            print("DBS datasets  : %s" % dbs_datasets)
            print("PopDB datasets: %s out of %s" % (pop_datasets, len(popdb_results)))
Beispiel #3
0
 def test_ndays(self):
     "Test ndays function"
     time1, time2 = '20141120', '20141124'
     result = ndays(time1, time2)
     expect = 4
     self.assertEqual(expect, result)