コード例 #1
0
    def execute(self):
        """Execute ConvertDataFrame2RooDataSet"""

        proc_mgr = ProcessManager()
        settings = proc_mgr.service(ConfigObject)
        ds = proc_mgr.service(DataStore)
        ws = proc_mgr.service(RooFitManager).ws

        # 1a. basic checks on contensts of the data frame
        assert self.read_key in list(
            ds.keys()), 'key %s not in DataStore' % self.read_key
        df = ds[self.read_key]
        if not isinstance(df, pd.DataFrame):
            raise TypeError(
                'retrieved object "%s" not of type pandas DataFrame' %
                self.read_key)
        assert len(df.index) > 0, 'dataframe "%s" is empty' % self.read_key

        # 1b. retrieve map_to_factorized from ds if it's a string
        if self.map_to_factorized:
            if isinstance(self.map_to_factorized, str):
                assert len(self.map_to_factorized
                           ), 'map_to_factorized needs to be a filled string'
                assert self.map_to_factorized in ds, 'map_to_factorized key "%s" not found in datastore'
                self.map_to_factorized = ds[self.map_to_factorized]
            assert isinstance(self.map_to_factorized,
                              dict), 'map_to_factorized needs to be a dict'

        # 1c. retrieve read_key_vars rooargset from datastore
        if self.read_key_vars:
            assert isinstance(self.read_key_vars, str) and len(self.read_key_vars), \
                'read_key_vars should be a filled string'
            assert self.read_key_vars in ds, 'read_key_vars not in datastore'
            varset = ds[self.read_key_vars]
            assert isinstance(
                varset, ROOT.RooArgSet), 'read_key_vars is not a RooArgSet'
            self._varset = varset
        if self._varset:
            # varset overrules provided columns
            self.columns = [rv.GetName() for rv in self._varset]

        # 1d. check all columns
        if not self.columns:
            self.columns = df.columns.tolist()
        # match all columns/pattern in self.columns to df.columns
        matched_columns = []
        for c in self.columns:
            match_c = fnmatch.filter(df.columns, c)
            if not match_c:
                raise AssertionError(
                    'column or pattern "%s" not in data frame' %
                    (c, self.read_key))
            matched_columns += match_c
        self.columns = matched_columns
        for col in self.columns[:]:
            dt = df[col].dtype.type
            # keep categorical observables -- convert these to roocategories in conversion
            if issubclass(dt, pd.types.dtypes.CategoricalDtypeType):
                continue
            # reject all string-based columns
            if (dt is np.string_) or (dt is np.object_):
                self.log().warning('Skipping string-based column "%s"', col)
                self.columns.remove(col)
            if col in self.ignore_columns:
                self.columns.remove(col)
        self.log().debug('Picking up columns: %s', self.columns)

        # 2. do conversion of df to roodataset
        #    self.map_to_factorized are categorical variables to be turned into roocategories
        rds, obs_vars, mtf, map_to_original = data_conversion.df_to_rds(
            df[self.columns],
            rf_varset=self._varset,
            category_vars=self.map_to_factorized,
            name=self.read_key,
            store_index=self.store_index)

        # 3a. remove original df?
        if self.rm_original:
            del ds[self.read_key]

        # 3b. put objects from the datastore into the workspace
        if self.into_ws:
            try:
                ws.put(rds, ROOT.RooFit.Rename(self.store_key))
                ws.defineSet(self.store_key_vars, obs_vars)
            except:
                raise RuntimeError(
                    'could not import object "%s" into rooworkspace' %
                    self.read_key)
        # 3c. put objects into datastore
        else:
            ds[self.store_key_vars] = obs_vars
            ds[self.store_key] = rds

        # create pdf of dataset as well?
        if self.create_keys_pdf:
            if self.into_ws:
                # retrieve for consistency
                obs_vars = ws.set(self.store_key_vars)
            obs_list = ROOT.RooArgList(obs_vars)
            keys_name = self.create_keys_pdf
            keys_pdf = ROOT.RooNDKeysPdf(keys_name, keys_name, obs_list, rds,
                                         'ma')
            ds[keys_name] = keys_pdf

        # 3e.
        ds[self.sk_map_to_original] = map_to_original
        n_rds = rds.numEntries()
        ds['n_' + self.store_key] = n_rds
        self.log().debug('Stored roodataset "%s" with length: %d',
                         self.store_key, n_rds)

        return StatusCode.Success
コード例 #2
0
    def execute(self):
        """Execute RooDataHistFiller

        Fill a roodatahist object with a pandas dataframe.  It it possible to
        fill the roodatahist iteratively, in a loop over dataframes.

        There are 5 steps to the code:

        1. basic checks of the dataframe
        2. convert the dataframe to a roodataset
        3. instantiate a roodatahist object
        4. fill the roodatahist object with the roodataset
        5. store the roodatahist.
           optionally, at the storage stage a pdf can be created of the roodatahist as well.
        """

        proc_mgr = ProcessManager()
        ds = proc_mgr.service(DataStore)

        # 1a. basic checks on contensts of the data frame
        assert self.read_key in list(
            ds.keys()), 'key "%s" not in DataStore' % self.read_key
        df = ds[self.read_key]
        if not isinstance(df, pd.DataFrame):
            raise RuntimeError(
                'retrieved object "%s" not of type pandas DataFrame' %
                self.read_key)
        assert len(df.index) > 0, 'dataframe "%s" is empty' % self.read_key

        # 1b. retrieve map_to_factorized from ds if it's a string
        if self.map_to_factorized:
            if isinstance(self.map_to_factorized, str):
                assert len(self.map_to_factorized
                           ), 'map_to_factorized needs to be a filled string'
                assert self.map_to_factorized in ds, 'map_to_factorized key "%s" not found in datastore'
                self.map_to_factorized = ds[self.map_to_factorized]
            assert isinstance(self.map_to_factorized,
                              dict), 'map_to_factorized needs to be a dict'

        # 1c. varset, if already set, overrules provided columns
        if self._varset:
            assert isinstance(self._varset,
                              ROOT.RooArgSet), 'varset is not a rooargset'
            self.columns = [rv.GetName() for rv in self._varset]

        # 1d. check all columns
        if not self.columns:
            self.columns = df.columns.tolist()
        for col in self.columns[:]:
            assert col in df.columns, 'column "%s" not in dataframe "%s"' % (
                col, self.read_key)
            dt = df[col].dtype.type
            # keep categorical observables -- convert these to roocategories in conversion to tree
            if issubclass(dt, pd.types.dtypes.CategoricalDtypeType):
                continue
            # reject all string-based columns
            if (dt is np.string_) or (dt is np.object_):
                self.log().warning('Skipping string-based column "%s"', col)
                self.columns.remove(col)
            if col in self.ignore_columns:
                self.columns.remove(col)
        self.log().debug('Picking up columns: %s', self.columns)

        # 2. do conversion of df to roodataset, pass this to roodatahist below.
        #    self.map_to_factorized are categorical variables to be turned into roocategories
        rds, obs, mtf, map_to_original = data_conversion.df_to_rds(
            df[self.columns],
            rf_varset=self._varset,
            category_vars=self.map_to_factorized,
            name=self.read_key)

        # 3a. determine max number of bin for continuous observables
        #     (do this at first iteration only.)
        n_max_bins = int(self.n_max_total_bins)
        if not self._varset:
            n_total_bins_in_categories = 1
            for mto in map_to_original.values():
                n_total_bins_in_categories *= len(mto)
            n_total_bins_in_vars = self.n_max_total_bins / n_total_bins_in_categories
            n_vars = len(self.columns) - len(map_to_original)
            assert n_total_bins_in_vars >= 0, 'total number of bins in vars is negative'
            assert n_vars >= 0, 'number of roorealvars is negative'
            if n_vars >= 1:
                n_max_bins = int(math.pow(n_total_bins_in_vars, 1 / n_vars))
                if n_max_bins < 1:
                    n_max_bins = 1
                elif n_max_bins > int(self.n_max_total_bins):
                    n_max_bins = int(self.n_max_total_bins)
                self.log().debug('Max number of variable bins set to: %d',
                                 n_max_bins)

        # 3b. instantiate roodatahist, to be filled up below.
        #     secondly, fix the roofit variable set
        if not self._varset:
            self._varset = obs
            self._catset = ROOT.RooArgSet()
            # update variable range and number of binsxs
            for rv in self._varset:
                if isinstance(rv, ROOT.RooCategory):
                    self._catset.add(rv)
                    continue
                if not isinstance(rv, ROOT.RooRealVar):
                    continue
                name = rv.GetName()
                if name in self.var_number_of_bins:
                    n_bins = self.var_number_of_bins[name]
                else:
                    n_bins = N_BINS_DEFAULT
                if n_bins > n_max_bins:
                    n_bins = n_max_bins
                    self.log().info('Capping n_bins of column "%s" to: %d',
                                    name, n_max_bins)
                rv.setBins(n_bins)
                if name in self.var_min_value:
                    min_val = self.var_min_value[name]
                    rv.setMin(min_val)
                if name in self.var_max_value:
                    max_val = self.var_max_value[name]
                    rv.setMax(max_val)
        else:
            assert isinstance(self._varset, ROOT.RooArgSet) and len(
                self._varset), 'varset is not a filled rooargset'
        if not self._rdh:
            name = str(rds.GetName()).replace('rds_', 'rdh_')
            self._rdh = ROOT.RooDataHist(name, name, self._varset)
        else:
            assert isinstance(self._rdh, ROOT.RooDataHist)

        # 4. fill the roodatahist with the roodataset
        try:
            self._rdh.add(rds)
            del rds
            if not self._mto:
                self._mto.update(map_to_original)
        except Exception as exc:
            self.log().critical(
                'Could not fill roodatahist object with roodataset')
            raise exc

        # 5. storage of roodatahist and its variables
        if not self.store_at_finalize:
            self.do_storage()

        return StatusCode.Success