def execute(self): """Execute ConvertDataFrame2RooDataSet""" proc_mgr = ProcessManager() settings = proc_mgr.service(ConfigObject) ds = proc_mgr.service(DataStore) ws = proc_mgr.service(RooFitManager).ws # 1a. basic checks on contensts of the data frame assert self.read_key in list( ds.keys()), 'key %s not in DataStore' % self.read_key df = ds[self.read_key] if not isinstance(df, pd.DataFrame): raise TypeError( 'retrieved object "%s" not of type pandas DataFrame' % self.read_key) assert len(df.index) > 0, 'dataframe "%s" is empty' % self.read_key # 1b. retrieve map_to_factorized from ds if it's a string if self.map_to_factorized: if isinstance(self.map_to_factorized, str): assert len(self.map_to_factorized ), 'map_to_factorized needs to be a filled string' assert self.map_to_factorized in ds, 'map_to_factorized key "%s" not found in datastore' self.map_to_factorized = ds[self.map_to_factorized] assert isinstance(self.map_to_factorized, dict), 'map_to_factorized needs to be a dict' # 1c. retrieve read_key_vars rooargset from datastore if self.read_key_vars: assert isinstance(self.read_key_vars, str) and len(self.read_key_vars), \ 'read_key_vars should be a filled string' assert self.read_key_vars in ds, 'read_key_vars not in datastore' varset = ds[self.read_key_vars] assert isinstance( varset, ROOT.RooArgSet), 'read_key_vars is not a RooArgSet' self._varset = varset if self._varset: # varset overrules provided columns self.columns = [rv.GetName() for rv in self._varset] # 1d. check all columns if not self.columns: self.columns = df.columns.tolist() # match all columns/pattern in self.columns to df.columns matched_columns = [] for c in self.columns: match_c = fnmatch.filter(df.columns, c) if not match_c: raise AssertionError( 'column or pattern "%s" not in data frame' % (c, self.read_key)) matched_columns += match_c self.columns = matched_columns for col in self.columns[:]: dt = df[col].dtype.type # keep categorical observables -- convert these to roocategories in conversion if issubclass(dt, pd.types.dtypes.CategoricalDtypeType): continue # reject all string-based columns if (dt is np.string_) or (dt is np.object_): self.log().warning('Skipping string-based column "%s"', col) self.columns.remove(col) if col in self.ignore_columns: self.columns.remove(col) self.log().debug('Picking up columns: %s', self.columns) # 2. do conversion of df to roodataset # self.map_to_factorized are categorical variables to be turned into roocategories rds, obs_vars, mtf, map_to_original = data_conversion.df_to_rds( df[self.columns], rf_varset=self._varset, category_vars=self.map_to_factorized, name=self.read_key, store_index=self.store_index) # 3a. remove original df? if self.rm_original: del ds[self.read_key] # 3b. put objects from the datastore into the workspace if self.into_ws: try: ws.put(rds, ROOT.RooFit.Rename(self.store_key)) ws.defineSet(self.store_key_vars, obs_vars) except: raise RuntimeError( 'could not import object "%s" into rooworkspace' % self.read_key) # 3c. put objects into datastore else: ds[self.store_key_vars] = obs_vars ds[self.store_key] = rds # create pdf of dataset as well? if self.create_keys_pdf: if self.into_ws: # retrieve for consistency obs_vars = ws.set(self.store_key_vars) obs_list = ROOT.RooArgList(obs_vars) keys_name = self.create_keys_pdf keys_pdf = ROOT.RooNDKeysPdf(keys_name, keys_name, obs_list, rds, 'ma') ds[keys_name] = keys_pdf # 3e. ds[self.sk_map_to_original] = map_to_original n_rds = rds.numEntries() ds['n_' + self.store_key] = n_rds self.log().debug('Stored roodataset "%s" with length: %d', self.store_key, n_rds) return StatusCode.Success
def execute(self): """Execute RooDataHistFiller Fill a roodatahist object with a pandas dataframe. It it possible to fill the roodatahist iteratively, in a loop over dataframes. There are 5 steps to the code: 1. basic checks of the dataframe 2. convert the dataframe to a roodataset 3. instantiate a roodatahist object 4. fill the roodatahist object with the roodataset 5. store the roodatahist. optionally, at the storage stage a pdf can be created of the roodatahist as well. """ proc_mgr = ProcessManager() ds = proc_mgr.service(DataStore) # 1a. basic checks on contensts of the data frame assert self.read_key in list( ds.keys()), 'key "%s" not in DataStore' % self.read_key df = ds[self.read_key] if not isinstance(df, pd.DataFrame): raise RuntimeError( 'retrieved object "%s" not of type pandas DataFrame' % self.read_key) assert len(df.index) > 0, 'dataframe "%s" is empty' % self.read_key # 1b. retrieve map_to_factorized from ds if it's a string if self.map_to_factorized: if isinstance(self.map_to_factorized, str): assert len(self.map_to_factorized ), 'map_to_factorized needs to be a filled string' assert self.map_to_factorized in ds, 'map_to_factorized key "%s" not found in datastore' self.map_to_factorized = ds[self.map_to_factorized] assert isinstance(self.map_to_factorized, dict), 'map_to_factorized needs to be a dict' # 1c. varset, if already set, overrules provided columns if self._varset: assert isinstance(self._varset, ROOT.RooArgSet), 'varset is not a rooargset' self.columns = [rv.GetName() for rv in self._varset] # 1d. check all columns if not self.columns: self.columns = df.columns.tolist() for col in self.columns[:]: assert col in df.columns, 'column "%s" not in dataframe "%s"' % ( col, self.read_key) dt = df[col].dtype.type # keep categorical observables -- convert these to roocategories in conversion to tree if issubclass(dt, pd.types.dtypes.CategoricalDtypeType): continue # reject all string-based columns if (dt is np.string_) or (dt is np.object_): self.log().warning('Skipping string-based column "%s"', col) self.columns.remove(col) if col in self.ignore_columns: self.columns.remove(col) self.log().debug('Picking up columns: %s', self.columns) # 2. do conversion of df to roodataset, pass this to roodatahist below. # self.map_to_factorized are categorical variables to be turned into roocategories rds, obs, mtf, map_to_original = data_conversion.df_to_rds( df[self.columns], rf_varset=self._varset, category_vars=self.map_to_factorized, name=self.read_key) # 3a. determine max number of bin for continuous observables # (do this at first iteration only.) n_max_bins = int(self.n_max_total_bins) if not self._varset: n_total_bins_in_categories = 1 for mto in map_to_original.values(): n_total_bins_in_categories *= len(mto) n_total_bins_in_vars = self.n_max_total_bins / n_total_bins_in_categories n_vars = len(self.columns) - len(map_to_original) assert n_total_bins_in_vars >= 0, 'total number of bins in vars is negative' assert n_vars >= 0, 'number of roorealvars is negative' if n_vars >= 1: n_max_bins = int(math.pow(n_total_bins_in_vars, 1 / n_vars)) if n_max_bins < 1: n_max_bins = 1 elif n_max_bins > int(self.n_max_total_bins): n_max_bins = int(self.n_max_total_bins) self.log().debug('Max number of variable bins set to: %d', n_max_bins) # 3b. instantiate roodatahist, to be filled up below. # secondly, fix the roofit variable set if not self._varset: self._varset = obs self._catset = ROOT.RooArgSet() # update variable range and number of binsxs for rv in self._varset: if isinstance(rv, ROOT.RooCategory): self._catset.add(rv) continue if not isinstance(rv, ROOT.RooRealVar): continue name = rv.GetName() if name in self.var_number_of_bins: n_bins = self.var_number_of_bins[name] else: n_bins = N_BINS_DEFAULT if n_bins > n_max_bins: n_bins = n_max_bins self.log().info('Capping n_bins of column "%s" to: %d', name, n_max_bins) rv.setBins(n_bins) if name in self.var_min_value: min_val = self.var_min_value[name] rv.setMin(min_val) if name in self.var_max_value: max_val = self.var_max_value[name] rv.setMax(max_val) else: assert isinstance(self._varset, ROOT.RooArgSet) and len( self._varset), 'varset is not a filled rooargset' if not self._rdh: name = str(rds.GetName()).replace('rds_', 'rdh_') self._rdh = ROOT.RooDataHist(name, name, self._varset) else: assert isinstance(self._rdh, ROOT.RooDataHist) # 4. fill the roodatahist with the roodataset try: self._rdh.add(rds) del rds if not self._mto: self._mto.update(map_to_original) except Exception as exc: self.log().critical( 'Could not fill roodatahist object with roodataset') raise exc # 5. storage of roodatahist and its variables if not self.store_at_finalize: self.do_storage() return StatusCode.Success