Exemple #1
0
# * LICENSE.                                                                       *
# **********************************************************************************

import logging

log = logging.getLogger('macro.esk204_apply_query_to_pandas_df')

from eskapade import ConfigObject, ProcessManager, DataStore
from eskapade import core_ops, analysis

log.debug('Now parsing configuration file esk204_apply_query_to_pandas_df')

#########################################################################################
# --- minimal analysis information

settings = ProcessManager().service(ConfigObject)
settings['analysisName'] = 'esk204_apply_query_to_pandas_df'
settings['version'] = 0

#########################################################################################
# --- Analysis values, settings, helper functions, configuration flags.

# generate a dummy dataframe and add to datastore
# to this dataset selections are applied below, during link execution.

# NB: realize that, normally, such a dataframe is read or constructed on the fly
# during link execution.

from numpy.random import randn
from pandas import DataFrame
# * modification, are permitted according to the terms listed in the file          *
# * LICENSE.                                                                       *
# **********************************************************************************

import logging
log = logging.getLogger('macro.esk109_debugging_tips')

from eskapade import ConfigObject, ProcessManager, DataStore
from eskapade import core_ops

log.debug('Now parsing configuration file esk109_debugging_tips')

#########################################################################################
# --- minimal analysis information

proc_mgr = ProcessManager()

settings = proc_mgr.service(ConfigObject)
settings['analysisName'] = 'esk109_debugging_tips'
settings['version'] = 0

#########################################################################################
# --- Analysis values, settings, helper functions, configuration flags.

msg = r"""

To end the run_eskapade.py session with an interactive ipython shell,
from the cmd line use the this flag: -i
"""
log.info(msg)
# **********************************************************************************

import logging
log = logging.getLogger('macro.esk410_testing_correlations_between_categories')

from eskapade import ConfigObject, ProcessManager
from eskapade import core_ops, analysis, root_analysis, visualization

log.debug(
    'Now parsing configuration file esk410_testing_correlations_between_categories'
)

#########################################################################################
# --- minimal analysis information

proc_mgr = ProcessManager()

settings = proc_mgr.service(ConfigObject)
settings['analysisName'] = 'esk410_testing_correlations_between_categories'
settings['version'] = 0

#########################################################################################
# --- Analysis values, settings, helper functions, configuration flags.

input_files = [os.environ['ESKAPADE'] + '/data/mock_accounts.csv.gz']

#########################################################################################
# --- now set up the chains and links based on configuration flags

ch = proc_mgr.add_chain('Data')
Exemple #4
0
# *
# * Authors:                                                                       *
# *      KPMG Big Data team, Amstelveen, The Netherlands
# *                                                                                *
# * Redistribution and use in source and binary forms, with or without             *
# * modification, are permitted according to the terms listed in the file          *
# * LICENSE.                                                                       *
# **********************************************************************************

from eskapade import ConfigObject, ProcessManager
from eskapade import core_ops
import tempfile

#########################################################################################
# --- minimal analysis information
settings = ProcessManager().service(ConfigObject)
settings['analysisName'] = 'esk108_eventlooper'
settings['version'] = 0

#########################################################################################
# --- Analysis values, settings, helper functions, configuration flags.

settings['do_map'] = False if not 'do_map' in settings else settings['do_map']
settings['do_reduce'] = False if not 'do_reduce' in settings else settings[
    'do_reduce']

settings[
    'TESTING'] = False if not 'TESTING' in settings else settings['TESTING']

# --- create dummy example dataset, which is used below
if settings['TESTING']:
    def execute(self):
        """Execute ConvertDataFrame2RooDataSet"""

        proc_mgr = ProcessManager()
        settings = proc_mgr.service(ConfigObject)
        ds = proc_mgr.service(DataStore)
        ws = proc_mgr.service(RooFitManager).ws

        # 1a. basic checks on contensts of the data frame
        assert self.read_key in list(
            ds.keys()), 'key %s not in DataStore' % self.read_key
        df = ds[self.read_key]
        if not isinstance(df, pd.DataFrame):
            raise TypeError(
                'retrieved object "%s" not of type pandas DataFrame' %
                self.read_key)
        assert len(df.index) > 0, 'dataframe "%s" is empty' % self.read_key

        # 1b. retrieve map_to_factorized from ds if it's a string
        if self.map_to_factorized:
            if isinstance(self.map_to_factorized, str):
                assert len(self.map_to_factorized
                           ), 'map_to_factorized needs to be a filled string'
                assert self.map_to_factorized in ds, 'map_to_factorized key "%s" not found in datastore'
                self.map_to_factorized = ds[self.map_to_factorized]
            assert isinstance(self.map_to_factorized,
                              dict), 'map_to_factorized needs to be a dict'

        # 1c. retrieve read_key_vars rooargset from datastore
        if self.read_key_vars:
            assert isinstance(self.read_key_vars, str) and len(self.read_key_vars), \
                'read_key_vars should be a filled string'
            assert self.read_key_vars in ds, 'read_key_vars not in datastore'
            varset = ds[self.read_key_vars]
            assert isinstance(
                varset, ROOT.RooArgSet), 'read_key_vars is not a RooArgSet'
            self._varset = varset
        if self._varset:
            # varset overrules provided columns
            self.columns = [rv.GetName() for rv in self._varset]

        # 1d. check all columns
        if not self.columns:
            self.columns = df.columns.tolist()
        for col in self.columns[:]:
            assert col in df.columns, 'column "%s" not in dataframe "%s"' % (
                col, self.read_key)
            dt = df[col].dtype.type
            # keep categorical observables -- convert these to roocategories in conversion
            if issubclass(dt, pd.types.dtypes.CategoricalDtypeType):
                continue
            # reject all string-based columns
            if (dt is np.string_) or (dt is np.object_):
                self.log().warning('Skipping string-based column "%s"', col)
                self.columns.remove(col)
            if col in self.ignore_columns:
                self.columns.remove(col)
        self.log().debug('Picking up columns: %s', self.columns)

        # 2. do conversion of df to roodataset
        #    self.map_to_factorized are categorical variables to be turned into roocategories
        rds, obs_vars, mtf, map_to_original = data_conversion.df_to_rds(
            df[self.columns],
            rf_varset=self._varset,
            category_vars=self.map_to_factorized,
            name=self.read_key,
            store_index=self.store_index)

        # create pdf of dataset as well?
        if self.create_keys_pdf:
            obs_list = ROOT.RooArgList(obs_vars)
            keys_name = self.create_keys_pdf
            keys_pdf = ROOT.RooNDKeysPdf(keys_name, keys_name, obs_list, rds,
                                         'ma')

        # 3a. remove original df?
        if self.rm_original:
            del ds[self.read_key]

        # 3b. put objects from the datastore into the workspace
        if self.into_ws:
            try:
                ws[self.store_key] = rds
                ws.defineSet(self.store_key_vars, obs_vars)
            except:
                raise RuntimeError(
                    'could not import object "%s" into rooworkspace' %
                    self.read_key)
        # 3c. put objects into datastore
        else:
            ds[self.store_key_vars] = obs_vars
            ds[self.store_key] = rds

        # 3d. workspace doesn't like keys pdf, so always keep in ds
        if self.create_keys_pdf:
            ds[keys_name] = keys_pdf

        # 3e.
        ds[self.sk_map_to_original] = map_to_original
        n_rds = rds.numEntries()
        ds['n_' + self.store_key] = n_rds
        self.log().debug('Stored roodataset "%s" with length: %d',
                         self.store_key, n_rds)

        return StatusCode.Success
    def execute(self):
        """ Execute ApplySelectionToDf

        Applies queries or column selection to a pandas DataFrame.
        Input dataframe is not overwritten, unless told to do so in kwargs.

        1. Apply queries, in order of provided query list.
        2. Select columns (if provided). 
        """

        ds = ProcessManager().service(DataStore)
        assert self.readKey in list(
            ds.keys()), 'Key %s not in DataStore.' % self.readKey
        assert isinstance(
            ds[self.readKey], pd.DataFrame
        ), 'Object with key %s is not a pandas DataFrame.' % self.readKey

        # 1. apply queries to input dataframe.
        #    input dataframe is not overwritten, unless told to do so in kwargs.
        do_continue = True
        if len(self.querySet):
            # apply first query
            query = self.querySet[0]
            try:
                df = ds[self.readKey].query(query, **self.kwargs)
            except:
                if not self.continueIfFailure:
                    raise ValueError(
                        'Failed to apply query <%s> to dataframe <%s>.' %
                        (query, self.readKey))
                else:
                    orig_df_cols = (ds[self.readKey]).columns
                    df = pd.DataFrame(columns=orig_df_cols)
                    do_continue = False
            # apply rest of the queries if any
            if do_continue:
                for query in self.querySet[1:]:
                    try:
                        df = df.query(query, **self.kwargs)
                    except:
                        if not self.continueIfFailure:
                            raise ValueError(
                                'Failed to apply query <%s> to dataframe <%s>.'
                                % (query, self.readKey))
                        else:
                            orig_df_cols = (ds[self.readKey]).columns
                            df = pd.DataFrame(columns=orig_df_cols)
                            break

        # 2. apply column selection to input dataframe.
        #    input dataframe is not overwritten.
        if len(self.selectColumns):
            if not 'df' in vars():
                df = (ds[self.readKey]).copy(deep=False)
            try:
                df = df[self.selectColumns]
            except:
                if not self.continueIfFailure:
                    raise ValueError(
                        'Failed to select columns <%s> of dataframe <%s>.' %
                        (str(self.selectColumns), self.readKey))
                else:
                    df = pd.DataFrame(columns=self.selectColumns)

        assert 'df' in vars(), 'No dataframe available for storage?'

        ds[self.storeKey] = df
        ds['n_' + self.storeKey] = len(df.index)

        self.log().info('Stored dataframe with key <%s> and length <%d>.' %
                        (self.storeKey, len(df.index)))

        return StatusCode.Success
Exemple #7
0
    def execute(self):
        """Execute UncorrelationHypothesisTester"""

        proc_mgr = ProcessManager()
        settings = proc_mgr.service(ConfigObject)
        ds = proc_mgr.service(DataStore)

        # 1a. basic checks on contents of the roodataset
        if self.from_ws:
            ws = proc_mgr.service(RooFitManager).ws
            rds = ws.data(self.read_key)
            assert rds is not None, 'Key %s not in workspace' % self.read_key
        else:
            assert self.read_key in ds, 'key "%s" not found in datastore' % self.read_key
            rds = ds[self.read_key]
        if not isinstance(rds, ROOT.RooDataSet):
            raise TypeError('retrieved object "%s" not of type RooDataSet, but: %s' % (self.read_key, type(rds)))
        assert rds.numEntries() > 0, 'RooDataSet "%s" is empty' % self.read_key

        # 1b. retrieve read_key_vars rooargset from datastore
        if self.read_key_vars:
            assert isinstance(self.read_key_vars, str) and len(self.read_key_vars), \
                'read_key_vars should be a filled string'
            assert self.read_key_vars in ds, 'read_key_vars not in datastore'
            varset = ds[self.read_key_vars]
            assert isinstance(varset, ROOT.RooArgSet), 'read_key_vars is not a RooArgSet'
        else:
            # first record in dataset
            varset = rds.get(0)
        self._all_columns = [rv.GetName() for rv in varset]
        assert len(self._all_columns) >= 2, 'need at least two variables in roodataset %s.' % self.read_key

        # 1c. check provided columns
        #     match all columns/pattern in self.columns to _all_columns
        if isinstance(self.columns, bool):
            self.columns = self._all_columns if self.columns else []
        matched_columns = []
        for c in self.columns:
            match_c = fnmatch.filter(self._all_columns, c)
            if not match_c:
                raise AssertionError('column or pattern "%s" not present in roodataset' % (c, self.read_key))
            matched_columns += match_c
        self.columns = sorted(list(set(matched_columns)))  # sorted unique list

        # 1d. retrieve left and right pair columns (multiplied as left x right)
        matched_columns = []
        for c in self.x_columns:
            match_c = fnmatch.filter(self._all_columns, c)
            if not match_c:
                raise AssertionError('column or pattern "%s" not present in roodataset' % (c, self.read_key))
            matched_columns += match_c
        self.x_columns = sorted(list(set(matched_columns)))  # sorted unique list
        matched_columns = []
        for c in self.y_columns:
            match_c = fnmatch.filter(self._all_columns, c)
            if not match_c:
                raise AssertionError('column or pattern "%s" not present in roodataset' % (c, self.read_key))
            matched_columns += match_c
        self.y_columns = sorted(list(set(matched_columns)))  # sorted unique list
        self.y_columns = sorted([c for c in self.y_columns if c not in self.x_columns])

        # 1e. retrieve map_to_original from ds
        if self.map_to_original:
            if isinstance(self.map_to_original, str):
                assert len(self.map_to_original), 'map_to_original needs to be a filled string'
                assert self.map_to_original in ds, 'map_to_original key not found in datastore'
                mto = ds[self.map_to_original]
            elif isinstance(self.map_to_original, dict):
                mto = self.map_to_original
            assert isinstance(mto, dict), 'map_to_original needs to be a dict'
            # pandas replace() will not do transformations that are identical,
            # including int 0/1 to bool. skip those column-tranformations
            self.mto = copy.copy(mto)
            for c, c_mto in mto.items():
                k = list(c_mto.keys())
                v = list(c_mto.values())
                if set(k) & set(v):
                    # true in case of indentical transformation
                    self.log().debug('Identical transformation for column "%s". Skipping column', c)
                    del self.mto[c]

        # 1f. create report pages
        # data scientis report
        self.pages = []
        if self.pages_key:
            self.pages = ds.get(self.pages_key, [])
            assert isinstance(self.pages, list), 'Pages key %s does not refer to a list' % self.pages_key
        # client report
        self.clientpages = []
        if self.clientpages_key:
            self.clientpages = ds.get(self.clientpages_key, [])
            assert isinstance(self.clientpages, list), 'Client pages key %s does not refer to a list' % self.clientpages_key

        # 1g. initialize significance_matrix
        nx = ny = 0
        x_cols = y_cols = []
        if len(self.columns):
            nx = len(self.columns)
            ny = len(self.columns)
            x_cols = self.columns
            y_cols = self.columns
        if len(self.x_columns) or len(self.y_columns):
            nx = len(self.x_columns)
            ny = len(self.y_columns)
            x_cols = self.x_columns
            y_cols = self.y_columns
        significance_matrix = np.zeros((ny, nx))
        symmetrize = True if self.columns else False
        n_bins = nx * ny if not symmetrize else nx * nx - nx
        n_unique = n_bins if not symmetrize else (nx * nx - nx) / 2

        # 2a. loop over unique column pairs and add to combinations
        for idx, c1 in enumerate(self.columns):
            for c2 in self.columns[idx + 1:]:
                self.combinations.append([c1, c2])
        #     add left-right pair combinations
        if self.x_columns and self.inproduct:
            assert len(self.x_columns) == len(self.y_columns)
        for i, c1 in enumerate(self.x_columns):
            if self.inproduct:
                c2 = self.y_columns[i]
                self.combinations.append([c1, c2])
            else:
                for j, c2 in enumerate(self.y_columns):
                    self.combinations.append([c1, c2])

        # 2b. loop over all combinations: calculate significance and residuals
        n_combos = len(self.combinations)
        n_entries = rds.numEntries()
        for i_c, combo in enumerate(self.combinations):
            combo_name = ':'.join(combo)
            # make roodatahist for each combination
            obsset = ROOT.RooArgSet()
            for c in combo:
                obsset.add(varset.find(c))
            catCutStr = '1'
            for j, var in enumerate(obsset):
                if isinstance(var, ROOT.RooRealVar):
                    n_bins = self._n_bins(combo, j)
                    var.setBins(n_bins)
                elif isinstance(var, ROOT.RooCategory):
                    ignore_categories = self._ignore_categories(combo, j)
                    for ic in ignore_categories:
                        if not var.isValidLabel(ic):
                            continue
                        catCutStr += ' && (%s!=%s::%s)' % (var.GetName(), var.GetName(), ic)
            rdh = ROOT.RooDataHist(combo_name, combo_name, obsset)
            # remove specific categories (e.g. nan) if this has been requested so.
            red = rds.reduce(ROOT.RooFit.Cut(catCutStr))
            rdh.add(red)
            del red
            # rdh.add(rds)
            # a) calculate global significance of combo
            self.log().debug('Now processing combination (%d/%d): %s with %d bins and %d entries' %
                             (i_c + 1, n_combos, str(combo), rdh.numEntries(), rdh.sumEntries()))
            Zi = ROOT.Eskapade.ABCD.SignificanceOfUncorrelatedHypothesis(rdh, obsset, self.nsims_per_significance)
            self.significance_map[combo_name] = Zi
            if len(combo) == 2:
                x = x_cols.index(combo[0])
                y = y_cols.index(combo[1])
                if x < nx and y < ny:
                    significance_matrix[y, x] = Zi
                    if symmetrize:
                        significance_matrix[x, y] = Zi
            # b) calculate residuals
            success = ROOT.Eskapade.ABCD.checkInputData(rdh)
            self.log().debug('Combination %s has significance: %f. Can calculate residuals? %s' % (str(combo), Zi, success))
            if not success:
                self.log().warning('Cannot calculate residuals for combination: %s. Skipping.' % str(combo))
                del rdh
                continue
            residi = ROOT.Eskapade.ABCD.GetNormalizedResiduals(rdh, obsset)
            dfri = data_conversion.rds_to_df(residi)
            del rdh
            del residi
            # do the mapping of roofit categories back to original format
            if self.mto:
                dfri.replace(self.mto, inplace=True)
            self.residuals_map[combo_name] = dfri

        # below, create report page for each variable in data frame
        # create resulting heatmaps and histograms

        # 1. make significance heatmap
        f_path = self.results_path + self.prefix + 'all_correlation_significance.pdf'
        var_label = 'Significance correlation matrix (s.d.)'
        vis_utils.plot_correlation_matrix(significance_matrix, x_cols, y_cols, f_path, var_label, -5, 5)
        stats = [('entries', n_entries), ('bins', n_bins), ('unique', n_unique),
                 ('> 0', (significance_matrix.ravel() > 0).sum()),
                 ('< 0', (significance_matrix.ravel() < 0).sum()),
                 ('avg', np.average(significance_matrix.ravel())),
                 ('max', max(significance_matrix.ravel())),
                 ('min', min(significance_matrix.ravel()))] if nx > 0 and ny > 0 else []
        stats_table = tabulate.tabulate(stats, tablefmt='latex')
        self.pages.append(self.page_template.replace('VAR_LABEL', var_label)
                          .replace('VAR_STATS_TABLE', stats_table)
                          .replace('VAR_HISTOGRAM_PATH', f_path))
        significance = self.significance_map.copy()
        for key in list(significance.keys()):
            significance[key] = [significance[key]]
        dfsignificance = pd.DataFrame(significance).stack().reset_index(level=1)\
                                                           .rename(columns={'level_1': 'Questions', 0: 'Significance'})\
                                                           .sort_values(by='Significance', ascending=False)
        keep_cols = ['Questions', 'Significance']
        table = latex_residuals_table(dfsignificance, keep_cols, self.z_threshold, normResidCol='Significance')
        if table:
            self.clientpages.append(self.table_template.replace('VAR_LABEL', 'Significance').replace('VAR_STATS_TABLE', table))

        # 2a. create one residual table containing the top non-noncorrelating answers
        resid_all=[]
        if len(self.combinations) > 1:
            # create one dataframe containing all data
            resid_list = []
            ndim_max = 2
            for key in list(self.residuals_map.keys()):
                if abs(self.significance_map[key]) < self.z_threshold:
                    continue
                dftmp = self.residuals_map[key].copy()
                resid_list.append(self._format_df(dftmp, key))
                if len(key.split(':')) > ndim_max:
                    ndim_max = len(key.split(':'))
            # convert top residuals into latex table
            if len(resid_list) >= 1:
                resid_all = resid_list[0]
                if len(resid_list) > 1:
                    resid_all = resid_list[0].append(resid_list[1:], ignore_index=True)
                resid_all = resid_all.reindex(resid_all.normResid.abs().sort_values(ascending=False).index)
                keep_cols = ['question_%d' % i for i in range(ndim_max)] + \
                            ['answer_%d' % i for i in range(ndim_max)] + \
                            ['num_entries', 'abcd', 'abcd_error', 'pValue', 'normResid']
                table = latex_residuals_table(resid_all, keep_cols, self.z_threshold)
                self.pages.append(self.table_template.replace('VAR_LABEL', 'Most significant outliers').replace('VAR_STATS_TABLE', table))
                keep_cols = ['question_%d' % i for i in range(ndim_max)] + \
                            ['answer_%d' % i for i in range(ndim_max)] + \
                            ['num_entries', 'abcd', 'normResid']
                table = latex_residuals_table(resid_all, keep_cols, self.z_threshold)
                self.clientpages.append(self.table_template.replace('VAR_LABEL', 'Most significant outliers').replace('VAR_STATS_TABLE', table))

        # 2b. make residuals heatmaps
        for combo in self.combinations:
            if len(combo) != 2:
                continue
            combo_name = ':'.join(combo)
            residi = self.residuals_map[combo_name]
            mat_normresiduals, x_vals, y_vals = extract_matrix(residi, combo[0], combo[1])
            mat_observed, x_vals, y_vals = extract_matrix(residi, combo[0], combo[1], 'num_entries')
            f_path = self.results_path + self.prefix + 'normalized_residuals_heatmap_' + '_'.join(combo) + '.pdf'
            vis_utils.plot_correlation_matrix(mat_normresiduals, x_vals, y_vals, f_path, 'significance relation', -5, 5,
                                              x_label=combo[0], y_label=combo[1],
                                              matrix_numbers=mat_observed,
                                              print_both_numbers=self.verbose_plots)
            stats = [('entries', residi['num_entries'].sum()), ('bins', len(residi.index)),
                     ('> 0', (residi['normResid'] > 0).sum()),
                     ('< 0', (residi['normResid'] < 0).sum()),
                     ('avg', residi['normResid'].mean()),
                     ('max', residi['normResid'].max()),
                     ('min', residi['normResid'].min())]
            stats_table = tabulate.tabulate(stats, tablefmt='latex')
            self.pages.append(self.page_template.replace('VAR_LABEL', 'relation (abcd): ' + ' vs '.join(combo))
                              .replace('VAR_STATS_TABLE', stats_table)
                              .replace('VAR_HISTOGRAM_PATH', f_path))

        # 2c. make residuals tables
        for combo in self.combinations:
            combo_name = ':'.join(combo)
            residi = self.residuals_map[combo_name]
            keep_cols = combo + ['num_entries', 'abcd', 'abcd_error', 'pValue', 'normResid']
            table = latex_residuals_table(residi, keep_cols, self.z_threshold)
            if not table:
                continue
            self.pages.append(self.table_template.replace('VAR_LABEL', 'outliers: ' + ' vs '.join(combo)).replace('VAR_STATS_TABLE', table))

        # 2d. make residuals histograms
        p_all = ROOT.TH1F('p_all', 'p_all', 20, 0, 1)
        z_all = ROOT.TH1F('z_all', 'z_all', 50, -10, 10)
        for combo in self.combinations:
            combo_name = ':'.join(combo)
            residi = self.residuals_map[combo_name]
            root_numpy.fill_hist(p_all, residi['pValue'].values)
            root_numpy.fill_hist(z_all, residi['normResid'].values)
            p_i = ROOT.TH1F('p_' + combo_name, 'p_' + combo_name, 20, 0, 1)
            z_i = ROOT.TH1F('z_' + combo_name, 'z_' + combo_name, 40, -8, 8)
            root_numpy.fill_hist(p_i, residi['pValue'].values)
            root_numpy.fill_hist(z_i, residi['normResid'].values)
            self.hist_dict['normalized residuals: ' + ' vs '.join(combo)] = z_i
            self.hist_dict['p-values: ' + ' vs '.join(combo)] = p_i
        self.hist_dict['all normalized residuals'] = z_all
        self.hist_dict['all p-values'] = p_all

        # 3. storage
        if self.hist_dict_key:
            ds[self.hist_dict_key] = self.hist_dict
        if self.pages_key:
            ds[self.pages_key] = self.pages
        if self.sk_significance_map:
            ds[self.sk_significance_map] = self.significance_map
            self.log().debug('Stored significance map in data store under key: %s' % self.sk_significance_map)
        if self.sk_residuals_map:
            ds[self.sk_residuals_map] = self.residuals_map
            self.log().debug('Stored residuals map in data store under key: %s' % self.sk_residuals_map)
        if self.sk_residuals_overview and len(resid_all)>0:
            ds[self.sk_residuals_overview] = resid_all
            self.log().debug('Stored residuals list in data store under key: %s' % self.sk_residuals_overview)

        return StatusCode.Success
Exemple #8
0
    def initialize(self):
        """Initialize UncorrelationHypothesisTester"""

        # check input arguments
        self.check_arg_types(read_key=str, significance_key=str, sk_significance_map=str, sk_residuals_map=str,
                             sk_residuals_overview=str, default_number_of_bins=int, nsims_per_significance=int, prefix=str,
                             z_threshold=float, pages_key=str, clientpages_key=str, hist_dict_key=str)
        self.check_arg_types(recurse=True, allow_none=True, columns=str)
        self.check_arg_types(recurse=True, allow_none=True, x_columns=str)
        self.check_arg_types(recurse=True, allow_none=True, y_columns=str)
        self.check_arg_types(recurse=True, allow_none=True, ignore_categories=str)
        self.check_arg_types(recurse=True, allow_none=True, var_ignore_categories=str)
        self.check_arg_vals('read_key')
        self.check_arg_vals('significance_key')

        if self.map_to_original and not isinstance(self.map_to_original, str) \
                and not isinstance(self.map_to_original, dict):
            raise TypeError('map_to_original needs to be a dict or string (to fetch a dict from the datastore)')

        # get I/O configuration
        io_conf = ProcessManager().service(ConfigObject).io_conf()

        # read report templates
        with open(persistence.io_path('templates', io_conf, 'df_summary_report.tex')) as templ_file:
            self.report_template = templ_file.read()
        with open(persistence.io_path('templates', io_conf, 'df_summary_report_page.tex')) as templ_file:
            self.page_template = templ_file.read()
        with open(persistence.io_path('templates', io_conf, 'df_summary_table_page.tex')) as templ_file:
            self.table_template = templ_file.read()

        # get path to results directory
        if not self.results_path:
            self.results_path = persistence.io_path('results_data', io_conf, 'report')
        if self.results_path and not self.results_path.endswith('/'):
            self.results_path = self.results_path + '/'

        # check if output directory exists
        if os.path.exists(self.results_path):
            # check if path is a directory
            if not os.path.isdir(self.results_path):
                self.log().critical('output path "%s" is not a directory', self.results_path)
                raise AssertionError('output path is not a directory')
        else:
            # create directory
            self.log().debug('Making output directory "%s"', self.results_path)
            os.makedirs(self.results_path)

        # prefix for file storage
        if self.prefix and not self.prefix.endswith('_'):
            self.prefix = self.prefix + '_'

        # check provided columns
        if len(self.columns):
            assert len(self.x_columns) == 0 and len(self.y_columns) == 0, \
                'Set either columns OR x_columns and y_columns.'
        if len(self.x_columns):
            assert len(self.columns) == 0 and len(self.y_columns) > 0, \
                'Set either columns OR x_columns and y_columns.'
        self._all_columns = []

        # check that var_ignore_categories are set correctly.
        for col, ic in self.var_ignore_categories.items():
            if isinstance(ic, str):
                self.var_ignore_categories[col] = [ic]
            elif not isinstance(ic, list):
                raise TypeError('var_ignore_categories key "%s" needs to be a string or list of strings' % col)

        # load roofit classes
        roofit_utils.load_libesroofit()

        return StatusCode.Success
Exemple #9
0
    def execute(self):
        """Execute WsUtils

        Operations are executed in this order:

        1. put objects from the datastore into rooworkspace
        2. execute rooworkspace factory commands
        3. pass the workspace to (a list of) functions, to execute bits of (workspace) code
        4. simulate data from a pdf
        5. fit a pdf to a dataset
        6. make a plot of a dataset, pdf, or function
        7. move objects from the workspace to the datastore
        """

        proc_mgr = ProcessManager()
        settings = proc_mgr.service(ConfigObject)
        ds = proc_mgr.service(DataStore)
        ws = proc_mgr.service(RooFitManager).ws

        # --- open existing report pages
        if self.pages_key:
            self.pages = ds.get(self.pages_key, [])
            if not isinstance(self.pages, list):
                raise TypeError('pages key "{}" does not refer to a list'.format(self.pages_key))
            elif len(self.pages) > 0:
                self.log().debug('Retrieved %d report pages under key "%s"', len(self.pages), self.pages_key)

        # --- put objects from the datastore into the workspace
        #     by doing this here, the object can be picked up by the factory
        for key in self.copy_into_ws:
            assert key in ds, 'key "%s" not found in datastore' % key
            try:
                ws[key] = ds[key]
                if self.rm_original:
                    del ds[key]
            except BaseException:
                raise RuntimeError('could not import object "%s" into rooworkspace' % key)

        # --- workspace factory commands
        #     by doing this here, the object previously imported objects
        #     can be picked up by the factory
        for cmd in self.factory:
            ws.factory(cmd)

        # --- pass ws to list of functions, to execute bits of (workspace) code
        #     by doing this here, the objects previously created can be picked up.
        for func in self.apply:
            func(ws)

        # --- simulation
        #     needs input pdf and observables to generate
        for i, tp in enumerate(self._simulate):
            assert isinstance(tp, tuple) and len(tp) == 2, 'simulate item "%d" needs to be an args, kwargs tuple' % i
            self.do_simulate(ds, ws, *tp[0], **tp[1])

        # --- fitting
        #     needs input pdf and dataset to fit
        for i, tp in enumerate(self._fit):
            assert isinstance(tp, tuple) and len(tp) == 2, 'fit item "%d" needs to be an args, kwargs tuple' % i
            self.do_fit(ds, ws, *tp[0], **tp[1])

        # --- plotting
        #     needs single observable, pdf and/or datset
        for i, tp in enumerate(self._plot):
            assert isinstance(tp, tuple) and len(tp) == 2, 'plot item "%d" needs to be an args, kwargs tuple' % i
            self.do_plot(ds, ws, *tp[0], **tp[1])

        # --- storage into ws
        #     put objects from the workspace into the datastore
        for key in self.copy_into_ds:
            assert key in ws, 'key "%s" not found in workspace' % key
            try:
                ds[key] = ws[key].Clone()
                if self.rm_original:
                    self.rm_from_ws.append(key)
            except BaseException:
                raise RuntimeError('could not import object "%s" from workspace into ds' % key)

        # --- deletion
        #     try to remove keys from the workspace
        for key in self.rm_from_ws:
            try:
                ws.cd()
                ROOT.gDirectory.Delete("%s;*" % key)
            except BaseException:
                self.log().warning('Could not remove "%s" from workspace. Pass', key)

        # storage
        if self.pages_key:
            ds[self.pages_key] = self.pages
            self.log().debug('%d report pages stored under key: %s', len(self.pages), self.pages_key)

        return StatusCode.Success
# * Redistribution and use in source and binary forms, with or without             *
# * modification, are permitted according to the terms listed in the file          *
# * LICENSE.                                                                       *
# **********************************************************************************

import logging
log = logging.getLogger('macro.esk103_printdatastore')

from eskapade import ConfigObject, ProcessManager
from eskapade import core_ops

log.debug('Now parsing configuration file esk103_printdatastore')

#########################################################################################
# --- minimal analysis information
settings = ProcessManager().service(ConfigObject)
settings['analysisName'] = 'esk103_printdatastore'
settings['version'] = 0

#########################################################################################
# --- for this macro, fill the datastore with some dummy information

from eskapade import DataStore

ProcessManager().service(DataStore)['hello'] = 'world'
ProcessManager().service(DataStore)['d'] = {'a': 1, 'b': 2, 'c': 3}

#########################################################################################
# --- now set up the chains and links based on configuration flags

proc_mgr = ProcessManager()