コード例 #1
0
def generate_test_data_score(method):
    # get number of paramenters
    hf = h5py.File(h5_file, 'r')
    parameters = len(hf['parameters']['names'])
    sa = SampleAugmenter(h5_file, include_nuisance_parameters=False)

    theta_input = inputs[str(method)]['theta']
    theta_sampling = theta_input['sampling_method']

    if (theta_sampling == 'random_morphing_points'):

        prior = []
        for p in range(parameters):
            this_tuple = theta_input['prior']['parameter_' + str(p)]
            prior.append((str(this_tuple['prior_shape']),
                          float(this_tuple['prior_param_0']),
                          float(this_tuple['prior_param_1'])))

        x, theta0, theta1, y, r_xz, t_xz = sample_train_local(
            theta=eval(theta_sampling)(theta_input['n_thetas'], prior),
            n_samples=inputs['n_samples']['test'],
            folder='/home/test/' + method + '/',
            filename='test',
            switch_train_test_events=False,
        )

    if (theta_sampling == 'benchmark'):
        _ = sa.sample_train_local(
            theta=eval(theta_sampling)(theta_input['argument']),
            n_samples=inputs['n_samples']['test'],
            folder='/home/test/' + method + '/',
            filename='test',
            switch_train_test_events=False,
        )
コード例 #2
0
 def _make_histo_data(self, thetas, n_samples, test_split=0.2):
     sampler = SampleAugmenter(
         self.madminer_filename,
         include_nuisance_parameters=self.include_nuisance_parameters)
     x, theta, _ = sampler.sample_train_plain(
         theta=sampling.morphing_points(thetas),
         n_samples=n_samples,
         test_split=test_split,
         filename=None,
         folder=None,
     )
     return theta, x
コード例 #3
0
    def _make_sampled_histo_data(self,
                                 summary_function,
                                 thetas,
                                 n_toys_per_theta,
                                 test_split=0.2,
                                 histo_theta_batchsize=100):
        sampler = SampleAugmenter(
            self.madminer_filename,
            include_nuisance_parameters=self.include_nuisance_parameters)
        all_summary_stats, all_theta = None, None

        if n_toys_per_theta is None:
            n_toys_per_theta = 10000

        n_thetas = len(thetas)
        n_batches = (n_thetas - 1) // histo_theta_batchsize + 1
        for i_batch in range(n_batches):
            logger.debug("Generating histogram data for batch %s / %s",
                         i_batch + 1, n_batches)
            theta_batch = thetas[i_batch *
                                 histo_theta_batchsize:(i_batch + 1) *
                                 histo_theta_batchsize]
            logger.debug(
                "Theta data: indices %s to %s, shape %s",
                i_batch * histo_theta_batchsize,
                (i_batch + 1) * histo_theta_batchsize,
                theta_batch.shape,
            )
            x, theta, _ = sampler.sample_train_plain(
                theta=sampling.morphing_points(theta_batch),
                n_samples=n_toys_per_theta * len(theta_batch),
                test_split=test_split,
                filename=None,
                folder=None,
                suppress_logging=True,
            )
            summary_stats = summary_function(x)
            logger.debug("Output: x has shape %s, summary_stats %s, theta %s",
                         x.shape, summary_stats.shape, theta.shape)
            if all_theta is None or all_summary_stats is None:
                all_theta = theta
                all_summary_stats = summary_stats
            else:
                all_theta = np.concatenate((all_theta, theta), 0)
                all_summary_stats = np.concatenate(
                    (all_summary_stats, summary_stats), 0)
        return all_theta, all_summary_stats
コード例 #4
0
def generate_test_data_score(method: str):
    """
    Generates test data files given a particular method (score)
    :param method: name of the MadMiner method to generate theta
    """

    sampler = SampleAugmenter(data_file, include_nuisance_parameters=False)
    thetas = inputs[method]

    theta_spec = thetas["theta_0"]
    theta_vals = get_theta_values(theta_spec)

    sampler.sample_train_local(
        theta=theta_vals,
        n_samples=n_samples_test,
        folder=f"{tests_dir}/{method}",
        filename="test",
    )
コード例 #5
0
    def _make_sampled_histo_data(self, summary_function, thetas, n_toys_per_theta, test_split=0.2):
        sampler = SampleAugmenter(self.madminer_filename, include_nuisance_parameters=self.include_nuisance_parameters)

        if n_toys_per_theta is None:
            n_toys_per_theta = 100000

        with less_logging():
            x, theta, _ = sampler.sample_train_plain(
                theta=sampling.morphing_points(thetas),
                n_samples=n_toys_per_theta * len(thetas),
                test_split=test_split,
                filename=None,
                folder=None,
            )

        summary_stats = summary_function(x)
        summary_stats = summary_stats.reshape((len(thetas), n_toys_per_theta, -1))

        return summary_stats
コード例 #6
0
def generate_test_data_ratio(method: str):
    """
    Generates test data files given a particular method (ratio)
    :param method: name of the MadMiner method to generate theta
    """

    sampler = SampleAugmenter(data_file, include_nuisance_parameters=False)
    thetas = inputs[method]

    if len(thetas) == 1:
        theta_spec = thetas["theta_0"]
        theta_vals = get_theta_values(theta_spec)

        sampler.sample_test(
            theta=theta_vals,
            n_samples=n_samples_test,
            folder=f"{tests_dir}/{method}",
            filename="test",
        )

    elif len(thetas) == 2:
        theta_0_spec = thetas["theta_0"]
        theta_1_spec = thetas["theta_1"]
        theta_0_vals = get_theta_values(theta_0_spec)
        theta_1_vals = get_theta_values(theta_1_spec)

        sampler.sample_train_ratio(
            theta0=theta_0_vals,
            theta1=theta_1_vals,
            n_samples=n_samples_test,
            folder=f"{tests_dir}/{method}",
            filename="test",
        )
コード例 #7
0
for key in logging.Logger.manager.loggerDict:
    if "madminer" not in key:
        logging.getLogger(key).setLevel(logging.WARNING)

# ## 1. Make (unweighted) training and test samples with augmented data

# At this point, we have all the information we need from the simulations. But the data is not quite ready to be used for machine learning. The `madminer.sampling` class `SampleAugmenter` will take care of the remaining book-keeping steps before we can train our estimators:
#
# First, it unweights the samples, i.e. for a given parameter vector `theta` (or a distribution `p(theta)`) it picks events `x` such that their distribution follows `p(x|theta)`. The selected samples will all come from the event file we have so far, but their frequency is changed -- some events will appear multiple times, some will disappear.
#
# Second, `SampleAugmenter` calculates all the augmented data ("gold") that is the key to our new inference methods. Depending on the specific technique, these are the joint likelihood ratio and / or the joint score. It saves all these pieces of information for the selected events in a set of numpy files that can easily be used in any machine learning framework.

# In[3]:

#sampler = SampleAugmenter('data/lhe_data_shuffled.h5')
sampler = SampleAugmenter('data/delphes_data_shuffled.h5')

# The `SampleAugmenter` class defines five different high-level functions to generate train or test samples:
# - `sample_train_plain()`, which only saves observations x, for instance for histograms or ABC;
# - `sample_train_local()` for methods like SALLY and SALLINO, which will be demonstrated in the second part of the tutorial;
# - `sample_train_density()` for neural density estimation techniques like MAF or SCANDAL;
# - `sample_train_ratio()` for techniques like CARL, ROLR, CASCAL, and RASCAL, when only theta0 is parameterized;
# - `sample_train_more_ratios()` for the same techniques, but with both theta0 and theta1 parameterized;
# - `sample_test()` for the evaluation of any method.
#
# For the arguments `theta`, `theta0`, or `theta1`, you can (and should!) use the helper functions `benchmark()`, `benchmarks()`, `morphing_point()`, `morphing_points()`, and `random_morphing_points()`, all defined in the `madminer.sampling` module.
#
# Here we'll train a likelihood ratio estimator with the ALICES method, so we focus on the `extract_samples_train_ratio()` function. We'll sample the numerator hypothesis in the likelihood ratio with 1000 points drawn from a Gaussian prior, and fix the denominator hypothesis to the SM.
#
# Note the keyword `sample_only_from_closest_benchmark=True`, which makes sure that for each parameter point we only use the events that were originally (in MG) generated from the closest benchmark. This reduces the statistical fluctuations in the outcome quite a bit.
コード例 #8
0
        logging.getLogger(key).setLevel(logging.WARNING)


# ## 1. Make (unweighted) training and test samples with augmented data

# At this point, we have all the information we need from the simulations. But the data is not quite ready to be used for machine learning. The `madminer.sampling` class `SampleAugmenter` will take care of the remaining book-keeping steps before we can train our estimators:
# 
# First, it unweights the samples, i.e. for a given parameter vector `theta` (or a distribution `p(theta)`) it picks events `x` such that their distribution follows `p(x|theta)`. The selected samples will all come from the event file we have so far, but their frequency is changed -- some events will appear multiple times, some will disappear.
# 
# Second, `SampleAugmenter` calculates all the augmented data ("gold") that is the key to our new inference methods. Depending on the specific technique, these are the joint likelihood ratio and / or the joint score. It saves all these pieces of information for the selected events in a set of numpy files that can easily be used in any machine learning framework.

# In[3]:


#sampler = SampleAugmenter('data/lhe_data_shuffled.h5')
sampler = SampleAugmenter('data/delphes_data_shuffled.h5')


# The `SampleAugmenter` class defines five different high-level functions to generate train or test samples:
# - `sample_train_plain()`, which only saves observations x, for instance for histograms or ABC;
# - `sample_train_local()` for methods like SALLY and SALLINO, which will be demonstrated in the second part of the tutorial;
# - `sample_train_density()` for neural density estimation techniques like MAF or SCANDAL;
# - `sample_train_ratio()` for techniques like CARL, ROLR, CASCAL, and RASCAL, when only theta0 is parameterized;
# - `sample_train_more_ratios()` for the same techniques, but with both theta0 and theta1 parameterized;
# - `sample_test()` for the evaluation of any method.
# 
# For the arguments `theta`, `theta0`, or `theta1`, you can (and should!) use the helper functions `benchmark()`, `benchmarks()`, `morphing_point()`, `morphing_points()`, and `random_morphing_points()`, all defined in the `madminer.sampling` module.
# 
# Here we'll train a likelihood estimator with the SCANDAL method, so we focus on the `extract_samples_train_density()` function. We'll sample the numerator hypothesis in the likelihood ratio with 1000 points drawn from a Gaussian prior, and fix the denominator hypothesis to the SM.
# 
# Note the keyword `sample_only_from_closest_benchmark=True`, which makes sure that for each parameter point we only use the events that were originally (in MG) generated from the closest benchmark. This reduces the statistical fluctuations in the outcome quite a bit.
コード例 #9
0
#############################
### Configuration parsing ###
#############################

data_dir = f'{output_dir}/data'

with open(inputs_file) as f:
    inputs = yaml.safe_load(f)

methods = inputs['methods']

#############################
#### Instantiate Sampler ####
#############################

sampler = SampleAugmenter(data_file, include_nuisance_parameters=nuisance)

#############################
## Create training samples ##
#############################

# Different methods have different arguments
train_ratio_methods = {'alice', 'alices', 'cascal', 'carl', 'rolr', 'rascal'}
train_local_methods = {'sally', 'sallino'}
train_global_methods = {'scandal'}

# Iterate through the methods
for method in methods:
    logger.info(f'Sampling from method: {method}')
    training_params = inputs[method]
コード例 #10
0
# Output of all other modules (e.g. matplotlib)
for key in logging.Logger.manager.loggerDict:
    if "madminer" not in key:
        logging.getLogger(key).setLevel(logging.WARNING)

# ## 1. Make (unweighted) training and test samples with augmented data

# At this point, we have all the information we need from the simulations. But the data is not quite ready to be used for machine learning. The `madminer.sampling` class `SampleAugmenter` will take care of the remaining book-keeping steps before we can train our estimators:
#
# First, it unweights the samples, i.e. for a given parameter vector `theta` (or a distribution `p(theta)`) it picks events `x` such that their distribution follows `p(x|theta)`. The selected samples will all come from the event file we have so far, but their frequency is changed -- some events will appear multiple times, some will disappear.
#
# Second, `SampleAugmenter` calculates all the augmented data ("gold") that is the key to our new inference methods. Depending on the specific technique, these are the joint likelihood ratio and / or the joint score. It saves all these pieces of information for the selected events in a set of numpy files that can easily be used in any machine learning framework.

# In[3]:

sampler = SampleAugmenter('data/delphes_data_shuffled.h5')

# The `SampleAugmenter` class defines five different high-level functions to generate train or test samples:
# - `sample_train_plain()`, which only saves observations x, for instance for histograms or ABC;
# - `sample_train_local()` for methods like SALLY and SALLINO, which will be demonstrated in the second part of the tutorial;
# - `sample_train_density()` for neural density estimation techniques like MAF or SCANDAL;
# - `sample_train_ratio()` for techniques like CARL, ROLR, CASCAL, and RASCAL, when only theta0 is parameterized;
# - `sample_train_more_ratios()` for the same techniques, but with both theta0 and theta1 parameterized;
# - `sample_test()` for the evaluation of any method.
#
# For the arguments `theta`, `theta0`, or `theta1`, you can (and should!) use the helper functions `benchmark()`, `benchmarks()`, `morphing_point()`, `morphing_points()`, and `random_morphing_points()`, all defined in the `madminer.sampling` module.
#
# Here we'll train a likelihood ratio estimator with the ALICES method, so we focus on the `extract_samples_train_ratio()` function. We'll sample the numerator hypothesis in the likelihood ratio with 1000 points drawn from a Gaussian prior, and fix the denominator hypothesis to the SM.
#
# Note the keyword `sample_only_from_closest_benchmark=True`, which makes sure that for each parameter point we only use the events that were originally (in MG) generated from the closest benchmark. This reduces the statistical fluctuations in the outcome quite a bit.
コード例 #11
0
ファイル: tth_object.py プロジェクト: luclepot/ttH_CP
 def init(self):
     if self.use_parton_level:
         self.sa = SampleAugmenter(
             'data/madminer_example_shuffled_parton.h5')
     else:
         self.sa = SampleAugmenter('data/madminer_example_shuffled_reco.h5')
コード例 #12
0
ファイル: tth_object.py プロジェクト: luclepot/ttH_CP
class sampling(tth_util):
    def __init__(self, use_parton_level=True, n_samples=100000):
        self.use_parton_level = use_parton_level
        self.n_samples = n_samples

    def run(self):
        self._hprint("Starting sampling run")
        self._tprint("", "Initializing")
        self.init()
        self._tprint("", "Training ratio")
        self.train_ratio()
        self._tprint("", "Extracting samples", "")
        self.extract_samples()
        self._hprint("Finished sampling run")
        # self.plot_distributions()

    def init(self):
        if self.use_parton_level:
            self.sa = SampleAugmenter(
                'data/madminer_example_shuffled_parton.h5')
        else:
            self.sa = SampleAugmenter('data/madminer_example_shuffled_reco.h5')

    def train_ratio(self, n_theta_samples=100):
        self.x, self.theta0, self.theta1, self.y, self.r_xz, self.t_xz = self.sa.extract_samples_train_ratio(
            theta0=random_morphing_thetas(n_theta_samples, [('flat', 0., 1.)]),
            theta1=constant_benchmark_theta('sm'),
            n_samples=self.n_samples,
            folder='./data/samples',
            filename='train1')

    def extract_samples(self):
        self.x, self.theta = self.sa.extract_samples_test(
            theta=constant_benchmark_theta('sm'),
            n_samples=self.n_samples,
            folder='./data/samples',
            filename='test')

        self.x_bsm, self.theta_bsm = self.sa.extract_samples_test(
            theta=constant_benchmark_theta('w'),
            n_samples=self.n_samples,
            folder='./data/samples',
            filename='test_bsm')

        self.x_bsm_morph, self.theta_bsm_morph = self.sa.extract_samples_test(
            theta=constant_benchmark_theta('morphing_basis_vector_2'),
            n_samples=self.n_samples,
            folder='./data/samples',
            filename='test_bsm_morph')

    def plot_distributions(self):
        if self.use_parton_level:  # parton level analysis
            labels = [r'$\Delta\eta_{t,\bar{t}}$', r'$p_{T, x0}$ [GeV]']
            ranges = [(-8., 8.), (0., 600.)]
            bins = (25, 25)
        else:
            labels = [
                r'$\Delta \phi_{\gamma \gamma}$', r'$p_{T, \gamma \gamma}$'
            ]
            bins = (25, 25)
            ranges = [(-3.15, 3.15), (0., 600.)]

        fig = corner.corner(self.x_bsm_morph,
                            color='C2',
                            labels=labels,
                            range=ranges,
                            bins=bins)
        _ = corner.corner(self.x_bsm,
                          color='C1',
                          labels=labels,
                          range=ranges,
                          bins=bins,
                          fig=fig)
        _ = corner.corner(self.x,
                          color='C0',
                          labels=labels,
                          range=ranges,
                          bins=bins,
                          fig=fig)
        fig.show()
コード例 #13
0
def generate_test_data_ratio(method):
    # get number of paramenters
    hf = h5py.File(h5_file, 'r')
    parameters = len(hf['parameters']['names'])
    sa = SampleAugmenter(h5_file, include_nuisance_parameters=False)

    if (len(inputs['evaluation'][str(method)]) == 1):  #only one theta

        theta_sampling = inputs['evaluation'][str(
            method)]['theta']['sampling_method']
        theta = inputs['evaluation'][str(method)]['theta']
        if (theta_sampling != 'random_morphing_points'):

            x, theta, y, r_xz, t_xz, n_effective = sa.sample_test(
                theta=eval(theta_sampling)(theta['argument']),
                n_samples=inputs['n_samples']['test'],
                folder='/home/test/' + method + '/',
                filename='test',
                switch_train_test_events=True)

        else:

            prior = []
            for p in range(parameters):
                this_tuple = theta['prior']['parameter_' + str(p)]
                prior.append((str(this_tuple['prior_shape']),
                              float(this_tuple['prior_param_0']),
                              float(this_tuple['prior_param_1'])))

            x, theta, y, r_xz, t_xz, n_effective = sa.sample_test(
                theta=eval(theta_sampling)(theta_['n_thetas'], prior),
                n_samples=inputs['n_samples']['test'],
                folder='/home/test/' + method + '/',
                filename='test',
                switch_train_test_events=True,
            )

    elif (len(inputs['evaluation'][str(method)]) == 2):  #two thetas

        theta0_sampling = inputs['evaluation'][str(method)]['theta_0'][
            'sampling_method']  #sampling method for theta0
        theta1_sampling = inputs['evaluation'][str(method)]['theta_1'][
            'sampling_method']  #sampling method for theta1
        theta_0 = inputs['evaluation'][str(method)][
            'theta_0']  #parameters for theta0 sampling
        theta_1 = inputs['evaluation'][str(method)][
            'theta_1']  #parameters for theta0 sampling

        if (theta0_sampling == 'random_morphing_points'
                and theta1_sampling != 'random_morphing_points'):

            prior = []
            for p in range(parameters):
                this_tuple = theta_0['prior']['parameter_' + str(p)]
                prior.append((str(this_tuple['prior_shape']),
                              float(this_tuple['prior_param_0']),
                              float(this_tuple['prior_param_1'])))

            x, th0, th1, y, r_xz, t_xz = sa.sample_train_ratio(
                theta0=eval(theta0_sampling)(theta_0['n_thetas'], prior),
                theta1=eval(theta1_sampling)(theta_1['argument']),
                n_samples=inputs['n_samples']['test'],
                folder='/home/test/' + method + '/',
                filename='test',
                switch_train_test_events=True,
            )

        elif (theta1_sampling == 'random_morphing_points'
              and theta0_sampling != 'random_morphing_points'):
            tuple_0 = theta_1['prior']['parameter_0']  #tuple for parameter 0
            tuple_1 = theta_1['prior']['parameter_1']  #tuple for parameter 1
            prior = [ (str(tuple_0['prior_shape']), float(tuple_0['prior_param_0']), float(tuple_0['prior_param_1'])), \
                      (str(tuple_1['prior_shape']), float(tuple_1['prior_param_0']), float(tuple_1['prior_param_1']))  ]

            x, theta0, theta1, y, r_xz, t_xz = sa.sample_train_ratio(
                theta0=eval(theta0_sampling)(theta_0['argument']),
                theta1=eval(theta1_sampling)(theta_1['n_thetas'], prior),
                n_samples=inputs['n_samples']['test'],
                folder='/home/test/' + method + '/',
                filename='test',
                switch_train_test_events=True,
            )

        elif (theta0_sampling == 'random_morphing_points'
              and theta1_sampling == 'random_morphing_points'):
            tuple0_0 = theta_0['prior']['parameter_0']  #tuple for parameter 0
            tuple0_1 = theta_0['prior']['parameter_1']  #tuple for parameter 1
            prior0 = [ (str(tuple0_0['prior_shape']), float(tuple0_0['prior_param_0']), float(tuple0_0['prior_param_1'])), \
                               (str(tuple0_1['prior_shape']), float(tuple0_1['prior_param_0']), float(tuple0_1['prior_param_1']))  ]

            tuple1_0 = theta_1[method]['prior'][
                'parameter_0']  #tuple for parameter 0
            tuple1_1 = theta_1[method]['prior'][
                'parameter_1']  #tuple for parameter 1
            prior1 = [ (str(tuple1_0['prior_shape']), float(tuple1_0['prior_param_0']), float(tuple1_0['prior_param_1'])), \
                           (str(tuple1_1['prior_shape']), float(tuple1_1['prior_param_0']), float(tuple1_1['prior_param_1']))  ]

            x, theta0, theta1, y, r_xz, t_xz = sa.sample_train_ratio(
                theta0=eval(theta0_sampling)(theta_0['n_thetas'], prior0),
                theta1=eval(theta1_sampling)(theta_1['n_thetas'], prior1),
                n_samples=inputs['n_samples']['test'],
                folder='/home/test/' + method + '/',
                filename='test',
                switch_train_test_events=True,
            )

        else:
            x, theta0, theta1, y, r_xz, t_xz, n_effective = sa.sample_train_ratio(
                theta0=eval(theta0_sampling)(theta_0['argument']),
                theta1=eval(theta1_sampling)(theta_1['argument']),
                n_samples=inputs['n_samples']['test'],
                folder='/home/test/' + method + '/',
                filename='test',
                switch_train_test_events=True)
コード例 #14
0
    #################### RATES & GRID

    theta_grid, p_values_expected_xsec, best_fit_expected_xsec = limits.expected_limits(
        theta_true=theta_true,
        theta_ranges=theta_ranges,
        mode="rate",
        include_xsec=True,
        resolutions=resolutions,
        luminosity=uselumi)

    np.save('/home/rates/grid.npy', theta_grid)
    np.save('/home/rates/rate.npy',
            [p_values_expected_xsec, best_fit_expected_xsec])

    sa_rates = SampleAugmenter(h5_file, include_nuisance_parameters=False)
    xs_grid = []
    neff_grid = []
    n_test = 10000

    for theta_element in theta_grid:
        _, xs, _ = sa_rates.cross_sections(
            theta=sampling.morphing_point(theta_element))
        _, _, neff = sa_rates.sample_train_plain(
            theta=sampling.morphing_point(theta_element), n_samples=n_test)
        xs_grid.append(xs)
        neff_grid.append(neff / float(n_test))
    neff_grid = np.array(neff_grid)
    xsgrid = np.array(xs_grid)

    np.save('/home/rates/neff_grid.npy', neff_grid)
コード例 #15
0
    grid_resolutions=grid_resolutions,
    luminosity=lumi,
    include_xsec=False,
)

p_values["SCANDAL"] = p_values_expected_scandal
mle["SCANDAL"] = best_fit_expected_scandal

# ## 6. Toy signal

# In addition to these expected limits (based on the SM), let us inject a mock signal. We first generate the data:

# In[ ]:

#sampler = SampleAugmenter('data/lhe_data_shuffled.h5')
sampler = SampleAugmenter('data/delphes_data_shuffled.h5')
sc = 1.  #1./16.52
x_observed, _, _ = sampler.sample_test(
    #theta=sampling.morphing_point([5.,1.]),
    theta=sampling.morphing_point([15.2 * sc, 0.1 * sc]),
    n_samples=1000,
    #n_samples=100000,
    folder=None,
    filename=None,
)

# In[ ]:

_, p_values_observed, best_fit_observed, _, _, _ = limits.observed_limits(
    x_observed=x_observed,
    mode="ml",
コード例 #16
0
ファイル: plotting.py プロジェクト: dlvp/madminer
def plot_distributions(
    filename,
    observables=None,
    parameter_points=None,
    uncertainties="nuisance",
    nuisance_parameters=None,
    draw_nuisance_toys=None,
    normalize=False,
    log=False,
    observable_labels=None,
    n_bins=50,
    line_labels=None,
    colors=None,
    linestyles=None,
    linewidths=1.5,
    toy_linewidths=0.5,
    alpha=0.15,
    toy_alpha=0.75,
    n_events=None,
    n_toys=100,
    n_cols=3,
):
    """
    Plots one-dimensional histograms of observables in a MadMiner file for a given set of benchmarks.

    Parameters
    ----------
    filename : str
        Filename of a MadMiner HDF5 file.

    observables : list of str or None, optional
        Which observables to plot, given by a list of their names. If None, all observables in the file
        are plotted. Default value: None.

    parameter_points : list of (str or ndarray) or None, optional
        Which parameter points to use for histogramming the data. Given by a list, each element can either be the name
        of a benchmark in the MadMiner file, or an ndarray specifying any parameter point in a morphing setup. If None,
        all physics (non-nuisance) benchmarks defined in the MadMiner file are plotted. Default value: None.

    uncertainties : {"nuisance", "none"}, optional
        Defines how uncertainty bands are drawn. With "nuisance", the variation in cross section from all nuisance
        parameters is added in quadrature. With "none", no error bands are drawn.

    nuisance_parameters : None or list of int, optional
        If uncertainties is "nuisance", this can restrict which nuisance parameters are used to draw the uncertainty
        bands. Each entry of this list is the index of one nuisance parameter (same order as in the MadMiner file).

    draw_nuisance_toys : None or int, optional
        If not None and uncertainties is "nuisance", sets the number of nuisance toy distributions that are drawn
        (in addition to the error bands).

    normalize : bool, optional
        Whether the distribution is normalized to the total cross section. Default value: False.

    log : bool, optional
        Whether to draw the y axes on a logarithmic scale. Defaul value: False.

    observable_labels : None or list of (str or None), optional
        x-axis labels naming the observables. If None, the observable names from the MadMiner file are used. Default
        value: None.

    n_bins : int, optional
        Number of histogram bins. Default value: 50.

    line_labels : None or list of (str or None), optional
        Labels for the different parameter points. If None and if parameter_points is None, the benchmark names from
        the MadMiner file are used. Default value: None.

    colors : None or str or list of str, optional
        Matplotlib line (and error band) colors for the distributions. If None, uses default colors. Default value:
        None.

    linestyles : None or str or list of str, optional
        Matplotlib line styles for the distributions. If None, uses default linestyles. Default value: None.

    linewidths : float or list of float, optional
        Line widths for the contours. Default value: 1.5.

    toy_linewidths : float or list of float or None, optional
        Line widths for the toy replicas, if uncertainties is "nuisance" and draw_nuisance_toys is not None. If None,
        linewidths is used. Default value: 1.

    alpha : float, optional
        alpha value for the uncertainty bands. Default value: 0.25.

    toy_alpha : float, optional
        alpha value for the toy replicas, if uncertainties is "nuisance" and draw_nuisance_toys is not None. Default
        value: 0.75.

    n_events : None or int, optional
        If not None, sets the number of events from the MadMiner file that will be analyzed and plotted. Default value:
        None.

    n_toys : int, optional
        Number of toy nuisance parameter vectors used to estimate the systematic uncertainties. Default value: 100.

    n_cols : int, optional
        Number of columns of subfigures in the plot. Default value: 3.

    Returns
    -------
    figure : Figure
        Plot as Matplotlib Figure instance.

    """

    # Load data
    sa = SampleAugmenter(filename, include_nuisance_parameters=True)
    if uncertainties == "nuisance":
        nuisance_morpher = NuisanceMorpher(
            sa.nuisance_parameters,
            list(sa.benchmarks.keys()),
            reference_benchmark=sa.reference_benchmark)

    # Default settings
    if parameter_points is None:
        parameter_points = []

        for key, is_nuisance in zip(sa.benchmarks, sa.benchmark_is_nuisance):
            if not is_nuisance:
                parameter_points.append(key)

        if line_labels is None:
            line_labels = parameter_points

    n_parameter_points = len(parameter_points)

    if colors is None:
        colors = ["C" + str(i)
                  for i in range(10)] * (n_parameter_points // 10 + 1)
    elif not isinstance(colors, list):
        colors = [colors for _ in range(n_parameter_points)]

    if linestyles is None:
        linestyles = ["solid", "dashed", "dotted", "dashdot"
                      ] * (n_parameter_points // 4 + 1)
    elif not isinstance(linestyles, list):
        linestyles = [linestyles for _ in range(n_parameter_points)]

    if not isinstance(linewidths, list):
        linewidths = [linewidths for _ in range(n_parameter_points)]

    if toy_linewidths is None:
        toy_linewidths = linewidths
    if not isinstance(toy_linewidths, list):
        toy_linewidths = [toy_linewidths for _ in range(n_parameter_points)]

    # Observables
    observable_indices = []
    if observables is None:
        observable_indices = list(range(len(sa.observables)))
    else:
        all_observables = list(sa.observables.keys())
        for obs in observables:
            try:
                observable_indices.append(all_observables.index(str(obs)))
            except ValueError:
                logging.warning("Ignoring unknown observable %s", obs)

    logger.debug("Observable indices: %s", observable_indices)

    n_observables = len(observable_indices)

    if observable_labels is None:
        all_observables = list(sa.observables.keys())
        observable_labels = [
            all_observables[obs] for obs in observable_indices
        ]

    # Get event data (observations and weights)
    x, weights_benchmarks = sa.extract_raw_data()
    logger.debug("Loaded raw data with shapes %s, %s", x.shape,
                 weights_benchmarks.shape)

    # Remove negative weights
    sane_event_filter = np.all(weights_benchmarks >= 0.0, axis=1)

    n_events_before = weights_benchmarks.shape[0]
    x = x[sane_event_filter]
    weights_benchmarks = weights_benchmarks[sane_event_filter]
    n_events_removed = n_events_before - weights_benchmarks.shape[0]

    if int(np.sum(sane_event_filter, dtype=np.int)) < len(sane_event_filter):
        logger.warning("Removed %s / %s events with negative weights",
                       n_events_removed, n_events_before)

    # Shuffle events
    x, weights_benchmarks = shuffle(x, weights_benchmarks)

    # Only analyze n_events
    if n_events is not None and n_events < x.shape[0]:
        logger.debug("Only analyzing first %s / %s events", n_events,
                     x.shape[0])

        x = x[:n_events]
        weights_benchmarks = weights_benchmarks[:n_events]

    if uncertainties != "nuisance":
        n_toys = 0

    n_nuisance_toys_drawn = 0
    if draw_nuisance_toys is not None:
        n_nuisance_toys_drawn = draw_nuisance_toys

    theta_matrices = []
    for theta in parameter_points:
        if isinstance(theta, six.string_types):
            matrix = get_theta_benchmark_matrix("benchmark", theta,
                                                sa.benchmarks)
        else:
            matrix = get_theta_benchmark_matrix("morphing", theta,
                                                sa.benchmarks, sa.morpher)
        theta_matrices.append(matrix)

    logger.debug("Calculated %s theta matrices", len(theta_matrices))

    # Nuisance parameters
    nuisance_toy_factors = []

    if uncertainties == "nuisance":
        n_nuisance_params = sa.n_nuisance_parameters

        if not n_nuisance_params > 0:
            raise RuntimeError(
                "Cannot draw systematic uncertainties -- no nuisance parameters found!"
            )

        logger.debug("Drawing nuisance toys")

        nuisance_toys = np.random.normal(loc=0.0,
                                         scale=1.0,
                                         size=n_nuisance_params * n_toys)
        nuisance_toys = nuisance_toys.reshape(n_toys, n_nuisance_params)

        # Restrict nuisance parameters
        if nuisance_parameters is not None:
            for i in range(n_nuisance_params):
                if i not in nuisance_parameters:
                    nuisance_toys[:, i] = 0.0

        logger.debug("Drew %s toy values for nuisance parameters",
                     n_toys * n_nuisance_params)

        nuisance_toy_factors = np.array([
            nuisance_morpher.calculate_nuisance_factors(
                nuisance_toy, weights_benchmarks)
            for nuisance_toy in nuisance_toys
        ])  # Shape (n_toys, n_events)

        nuisance_toy_factors = sanitize_array(nuisance_toy_factors,
                                              min_value=1.0e-2,
                                              max_value=100.0)
        # Shape (n_toys, n_events)

    # Preparing plot
    n_rows = (n_observables + n_cols - 1) // n_cols
    n_events_for_range = 10000 if n_events is None else min(10000, n_events)

    fig = plt.figure(figsize=(4.0 * n_cols, 4.0 * n_rows))

    for i_panel, (i_obs, xlabel) in enumerate(
            zip(observable_indices, observable_labels)):
        logger.debug("Plotting panel %s: observable %s, label %s", i_panel,
                     i_obs, xlabel)

        # Figure out x range
        xmins, xmaxs = [], []
        for theta_matrix in theta_matrices:
            x_small = x[:n_events_for_range]
            weights_small = mdot(theta_matrix,
                                 weights_benchmarks[:n_events_for_range])

            xmin = weighted_quantile(x_small[:, i_obs], 0.05, weights_small)
            xmax = weighted_quantile(x_small[:, i_obs], 0.95, weights_small)
            xwidth = xmax - xmin
            xmin -= xwidth * 0.1
            xmax += xwidth * 0.1

            xmin = max(xmin, np.min(x[:, i_obs]))
            xmax = min(xmax, np.max(x[:, i_obs]))

            xmins.append(xmin)
            xmaxs.append(xmax)

        xmin = min(xmins)
        xmax = max(xmaxs)
        x_range = (xmin, xmax)

        logger.debug("Ranges for observable %s: min = %s, max = %s", xlabel,
                     xmins, xmaxs)

        # Subfigure
        ax = plt.subplot(n_rows, n_cols, i_panel + 1)

        # Calculate histograms
        bin_edges = None
        histos = []
        histos_up = []
        histos_down = []
        histos_toys = []

        for i_theta, theta_matrix in enumerate(theta_matrices):
            theta_weights = mdot(theta_matrix,
                                 weights_benchmarks)  # Shape (n_events,)

            histo, bin_edges = np.histogram(x[:, i_obs],
                                            bins=n_bins,
                                            range=x_range,
                                            weights=theta_weights,
                                            density=normalize)
            histos.append(histo)

            if uncertainties == "nuisance":
                histos_toys_this_theta = []
                for i_toy, nuisance_toy_factors_this_toy in enumerate(
                        nuisance_toy_factors):
                    toy_histo, _ = np.histogram(
                        x[:, i_obs],
                        bins=n_bins,
                        range=x_range,
                        weights=theta_weights * nuisance_toy_factors_this_toy,
                        density=normalize,
                    )
                    histos_toys_this_theta.append(toy_histo)

                histos_up.append(
                    np.percentile(histos_toys_this_theta, 84.0, axis=0))
                histos_down.append(
                    np.percentile(histos_toys_this_theta, 16.0, axis=0))
                histos_toys.append(
                    histos_toys_this_theta[:n_nuisance_toys_drawn])

        # Draw error bands
        if uncertainties == "nuisance":
            for histo_up, histo_down, lw, color, label, ls in zip(
                    histos_up, histos_down, linewidths, colors, line_labels,
                    linestyles):
                bin_edges_ = np.repeat(bin_edges, 2)[1:-1]
                histo_down_ = np.repeat(histo_down, 2)
                histo_up_ = np.repeat(histo_up, 2)

                plt.fill_between(bin_edges_,
                                 histo_down_,
                                 histo_up_,
                                 facecolor=color,
                                 edgecolor="none",
                                 alpha=alpha)

            # Draw some toys
            for histo_toys, lw, color, ls in zip(histos_toys, toy_linewidths,
                                                 colors, linestyles):
                for k in range(n_nuisance_toys_drawn):
                    bin_edges_ = np.repeat(bin_edges, 2)[1:-1]
                    histo_ = np.repeat(histo_toys[k], 2)

                    plt.plot(bin_edges_,
                             histo_,
                             color=color,
                             alpha=toy_alpha,
                             lw=lw,
                             ls=ls)

        # Draw central lines
        for histo, lw, color, label, ls in zip(histos, linewidths, colors,
                                               line_labels, linestyles):
            bin_edges_ = np.repeat(bin_edges, 2)[1:-1]
            histo_ = np.repeat(histo, 2)

            plt.plot(bin_edges_,
                     histo_,
                     color=color,
                     lw=lw,
                     ls=ls,
                     label=label,
                     alpha=1.0)

        plt.legend()

        plt.xlabel(xlabel)
        if normalize:
            plt.ylabel("Normalized distribution")
        else:
            plt.ylabel(r"$\frac{d\sigma}{dx}$ [pb / bin]")

        plt.xlim(x_range[0], x_range[1])
        if log:
            ax.set_yscale("log", nonposy="clip")
        else:
            plt.ylim(0.0, None)

    plt.tight_layout()

    return fig
コード例 #17
0
# Output of all other modules (e.g. matplotlib)
for key in logging.Logger.manager.loggerDict:
    if "madminer" not in key:
        logging.getLogger(key).setLevel(logging.WARNING)

# ## 1. Make (unweighted) training and test samples with augmented data

# At this point, we have all the information we need from the simulations. But the data is not quite ready to be used for machine learning. The `madminer.sampling` class `SampleAugmenter` will take care of the remaining book-keeping steps before we can train our estimators:
#
# First, it unweights the samples, i.e. for a given parameter vector `theta` (or a distribution `p(theta)`) it picks events `x` such that their distribution follows `p(x|theta)`. The selected samples will all come from the event file we have so far, but their frequency is changed -- some events will appear multiple times, some will disappear.
#
# Second, `SampleAugmenter` calculates all the augmented data ("gold") that is the key to our new inference methods. Depending on the specific technique, these are the joint likelihood ratio and / or the joint score. It saves all these pieces of information for the selected events in a set of numpy files that can easily be used in any machine learning framework.

# In[3]:

sampler = SampleAugmenter('data/delphes_data_shuffled.h5')

# The relevant `SampleAugmenter` function for local score estimators is `extract_samples_train_local()`. As in part 3a of the tutorial, for the argument `theta` you can use the helper functions `sampling.benchmark()`, `sampling.benchmarks()`, `sampling.morphing_point()`, `sampling.morphing_points()`, and `sampling.random_morphing_points()`.

# In[4]:

x, theta, t_xz, _ = sampler.sample_train_local(
    theta=sampling.benchmark('sm'),
    #n_samples=4 * 10**5, #100000,
    n_samples=2 * 10**6,  # fewer than others
    folder='./data/samples',
    filename='train_score')

# We can use the same data as in part 3a, so you only have to execute this if you haven't gone through tutorial 3a:

# In[5]:
コード例 #18
0
methods = map(lambda x: str(x), methods)

test_split = float(inputs['test_split'])  #training-test split

# get number of paramenters
hf = h5py.File(h5_file, 'r')
parameters = len(hf['parameters']['names'])

#to shuffle or not to shuffle
if (inputs['shuffle']):
    h5shuffle_file = '/home/data/madminer_example_shuffled.h5'

    combine_and_shuffle([h5_file], h5shuffle_file)

    sampler = SampleAugmenter(h5shuffle_file,
                              include_nuisance_parameters=nuisance
                              )  #'data/madminer_example_shuffled.h5'

else:
    sampler = SampleAugmenter(h5_file, include_nuisance_parameters=nuisance)

for method in methods:
    print('sampling from method ', method)

    for i in range(n_trainsamples):

        # creates training samples

        # different methods have different arguments
        # TRAIN RATIO
コード例 #19
0
ファイル: plotting.py プロジェクト: gitter-badger/madminer
def plot_uncertainty(
    filename,
    theta,
    observable,
    obs_label,
    obs_range,
    n_bins=50,
    nuisance_parameters=None,
    n_events=None,
    n_toys=100,
    linecolor="black",
    bandcolor1="#CC002E",
    bandcolor2="orange",
    ratio_range=(0.8, 1.2),
):
    """
    Plots absolute and relative uncertainty bands in a histogram of one observable in a MadMiner file.

    Parameters
    ----------
    filename : str
        Filename of a MadMiner HDF5 file.

    theta : ndarray, optional
        Which parameter points to use for histogramming the data.

    observable : str
        Which observable to plot, given by its name in the MadMiner file.

    obs_label : str
        x-axis label naming the observable.

    obs_range : tuple of two float
        Range to be plotted for the observable.

    n_bins : int
        Number of bins. Default value: 50.

    nuisance_parameters : None or list of int, optional
        This can restrict which nuisance parameters are used to draw the uncertainty
        bands. Each entry of this list is the index of one nuisance parameter (same order as in the MadMiner file).

    n_events : None or int, optional
        If not None, sets the number of events from the MadMiner file that will be analyzed and plotted. Default value:
        None.

    n_toys : int, optional
        Number of toy nuisance parameter vectors used to estimate the systematic uncertainties. Default value: 100.

    linecolor : str, optional
        Line color for central prediction. Default value: "black".

    bandcolor1 : str, optional
        Error band color for 1 sigma uncertainty. Default value: "#CC002E".

    bandcolor2 : str, optional
        Error band color for 2 sigma uncertainty. Default value: "orange".

    ratio_range : tuple of two floar
        y-axis range for the plots of the ratio to the central prediction. Default value: (0.8, 1.2).

    Returns
    -------
    figure : Figure
        Plot as Matplotlib Figure instance.

    """

    # Load data
    sa = SampleAugmenter(filename, include_nuisance_parameters=True)
    nuisance_morpher = NuisanceMorpher(
        sa.nuisance_parameters, list(sa.benchmarks.keys()), reference_benchmark=sa.reference_benchmark
    )

    # Observable index
    obs_idx = list(sa.observables.keys()).index(observable)

    # Get event data (observations and weights)
    x, weights_benchmarks = sa.weighted_events()
    x = x[:, obs_idx]

    # Theta matrix
    theta_matrix = sa._get_theta_benchmark_matrix(theta)
    weights = mdot(theta_matrix, weights_benchmarks)

    # Remove negative weights
    x = x[weights >= 0.0]
    weights_benchmarks = weights_benchmarks[weights >= 0.0]
    weights = weights[weights >= 0.0]

    # Shuffle events
    x, weights, weights_benchmarks = shuffle(x, weights, weights_benchmarks)

    # Only analyze n_events
    if n_events is not None and n_events < x.shape[0]:
        x = x[:n_events]
        weights_benchmarks = weights_benchmarks[:n_events]
        weights = weights[:n_events]

    # Nuisance parameters
    n_nuisance_params = sa.n_nuisance_parameters

    nuisance_toys = np.random.normal(loc=0.0, scale=1.0, size=n_nuisance_params * n_toys)
    nuisance_toys = nuisance_toys.reshape(n_toys, n_nuisance_params)

    # Restrict nuisance parameters
    if nuisance_parameters is not None:
        for i in range(n_nuisance_params):
            if i not in nuisance_parameters:
                nuisance_toys[:, i] = 0.0

    nuisance_toy_factors = np.array(
        [
            nuisance_morpher.calculate_nuisance_factors(nuisance_toy, weights_benchmarks)
            for nuisance_toy in nuisance_toys
        ]
    )  # Shape (n_toys, n_events)

    nuisance_toy_factors = sanitize_array(nuisance_toy_factors, min_value=1.0e-2, max_value=100.0)
    # Shape (n_toys, n_events)

    # Calculate histogram for central prediction, not normalized
    histo, bin_edges = np.histogram(x, bins=n_bins, range=obs_range, weights=weights, density=False)

    # Calculate toy histograms, not normalized
    histos_toys_this_theta = []
    for i_toy, nuisance_toy_factors_this_toy in enumerate(nuisance_toy_factors):
        toy_histo, _ = np.histogram(
            x, bins=n_bins, range=obs_range, weights=weights * nuisance_toy_factors_this_toy, density=False
        )
        histos_toys_this_theta.append(toy_histo)

    histo_plus2sigma = np.percentile(histos_toys_this_theta, 97.5, axis=0)
    histo_plus1sigma = np.percentile(histos_toys_this_theta, 84.0, axis=0)
    histo_minus1sigma = np.percentile(histos_toys_this_theta, 16.0, axis=0)
    histo_minus2sigma = np.percentile(histos_toys_this_theta, 2.5, axis=0)

    # Calculate histogram for central prediction,  normalized
    histo_norm, bin_edges_norm = np.histogram(x, bins=n_bins, range=obs_range, weights=weights, density=True)

    # Calculate toy histograms, normalized
    histos_toys_this_theta = []
    for i_toy, nuisance_toy_factors_this_toy in enumerate(nuisance_toy_factors):
        toy_histo, _ = np.histogram(
            x, bins=n_bins, range=obs_range, weights=weights * nuisance_toy_factors_this_toy, density=True
        )
        histos_toys_this_theta.append(toy_histo)

    histo_plus2sigma_norm = np.percentile(histos_toys_this_theta, 97.5, axis=0)
    histo_plus1sigma_norm = np.percentile(histos_toys_this_theta, 84.0, axis=0)
    histo_minus1sigma_norm = np.percentile(histos_toys_this_theta, 16.0, axis=0)
    histo_minus2sigma_norm = np.percentile(histos_toys_this_theta, 2.5, axis=0)

    # Prepare plotting
    def plot_mc(edges, histo_central, histo_m2, histo_m1, histo_p1, histo_p2, relative=False):
        bin_edges_ = np.repeat(edges, 2)[1:-1]
        histo_ = np.repeat(histo_central, 2)
        histo_m2_ = np.repeat(histo_m2, 2)
        histo_m1_ = np.repeat(histo_m1, 2)
        histo_p1_ = np.repeat(histo_p1, 2)
        histo_p2_ = np.repeat(histo_p2, 2)

        if relative:
            histo_m2_ /= histo_
            histo_m1_ /= histo_
            histo_p1_ /= histo_
            histo_p2_ /= histo_
            histo_ /= histo_

        plt.fill_between(bin_edges_, histo_m2_, histo_p2_, facecolor=bandcolor2, edgecolor="none")
        plt.fill_between(bin_edges_, histo_m1_, histo_p1_, facecolor=bandcolor1, edgecolor="none")
        plt.plot(bin_edges_, histo_, color=linecolor, lw=1.5, ls="-")

    # Make plot
    fig = plt.figure(figsize=(10, 7))
    gs = gridspec.GridSpec(2, 2, height_ratios=[2, 1])

    # MC, absolute residuals
    ax = plt.subplot(gs[2])
    plot_mc(bin_edges, histo, histo_minus2sigma, histo_minus1sigma, histo_plus1sigma, histo_plus2sigma, relative=True)
    plt.xlabel(obs_label)
    plt.ylabel(r"Relative to central pred.")
    plt.xlim(obs_range[0], obs_range[1])
    plt.ylim(ratio_range[0], ratio_range[1])

    # MC, absolute
    ax = plt.subplot(gs[0], sharex=ax)
    plot_mc(bin_edges, histo, histo_minus2sigma, histo_minus1sigma, histo_plus1sigma, histo_plus2sigma)
    plt.ylabel(r"Differential cross section [pb/bin]")
    plt.ylim(0.0, None)
    plt.setp(ax.get_xticklabels(), visible=False)

    # MC, relative residuals
    ax = plt.subplot(gs[3])
    plot_mc(
        bin_edges_norm,
        histo_norm,
        histo_minus2sigma_norm,
        histo_minus1sigma_norm,
        histo_plus1sigma_norm,
        histo_plus2sigma_norm,
        relative=True,
    )
    plt.xlabel(r"$p_{T,\gamma}$ [GeV]")
    plt.ylabel(r"Relative to central pred.")
    plt.xlim(obs_range[0], obs_range[1])
    plt.ylim(ratio_range[0], ratio_range[1])

    # MC, relative
    ax = plt.subplot(gs[1], sharex=ax)
    plot_mc(
        bin_edges_norm,
        histo_norm,
        histo_minus2sigma_norm,
        histo_minus1sigma_norm,
        histo_plus1sigma_norm,
        histo_plus2sigma_norm,
    )
    plt.ylabel(r"Normalized distribution")
    plt.ylim(0.0, None)
    plt.setp(ax.get_xticklabels(), visible=False)

    # Return
    plt.tight_layout()
    return fig
コード例 #20
0
    if "madminer" not in key:
        logging.getLogger(key).setLevel(logging.WARNING)


# ## 1. Make (unweighted) training and test samples with augmented data

# At this point, we have all the information we need from the simulations. But the data is not quite ready to be used for machine learning. The `madminer.sampling` class `SampleAugmenter` will take care of the remaining book-keeping steps before we can train our estimators:
#
# First, it unweights the samples, i.e. for a given parameter vector `theta` (or a distribution `p(theta)`) it picks events `x` such that their distribution follows `p(x|theta)`. The selected samples will all come from the event file we have so far, but their frequency is changed -- some events will appear multiple times, some will disappear.
#
# Second, `SampleAugmenter` calculates all the augmented data ("gold") that is the key to our new inference methods. Depending on the specific technique, these are the joint likelihood ratio and / or the joint score. It saves all these pieces of information for the selected events in a set of numpy files that can easily be used in any machine learning framework.

# In[3]:


sampler = SampleAugmenter('data/delphes_data_shuffled.h5')


# The `SampleAugmenter` class defines five different high-level functions to generate train or test samples:
# - `sample_train_plain()`, which only saves observations x, for instance for histograms or ABC;
# - `sample_train_local()` for methods like SALLY and SALLINO, which will be demonstrated in the second part of the tutorial;
# - `sample_train_density()` for neural density estimation techniques like MAF or SCANDAL;
# - `sample_train_ratio()` for techniques like CARL, ROLR, CASCAL, and RASCAL, when only theta0 is parameterized;
# - `sample_train_more_ratios()` for the same techniques, but with both theta0 and theta1 parameterized;
# - `sample_test()` for the evaluation of any method.
#
# For the arguments `theta`, `theta0`, or `theta1`, you can (and should!) use the helper functions `benchmark()`, `benchmarks()`, `morphing_point()`, `morphing_points()`, and `random_morphing_points()`, all defined in the `madminer.sampling` module.
#
# Here we'll train a likelihood estimator with the SCANDAL method, so we focus on the `extract_samples_train_density()` function. We'll sample the numerator hypothesis in the likelihood ratio with 1000 points drawn from a Gaussian prior, and fix the denominator hypothesis to the SM.
#
# Note the keyword `sample_only_from_closest_benchmark=True`, which makes sure that for each parameter point we only use the events that were originally (in MG) generated from the closest benchmark. This reduces the statistical fluctuations in the outcome quite a bit.