Esempio n. 1
0
    def __init__(self,
                 filename,
                 disable_morphing=False,
                 include_nuisance_parameters=True):
        # Save setup
        self.include_nuisance_parameters = include_nuisance_parameters
        self.madminer_filename = filename

        # Load data
        logger.info("Loading data from %s", filename)
        (
            self.parameters,
            self.benchmarks,
            self.benchmark_is_nuisance,
            self.morphing_components,
            self.morphing_matrix,
            self.observables,
            self.n_samples,
            _,
            self.reference_benchmark,
            self.nuisance_parameters,
            self.n_events_generated_per_benchmark,
            self.n_events_backgrounds,
        ) = load_madminer_settings(
            filename, include_nuisance_benchmarks=include_nuisance_parameters)

        self.n_parameters = len(self.parameters)
        self.n_benchmarks = len(self.benchmarks)
        self.n_benchmarks_phys = np.sum(
            np.logical_not(self.benchmark_is_nuisance))
        self.n_observables = 0 if self.observables is None else len(
            self.observables)

        self.n_nuisance_parameters = 0
        if self.nuisance_parameters is not None and include_nuisance_parameters:
            self.n_nuisance_parameters = len(self.nuisance_parameters)
        else:
            self.nuisance_parameters = None

        # Morphing
        self.morpher = None
        if self.morphing_matrix is not None and self.morphing_components is not None and not disable_morphing:
            self.morpher = PhysicsMorpher(self.parameters)
            self.morpher.set_components(self.morphing_components)
            self.morpher.set_basis(self.benchmarks,
                                   morphing_matrix=self.morphing_matrix)

        # Nuisance morphing
        self.nuisance_morpher = None
        if self.nuisance_parameters is not None:
            self.nuisance_morpher = NuisanceMorpher(
                self.nuisance_parameters, list(self.benchmarks.keys()),
                self.reference_benchmark)
        else:
            self.include_nuisance_parameters = False

        # Check event numbers
        self._check_n_events()

        self._report_setup()
Esempio n. 2
0
class DataAnalyzer(object):
    """
    Collects common functionality that is used when analysing data in the MadMiner file.

    Parameters
    ----------
    filename : str
        Path to MadMiner file (for instance the output of `madminer.delphes.DelphesProcessor.save()`).

    disable_morphing : bool, optional
        If True, the morphing setup is not loaded from the file. Default value: False.

    include_nuisance_parameters : bool, optional
        If True, nuisance parameters are taken into account. Default value: True.

    """
    def __init__(self,
                 filename,
                 disable_morphing=False,
                 include_nuisance_parameters=True):
        # Save setup
        self.include_nuisance_parameters = include_nuisance_parameters
        self.madminer_filename = filename

        # Load data
        logger.info("Loading data from %s", filename)
        (
            self.parameters,
            self.benchmarks,
            self.benchmark_is_nuisance,
            self.morphing_components,
            self.morphing_matrix,
            self.observables,
            self.n_samples,
            _,
            self.reference_benchmark,
            self.nuisance_parameters,
            self.n_events_generated_per_benchmark,
            self.n_events_backgrounds,
        ) = load_madminer_settings(
            filename, include_nuisance_benchmarks=include_nuisance_parameters)

        self.n_parameters = len(self.parameters)
        self.n_benchmarks = len(self.benchmarks)
        self.n_benchmarks_phys = np.sum(
            np.logical_not(self.benchmark_is_nuisance))
        self.n_observables = 0 if self.observables is None else len(
            self.observables)

        self.n_nuisance_parameters = 0
        if self.nuisance_parameters is not None and include_nuisance_parameters:
            self.n_nuisance_parameters = len(self.nuisance_parameters)
        else:
            self.nuisance_parameters = None

        # Morphing
        self.morpher = None
        if self.morphing_matrix is not None and self.morphing_components is not None and not disable_morphing:
            self.morpher = PhysicsMorpher(self.parameters)
            self.morpher.set_components(self.morphing_components)
            self.morpher.set_basis(self.benchmarks,
                                   morphing_matrix=self.morphing_matrix)

        # Nuisance morphing
        self.nuisance_morpher = None
        if self.nuisance_parameters is not None:
            self.nuisance_morpher = NuisanceMorpher(
                self.nuisance_parameters, list(self.benchmarks.keys()),
                self.reference_benchmark)
        else:
            self.include_nuisance_parameters = False

        # Check event numbers
        self._check_n_events()

        self._report_setup()

    def event_loader(
        self,
        start=0,
        end=None,
        batch_size=100000,
        include_nuisance_parameters=None,
        generated_close_to=None,
        return_sampling_ids=False,
    ):
        """
        Yields batches of events in the MadMiner file.

        Parameters
        ----------
        start : int, optional
            First event index to load

        end : int or None, optional
            Last event index to load

        batch_size : int, optional
            Batch size

        include_nuisance_parameters : bool, optional
            Whether nuisance parameter benchmarks are included in the returned data

        generated_close_to : None or ndarray, optional
            If None, this function yields all events. Otherwise, it just yields just the events that were generated
            at the closest benchmark point to a given parameter point.

        return_sampling_ids : bool, optional
            If True, the iterator returns the sampling IDs in additioin to observables and weights.

        Yields
        ------
        observations : ndarray
            Event data

        weights : ndarray
            Event weights

        sampling_ids : int
            Sampling IDs (benchmark used for sampling for signal events, -1 for background events). Only returned if
            return_sampling_ids = True was set.

        """
        if include_nuisance_parameters is None:
            include_nuisance_parameters = self.include_nuisance_parameters

        sampling_benchmark = self._find_closest_benchmark(generated_close_to)
        logger.debug("Sampling benchmark closest to %s: %s",
                     generated_close_to, sampling_benchmark)

        if sampling_benchmark is None:
            sampling_factors = self._calculate_sampling_factors()
        else:
            sampling_factors = np.ones(self.n_benchmarks_phys + 1)
        logger.debug("Sampling factors: %s", sampling_factors)

        for data in madminer_event_loader(
                self.madminer_filename,
                start,
                end,
                batch_size,
                include_nuisance_parameters,
                benchmark_is_nuisance=self.benchmark_is_nuisance,
                sampling_benchmark=sampling_benchmark,
                sampling_factors=sampling_factors,
                return_sampling_ids=return_sampling_ids,
        ):
            yield data

    def weighted_events(
        self,
        theta=None,
        nu=None,
        start_event=None,
        end_event=None,
        derivative=False,
        generated_close_to=None,
        n_draws=None,
    ):
        """
        Returns all events together with the benchmark weights (if theta is None) or weights for a given theta.

        Parameters
        ----------
        theta : None or ndarray or str, optional
            If None, the function returns all benchmark weights. If str, the function returns the weights for a given
            benchmark name. If ndarray, it uses morphing to calculate the weights for this value of theta. Default
            value: None.

        nu : None or ndarray, optional
            If None, the nuisance parameters are set to their nominal values. Otherwise, and if theta is an ndarray,
            sets the values of the nuisance parameters.

        start_event : int
            Index (in the MadMiner file) of the first event to consider.

        end_event : int
            Index (in the MadMiner file) of the last unweighted event to consider.

        derivative : bool, optional
            If True and if theta is not None, the derivative of the weights with respect to theta are returned. Default
            value: False.

        generated_close_to : None or int, optional
            Only returns benchmarks generated from this benchmark (and background events). Default value: None.

        n_draws : None or int, optional
            If not None, returns only this number of events, drawn randomly.

        Returns
        -------
        x : ndarray
            Observables with shape `(n_unweighted_samples, n_observables)`.

        weights : ndarray
            If theta is None and derivative is False, benchmark weights with shape
            `(n_unweighted_samples, n_benchmarks)` in pb. If theta is not None and derivative is True, the gradient of
            the weight for the given parameter with respect to theta with shape `(n_unweighted_samples, n_gradients)`
            in pb. Otherwise, weights for the given parameter theta with shape `(n_unweighted_samples,)` in pb.

        """

        x, weights_benchmarks = next(
            self.event_loader(batch_size=None,
                              start=start_event,
                              end=end_event,
                              generated_close_to=generated_close_to))

        # Pick events randomly
        n_events = len(x)
        if n_draws is not None and n_draws < n_events:
            idx = np.random.choice(n_events, n_draws, replace=False)
            x = x[idx]
            weights_benchmarks = weights_benchmarks[idx]
        elif n_draws is not None:
            logger.warning("Requested %s events, but only %s available",
                           n_draws, n_events)

        # Process and return appropriate weights
        if theta is None:
            return x, weights_benchmarks
        elif isinstance(theta, six.string_types):
            i_benchmark = list(self.benchmarks.keys()).index(theta)
            return x, weights_benchmarks[:, i_benchmark]
        elif derivative:
            dtheta_matrix = self._get_dtheta_benchmark_matrix(theta)
            gradients_theta = mdot(
                dtheta_matrix, weights_benchmarks)  # (n_gradients, n_samples)
            gradients_theta = gradients_theta.T
            return x, gradients_theta
        else:
            # TODO: nuisance params
            if nu is not None:
                raise NotImplementedError
            theta_matrix = self._get_theta_benchmark_matrix(theta)
            weights_theta = mdot(theta_matrix, weights_benchmarks)
            return x, weights_theta

    def xsecs(
        self,
        thetas=None,
        nus=None,
        events="all",
        test_split=0.2,
        include_nuisance_benchmarks=True,
        batch_size=100000,
        generated_close_to=None,
    ):
        """
        Returns the total cross sections for benchmarks or parameter points.

        Parameters
        ----------
        thetas : None or list of (ndarray or str), optional
            If None, the function returns all benchmark cross sections. Otherwise, it returns the cross sections for a
            series of parameter points that are either given by their benchmark name (as a str), their benchmark index
            (as an int), or their parameter value (as an ndarray, using morphing). Default value: None.

        nus : None or list of (None or ndarray), optional
             If None, the nuisance parameters are set to their nominal values (0), i.e. no systematics are taken into
             account. Otherwise, the list has to have the same number of elements as thetas, and each entry can specify
             nuisance parameters at nominal value (None) or a value of the nuisance parameters (ndarray).

        include_nuisance_benchmarks : bool, optional
            Whether to include nuisance benchmarks if thetas is None. Default value: True.

        test_split : float, optional
            Fraction of events reserved for testing. Default value: 0.2.

        events : {"train", "test", "all"}, optional
            Which events to use. Default: "all".

        batch_size : int, optional
            Size of the batches of events that are loaded into memory at the same time. Default value: 100000.

        generated_close_to : None or ndarray, optional
            If not None, only events originally generated from the closest benchmark to this parameter point will be
            used. Default value : None.

        Returns
        -------
        xsecs : ndarray
            Calculated cross sections in pb.

        xsec_uncertainties : ndarray
            Cross-section uncertainties in pb. Basically calculated as sum(weights**2)**0.5.
        """

        logger.debug("Calculating cross sections for thetas = %s and nus = %s",
                     thetas, nus)

        # Inputs
        if thetas is not None:
            include_nuisance_benchmarks = True

        if thetas is not None:
            if nus is None:
                nus = [None for _ in thetas]
            assert len(nus) == len(
                thetas), "Numbers of thetas and nus don't match!"

        # Which events to use
        if events == "all":
            start_event, end_event = None, None
            correction_factor = 1.0
        elif events == "train":
            start_event, end_event, correction_factor = self._train_test_split(
                True, test_split)
        elif events == "test":
            start_event, end_event, correction_factor = self._train_test_split(
                False, test_split)
        else:
            raise ValueError(
                "Events has to be either 'all', 'train', or 'test', but got {}!"
                .format(events))

        # Theta matrices (translation of benchmarks to theta, at nominal nuisance params)
        theta_matrices = [
            self._get_theta_benchmark_matrix(theta) for theta in thetas
        ]
        theta_matrices = np.asarray(
            theta_matrices)  # Shape (n_thetas, n_benchmarks)

        # Loop over events
        xsecs = 0.0
        xsec_uncertainties = 0.0
        n_events = 0

        for i_batch, (_, benchmark_weights) in enumerate(
                self.event_loader(
                    start=start_event,
                    end=end_event,
                    include_nuisance_parameters=include_nuisance_benchmarks,
                    batch_size=batch_size,
                    generated_close_to=generated_close_to,
                )):
            n_batch, _ = benchmark_weights.shape
            n_events += n_batch

            # Benchmark xsecs
            if thetas is None:
                xsecs += np.sum(benchmark_weights, axis=0)
                xsec_uncertainties += np.sum(benchmark_weights *
                                             benchmark_weights,
                                             axis=0)

            # xsecs at given parameters(theta, nu)
            else:
                # Weights at nominal nuisance params (nu=0)
                weights_nom = mdot(
                    theta_matrices,
                    benchmark_weights)  # Shape (n_thetas, n_batch)
                weights_sq_nom = mdot(theta_matrices, benchmark_weights *
                                      benchmark_weights)  # same

                # Effect of nuisance parameters
                nuisance_factors = self._calculate_nuisance_factors(
                    nus, benchmark_weights)
                weights = nuisance_factors * weights_nom
                weights_sq = nuisance_factors * weights_sq_nom

                # Sum up
                xsecs += np.sum(weights, axis=1)
                xsec_uncertainties += np.sum(weights_sq, axis=1)

        if n_events == 0:
            raise RuntimeError(
                "Did not find events with test_split = %s and generated_close_to = %s",
                test_split, generated_close_to)

        xsec_uncertainties = np.maximum(xsec_uncertainties, 0.0)**0.5

        # Correct for not using all events
        xsecs *= correction_factor
        xsec_uncertainties *= correction_factor

        logger.debug("xsecs and uncertainties [pb]:")
        for this_xsec, this_uncertainty in zip(xsecs, xsec_uncertainties):
            logger.debug("  (%4f +/- %4f) pb (%4f %%)", this_xsec,
                         this_uncertainty, 100 * this_uncertainty / this_xsec)

        return xsecs, xsec_uncertainties

    def xsec_gradients(
        self,
        thetas,
        nus=None,
        events="all",
        test_split=0.2,
        gradients="all",
        batch_size=100000,
        generated_close_to=None,
    ):
        """
        Returns the gradient of total cross sections with respect to parameters.

        Parameters
        ----------
        thetas : list of (ndarray or str), optional
            If None, the function returns all benchmark cross sections. Otherwise, it returns the cross sections for a
            series of parameter points that are either given by their benchmark name (as a str), their benchmark index
            (as an int), or their parameter value (as an ndarray, using morphing). Default value: None.

        nus : None or list of (None or ndarray), optional
             If None, the nuisance parameters are set to their nominal values (0), i.e. no systematics are taken into
             account. Otherwise, the list has to have the same number of elements as thetas, and each entry can specify
             nuisance parameters at nominal value (None) or a value of the nuisance parameters (ndarray).

        test_split : float, optional
            Fraction of events reserved for testing. Default value: 0.2.

        events : {"train", "test", "all"}, optional
            Which events to use. Default: "all".

        gradients : {"all", "theta", "nu"}, optional
            Which gradients to calculate. Default value: "all".

        batch_size : int, optional
            Size of the batches of events that are loaded into memory at the same time. Default value: 100000.

        generated_close_to : None or ndarray, optional
            If not None, only events originally generated from the closest benchmark to this parameter point will be
            used. Default value : None.

        Returns
        -------
        xsecs_gradients : ndarray
            Calculated cross section gradients in pb with shape (n_gradients,).
        """

        logger.debug(
            "Calculating cross section gradients for thetas = %s and nus = %s",
            thetas, nus)

        # Inputs
        include_nuisance_benchmarks = nus is not None or gradients in [
            "all", "nu"
        ]
        if nus is None:
            nus = [None for _ in thetas]
        assert len(nus) == len(
            thetas), "Numbers of thetas and nus don't match!"
        if gradients not in ["all", "theta", "nu"]:
            raise RuntimeError(
                "Gradients has to be 'all', 'theta', or 'nu', but got {}".
                format(gradients))

        # Which events to use
        if events == "all":
            start_event, end_event = None, None
            correction_factor = 1.0
        elif events == "train":
            start_event, end_event, correction_factor = self._train_test_split(
                True, test_split)
        elif events == "test":
            start_event, end_event, correction_factor = self._train_test_split(
                False, test_split)
        else:
            raise ValueError(
                "Events has to be either 'all', 'train', or 'test', but got {}!"
                .format(events))

        # Theta matrices (translation of benchmarks to theta, at nominal nuisance params)
        theta_matrices = np.asarray([
            self._get_theta_benchmark_matrix(theta) for theta in thetas
        ])  # shape (n_thetas, n_benchmarks)
        theta_gradient_matrices = np.asarray([
            self._get_dtheta_benchmark_matrix(theta) for theta in thetas
        ])  # shape (n_thetas, n_gradients, n_benchmarks)

        # Loop over events
        xsec_gradients = 0.0

        for i_batch, (_, benchmark_weights) in enumerate(
                self.event_loader(
                    start=start_event,
                    end=end_event,
                    include_nuisance_parameters=include_nuisance_benchmarks,
                    batch_size=batch_size,
                    generated_close_to=generated_close_to,
                )):
            n_batch, _ = benchmark_weights.shape
            logger.debug("Batch %s with %s events", i_batch + 1, n_batch)

            if gradients in ["all", "theta"]:
                nom_gradients = mdot(
                    theta_gradient_matrices, benchmark_weights
                )  # Shape (n_thetas, n_phys_gradients, n_batch)
                nuisance_factors = self._calculate_nuisance_factors(
                    nus, benchmark_weights)  # Shape (n_thetas, n_batch)
                try:
                    dweight_dtheta = nuisance_factors[:, np.
                                                      newaxis, :] * nom_gradients
                except TypeError:
                    dweight_dtheta = nom_gradients

            if gradients in ["all", "nu"]:
                weights_nom = mdot(
                    theta_matrices,
                    benchmark_weights)  # Shape (n_thetas, n_batch)
                nuisance_factor_gradients = np.asarray([
                    self.nuisance_morpher.calculate_nuisance_factor_gradients(
                        nu, benchmark_weights) for nu in nus
                ])  # Shape (n_thetas, n_nuisance_gradients, n_batch)
                dweight_dnu = nuisance_factor_gradients * weights_nom[:, np.
                                                                      newaxis, :]

            if gradients == "all":
                dweight_dall = np.concatenate((dweight_dtheta, dweight_dnu), 1)
            elif gradients == "theta":
                dweight_dall = dweight_dtheta
            elif gradients == "nu":
                dweight_dall = dweight_dnu
            xsec_gradients += np.sum(dweight_dall, axis=2)

        # Correct for not using all events
        xsec_gradients *= correction_factor

        return xsec_gradients

    def _check_n_events(self):
        if self.n_events_generated_per_benchmark is None:
            return

        n_events_check = sum(self.n_events_generated_per_benchmark)
        if self.n_events_backgrounds is not None:
            n_events_check += self.n_events_backgrounds

        if self.n_samples != n_events_check:
            logger.warning(
                "Inconsistent event numbers in HDF5 file! Please recalculate them by calling "
                "combine_and_shuffle(recalculate_header=True).")

    def _report_setup(self):
        logger.info("Found %s parameters", self.n_parameters)
        for key, values in six.iteritems(self.parameters):
            logger.debug(
                "   %s (LHA: %s %s, maximal power in squared ME: %s, range: %s)",
                key,
                values[0],
                values[1],
                values[2],
                values[3],
            )

        if self.nuisance_parameters is not None:
            logger.info("Found %s nuisance parameters",
                        self.n_nuisance_parameters)
            for key, values in six.iteritems(self.nuisance_parameters):
                logger.debug("   %s (%s)", key, values)
        else:
            logger.info("Did not find nuisance parameters")
            self.include_nuisance_parameters = False

        logger.info("Found %s benchmarks, of which %s physical",
                    self.n_benchmarks, self.n_benchmarks_phys)
        for (key, values), is_nuisance in zip(six.iteritems(self.benchmarks),
                                              self.benchmark_is_nuisance):
            if is_nuisance:
                logger.debug("   %s: systematics", key)
            else:
                logger.debug("   %s: %s", key, format_benchmark(values))

        logger.info("Found %s observables", self.n_observables)
        if self.observables is not None:
            for i, obs in enumerate(self.observables):
                logger.debug("  %2.2s %s", i, obs)

        logger.info("Found %s events", self.n_samples)
        if self.n_events_generated_per_benchmark is not None:
            for events, name in zip(self.n_events_generated_per_benchmark,
                                    six.iterkeys(self.benchmarks)):
                if events > 0:
                    logger.info("  %s signal events sampled from benchmark %s",
                                events, name)
            if self.n_events_backgrounds is not None and self.n_events_backgrounds > 0:
                logger.info("  %s background events",
                            self.n_events_backgrounds)
        else:
            logger.debug("  Did not find sample summary information")

        if self.morpher is not None:
            logger.info("Found morphing setup with %s components",
                        len(self.morphing_components))
        else:
            logger.info("Did not find morphing setup.")

        if self.nuisance_morpher is not None:
            logger.info("Found nuisance morphing setup")
        else:
            logger.info("Did not find nuisance morphing setup")

    def _calculate_nuisance_factors(self, nus, benchmark_weights):
        if self._any_nontrivial_nus(nus):
            return np.asarray([
                self.nuisance_morpher.calculate_nuisance_factors(
                    nu, benchmark_weights) for nu in nus
            ])  # Shape (n_thetas, n_batch)
        else:
            return 1.0

    @staticmethod
    def _any_nontrivial_nus(nus):
        if nus is None:
            return False
        for nu in nus:
            if nu is not None:
                return True
        return False

    def _weights(self, thetas, nus, benchmark_weights, theta_matrices=None):
        """
        Turns benchmark weights into weights for given parameter points (theta, nu).

        Parameters
        ----------
        thetas : list of (ndarray or str)
            If None, the function returns all benchmark cross sections. Otherwise, it returns the cross sections for a
            series of parameter points that are either given by their benchmark name (as a str), their benchmark index
            (as an int), or their parameter value (as an ndarray, using morphing).

        nus : None or list of (None or ndarray)
             If None, the nuisance parameters are set to their nominal values (0), i.e. no systematics are taken into
             account. Otherwise, the list has to have the same number of elements as thetas, and each entry can specify
             nuisance parameters at nominal value (None) or a value of the nuisance parameters (ndarray).

        Returns
        -------
        weights : ndarray
            Calculated weights in pb.
        """

        n_events, _ = benchmark_weights.shape

        # Inputs
        include_nuisance_benchmarks = nus is not None
        if nus is None:
            nus = [None for _ in thetas]
        assert len(nus) == len(
            thetas), "Numbers of thetas and nus don't match!"

        # Theta matrices (translation of benchmarks to theta, at nominal nuisance params)
        if theta_matrices is None:
            theta_matrices = [
                self._get_theta_benchmark_matrix(theta) for theta in thetas
            ]
        theta_matrices = np.asarray(
            theta_matrices)  # Shape (n_thetas, n_benchmarks)

        # Weights at nominal nuisance params (nu=0)
        weights_nom = mdot(theta_matrices,
                           benchmark_weights)  # Shape (n_thetas, n_batch)

        # Effect of nuisance parameters
        nuisance_factors = self._calculate_nuisance_factors(
            nus, benchmark_weights)
        weights = nuisance_factors * weights_nom

        return weights

    def _weight_gradients(self,
                          thetas,
                          nus,
                          benchmark_weights,
                          gradients="all",
                          theta_matrices=None,
                          theta_gradient_matrices=None):
        """
        Turns benchmark weights into weights for given parameter points (theta, nu).

        Parameters
        ----------
        thetas : list of (ndarray or str)
            If None, the function returns all benchmark cross sections. Otherwise, it returns the cross sections for a
            series of parameter points that are either given by their benchmark name (as a str), their benchmark index
            (as an int), or their parameter value (as an ndarray, using morphing).

        nus : None or list of (None or ndarray)
             If None, the nuisance parameters are set to their nominal values (0), i.e. no systematics are taken into
             account. Otherwise, the list has to have the same number of elements as thetas, and each entry can specify
             nuisance parameters at nominal value (None) or a value of the nuisance parameters (ndarray).

        gradients : {"all", "theta", "nu"}, optional
            Which gradients to calculate. Default value: "all".

        Returns
        -------
        gradients : ndarray
            Calculated gradients in pb.
        """

        n_events, _ = benchmark_weights.shape

        # Inputs
        if gradients == "all" and self.n_nuisance_parameters == 0:
            gradients = "theta"
        if nus is None:
            nus = [None for _ in thetas]
        assert len(nus) == len(
            thetas), "Numbers of thetas and nus don't match!"

        # Theta matrices (translation of benchmarks to theta, at nominal nuisance params)
        if theta_matrices is None:
            theta_matrices = [
                self._get_theta_benchmark_matrix(theta) for theta in thetas
            ]
        if theta_gradient_matrices is None:
            theta_gradient_matrices = [
                self._get_dtheta_benchmark_matrix(theta) for theta in thetas
            ]
        theta_matrices = np.asarray(
            theta_matrices)  # Shape (n_thetas, n_benchmarks)
        theta_gradient_matrices = np.asarray(
            theta_gradient_matrices
        )  # Shape (n_thetas, n_gradients, n_benchmarks)

        # Calculate theta gradient
        if gradients in ["all", "theta"]:
            nom_gradients = mdot(
                theta_gradient_matrices,
                benchmark_weights)  # (n_thetas, n_phys_gradients, n_batch)
            nuisance_factors = self._calculate_nuisance_factors(
                nus, benchmark_weights)
            try:
                dweight_dtheta = nuisance_factors[:, np.
                                                  newaxis, :] * nom_gradients
            except TypeError:
                dweight_dtheta = nom_gradients
        else:
            dweight_dtheta = None

        # Calculate nu gradient
        if gradients in ["all", "nu"]:
            weights_nom = mdot(theta_matrices,
                               benchmark_weights)  # Shape (n_thetas, n_batch)
            nuisance_factor_gradients = np.asarray([
                self.nuisance_morpher.calculate_nuisance_factor_gradients(
                    nu, benchmark_weights) for nu in nus
            ])  # Shape (n_thetas, n_nuisance_gradients, n_batch)
            dweight_dnu = nuisance_factor_gradients * weights_nom[:, np.
                                                                  newaxis, :]
        else:
            dweight_dnu = None

        if gradients == "theta":
            return dweight_dtheta
        elif gradients == "nu":
            return dweight_dnu
        return np.concatenate((dweight_dtheta, dweight_dnu), 1)

    def _train_test_split(self, train, test_split):
        """
        Returns the start and end event for train samples (train = True) or test samples (train = False).

        Parameters
        ----------
        train : bool
            True if training data is generated, False if test data is generated.

        test_split : float
            Fraction of events reserved for testing.

        Returns
        -------
        start_event : int
            Index (in the MadMiner file) of the first event to consider.

        end_event : int
            Index (in the MadMiner file) of the last unweighted event to consider.

        correction_factor : float
            Factor with which the weights and cross sections will have to be multiplied to make up for the missing
            events.

        """
        if train:
            start_event = 0

            if test_split is None or test_split <= 0.0 or test_split >= 1.0:
                end_event = None
                correction_factor = 1.0
            else:
                end_event = int(round((1.0 - test_split) * self.n_samples, 0))
                correction_factor = 1.0 / (1.0 - test_split)
                if end_event < 0 or end_event > self.n_samples:
                    raise ValueError(
                        "Irregular train / test split: sample {} / {}",
                        end_event, self.n_samples)

        else:
            if test_split is None or test_split <= 0.0 or test_split >= 1.0:
                start_event = 0
                correction_factor = 1.0
            else:
                start_event = int(round(
                    (1.0 - test_split) * self.n_samples, 0)) + 1
                correction_factor = 1.0 / test_split
                if start_event < 0 or start_event > self.n_samples:
                    raise ValueError(
                        "Irregular train / test split: sample {} / {}",
                        start_event, self.n_samples)

            end_event = None

        return start_event, end_event, correction_factor

    def _get_theta_value(self, theta):
        if isinstance(theta, six.string_types):
            benchmark = self.benchmarks[theta]
            theta_value = np.array([benchmark[key] for key in benchmark])
        elif isinstance(theta, int):
            benchmark = self.benchmarks[list(self.benchmarks.keys())[theta]]
            theta_value = np.array([benchmark[key] for key in benchmark])
        else:
            theta_value = np.asarray(theta)
        return theta_value

    def _get_nu_value(self, nu):
        if nu is None:
            nu_value = np.zeros(self.n_nuisance_parameters)
        else:
            nu_value = np.asarray(nu)
        return nu_value

    def _get_theta_benchmark_matrix(self, theta, zero_pad=True):
        """Calculates vector A such that dsigma(theta) = A * dsigma_benchmarks"""

        if zero_pad:
            unpadded_theta_matrix = self._get_theta_benchmark_matrix(
                theta, zero_pad=False)
            theta_matrix = np.zeros(self.n_benchmarks)
            theta_matrix[:unpadded_theta_matrix.
                         shape[0]] = unpadded_theta_matrix

        elif isinstance(theta, six.string_types):
            i_benchmark = list(self.benchmarks).index(theta)
            theta_matrix = self._get_theta_benchmark_matrix(i_benchmark)

        elif isinstance(theta, int):
            n_benchmarks = len(self.benchmarks)
            theta_matrix = np.zeros(n_benchmarks)
            theta_matrix[theta] = 1.0

        else:
            theta_matrix = self.morpher.calculate_morphing_weights(theta)

        return theta_matrix

    def _get_dtheta_benchmark_matrix(self, theta, zero_pad=True):
        """Calculates matrix A_ij such that d dsigma(theta) / d theta_i = A_ij * dsigma (benchmark j)"""

        if self.morpher is None:
            raise RuntimeError("Cannot calculate score without morphing")

        if zero_pad:
            unpadded_theta_matrix = self._get_dtheta_benchmark_matrix(
                theta, zero_pad=False)
            dtheta_matrix = np.zeros(
                (unpadded_theta_matrix.shape[0], self.n_benchmarks))
            dtheta_matrix[:, :unpadded_theta_matrix.
                          shape[1]] = unpadded_theta_matrix

        elif isinstance(theta, six.string_types):
            benchmark = self.benchmarks[theta]
            benchmark = np.array(
                [value for _, value in six.iteritems(benchmark)])
            dtheta_matrix = self._get_dtheta_benchmark_matrix(benchmark)

        elif isinstance(theta, int):
            benchmark = self.benchmarks[list(self.benchmarks.keys())[theta]]
            benchmark = np.array(
                [value for _, value in six.iteritems(benchmark)])
            dtheta_matrix = self._get_dtheta_benchmark_matrix(benchmark)

        else:
            dtheta_matrix = self.morpher.calculate_morphing_weight_gradient(
                theta)  # Shape (n_parameters, n_benchmarks_phys)

        return dtheta_matrix

    def _calculate_sampling_factors(self):
        events = np.asarray(self.n_events_generated_per_benchmark,
                            dtype=np.float)
        logger.debug("Events per benchmark: %s", events)
        factors = events / np.sum(events)
        factors = np.hstack((factors, 1.0))  # background events
        return factors

    def _find_closest_benchmark(self, theta):
        if theta is None:
            return None

        benchmarks = self._benchmark_array()
        distances = [
            np.linalg.norm(benchmark - theta) for benchmark in benchmarks
        ]

        logger.debug("Distances from %s: %s", theta, distances)

        # Don't use benchmarks where we don't actually have events
        if self.n_events_generated_per_benchmark is not None:
            logger.debug("n_events_generated_per_benchmark: %s",
                         self.n_events_generated_per_benchmark)
            distances = distances + 1.0e9 * (
                self.n_events_generated_per_benchmark == 0).astype(np.float)

        closest_idx = np.argmin(distances)
        return closest_idx

    def _benchmark_array(self):
        benchmarks_array = []
        for benchmark in six.itervalues(self.benchmarks):
            benchmarks_array.append(list(six.itervalues(benchmark)))
        return np.asarray(benchmarks_array)
Esempio n. 3
0
def plot_distributions(
    filename,
    observables=None,
    parameter_points=None,
    uncertainties="nuisance",
    nuisance_parameters=None,
    draw_nuisance_toys=None,
    normalize=False,
    log=False,
    observable_labels=None,
    n_bins=50,
    line_labels=None,
    colors=None,
    linestyles=None,
    linewidths=1.5,
    toy_linewidths=0.5,
    alpha=0.15,
    toy_alpha=0.75,
    n_events=None,
    n_toys=100,
    n_cols=3,
    quantiles_for_range=(0.025, 0.975),
    sample_only_from_closest_benchmark=True,
):
    """
    Plots one-dimensional histograms of observables in a MadMiner file for a given set of benchmarks.

    Parameters
    ----------
    filename : str
        Filename of a MadMiner HDF5 file.

    observables : list of str or None, optional
        Which observables to plot, given by a list of their names. If None, all observables in the file
        are plotted. Default value: None.

    parameter_points : list of (str or ndarray) or None, optional
        Which parameter points to use for histogramming the data. Given by a list, each element can either be the name
        of a benchmark in the MadMiner file, or an ndarray specifying any parameter point in a morphing setup. If None,
        all physics (non-nuisance) benchmarks defined in the MadMiner file are plotted. Default value: None.

    uncertainties : {"nuisance", "none"}, optional
        Defines how uncertainty bands are drawn. With "nuisance", the variation in cross section from all nuisance
        parameters is added in quadrature. With "none", no error bands are drawn.

    nuisance_parameters : None or list of int, optional
        If uncertainties is "nuisance", this can restrict which nuisance parameters are used to draw the uncertainty
        bands. Each entry of this list is the index of one nuisance parameter (same order as in the MadMiner file).

    draw_nuisance_toys : None or int, optional
        If not None and uncertainties is "nuisance", sets the number of nuisance toy distributions that are drawn
        (in addition to the error bands).

    normalize : bool, optional
        Whether the distribution is normalized to the total cross section. Default value: False.

    log : bool, optional
        Whether to draw the y axes on a logarithmic scale. Defaul value: False.

    observable_labels : None or list of (str or None), optional
        x-axis labels naming the observables. If None, the observable names from the MadMiner file are used. Default
        value: None.

    n_bins : int, optional
        Number of histogram bins. Default value: 50.

    line_labels : None or list of (str or None), optional
        Labels for the different parameter points. If None and if parameter_points is None, the benchmark names from
        the MadMiner file are used. Default value: None.

    colors : None or str or list of str, optional
        Matplotlib line (and error band) colors for the distributions. If None, uses default colors. Default value:
        None.

    linestyles : None or str or list of str, optional
        Matplotlib line styles for the distributions. If None, uses default linestyles. Default value: None.

    linewidths : float or list of float, optional
        Line widths for the contours. Default value: 1.5.

    toy_linewidths : float or list of float or None, optional
        Line widths for the toy replicas, if uncertainties is "nuisance" and draw_nuisance_toys is not None. If None,
        linewidths is used. Default value: 1.

    alpha : float, optional
        alpha value for the uncertainty bands. Default value: 0.25.

    toy_alpha : float, optional
        alpha value for the toy replicas, if uncertainties is "nuisance" and draw_nuisance_toys is not None. Default
        value: 0.75.

    n_events : None or int, optional
        If not None, sets the number of events from the MadMiner file that will be analyzed and plotted. Default value:
        None.

    n_toys : int, optional
        Number of toy nuisance parameter vectors used to estimate the systematic uncertainties. Default value: 100.

    n_cols : int, optional
        Number of columns of subfigures in the plot. Default value: 3.

    quantiles_for_range : tuple of two float, optional
        Tuple `(min_quantile, max_quantile)` that defines how the observable range is determined for each panel.
        Default: (0.025, 0.075).

    Returns
    -------
    figure : Figure
        Plot as Matplotlib Figure instance.

    """

    # Load data
    sa = SampleAugmenter(filename, include_nuisance_parameters=True)
    if uncertainties == "nuisance":
        nuisance_morpher = NuisanceMorpher(
            sa.nuisance_parameters, list(sa.benchmarks.keys()), reference_benchmark=sa.reference_benchmark
        )

    # Default settings
    if parameter_points is None:
        parameter_points = []

        for key, is_nuisance in zip(sa.benchmarks, sa.benchmark_is_nuisance):
            if not is_nuisance:
                parameter_points.append(key)

        if line_labels is None:
            line_labels = parameter_points

    n_parameter_points = len(parameter_points)

    if colors is None:
        colors = ["C" + str(i) for i in range(10)] * (n_parameter_points // 10 + 1)
    elif not isinstance(colors, list):
        colors = [colors for _ in range(n_parameter_points)]

    if linestyles is None:
        linestyles = ["solid", "dashed", "dotted", "dashdot"] * (n_parameter_points // 4 + 1)
    elif not isinstance(linestyles, list):
        linestyles = [linestyles for _ in range(n_parameter_points)]

    if not isinstance(linewidths, list):
        linewidths = [linewidths for _ in range(n_parameter_points)]

    if toy_linewidths is None:
        toy_linewidths = linewidths
    if not isinstance(toy_linewidths, list):
        toy_linewidths = [toy_linewidths for _ in range(n_parameter_points)]

    # Observables
    observable_indices = []
    if observables is None:
        observable_indices = list(range(len(sa.observables)))
    else:
        all_observables = list(sa.observables.keys())
        for obs in observables:
            try:
                observable_indices.append(all_observables.index(str(obs)))
            except ValueError:
                logging.warning("Ignoring unknown observable %s", obs)

    logger.debug("Observable indices: %s", observable_indices)

    n_observables = len(observable_indices)

    if observable_labels is None:
        all_observables = list(sa.observables.keys())
        observable_labels = [all_observables[obs] for obs in observable_indices]

    # Parse thetas
    theta_values = [sa._get_theta_value(theta) for theta in parameter_points]
    theta_matrices = [sa._get_theta_benchmark_matrix(theta) for theta in parameter_points]
    logger.debug("Calculated %s theta matrices", len(theta_matrices))

    # Get event data (observations and weights)
    all_x, all_weights_benchmarks = sa.weighted_events(generated_close_to=None)
    logger.debug("Loaded raw data with shapes %s, %s", all_x.shape, all_weights_benchmarks.shape)

    indiv_x, indiv_weights_benchmarks = [], []
    if sample_only_from_closest_benchmark:
        for theta in theta_values:
            this_x, this_weights = sa.weighted_events(generated_close_to=theta)
            indiv_x.append(this_x)
            indiv_weights_benchmarks.append(this_weights)

    # Remove negative weights
    sane_event_filter = np.all(all_weights_benchmarks >= 0.0, axis=1)

    n_events_before = all_weights_benchmarks.shape[0]
    all_x = all_x[sane_event_filter]
    all_weights_benchmarks = all_weights_benchmarks[sane_event_filter]
    n_events_removed = n_events_before - all_weights_benchmarks.shape[0]

    if int(np.sum(sane_event_filter, dtype=np.int)) < len(sane_event_filter):
        logger.warning("Removed %s / %s events with negative weights", n_events_removed, n_events_before)

    for i, (x, weights) in enumerate(zip(indiv_x, indiv_weights_benchmarks)):
        sane_event_filter = np.all(weights >= 0.0, axis=1)
        indiv_x[i] = x[sane_event_filter]
        indiv_weights_benchmarks[i] = weights[sane_event_filter]

    # Shuffle events
    all_x, all_weights_benchmarks = shuffle(all_x, all_weights_benchmarks)

    for i, (x, weights) in enumerate(zip(indiv_x, indiv_weights_benchmarks)):
        indiv_x[i], indiv_weights_benchmarks[i] = shuffle(x, weights)

    # Only analyze n_events
    if n_events is not None and n_events < all_x.shape[0]:
        logger.debug("Only analyzing first %s / %s events", n_events, all_x.shape[0])

        all_x = all_x[:n_events]
        all_weights_benchmarks = all_weights_benchmarks[:n_events]

        for i, (x, weights) in enumerate(zip(indiv_x, indiv_weights_benchmarks)):
            indiv_x[i] = x[:n_events]
            indiv_weights_benchmarks[i] = weights[:n_events]

    if uncertainties != "nuisance":
        n_toys = 0

    n_nuisance_toys_drawn = 0
    if draw_nuisance_toys is not None:
        n_nuisance_toys_drawn = draw_nuisance_toys

    # Nuisance parameters
    nuisance_toy_factors = []

    if uncertainties == "nuisance":
        n_nuisance_params = sa.n_nuisance_parameters

        if not n_nuisance_params > 0:
            raise RuntimeError("Cannot draw systematic uncertainties -- no nuisance parameters found!")

        logger.debug("Drawing nuisance toys")

        nuisance_toys = np.random.normal(loc=0.0, scale=1.0, size=n_nuisance_params * n_toys)
        nuisance_toys = nuisance_toys.reshape(n_toys, n_nuisance_params)

        # Restrict nuisance parameters
        if nuisance_parameters is not None:
            for i in range(n_nuisance_params):
                if i not in nuisance_parameters:
                    nuisance_toys[:, i] = 0.0

        logger.debug("Drew %s toy values for nuisance parameters", n_toys * n_nuisance_params)

        nuisance_toy_factors = np.array(
            [
                nuisance_morpher.calculate_nuisance_factors(nuisance_toy, all_weights_benchmarks)
                for nuisance_toy in nuisance_toys
            ]
        )  # Shape (n_toys, n_events)

        nuisance_toy_factors = sanitize_array(nuisance_toy_factors, min_value=1.0e-2, max_value=100.0)
        # Shape (n_toys, n_events)

    # Preparing plot
    n_rows = (n_observables + n_cols - 1) // n_cols
    n_events_for_range = 10000 if n_events is None else min(10000, n_events)

    fig = plt.figure(figsize=(4.0 * n_cols, 4.0 * n_rows))

    for i_panel, (i_obs, xlabel) in enumerate(zip(observable_indices, observable_labels)):
        logger.debug("Plotting panel %s: observable %s, label %s", i_panel, i_obs, xlabel)

        # Figure out x range
        xmins, xmaxs = [], []
        for theta_matrix in theta_matrices:
            x_small = all_x[:n_events_for_range]
            weights_small = mdot(theta_matrix, all_weights_benchmarks[:n_events_for_range])

            xmin = weighted_quantile(x_small[:, i_obs], quantiles_for_range[0], weights_small)
            xmax = weighted_quantile(x_small[:, i_obs], quantiles_for_range[1], weights_small)
            xwidth = xmax - xmin
            xmin -= xwidth * 0.1
            xmax += xwidth * 0.1

            xmin = max(xmin, np.min(all_x[:, i_obs]))
            xmax = min(xmax, np.max(all_x[:, i_obs]))

            xmins.append(xmin)
            xmaxs.append(xmax)

        xmin = min(xmins)
        xmax = max(xmaxs)
        x_range = (xmin, xmax)

        logger.debug("Ranges for observable %s: min = %s, max = %s", xlabel, xmins, xmaxs)

        # Subfigure
        ax = plt.subplot(n_rows, n_cols, i_panel + 1)

        # Calculate histograms
        bin_edges = None
        histos = []
        histos_up = []
        histos_down = []
        histos_toys = []

        for i_theta, theta_matrix in enumerate(theta_matrices):
            theta_weights = mdot(theta_matrix, all_weights_benchmarks)  # Shape (n_events,)

            if sample_only_from_closest_benchmark:
                indiv_theta_weights = mdot(theta_matrix, indiv_weights_benchmarks[i_theta])  # Shape (n_events,)
                histo, bin_edges = np.histogram(
                    indiv_x[i_theta][:, i_obs],
                    bins=n_bins,
                    range=x_range,
                    weights=indiv_theta_weights,
                    density=normalize,
                )
            else:
                histo, bin_edges = np.histogram(
                    all_x[:, i_obs], bins=n_bins, range=x_range, weights=theta_weights, density=normalize
                )
            histos.append(histo)

            if uncertainties == "nuisance":
                histos_toys_this_theta = []
                for i_toy, nuisance_toy_factors_this_toy in enumerate(nuisance_toy_factors):
                    toy_histo, _ = np.histogram(
                        all_x[:, i_obs],
                        bins=n_bins,
                        range=x_range,
                        weights=theta_weights * nuisance_toy_factors_this_toy,
                        density=normalize,
                    )
                    histos_toys_this_theta.append(toy_histo)

                histos_up.append(np.percentile(histos_toys_this_theta, 84.0, axis=0))
                histos_down.append(np.percentile(histos_toys_this_theta, 16.0, axis=0))
                histos_toys.append(histos_toys_this_theta[:n_nuisance_toys_drawn])

        # Draw error bands
        if uncertainties == "nuisance":
            for histo_up, histo_down, lw, color, label, ls in zip(
                histos_up, histos_down, linewidths, colors, line_labels, linestyles
            ):
                bin_edges_ = np.repeat(bin_edges, 2)[1:-1]
                histo_down_ = np.repeat(histo_down, 2)
                histo_up_ = np.repeat(histo_up, 2)

                plt.fill_between(bin_edges_, histo_down_, histo_up_, facecolor=color, edgecolor="none", alpha=alpha)

            # Draw some toys
            for histo_toys, lw, color, ls in zip(histos_toys, toy_linewidths, colors, linestyles):
                for k in range(n_nuisance_toys_drawn):
                    bin_edges_ = np.repeat(bin_edges, 2)[1:-1]
                    histo_ = np.repeat(histo_toys[k], 2)

                    plt.plot(bin_edges_, histo_, color=color, alpha=toy_alpha, lw=lw, ls=ls)

        # Draw central lines
        for histo, lw, color, label, ls in zip(histos, linewidths, colors, line_labels, linestyles):
            bin_edges_ = np.repeat(bin_edges, 2)[1:-1]
            histo_ = np.repeat(histo, 2)

            plt.plot(bin_edges_, histo_, color=color, lw=lw, ls=ls, label=label, alpha=1.0)

        plt.legend()

        plt.xlabel(xlabel)
        if normalize:
            plt.ylabel("Normalized distribution")
        else:
            plt.ylabel(r"$\frac{d\sigma}{dx}$ [pb / bin]")

        plt.xlim(x_range[0], x_range[1])
        if log:
            ax.set_yscale("log", nonposy="clip")
        else:
            plt.ylim(0.0, None)

    plt.tight_layout()

    return fig
Esempio n. 4
0
def plot_uncertainty(
    filename,
    theta,
    observable,
    obs_label,
    obs_range,
    n_bins=50,
    nuisance_parameters=None,
    n_events=None,
    n_toys=100,
    linecolor="black",
    bandcolor1="#CC002E",
    bandcolor2="orange",
    ratio_range=(0.8, 1.2),
):
    """
    Plots absolute and relative uncertainty bands in a histogram of one observable in a MadMiner file.

    Parameters
    ----------
    filename : str
        Filename of a MadMiner HDF5 file.

    theta : ndarray, optional
        Which parameter points to use for histogramming the data.

    observable : str
        Which observable to plot, given by its name in the MadMiner file.

    obs_label : str
        x-axis label naming the observable.

    obs_range : tuple of two float
        Range to be plotted for the observable.

    n_bins : int
        Number of bins. Default value: 50.

    nuisance_parameters : None or list of int, optional
        This can restrict which nuisance parameters are used to draw the uncertainty
        bands. Each entry of this list is the index of one nuisance parameter (same order as in the MadMiner file).

    n_events : None or int, optional
        If not None, sets the number of events from the MadMiner file that will be analyzed and plotted. Default value:
        None.

    n_toys : int, optional
        Number of toy nuisance parameter vectors used to estimate the systematic uncertainties. Default value: 100.

    linecolor : str, optional
        Line color for central prediction. Default value: "black".

    bandcolor1 : str, optional
        Error band color for 1 sigma uncertainty. Default value: "#CC002E".

    bandcolor2 : str, optional
        Error band color for 2 sigma uncertainty. Default value: "orange".

    ratio_range : tuple of two floar
        y-axis range for the plots of the ratio to the central prediction. Default value: (0.8, 1.2).

    Returns
    -------
    figure : Figure
        Plot as Matplotlib Figure instance.

    """

    # Load data
    sa = SampleAugmenter(filename, include_nuisance_parameters=True)
    nuisance_morpher = NuisanceMorpher(
        sa.nuisance_parameters, list(sa.benchmarks.keys()), reference_benchmark=sa.reference_benchmark
    )

    # Observable index
    obs_idx = list(sa.observables.keys()).index(observable)

    # Get event data (observations and weights)
    x, weights_benchmarks = sa.weighted_events()
    x = x[:, obs_idx]

    # Theta matrix
    theta_matrix = sa._get_theta_benchmark_matrix(theta)
    weights = mdot(theta_matrix, weights_benchmarks)

    # Remove negative weights
    x = x[weights >= 0.0]
    weights_benchmarks = weights_benchmarks[weights >= 0.0]
    weights = weights[weights >= 0.0]

    # Shuffle events
    x, weights, weights_benchmarks = shuffle(x, weights, weights_benchmarks)

    # Only analyze n_events
    if n_events is not None and n_events < x.shape[0]:
        x = x[:n_events]
        weights_benchmarks = weights_benchmarks[:n_events]
        weights = weights[:n_events]

    # Nuisance parameters
    n_nuisance_params = sa.n_nuisance_parameters

    nuisance_toys = np.random.normal(loc=0.0, scale=1.0, size=n_nuisance_params * n_toys)
    nuisance_toys = nuisance_toys.reshape(n_toys, n_nuisance_params)

    # Restrict nuisance parameters
    if nuisance_parameters is not None:
        for i in range(n_nuisance_params):
            if i not in nuisance_parameters:
                nuisance_toys[:, i] = 0.0

    nuisance_toy_factors = np.array(
        [
            nuisance_morpher.calculate_nuisance_factors(nuisance_toy, weights_benchmarks)
            for nuisance_toy in nuisance_toys
        ]
    )  # Shape (n_toys, n_events)

    nuisance_toy_factors = sanitize_array(nuisance_toy_factors, min_value=1.0e-2, max_value=100.0)
    # Shape (n_toys, n_events)

    # Calculate histogram for central prediction, not normalized
    histo, bin_edges = np.histogram(x, bins=n_bins, range=obs_range, weights=weights, density=False)

    # Calculate toy histograms, not normalized
    histos_toys_this_theta = []
    for i_toy, nuisance_toy_factors_this_toy in enumerate(nuisance_toy_factors):
        toy_histo, _ = np.histogram(
            x, bins=n_bins, range=obs_range, weights=weights * nuisance_toy_factors_this_toy, density=False
        )
        histos_toys_this_theta.append(toy_histo)

    histo_plus2sigma = np.percentile(histos_toys_this_theta, 97.5, axis=0)
    histo_plus1sigma = np.percentile(histos_toys_this_theta, 84.0, axis=0)
    histo_minus1sigma = np.percentile(histos_toys_this_theta, 16.0, axis=0)
    histo_minus2sigma = np.percentile(histos_toys_this_theta, 2.5, axis=0)

    # Calculate histogram for central prediction,  normalized
    histo_norm, bin_edges_norm = np.histogram(x, bins=n_bins, range=obs_range, weights=weights, density=True)

    # Calculate toy histograms, normalized
    histos_toys_this_theta = []
    for i_toy, nuisance_toy_factors_this_toy in enumerate(nuisance_toy_factors):
        toy_histo, _ = np.histogram(
            x, bins=n_bins, range=obs_range, weights=weights * nuisance_toy_factors_this_toy, density=True
        )
        histos_toys_this_theta.append(toy_histo)

    histo_plus2sigma_norm = np.percentile(histos_toys_this_theta, 97.5, axis=0)
    histo_plus1sigma_norm = np.percentile(histos_toys_this_theta, 84.0, axis=0)
    histo_minus1sigma_norm = np.percentile(histos_toys_this_theta, 16.0, axis=0)
    histo_minus2sigma_norm = np.percentile(histos_toys_this_theta, 2.5, axis=0)

    # Prepare plotting
    def plot_mc(edges, histo_central, histo_m2, histo_m1, histo_p1, histo_p2, relative=False):
        bin_edges_ = np.repeat(edges, 2)[1:-1]
        histo_ = np.repeat(histo_central, 2)
        histo_m2_ = np.repeat(histo_m2, 2)
        histo_m1_ = np.repeat(histo_m1, 2)
        histo_p1_ = np.repeat(histo_p1, 2)
        histo_p2_ = np.repeat(histo_p2, 2)

        if relative:
            histo_m2_ /= histo_
            histo_m1_ /= histo_
            histo_p1_ /= histo_
            histo_p2_ /= histo_
            histo_ /= histo_

        plt.fill_between(bin_edges_, histo_m2_, histo_p2_, facecolor=bandcolor2, edgecolor="none")
        plt.fill_between(bin_edges_, histo_m1_, histo_p1_, facecolor=bandcolor1, edgecolor="none")
        plt.plot(bin_edges_, histo_, color=linecolor, lw=1.5, ls="-")

    # Make plot
    fig = plt.figure(figsize=(10, 7))
    gs = gridspec.GridSpec(2, 2, height_ratios=[2, 1])

    # MC, absolute residuals
    ax = plt.subplot(gs[2])
    plot_mc(bin_edges, histo, histo_minus2sigma, histo_minus1sigma, histo_plus1sigma, histo_plus2sigma, relative=True)
    plt.xlabel(obs_label)
    plt.ylabel(r"Relative to central pred.")
    plt.xlim(obs_range[0], obs_range[1])
    plt.ylim(ratio_range[0], ratio_range[1])

    # MC, absolute
    ax = plt.subplot(gs[0], sharex=ax)
    plot_mc(bin_edges, histo, histo_minus2sigma, histo_minus1sigma, histo_plus1sigma, histo_plus2sigma)
    plt.ylabel(r"Differential cross section [pb/bin]")
    plt.ylim(0.0, None)
    plt.setp(ax.get_xticklabels(), visible=False)

    # MC, relative residuals
    ax = plt.subplot(gs[3])
    plot_mc(
        bin_edges_norm,
        histo_norm,
        histo_minus2sigma_norm,
        histo_minus1sigma_norm,
        histo_plus1sigma_norm,
        histo_plus2sigma_norm,
        relative=True,
    )
    plt.xlabel(r"$p_{T,\gamma}$ [GeV]")
    plt.ylabel(r"Relative to central pred.")
    plt.xlim(obs_range[0], obs_range[1])
    plt.ylim(ratio_range[0], ratio_range[1])

    # MC, relative
    ax = plt.subplot(gs[1], sharex=ax)
    plot_mc(
        bin_edges_norm,
        histo_norm,
        histo_minus2sigma_norm,
        histo_minus1sigma_norm,
        histo_plus1sigma_norm,
        histo_plus2sigma_norm,
    )
    plt.ylabel(r"Normalized distribution")
    plt.ylim(0.0, None)
    plt.setp(ax.get_xticklabels(), visible=False)

    # Return
    plt.tight_layout()
    return fig
Esempio n. 5
0
    def __init__(self,
                 filename,
                 disable_morphing=False,
                 include_nuisance_parameters=True):
        # Save setup
        self.include_nuisance_parameters = include_nuisance_parameters
        self.madminer_filename = filename

        logger.info("Loading data from %s", filename)

        # Load data
        (
            self.parameters,
            self.benchmarks,
            self.benchmark_is_nuisance,
            self.morphing_components,
            self.morphing_matrix,
            self.observables,
            self.n_samples,
            _,
            self.reference_benchmark,
            self.nuisance_parameters,
        ) = load_madminer_settings(
            filename, include_nuisance_benchmarks=include_nuisance_parameters)

        self.n_parameters = len(self.parameters)
        self.n_benchmarks = len(self.benchmarks)
        self.n_benchmarks_phys = np.sum(
            np.logical_not(self.benchmark_is_nuisance))
        self.n_observables = len(self.observables)

        self.n_nuisance_parameters = 0
        if self.nuisance_parameters is not None and include_nuisance_parameters:
            self.n_nuisance_parameters = len(self.nuisance_parameters)
        else:
            self.nuisance_parameters = None

        logger.info("Found %s parameters", self.n_parameters)
        for key, values in six.iteritems(self.parameters):
            logger.debug(
                "   %s (LHA: %s %s, maximal power in squared ME: %s, range: %s)",
                key,
                values[0],
                values[1],
                values[2],
                values[3],
            )

        if self.nuisance_parameters is not None:
            logger.info("Found %s nuisance parameters",
                        self.n_nuisance_parameters)
            for key, values in six.iteritems(self.nuisance_parameters):
                logger.debug("   %s (%s)", key, values)
        else:
            logger.info("Did not find nuisance parameters")

        logger.info("Found %s benchmarks, of which %s physical",
                    self.n_benchmarks, self.n_benchmarks_phys)
        for (key, values), is_nuisance in zip(six.iteritems(self.benchmarks),
                                              self.benchmark_is_nuisance):
            if is_nuisance:
                logger.debug("   %s: systematics", key)
            else:
                logger.debug("   %s: %s", key, format_benchmark(values))

        logger.info("Found %s observables", len(self.observables))
        for i, obs in enumerate(self.observables):
            logger.debug("  %2.2s %s", i, obs)
        logger.info("Found %s events", self.n_samples)

        # Morphing
        self.morpher = None
        if self.morphing_matrix is not None and self.morphing_components is not None and not disable_morphing:
            self.morpher = PhysicsMorpher(self.parameters)
            self.morpher.set_components(self.morphing_components)
            self.morpher.set_basis(self.benchmarks,
                                   morphing_matrix=self.morphing_matrix)

            logger.info("Found morphing setup with %s components",
                        len(self.morphing_components))

        else:
            logger.info("Did not find morphing setup.")

        # Nuisance morphing
        self.nuisance_morpher = None
        if self.nuisance_parameters is not None:
            self.nuisance_morpher = NuisanceMorpher(
                self.nuisance_parameters, list(self.benchmarks.keys()),
                self.reference_benchmark)
            logger.info("Found nuisance morphing setup")
Esempio n. 6
0
class DataAnalyzer:
    """
    Collects common functionality that is used when analysing data in the MadMiner file.

    Parameters
    ----------
    filename : str
        Path to MadMiner file (for instance the output of `madminer.delphes.DelphesProcessor.save()`).

    disable_morphing : bool, optional
        If True, the morphing setup is not loaded from the file. Default value: False.

    include_nuisance_parameters : bool, optional
        If True, nuisance parameters are taken into account. Default value: True.

    """
    def __init__(self,
                 filename,
                 disable_morphing=False,
                 include_nuisance_parameters=True):
        # Save setup
        self.include_nuisance_parameters = include_nuisance_parameters
        self.madminer_filename = filename

        # Load data
        logger.info("Loading data from %s", filename)
        (
            self.parameters,
            self.benchmarks,
            self.benchmark_nuisance_flags,
            self.morphing_components,
            self.morphing_matrix,
            self.observables,
            self.n_samples,
            self.systematics,
            self.reference_benchmark,
            self.nuisance_parameters,
            self.n_events_generated_per_benchmark,
            self.n_events_backgrounds,
            self.finite_difference_benchmarks,
            self.finite_difference_epsilon,
        ) = load_madminer_settings(
            filename, include_nuisance_benchmarks=include_nuisance_parameters)

        self.n_observables = len(self.observables)
        self.n_parameters = len(self.parameters)
        self.n_benchmarks = len(self.benchmarks)
        self.n_benchmarks_phys = np.sum(
            np.logical_not(self.benchmark_nuisance_flags))
        self.n_nuisance_parameters = len(self.nuisance_parameters)

        # Morphing
        self.morpher = None
        if self.morphing_matrix is not None and self.morphing_components is not None and not disable_morphing:
            self.morpher = PhysicsMorpher(self.parameters)
            self.morpher.set_components(self.morphing_components)
            self.morpher.set_basis(self.benchmarks,
                                   morphing_matrix=self.morphing_matrix)

        # Nuisance morphing
        self.nuisance_morpher = None
        if self.n_nuisance_parameters > 0:
            self.nuisance_morpher = NuisanceMorpher(
                self.nuisance_parameters,
                self.benchmarks.keys(),
                self.reference_benchmark,
            )

        # Check event numbers
        self._check_n_events()
        self._report_setup()

    def event_loader(
        self,
        start=0,
        end=None,
        batch_size=100000,
        include_nuisance_parameters=None,
        generated_close_to=None,
        return_sampling_ids=False,
    ):
        """
        Yields batches of events in the MadMiner file.

        Parameters
        ----------
        start : int, optional
            First event index to load

        end : int or None, optional
            Last event index to load

        batch_size : int, optional
            Batch size

        include_nuisance_parameters : bool, optional
            Whether nuisance parameter benchmarks are included in the returned data

        generated_close_to : None or ndarray, optional
            If None, this function yields all events. Otherwise, it just yields just the events that were generated
            at the closest benchmark point to a given parameter point.

        return_sampling_ids : bool, optional
            If True, the iterator returns the sampling IDs in addition to observables and weights.

        Yields
        ------
        observations : ndarray
            Event data

        weights : ndarray
            Event weights

        sampling_ids : int
            Sampling IDs (benchmark used for sampling for signal events, -1 for background events). Only returned if
            return_sampling_ids = True was set.
        """

        if include_nuisance_parameters is None:
            include_nuisance_parameters = self.include_nuisance_parameters

        sampling_benchmark = self._find_closest_benchmark(generated_close_to)
        logger.debug(
            f"Sampling benchmark closest to {generated_close_to}: {sampling_benchmark}"
        )

        if sampling_benchmark is None:
            sampling_factors = self._calculate_sampling_factors()
        else:
            sampling_factors = np.ones(self.n_benchmarks_phys + 1)

        for data in load_events(
                file_name=self.madminer_filename,
                start_index=start,
                final_index=end,
                batch_size=batch_size,
                benchmark_nuisance_flags=self.benchmark_nuisance_flags,
                sampling_benchmark=sampling_benchmark,
                sampling_factors=sampling_factors,
                include_nuisance_params=include_nuisance_parameters,
                include_sampling_ids=return_sampling_ids,
        ):
            yield data

    def weighted_events(
        self,
        theta=None,
        nu=None,
        start_event=None,
        end_event=None,
        derivative=False,
        generated_close_to=None,
        n_draws=None,
    ):
        """
        Returns all events together with the benchmark weights (if theta is None) or weights for a given theta.

        Parameters
        ----------
        theta : None or ndarray or str, optional
            If None, the function returns all benchmark weights. If str, the function returns the weights for a given
            benchmark name. If ndarray, it uses morphing to calculate the weights for this value of theta. Default
            value: None.

        nu : None or ndarray, optional
            If None, the nuisance parameters are set to their nominal values. Otherwise, and if theta is an ndarray,
            sets the values of the nuisance parameters.

        start_event : int
            Index (in the MadMiner file) of the first event to consider.

        end_event : int
            Index (in the MadMiner file) of the last unweighted event to consider.

        derivative : bool, optional
            If True and if theta is not None, the derivative of the weights with respect to theta are returned. Default
            value: False.

        generated_close_to : None or int, optional
            Only returns benchmarks generated from this benchmark (and background events). Default value: None.

        n_draws : None or int, optional
            If not None, returns only this number of events, drawn randomly.

        Returns
        -------
        x : ndarray
            Observables with shape `(n_unweighted_samples, n_observables)`.

        weights : ndarray
            If theta is None and derivative is False, benchmark weights with shape
            `(n_unweighted_samples, n_benchmarks)` in pb. If theta is not None and derivative is True, the gradient of
            the weight for the given parameter with respect to theta with shape `(n_unweighted_samples, n_gradients)`
            in pb. Otherwise, weights for the given parameter theta with shape `(n_unweighted_samples,)` in pb.

        """

        x, weights_benchmarks = next(
            self.event_loader(
                start=start_event,
                end=end_event,
                batch_size=None,
                generated_close_to=generated_close_to,
            ))

        # Pick events randomly
        n_events = len(x)

        if n_draws is not None and n_draws < n_events:
            idx = np.random.choice(n_events, n_draws, replace=False)
            x = x[idx]
            weights_benchmarks = weights_benchmarks[idx]
        elif n_draws is not None:
            logger.warning(
                f"Requested {n_draws} events, but only {n_events} available")

        # Process and return appropriate weights
        if theta is None:
            return x, weights_benchmarks
        elif isinstance(theta, str):
            i_benchmark = list(self.benchmarks.keys()).index(theta)
            return x, weights_benchmarks[:, i_benchmark]
        elif derivative:
            dtheta_matrix = self._get_dtheta_benchmark_matrix(theta)
            gradients_theta = mdot(
                dtheta_matrix, weights_benchmarks)  # (n_gradients, n_samples)
            gradients_theta = gradients_theta.T
            return x, gradients_theta
        else:
            # TODO: nuisance params
            if nu is not None:
                raise NotImplementedError()
            theta_matrix = self._get_theta_benchmark_matrix(theta)
            weights_theta = mdot(theta_matrix, weights_benchmarks)
            return x, weights_theta

    def xsecs(
        self,
        thetas=None,
        nus=None,
        partition="all",
        test_split=0.2,
        validation_split=0.2,
        include_nuisance_benchmarks=True,
        batch_size=100000,
        generated_close_to=None,
    ):
        """
        Returns the total cross sections for benchmarks or parameter points.

        Parameters
        ----------
        thetas : None or list of (ndarray or str), optional
            If None, the function returns all benchmark cross sections. Otherwise, it returns the cross sections for a
            series of parameter points that are either given by their benchmark name (as a str), their benchmark index
            (as an int), or their parameter value (as an ndarray, using morphing). Default value: None.

        nus : None or list of (None or ndarray), optional
             If None, the nuisance parameters are set to their nominal values (0), i.e. no systematics are taken into
             account. Otherwise, the list has to have the same number of elements as thetas, and each entry can specify
             nuisance parameters at nominal value (None) or a value of the nuisance parameters (ndarray).

        partition : {"train", "test", "validation", "all"}, optional
            Which event partition to use. Default: "all".

        test_split : float, optional
            Fraction of events reserved for testing. Default value: 0.2.

        validation_split : float, optional
            Fraction of weighted events reserved for validation. Default value: 0.2.

        include_nuisance_benchmarks : bool, optional
            Whether to include nuisance benchmarks if thetas is None. Default value: True.

        batch_size : int, optional
            Size of the batches of events that are loaded into memory at the same time. Default value: 100000.

        generated_close_to : None or ndarray, optional
            If not None, only events originally generated from the closest benchmark to this parameter point will be
            used. Default value : None.

        Returns
        -------
        xsecs : ndarray
            Calculated cross sections in pb.

        xsec_uncertainties : ndarray
            Cross-section uncertainties in pb. Basically calculated as sum(weights**2)**0.5.
        """

        logger.debug("Calculating cross sections for thetas = %s and nus = %s",
                     thetas, nus)

        # Inputs
        if thetas is not None:
            include_nuisance_benchmarks = True
        if thetas is not None:
            if nus is None:
                nus = [None for _ in thetas]
            assert len(nus) == len(
                thetas), "Numbers of thetas and nus don't match!"

        # Which events to use
        if partition == "all":
            start_event, end_event = None, None
            correction_factor = 1.0
        elif partition in ["train", "validation", "test"]:
            start_event, end_event, correction_factor = self._train_validation_test_split(
                partition, test_split, validation_split)
        else:
            raise ValueError(f"Invalid partition type: {partition}")

        # Theta matrices (translation of benchmarks to theta, at nominal nuisance params)
        if thetas is None:
            theta_matrices = np.identity(self.n_benchmarks)
        else:
            theta_matrices = [
                self._get_theta_benchmark_matrix(theta) for theta in thetas
            ]
            theta_matrices = np.asarray(
                theta_matrices)  # Shape (n_thetas, n_benchmarks)

        # Loop over events
        xsecs = 0.0
        xsec_uncertainties = 0.0
        n_events = 0

        for i_batch, (_, benchmark_weights) in enumerate(
                self.event_loader(
                    start=start_event,
                    end=end_event,
                    include_nuisance_parameters=include_nuisance_benchmarks,
                    batch_size=batch_size,
                    generated_close_to=generated_close_to,
                )):
            n_batch, _ = benchmark_weights.shape
            n_events += n_batch

            # Benchmark xsecs
            if thetas is None:
                xsecs += np.sum(benchmark_weights, axis=0)
                xsec_uncertainties += np.sum(benchmark_weights *
                                             benchmark_weights,
                                             axis=0)

            # xsecs at given parameters(theta, nu)
            else:
                # Weights at nominal nuisance params (nu=0)
                weights_nom = mdot(
                    theta_matrices,
                    benchmark_weights)  # Shape (n_thetas, n_batch)
                weights_sq_nom = mdot(theta_matrices, benchmark_weights *
                                      benchmark_weights)  # same

                # Effect of nuisance parameters
                nuisance_factors = self._calculate_nuisance_factors(
                    nus, benchmark_weights)
                weights = nuisance_factors * weights_nom
                weights_sq = nuisance_factors * weights_sq_nom

                # Sum up
                xsecs += np.sum(weights, axis=1)
                xsec_uncertainties += np.sum(weights_sq, axis=1)

        if n_events == 0:
            raise RuntimeError(
                f"Did not find events with test_split = {test_split} "
                f"and generated_close_to = {generated_close_to}")

        xsec_uncertainties = np.maximum(xsec_uncertainties, 0.0)**0.5

        # Correct for not using all events
        xsecs *= correction_factor
        xsec_uncertainties *= correction_factor

        logger.debug("xsecs and uncertainties [pb]:")
        for this_xsec, this_uncertainty in zip(xsecs, xsec_uncertainties):
            logger.debug("  (%4f +/- %4f) pb (%4f %%)", this_xsec,
                         this_uncertainty, 100 * this_uncertainty / this_xsec)

        return xsecs, xsec_uncertainties

    def xsec_gradients(
        self,
        thetas,
        nus=None,
        partition="all",
        test_split=0.2,
        validation_split=0.2,
        gradients="all",
        batch_size=100000,
        generated_close_to=None,
    ):
        """
        Returns the gradient of total cross sections with respect to parameters.

        Parameters
        ----------
        thetas : list of (ndarray or str), optional
            If None, the function returns all benchmark cross sections. Otherwise, it returns the cross sections for a
            series of parameter points that are either given by their benchmark name (as a str), their benchmark index
            (as an int), or their parameter value (as an ndarray, using morphing). Default value: None.

        nus : None or list of (None or ndarray), optional
             If None, the nuisance parameters are set to their nominal values (0), i.e. no systematics are taken into
             account. Otherwise, the list has to have the same number of elements as thetas, and each entry can specify
             nuisance parameters at nominal value (None) or a value of the nuisance parameters (ndarray).

        partition : {"train", "test", "validation", "all"}, optional
            Which events to use. Default: "all".

        test_split : float, optional
            Fraction of events reserved for testing. Default value: 0.2.

        validation_split : float, optional
            Fraction of weighted events reserved for validation. Default value: 0.2.

        gradients : {"all", "theta", "nu"}, optional
            Which gradients to calculate. Default value: "all".

        batch_size : int, optional
            Size of the batches of events that are loaded into memory at the same time. Default value: 100000.

        generated_close_to : None or ndarray, optional
            If not None, only events originally generated from the closest benchmark to this parameter point will be
            used. Default value : None.

        Returns
        -------
        xsecs_gradients : ndarray
            Calculated cross section gradients in pb with shape (n_gradients,).
        """

        logger.debug(
            f"Calculating cross section gradients for thetas = {thetas} and nus = {nus}"
        )

        # Inputs
        include_nuisance_benchmarks = nus is not None or gradients in [
            "all", "nu"
        ]
        if nus is None:
            nus = [None for _ in thetas]
        assert len(nus) == len(
            thetas), "Numbers of thetas and nus don't match!"
        if gradients not in ["all", "theta", "nu"]:
            raise RuntimeError(f"Invalid gradients type: {gradients}")

        # Which events to use
        if partition == "all":
            start_event, end_event = None, None
            correction_factor = 1.0
        elif partition in ["train", "validation", "test"]:
            start_event, end_event, correction_factor = self._train_validation_test_split(
                partition, test_split, validation_split)
        else:
            raise ValueError(f"Invalid partition type: {partition}")

        # Theta matrices (translation of benchmarks to theta, at nominal nuisance params)
        theta_matrices = np.asarray([
            self._get_theta_benchmark_matrix(theta) for theta in thetas
        ])  # shape (n_thetas, n_benchmarks)

        theta_gradient_matrices = np.asarray([
            self._get_dtheta_benchmark_matrix(theta) for theta in thetas
        ])  # shape (n_thetas, n_gradients, n_benchmarks)

        # Loop over events
        xsec_gradients = 0.0

        for i_batch, (_, benchmark_weights) in enumerate(
                self.event_loader(
                    start=start_event,
                    end=end_event,
                    include_nuisance_parameters=include_nuisance_benchmarks,
                    batch_size=batch_size,
                    generated_close_to=generated_close_to,
                )):
            n_batch, _ = benchmark_weights.shape
            logger.debug(f"Batch {i_batch+1} with {n_batch} events")

            if gradients in ["all", "theta"]:
                nom_gradients = mdot(
                    theta_gradient_matrices, benchmark_weights
                )  # Shape (n_thetas, n_phys_gradients, n_batch)
                nuisance_factors = self._calculate_nuisance_factors(
                    nus, benchmark_weights)  # Shape (n_thetas, n_batch)
                try:
                    dweight_dtheta = nuisance_factors[:, np.
                                                      newaxis, :] * nom_gradients
                except TypeError:
                    dweight_dtheta = nom_gradients

            if gradients in ["all", "nu"]:
                weights_nom = mdot(
                    theta_matrices,
                    benchmark_weights)  # Shape (n_thetas, n_batch)
                nuisance_factor_gradients = np.asarray([
                    self.nuisance_morpher.calculate_nuisance_factor_gradients(
                        nu, benchmark_weights) for nu in nus
                ])  # Shape (n_thetas, n_nuisance_gradients, n_batch)
                dweight_dnu = nuisance_factor_gradients * weights_nom[:, np.
                                                                      newaxis, :]

            if gradients == "all":
                dweight_dall = np.concatenate((dweight_dtheta, dweight_dnu), 1)
            elif gradients == "theta":
                dweight_dall = dweight_dtheta
            elif gradients == "nu":
                dweight_dall = dweight_dnu
            xsec_gradients += np.sum(dweight_dall, axis=2)

        # Correct for not using all events
        xsec_gradients *= correction_factor

        return xsec_gradients

    def _check_n_events(self):
        n_events_check = \
            sum(self.n_events_generated_per_benchmark) \
            + self.n_events_backgrounds

        if self.n_samples != n_events_check:
            logger.warning(
                "Inconsistent event numbers in HDF5 file! Please recalculate them by calling "
                "combine_and_shuffle(recalculate_header=True).")

    def _report_setup(self):
        logger.info(f"Found {self.n_parameters} parameters")
        for i, param in enumerate(self.parameters.values()):
            logger.info("  %s: %s", i, param)

        if self.n_nuisance_parameters > 0:
            logger.info(
                f"Found {self.n_nuisance_parameters} nuisance parameters")
            for i, param in enumerate(self.nuisance_parameters.values()):
                logger.info("  %s: %s", i, param)
        else:
            logger.info("Did not find nuisance parameters")

        logger.info(f"Found {self.n_benchmarks} benchmarks")
        for benchmark in self.benchmarks.values():
            if benchmark.is_nuisance:
                logger.debug("   %s: systematics", benchmark.name)
            else:
                logger.debug("   %s", benchmark)

        logger.info(f"Found {self.n_observables} observables")
        for i, obs in enumerate(self.observables):
            logger.debug("  %2.2s %s", i, obs)

        logger.info(f"Found {self.n_samples} events")
        if len(self.n_events_generated_per_benchmark) > 0:
            for events, name in zip(self.n_events_generated_per_benchmark,
                                    self.benchmarks.keys()):
                logger.info("  %s signal events sampled from benchmark %s",
                            events, name)
            if self.n_events_backgrounds is not None and self.n_events_backgrounds > 0:
                logger.info("  %s background events",
                            self.n_events_backgrounds)
        else:
            logger.debug("  Did not find sample summary information")

        if self.morpher is not None:
            logger.info("Found morphing setup with %s components",
                        len(self.morphing_components))
        else:
            logger.info("Did not find morphing setup.")

        if self.nuisance_morpher is not None:
            logger.info("Found nuisance morphing setup")
        else:
            logger.info("Did not find nuisance morphing setup")

    def _calculate_nuisance_factors(self, nus, benchmark_weights):
        if self._any_nontrivial_nus(nus):
            return np.asarray([
                self.nuisance_morpher.calculate_nuisance_factors(
                    nu, benchmark_weights) for nu in nus
            ])  # Shape (n_thetas, n_batch)
        else:
            return 1.0

    def _finite_differences_theta_gradient_matrices(self):
        """ Constructs the matrix that translates benchmark weights to the gradient of the weight evaluated at the benchmarks """
        assert self.finite_difference_benchmarks is not None
        assert self.finite_difference_epsilon is not None

        matrix = np.zeros(
            (self.n_benchmarks, self.n_parameters,
             self.n_benchmarks))  # (n_thetas, n_gradients, n_benchmarks)

        benchmark_names = list(self.benchmarks.keys())

        # We'll generally try to find the tuples p, i, j, k such that
        # matrix[i, p, j] = - 1 / eps and matrix[i, p, i] = 1 / eps
        for i, b_name in enumerate(self.benchmarks.keys()):
            # For the FD-shifted benchmarks, we assume that the gradients are
            # the same as at the original point, and will just copy the matrix later
            copy_to = []
            if b_name not in self.finite_difference_benchmarks.keys():
                continue

            for p, p_name in enumerate(self.parameters.keys()):
                shifted_benchmark_dict = self.finite_difference_benchmarks[
                    b_name].shift_names
                shifted_benchmark_name = shifted_benchmark_dict[p_name]
                j = benchmark_names.index(shifted_benchmark_name)
                copy_to.append(j)

                matrix[i, p, j] = +1.0 / self.finite_difference_epsilon
                matrix[i, p, i] = -1.0 / self.finite_difference_epsilon

            for j in copy_to:
                matrix[j, :, :] = matrix[i, :, :]

        return matrix

    @staticmethod
    def _any_nontrivial_nus(nus):
        if nus is None:
            return False
        for nu in nus:
            if nu is not None:
                return True
        return False

    def _derivative_mode(self):
        if self.morpher is not None:
            mode = "morphing"
        elif self.finite_difference_benchmarks is not None:
            mode = "fd"
        else:
            raise RuntimeError(
                "Cannot compute xsec gradients when neither morphing nor finite differences are correctly set up!"
            )
        return mode

    def _weights(self, thetas, nus, benchmark_weights, theta_matrices=None):
        """
        Turns benchmark weights into weights for given parameter points (theta, nu).

        Parameters
        ----------
        thetas : list of (ndarray or str)
            If None, the function returns all benchmark cross sections. Otherwise, it returns the cross sections for a
            series of parameter points that are either given by their benchmark name (as a str), their benchmark index
            (as an int), or their parameter value (as an ndarray, using morphing).

        nus : None or list of (None or ndarray)
             If None, the nuisance parameters are set to their nominal values (0), i.e. no systematics are taken into
             account. Otherwise, the list has to have the same number of elements as thetas, and each entry can specify
             nuisance parameters at nominal value (None) or a value of the nuisance parameters (ndarray).

        Returns
        -------
        weights : ndarray
            Calculated weights in pb.
        """

        n_events, _ = benchmark_weights.shape

        # Inputs
        if nus is None:
            nus = [None for _ in thetas]
        assert len(nus) == len(
            thetas), "Numbers of thetas and nus don't match!"

        # Theta matrices (translation of benchmarks to theta, at nominal nuisance params)
        if theta_matrices is None:
            theta_matrices = [
                self._get_theta_benchmark_matrix(theta) for theta in thetas
            ]
        theta_matrices = np.asarray(
            theta_matrices)  # Shape (n_thetas, n_benchmarks)

        # Weights at nominal nuisance params (nu=0)
        weights_nom = mdot(theta_matrices,
                           benchmark_weights)  # Shape (n_thetas, n_batch)

        # Effect of nuisance parameters
        nuisance_factors = self._calculate_nuisance_factors(
            nus, benchmark_weights)
        weights = nuisance_factors * weights_nom

        return weights

    def _weight_gradients(self,
                          thetas,
                          nus,
                          benchmark_weights,
                          gradients="all",
                          theta_matrices=None,
                          theta_gradient_matrices=None):
        """
        Turns benchmark weights into weights for given parameter points (theta, nu).

        Parameters
        ----------
        thetas : list of (ndarray or str)
            If None, the function returns all benchmark cross sections. Otherwise, it returns the cross sections for a
            series of parameter points that are either given by their benchmark name (as a str), their benchmark index
            (as an int), or their parameter value (as an ndarray, using morphing).

        nus : None or list of (None or ndarray)
             If None, the nuisance parameters are set to their nominal values (0), i.e. no systematics are taken into
             account. Otherwise, the list has to have the same number of elements as thetas, and each entry can specify
             nuisance parameters at nominal value (None) or a value of the nuisance parameters (ndarray).

        gradients : {"all", "theta", "nu"}, optional
            Which gradients to calculate. Default value: "all".

        Returns
        -------
        gradients : ndarray
            Calculated gradients in pb.
        """

        n_events, _ = benchmark_weights.shape

        # Inputs
        if gradients == "all" and self.n_nuisance_parameters == 0:
            gradients = "theta"
        if nus is None:
            nus = [None for _ in thetas]
        assert len(nus) == len(
            thetas), "Numbers of thetas and nus don't match!"

        # Theta matrices (translation of benchmarks to theta, at nominal nuisance params)
        if theta_matrices is None:
            theta_matrices = [
                self._get_theta_benchmark_matrix(theta) for theta in thetas
            ]
        if theta_gradient_matrices is None:
            theta_gradient_matrices = [
                self._get_dtheta_benchmark_matrix(theta) for theta in thetas
            ]
        theta_matrices = np.asarray(
            theta_matrices)  # Shape (n_thetas, n_benchmarks)
        theta_gradient_matrices = np.asarray(
            theta_gradient_matrices
        )  # Shape (n_thetas, n_gradients, n_benchmarks)

        # Calculate theta gradient
        if gradients in ["all", "theta"]:
            nom_gradients = mdot(
                theta_gradient_matrices,
                benchmark_weights)  # (n_thetas, n_phys_gradients, n_batch)
            nuisance_factors = self._calculate_nuisance_factors(
                nus, benchmark_weights)
            try:
                dweight_dtheta = nuisance_factors[:, np.
                                                  newaxis, :] * nom_gradients
            except TypeError:
                dweight_dtheta = nom_gradients
        else:
            dweight_dtheta = None

        # Calculate nu gradient
        if gradients in ["all", "nu"]:
            weights_nom = mdot(theta_matrices,
                               benchmark_weights)  # Shape (n_thetas, n_batch)
            nuisance_factor_gradients = np.asarray([
                self.nuisance_morpher.calculate_nuisance_factor_gradients(
                    nu, benchmark_weights) for nu in nus
            ])  # Shape (n_thetas, n_nuisance_gradients, n_batch)
            dweight_dnu = nuisance_factor_gradients * weights_nom[:, np.
                                                                  newaxis, :]
        else:
            dweight_dnu = None

        if gradients == "theta":
            return dweight_dtheta
        elif gradients == "nu":
            return dweight_dnu
        return np.concatenate((dweight_dtheta, dweight_dnu), 1)

    def _train_test_split(self, train, test_split):
        """
        Returns the start and end event for train samples (train = True) or test samples (train = False).

        Parameters
        ----------
        train : bool
            True if training data is generated, False if test data is generated.

        test_split : float
            Fraction of events reserved for testing.

        Returns
        -------
        start_event : int
            Index (in the MadMiner file) of the first event to consider.

        end_event : int
            Index (in the MadMiner file) of the last unweighted event to consider.

        correction_factor : float
            Factor with which the weights and cross sections will have to be multiplied to make up for the missing
            events.

        """
        if train:
            start_event = 0

            if test_split is None or test_split <= 0.0 or test_split >= 1.0:
                end_event = None
                correction_factor = 1.0
            else:
                end_event = int(round((1.0 - test_split) * self.n_samples, 0))
                correction_factor = 1.0 / (1.0 - test_split)
                if end_event < 0 or end_event > self.n_samples:
                    raise ValueError(
                        f"Irregular split: sample {end_event} / {self.n_samples}"
                    )

        else:
            if test_split is None or test_split <= 0.0 or test_split >= 1.0:
                start_event = 0
                correction_factor = 1.0
            else:
                start_event = int(round(
                    (1.0 - test_split) * self.n_samples, 0)) + 1
                correction_factor = 1.0 / test_split
                if start_event < 0 or start_event > self.n_samples:
                    raise ValueError(
                        f"Irregular split: sample {start_event} / {self.n_samples}"
                    )

            end_event = None

        return start_event, end_event, correction_factor

    def _train_validation_test_split(self, partition, test_split,
                                     validation_split):
        """
        Returns the start and end event for train samples (train = True) or test samples (train = False).

        Parameters
        ----------
        partition : ["train", "validation", "test"]

        test_split : float
            Fraction of events reserved for testing.

        validation_split : float
            Fraction of events reserved for testing.

        Returns
        -------
        start_event : int
            Index (in the MadMiner file) of the first event to consider.

        end_event : int
            Index (in the MadMiner file) of the last unweighted event to consider.

        correction_factor : float
            Factor with which the weights and cross sections will have to be multiplied to make up for the missing
            events.

        """
        if test_split is None or test_split < 0.0:
            test_split = 0.0
        if validation_split is None or validation_split < 0.0:
            validation_split = 0.0
        assert test_split + validation_split <= 1.0
        train_split = 1.0 - test_split - validation_split

        if partition == "train":
            start_event = 0

            if test_split is None or test_split <= 0.0 or test_split >= 1.0:
                end_event = None
                correction_factor = 1.0
            else:
                end_event = int(round(train_split * self.n_samples, 0))
                correction_factor = 1.0 / train_split

                if end_event < 0 or end_event > self.n_samples:
                    raise ValueError(
                        f"Irregular split: sample {end_event} / {self.n_samples}"
                    )

        elif partition == "validation":
            if validation_split is None or validation_split <= 0.0 or validation_split >= 1.0:
                start_event = 0
                end_event = None
                correction_factor = 1.0

            else:
                start_event = int(round(train_split * self.n_samples, 0)) + 1
                end_event = int(round((1.0 - test_split) * self.n_samples, 0))
                correction_factor = 1.0 / validation_split

                if start_event < 0 or start_event > self.n_samples:
                    raise ValueError(
                        f"Irregular split: sample {start_event} / {self.n_samples}"
                    )

                if end_event < 0 or end_event > self.n_samples:
                    raise ValueError(
                        f"Irregular split: sample {end_event} / {self.n_samples}"
                    )

        elif partition == "test":
            end_event = None

            if test_split is None or test_split <= 0.0 or test_split >= 1.0:
                start_event = 0
                correction_factor = 1.0
            else:
                start_event = int(round(
                    (1.0 - test_split) * self.n_samples, 0)) + 1
                correction_factor = 1.0 / test_split
                if start_event < 0 or start_event > self.n_samples:
                    raise ValueError(
                        f"Irregular split: sample {start_event} / {self.n_samples}"
                    )

        else:
            raise RuntimeError(f"Unknown partition {partition}")

        return start_event, end_event, correction_factor

    def _get_theta_value(self, theta):
        if isinstance(theta, str):
            benchmark = self.benchmarks[theta]
            theta_value = np.array([val for val in benchmark.values.values()])
        elif isinstance(theta, int):
            benchmark = self.benchmarks[list(self.benchmarks.keys())[theta]]
            theta_value = np.array([val for val in benchmark.values.values()])
        else:
            theta_value = np.asarray(theta)
        return theta_value

    def _get_nu_value(self, nu):
        if nu is None:
            nu_value = np.zeros(self.n_nuisance_parameters)
        else:
            nu_value = np.asarray(nu)
        return nu_value

    def _get_theta_benchmark_matrix(self, theta, zero_pad=True):
        """Calculates vector A such that dsigma(theta) = A * dsigma_benchmarks"""

        if zero_pad:
            unpadded_theta_matrix = self._get_theta_benchmark_matrix(
                theta, zero_pad=False)
            theta_matrix = np.zeros(self.n_benchmarks)
            theta_matrix[:unpadded_theta_matrix.
                         shape[0]] = unpadded_theta_matrix

        elif isinstance(theta, str):
            i_benchmark = list(self.benchmarks).index(theta)
            theta_matrix = self._get_theta_benchmark_matrix(i_benchmark)

        elif isinstance(theta, int):
            n_benchmarks = len(self.benchmarks)
            theta_matrix = np.zeros(n_benchmarks)
            theta_matrix[theta] = 1.0

        else:
            theta_matrix = self.morpher.calculate_morphing_weights(theta)

        return theta_matrix

    def _get_dtheta_benchmark_matrix(self, theta, zero_pad=True):
        """Calculates matrix A_ij such that d dsigma(theta) / d theta_i = A_ij * dsigma (benchmark j)"""

        mode = self._derivative_mode()

        if zero_pad:
            unpadded_theta_matrix = self._get_dtheta_benchmark_matrix(
                theta, zero_pad=False)
            dtheta_matrix = np.zeros(
                (unpadded_theta_matrix.shape[0], self.n_benchmarks))
            dtheta_matrix[:, :unpadded_theta_matrix.
                          shape[1]] = unpadded_theta_matrix

        elif isinstance(theta, str) and mode == "morphing":
            benchmark = self.benchmarks[theta]
            benchmark = np.array([val for val in benchmark.values.values()])
            dtheta_matrix = self._get_dtheta_benchmark_matrix(benchmark)

        elif isinstance(theta, int) and mode == "morphing":
            benchmark = self.benchmarks[list(self.benchmarks.keys())[theta]]
            benchmark = np.array([val for val in benchmark.values.values()])
            dtheta_matrix = self._get_dtheta_benchmark_matrix(benchmark)

        elif isinstance(theta, str):
            benchmark_id = list(self.benchmarks.keys()).index(theta)
            dtheta_matrix = self._get_dtheta_benchmark_matrix(benchmark_id)

        elif isinstance(theta, int):  # finite differences
            # TODO: avoid constructing the full matrix every time
            dtheta_matrix = self._finite_differences_theta_gradient_matrices(
            )[theta]

        else:
            if mode == "fd":
                raise RuntimeError(
                    "Cannot calculate score for arbitrary parameter points without morphing setup"
                )

            # Shape (n_parameters, n_benchmarks_phys)
            dtheta_matrix = self.morpher.calculate_morphing_weight_gradient(
                theta)

        return dtheta_matrix

    def _calculate_sampling_factors(self):
        events = np.asarray(self.n_events_generated_per_benchmark,
                            dtype=np.float64)
        logger.debug(f"Events per benchmark: {events}")
        factors = events / np.sum(events)
        factors = np.hstack((factors, 1.0))  # background events
        return factors

    def _find_closest_benchmark(self, theta):
        if theta is None:
            return None

        benchmarks = self._benchmark_array()[:self.n_benchmarks_phys]
        distances = [
            np.linalg.norm(benchmark - theta) for benchmark in benchmarks
        ]

        # Don't use benchmarks where we don't actually have events
        if len(self.n_events_generated_per_benchmark) > 0:
            distances = distances + 1.0e9 * (
                self.n_events_generated_per_benchmark == 0).astype(np.float64)

        closest_idx = np.argmin(distances)
        return closest_idx

    def _benchmark_array(self):
        return np.asarray([
            list(benchmark.values.values())
            for benchmark in self.benchmarks.values()
        ])