Beispiel #1
0
    def sel(self, *args, **kwargs):
        """Computes a boolean mask of EFPs matching each of the
        specifications provided by the `args`. 

        **Arguments**

        - ***args** : arbitrary positional arguments
            - Each argument can be either a string or a length-two iterable. If
            the argument is a string, it should consist of three parts: a
            character which is a valid element of `cols`, a comparison
            operator (one of `<`, `>`, `<=`, `>=`, `==`, `!=`), and a number.
            Whitespace between the parts does not matter. If the argument is a
            tuple, the first element should be a string containing a column
            header character and a comparison operator; the second element is
            the value to be compared. The tuple version is useful when the
            value is a variable that changes (such as in a list comprehension).

        **Returns**

        - _1-d numpy.ndarray_
            - A boolean array of length the number of EFPs stored by this object. 
        """

        # ensure only valid keyword args are passed
        specs = kwargs.pop('specs', None)
        kwargs_check('sel', kwargs)

        # use default specs if non provided
        if specs is None:
            specs = self.specs

        # iterate through arguments
        mask = np.ones(len(specs), dtype=bool)
        for arg in args:

            # parse arg
            if isinstance(arg, six.string_types):
                s = arg
            elif hasattr(arg, '__getitem__'):
                if len(arg) == 2:
                    s = arg[0] + str(arg[1])
                else:
                    raise ValueError('{} is not length 2'.format(arg))
            else:
                raise TypeError('invalid argument {}'.format(arg))

            s = s.replace(' ', '')

            # match string to pattern
            match = self._sel_re.match(s)
            if match is None:
                raise ValueError('could not understand \'{}\''.format(arg))

            # get the variable of the selection
            var = match.group(1)
            if var not in self.cols:
                raise ValueError('\'{}\' not in {}'.format(var, self.cols))

            # get the comparison and value
            comp, val = match.group(2, 3)

            # AND the selection with mask
            mask &= explicit_comp(specs[:, getattr(self, var + '_ind')], comp,
                                  int(val))

        return mask
Beispiel #2
0
    def emd_wasserstein(ev0, ev1, dists=None, R=1.0, beta=1.0, norm=False, gdim=2, mask=False,
                                  return_flow=False, do_timing=False,
                                  n_iter_max=100000,
                                  epsilon_large_factor=10000.0, epsilon_small_factor=1.0,
                                  **kwargs):
        r"""Compute the EMD between two events using the Wasserstein library.

        **Arguments**

        - **ev0** : _numpy.ndarray_
            - The first event, given as a two-dimensional array. The event is 
            assumed to be an `(M,1+gdim)` array of particles, where `M` is the 
            multiplicity and `gdim` is the dimension of the ground space in 
            which to compute euclidean distances between particles (as specified
            by the `gdim` keyword argument). The zeroth column is the weights of
            the particles, typically their energies or transverse momenta. For
            typical hadron collider jet applications, each particle will be of
            the form `(pT,y,phi)` where  `y` is the rapidity and `phi` is the
            azimuthal angle. If `dists` are provided, then the columns after the
            zeroth are ignored; alternatively a one-dimensional array consisting
            of just the particle weights may be passed in this case.
        - **ev1** : _numpy.ndarray_
            - The other event, same format as `ev0`.
        - **dists** : _numpy.ndarray_
            - A distance matrix between particles in `ev0` and `ev1`. If `None`,
            then the columns of the events after the zeroth are taken to be
            coordinates and the `gdim`-dimensional Euclidean distance is used.
        - **R** : _float_
            - The R parameter in the EMD definition that controls the relative 
            importance of the two terms. Must be greater than or equal to half 
            of the maximum ground distance in the space in order for the EMD 
            to be a valid metric satisfying the triangle inequality.
        - **beta** : _float_
            - The angular weighting exponent. The internal pairwsie distance
            matrix is raised to this power prior to solving the optimal
            transport problem.
        - **norm** : _bool_
            - Whether or not to normalize the particle weights to sum to one
            prior to computing the EMD.
        - **gdim** : _int_
            - The dimension of the ground metric space. Useful for restricting
            which dimensions are considered part of the ground space when using
            the internal euclidean distances between particles. Has no effect if
            `dists` are provided.
        - **return_flow** : _bool_
            - Whether or not to return the flow matrix describing the optimal 
            transport found during the computation of the EMD. Note that since
            the second term in Eq. 1 is implemented by including an additional 
            particle in the event with lesser total weight, this will be
            reflected in the flow matrix.
        - **mask** : _bool_
            - If `True`, masks out particles farther than `R` away from the
            origin. Has no effect if `dists` are provided.
        - **n_iter_max** : _int_
            - Maximum number of iterations for solving the optimal transport 
            problem.
        - **epsilon_large_factor** : _float_
            - Controls some tolerances in the optimal transport solver. This
            value is multiplied by the floating points epsilon (around 1e-16 for
            64-bit floats) to determine the actual tolerance.
        - **epsilon_small_factor** : _float_
            - Analogous to `epsilon_large_factor` but used where the numerical
            tolerance can be stricter.

        **Returns**

        - _float_
            - The EMD value.
        - [_numpy.ndarray_], optional
            - The flow matrix found while solving for the EMD. The `(i,j)`th 
            entry is the amount of `pT` that flows between particle i in `ev0`
            and particle j in `ev1`.
        """

        # warn about old kwargs
        old_kwargs = {'measure', 'coords', 'periodic_phi', 'phi_col', 'empty_policy'}
        kwargs_check('emd_wasserstein', kwargs, old_kwargs)
        for k in kwargs:
            warnings.warn("Keyword argument '{}' has no effect on `emd_wasserstein`.".format(k)
                          + " Use `emd_pot` if you need previous functionality.")

        # set options
        _EMD.set_R(R)
        _EMD.set_beta(beta)
        _EMD.set_norm(norm)
        _EMD.set_network_simplex_params(n_iter_max, epsilon_large_factor, epsilon_small_factor)

        # run using euclidean distances
        if dists is None:
            ev0, ev1 = np.atleast_2d(ev0)[:,:gdim+1], np.atleast_2d(ev1)[:,:gdim+1]

            # mask out particles
            if mask:
                R2 = R*R
                ev0, ev1 = ev0[np.sum(ev0**2, axis=1) <= R2], ev1[np.sum(ev1**2, axis=1) <= R2]

            # evaluate EMD
            emd = _EMD(ev0[:,0], ev0[:,1:], ev1[:,0], ev1[:,1:])

        # run using custom distances
        else:

            # if events are 2d, extract weights as just the first column
            if ev0.ndim == 2:
                ev0 = ev0[:,0]
            if ev1.ndim == 2:
                ev1 = ev1[:,0]

            # evaluate EMD
            emd = _EMD(ev0, ev1, dists)

        # get flows if requested
        if return_flow:
            flows = _EMD.flows()

        if return_flow:
            return emd, flows
        else:
            return emd
Beispiel #3
0
    def emds_wasserstein(events0, events1=None, R=1.0, beta=1.0, norm=False, gdim=2, mask=False,
                                                external_emd_handler=None,
                                                n_jobs=-1, print_every=0, verbose=0,
                                                throw_on_error=True, n_iter_max=100000,
                                                epsilon_large_factor=10000.0, epsilon_small_factor=1.0,
                                                **kwargs):
        r"""Compute the EMDs between collections of events. This can be used to
        compute EMDs between all pairs of events in a set or between events in
        two different sets.

        **Arguments**

        - **events0** : _list_
            - Iterable collection of events. Each event is assumed to be an 
            `(M,1+gdim)` array of particles, where `M` is the multiplicity and
            `gdim` is the dimension of the ground space in which to compute
            euclidean distances between particles (as specified by the `gdim`
            keyword argument). The zeroth column is the weights of the
            particles, typically their energies or transverse momenta. For
            typical hadron collider jet applications, each particle will be of
            the form `(pT,y,phi)` where  `y` is the rapidity and `phi` is the
            azimuthal angle. If `dists` are provided, then the columns after the
            zeroth are ignored; alternatively a one-dimensional array consisting
            of just the particle weights may be passed in this case.
        - **events1** : _list_ or `None`
            - Iterable collection of events in the same format as `events0`, or
            `None`. If the latter, the pairwise distances between events in
            `events0` will be computed and the returned matrix will be
            symmetric.
       - **R** : _float_
            - The R parameter in the EMD definition that controls the relative 
            importance of the two terms. Must be greater than or equal to half 
            of the maximum ground distance in the space in order for the EMD 
            to be a valid metric satisfying the triangle inequality.
        - **norm** : _bool_
            - Whether or not to normalize the particle weights to sum to one
            prior to computing the EMD.
        - **beta** : _float_
            - The angular weighting exponent. The internal pairwsie distance
            matrix is raised to this power prior to solving the optimal
            transport problem.
         - **gdim** : _int_
            - The dimension of the ground metric space. Useful for restricting
            which dimensions are considered part of the ground space when using
            the internal euclidean distances between particles.
        - **mask** : _bool_
            - If `True`, ignores particles farther than `R` away from the
            origin.
        - **external_emd_handler** : _wasserstein.ExternalEMDHandler_
            - An instance of an external EMD handler from the wasserstein
            module, e.g. `CorrelationDimension`.
        - **n_jobs** : _int_ or `None`
            - The number of cpu cores to use. A value of `None` or `-1` will use
            as many threads as there are CPUs on the machine.
        - **print_every** : _int_
            - The number of computations to do in between printing the
            progress. Even if the verbosity level is zero, this still plays a
            role in determining when the worker threads report the results
            back to the main thread and check for interrupt signals.
        - **verbose** : _int_
            - Controls the verbosity level. A value greater than `0` will print
            the progress of the computation at intervals specified by
            `print_every`.
        - **throw_on_error** : _bool_
            - Whether or not to raise an exception when an issue is encountered.
            Can be useful when debugging.
        - **n_iter_max** : _int_
            - Maximum number of iterations for solving the optimal transport 
            problem.
        - **epsilon_large_factor** : _float_
            - Controls some tolerances in the optimal transport solver. This
            value is multiplied by the floating points epsilon (around 1e-16 for
            64-bit floats) to determine the actual tolerance.
        - **epsilon_small_factor** : _float_
            - Analogous to `epsilon_large_factor` but used where the numerical
            tolerance can be stricter.

        **Returns**

        - _numpy.ndarray_
            - The EMD values as a two-dimensional array, except if an external
            EMD handler was provided, in which case no value is returned. If
            `events1` was `None`, then the shape will be `(len(events0),
            len(events0))` and the array will be symmetric, otherwise it will
            have shape `(len(events0), len(events1))`.
        """

        # warn about old kwargs
        old_kwargs = {'X0', 'X1', 'measure', 'coords', 'periodic_phi', 'phi_col', 'empty_policy'}
        kwargs_check('emds_wasserstein', kwargs, old_kwargs)
        for k in kwargs:
            warnings.warn("Keyword argument '{}' has no effect on `emds_wasserstein`.".format(k)
                          + " Use `emds_pot` if you need previous functionality.")

        # determine number of threads to use
        if n_jobs is None or n_jobs == -1:
            n_jobs = multiprocessing.cpu_count() or 1

        # create object
        pairwise_emd = wasserstein.PairwiseEMD(R, beta, norm, n_jobs, print_every, bool(verbose),
                                               throw_on_error=throw_on_error,
                                               n_iter_max=n_iter_max,
                                               epsilon_large_factor=epsilon_large_factor,
                                               epsilon_small_factor=epsilon_small_factor)
        if verbose > 0:
            print(pairwise_emd)

        # set handler if given
        if external_emd_handler is not None:
            pairwise_emd.set_external_emd_handler(external_emd_handler)

        # run computation
        pairwise_emd(events0, events1, gdim, mask)

        # return flows if handler not provided
        if external_emd_handler is None:
            return pairwise_emd.emds()
Beispiel #4
0
    def __init__(self, kwargs):

        kwargs_check('EFBase', kwargs, allowed=MEASURE_KWARGS)
        self._measure = Measure(kwargs.pop('measure'), **kwargs)