Example #1
0
    def _emd(ev0, ev1, R, norm, n_iter_max, periodic_phi, phi_col):

        pTs0, coords0 = ev0
        pTs1, coords1 = ev1

        thetas = _cdist_euclidean(coords0, coords1, periodic_phi, phi_col)/R

        # extra particles (with zero pt) already added if going in here
        rescale = 1.0
        if not norm:
            pT0, pT1 = pTs0.sum(), pTs1.sum()
            pTdiff = pT1 - pT0
            if pTdiff > 0:
                pTs0[-1] = pTdiff
            elif pTdiff < 0:
                pTs1[-1] = -pTdiff
            thetas[:,-1] = 1.0
            thetas[-1,:] = 1.0

            # change units for numerical stability
            rescale = max(pT0, pT1)

        # compute the emd with POT
        _, cost, _, _, result_code = emd_c(pTs0/rescale, pTs1/rescale, thetas, n_iter_max)
        check_result(result_code)

        # important! must reset extra particles to have pt zero
        if not norm:
            pTs0[-1] = pTs1[-1] = 0

        return cost * rescale
Example #2
0
    def _emd(ev0, ev1, R, no_norm, beta, euclidean, n_iter_max, periodic_phi,
             phi_col, empty_policy):

        pTs0, coords0 = ev0
        pTs1, coords1 = ev1

        if pTs0 is None or pTs1 is None:
            return empty_policy

        thetas = _cdist(coords0, coords1, euclidean, periodic_phi, phi_col) / R

        # implement angular exponent
        if beta != 1:
            thetas **= beta

        # extra particles (with zero pt) already added if going in here
        rescale = 1.0
        if no_norm:
            pT0, pT1 = pTs0.sum(), pTs1.sum()
            pTdiff = pT1 - pT0
            if pTdiff > 0:
                pTs0[-1] = pTdiff
            elif pTdiff < 0:
                pTs1[-1] = -pTdiff
            thetas[:, -1] = 1.0
            thetas[-1, :] = 1.0

            # change units for numerical stability
            rescale = max(pT0, pT1)

        # compute the emd with POT
        _, cost, _, _, result_code = emd_c(pTs0 / rescale, pTs1 / rescale,
                                           thetas, n_iter_max)
        check_result(result_code)

        # important! must reset extra particles to have pt zero
        if no_norm:
            pTs0[-1] = pTs1[-1] = 0

        return cost * rescale
Example #3
0
    def emd(ev0, ev1, R=1.0, norm=False, return_flow=False, gdim=None, n_iter_max=100000,
                      periodic_phi=False, phi_col=2):
        r"""Compute the EMD between two events.

        **Arguments**

        - **ev0** : _numpy.ndarray_
            - The first event, given as a two-dimensional array. The event is 
            assumed to be an `(M,1+gdim)` array of particles, where `M` is the 
            multiplicity and `gdim` is the dimension of the ground space in 
            which to compute euclidean distances between particles (as specified
            by the `gdim` keyword argument. The zeroth column is assumed to be
            the energies (or equivalently, the transverse momenta) of the 
            particles. For typical hadron collider jet applications, each 
            particle will be of the form `(pT,y,phi)` where  `y` is the rapidity
            and `phi` is the azimuthal angle.
        - **ev1** : _numpy.ndarray_
            - The other event, same format as **ev0**.
        - **R** : _float_
            - The R parameter in the EMD definition that controls the relative 
            importance of the two terms. Must be greater than or equal to half 
            of the maximum ground distance in the space in order for the EMD 
            to be a valid metric.
        - **norm** : _bool_
            - Whether or not to normalize the pT values of the events prior to 
            computing the EMD.
        - **return_flow** : _bool_
            - Whether or not to return the flow matrix describing the optimal 
            transport found during the computation of the EMD. Note that since
            the second term in Eq. 1 is implemented by including an additional 
            particle in the event with lesser total pT, this will be reflected 
            in the flow matrix.
        - **gdim** : _int_
            - The dimension of the ground metric space. Useful for restricting
            which dimensions are considered part of the ground space. Can be
            larger than the number of dimensions present in the events (in
            which case all dimensions will be included). If `None`, has no
            effect.
        - **n_iter_max** : _int_
            - Maximum number of iterations for solving the optimal transport 
            problem.
        - **periodic_phi** : _bool_
            - Whether to expect (and therefore properly handle) periodicity
            in the coordinate corresponding to the azimuthal angle $\phi$.
            Should typically be `True` for event-level applications but can
            be set to `False` (which is slightly faster) for jet applications
            where all $\phi$ differences are less than or equal to $\pi$.
        - **phi_col** : _int_
            - The index of the column of $\phi$ values in the event array.

        **Returns**

        - _float_
            - The EMD value.
        - [_numpy.ndarray_], optional
            - The flow matrix found while solving for the EMD. The `(i,j)`th 
            entry is the amount of `pT` that flows between particle i in `ev0`
            and particle j in `ev1`.
        """

        # parameter checks
        _check_params(norm, gdim, phi_col)

        # handle periodicity
        phi_col_m1 = phi_col - 1

        # process events
        pTs0, coords0 = _process_for_emd(ev0, None, gdim, periodic_phi, phi_col_m1)
        pTs1, coords1 = _process_for_emd(ev1, None, gdim, periodic_phi, phi_col_m1)

        pT0, pT1 = pTs0.sum(), pTs1.sum()

        # if norm, then we normalize the pts to 1
        if norm:
            pTs0 /= pT0
            pTs1 /= pT1
            thetas = _cdist_euclidean(coords0, coords1, periodic_phi, phi_col_m1)/R
            rescale = 1.0

        # implement the EMD in Eq. 1 of the paper by adding an appropriate extra particle
        else:
            pTdiff = pT1 - pT0
            if pTdiff > 0:
                pTs0 = np.hstack((pTs0, pTdiff))
                coords0_extra = np.vstack((coords0, np.zeros(coords0.shape[1], dtype=np.float64)))
                thetas = _cdist_euclidean(coords0_extra, coords1, periodic_phi, phi_col_m1)/R
                thetas[-1,:] = 1.0

            elif pTdiff < 0:
                pTs1 = np.hstack((pTs1, -pTdiff))
                coords1_extra = np.vstack((coords1, np.zeros(coords1.shape[1], dtype=np.float64)))
                thetas = _cdist_euclidean(coords0, coords1_extra, periodic_phi, phi_col_m1)/R
                thetas[:,-1] = 1.0

            # in this case, the pts were exactly equal already so no need to add a particle
            else:
                thetas = _cdist_euclidean(coords0, coords1, periodic_phi, phi_col_m1)/R

            # change units for numerical stability
            rescale = max(pT0, pT1)

        G, cost, _, _, result_code = emd_c(pTs0/rescale, pTs1/rescale, thetas, n_iter_max)
        check_result(result_code)

        # need to change units back
        return (cost * rescale, G * rescale) if return_flow else cost * rescale
Example #4
0
    def emd_pot(ev0, ev1, R=1.0, norm=False, beta=1.0, measure='euclidean', coords='hadronic',
                         return_flow=False, gdim=None, mask=False, n_iter_max=100000,
                         periodic_phi=False, phi_col=2, empty_policy='error'):
        r"""Compute the EMD between two events using the Python Optimal
        Transport library.

        **Arguments**

        - **ev0** : _numpy.ndarray_
            - The first event, given as a two-dimensional array. The event is 
            assumed to be an `(M,1+gdim)` array of particles, where `M` is the 
            multiplicity and `gdim` is the dimension of the ground space in 
            which to compute euclidean distances between particles (as specified
            by the `gdim` keyword argument. The zeroth column is assumed to be
            the energies (or equivalently, the transverse momenta) of the 
            particles. For typical hadron collider jet applications, each 
            particle will be of the form `(pT,y,phi)` where  `y` is the rapidity
            and `phi` is the azimuthal angle.
        - **ev1** : _numpy.ndarray_
            - The other event, same format as `ev0`.
        - **R** : _float_
            - The R parameter in the EMD definition that controls the relative 
            importance of the two terms. Must be greater than or equal to half 
            of the maximum ground distance in the space in order for the EMD 
            to be a valid metric satisfying the triangle inequality.
        - **beta** : _float_
            - The angular weighting exponent. The internal pairwsie distance
            matrix is raised to this power prior to solving the optimal
            transport problem.
        - **norm** : _bool_
            - Whether or not to normalize the pT values of the events prior to 
            computing the EMD.
        - **measure** : _str_
            - Controls which metric is used to calculate the ground distances
            between particles. `'euclidean'` uses the euclidean metric in
            however many dimensions are provided and specified by `gdim`.
            `'spherical'` uses the opening angle between particles on the
            sphere (note that this is not fully tested and should be used
            cautiously).
        - **coords** : _str_
            - Only has an effect if `measure='spherical'`, in which case it
            controls if `'hadronic'` coordinates `(pT,y,phi,[m])` are expected
            versus `'cartesian'` coordinates `(E,px,py,pz)`.
        - **return_flow** : _bool_
            - Whether or not to return the flow matrix describing the optimal 
            transport found during the computation of the EMD. Note that since
            the second term in Eq. 1 is implemented by including an additional 
            particle in the event with lesser total pT, this will be reflected 
            in the flow matrix.
        - **gdim** : _int_
            - The dimension of the ground metric space. Useful for restricting
            which dimensions are considered part of the ground space. Can be
            larger than the number of dimensions present in the events (in
            which case all dimensions will be included). If `None`, has no
            effect.
        - **mask** : _bool_
            - If `True`, ignores particles farther than `R` away from the
            origin.
        - **n_iter_max** : _int_
            - Maximum number of iterations for solving the optimal transport 
            problem.
        - **periodic_phi** : _bool_
            - Whether to expect (and therefore properly handle) periodicity
            in the coordinate corresponding to the azimuthal angle $\phi$.
            Should typically be `True` for event-level applications but can
            be set to `False` (which is slightly faster) for jet applications
            where all $\phi$ differences are less than or equal to $\pi$.
        - **phi_col** : _int_
            - The index of the column of $\phi$ values in the event array.
        - **empty_policy** : _float_ or `'error'`
            - Controls behavior if an empty event is passed in. When set to
            `'error'`, a `ValueError` is raised if an empty event is
            encountered. If set to a float, that value is returned is returned
            instead on an empty event.

        **Returns**

        - _float_
            - The EMD value.
        - [_numpy.ndarray_], optional
            - The flow matrix found while solving for the EMD. The `(i,j)`th 
            entry is the amount of `pT` that flows between particle i in `ev0`
            and particle j in `ev1`.
        """

        # parameter checks
        _check_params(norm, gdim, phi_col, measure, coords, empty_policy)
        euclidean = (measure == 'euclidean')
        hadr2cart = (not euclidean) and (coords == 'hadronic')
        error_on_empty = (empty_policy == 'error')

        # handle periodicity
        phi_col_m1 = phi_col - 1

        # process events
        args = (None, gdim, periodic_phi, phi_col_m1, 
                mask, R, hadr2cart, euclidean, error_on_empty)
        pTs0, coords0 = _process_for_emd(ev0, *args)
        pTs1, coords1 = _process_for_emd(ev1, *args)

        if pTs0 is None or pTs1 is None:
            if return_flow:
                return empty_policy, np.zeros((0,0))
            else:
                return empty_policy

        pT0, pT1 = pTs0.sum(), pTs1.sum()

        # if norm, then we normalize the pts to 1
        if norm:
            pTs0 /= pT0
            pTs1 /= pT1
            thetas = _cdist(coords0, coords1, euclidean, periodic_phi, phi_col_m1)/R
            rescale = 1.0

        # implement the EMD in Eq. 1 of the paper by adding an appropriate extra particle
        else:
            pTdiff = pT1 - pT0
            if pTdiff > 0:
                pTs0 = np.hstack((pTs0, pTdiff))
                coords0_extra = np.vstack((coords0, np.zeros(coords0.shape[1], dtype=np.float64)))
                thetas = _cdist(coords0_extra, coords1, euclidean, periodic_phi, phi_col_m1)/R
                thetas[-1,:] = 1.0

            elif pTdiff < 0:
                pTs1 = np.hstack((pTs1, -pTdiff))
                coords1_extra = np.vstack((coords1, np.zeros(coords1.shape[1], dtype=np.float64)))
                thetas = _cdist(coords0, coords1_extra, euclidean, periodic_phi, phi_col_m1)/R
                thetas[:,-1] = 1.0

            # in this case, the pts were exactly equal already so no need to add a particle
            else:
                thetas = _cdist(coords0, coords1, euclidean, periodic_phi, phi_col_m1)/R

            # change units for numerical stability
            rescale = max(pT0, pT1)

        # implement angular exponent
        if beta != 1:
            thetas **= beta

        G, cost, _, _, result_code = emd_c(pTs0/rescale, pTs1/rescale, thetas, n_iter_max)
        check_result(result_code)

        # need to change units back
        if return_flow:
            G *= rescale
            return cost * rescale, G
        else:
            return cost * rescale