def _emd(ev0, ev1, R, norm, n_iter_max, periodic_phi, phi_col): pTs0, coords0 = ev0 pTs1, coords1 = ev1 thetas = _cdist_euclidean(coords0, coords1, periodic_phi, phi_col)/R # extra particles (with zero pt) already added if going in here rescale = 1.0 if not norm: pT0, pT1 = pTs0.sum(), pTs1.sum() pTdiff = pT1 - pT0 if pTdiff > 0: pTs0[-1] = pTdiff elif pTdiff < 0: pTs1[-1] = -pTdiff thetas[:,-1] = 1.0 thetas[-1,:] = 1.0 # change units for numerical stability rescale = max(pT0, pT1) # compute the emd with POT _, cost, _, _, result_code = emd_c(pTs0/rescale, pTs1/rescale, thetas, n_iter_max) check_result(result_code) # important! must reset extra particles to have pt zero if not norm: pTs0[-1] = pTs1[-1] = 0 return cost * rescale
def _emd(ev0, ev1, R, no_norm, beta, euclidean, n_iter_max, periodic_phi, phi_col, empty_policy): pTs0, coords0 = ev0 pTs1, coords1 = ev1 if pTs0 is None or pTs1 is None: return empty_policy thetas = _cdist(coords0, coords1, euclidean, periodic_phi, phi_col) / R # implement angular exponent if beta != 1: thetas **= beta # extra particles (with zero pt) already added if going in here rescale = 1.0 if no_norm: pT0, pT1 = pTs0.sum(), pTs1.sum() pTdiff = pT1 - pT0 if pTdiff > 0: pTs0[-1] = pTdiff elif pTdiff < 0: pTs1[-1] = -pTdiff thetas[:, -1] = 1.0 thetas[-1, :] = 1.0 # change units for numerical stability rescale = max(pT0, pT1) # compute the emd with POT _, cost, _, _, result_code = emd_c(pTs0 / rescale, pTs1 / rescale, thetas, n_iter_max) check_result(result_code) # important! must reset extra particles to have pt zero if no_norm: pTs0[-1] = pTs1[-1] = 0 return cost * rescale
def emd(ev0, ev1, R=1.0, norm=False, return_flow=False, gdim=None, n_iter_max=100000, periodic_phi=False, phi_col=2): r"""Compute the EMD between two events. **Arguments** - **ev0** : _numpy.ndarray_ - The first event, given as a two-dimensional array. The event is assumed to be an `(M,1+gdim)` array of particles, where `M` is the multiplicity and `gdim` is the dimension of the ground space in which to compute euclidean distances between particles (as specified by the `gdim` keyword argument. The zeroth column is assumed to be the energies (or equivalently, the transverse momenta) of the particles. For typical hadron collider jet applications, each particle will be of the form `(pT,y,phi)` where `y` is the rapidity and `phi` is the azimuthal angle. - **ev1** : _numpy.ndarray_ - The other event, same format as **ev0**. - **R** : _float_ - The R parameter in the EMD definition that controls the relative importance of the two terms. Must be greater than or equal to half of the maximum ground distance in the space in order for the EMD to be a valid metric. - **norm** : _bool_ - Whether or not to normalize the pT values of the events prior to computing the EMD. - **return_flow** : _bool_ - Whether or not to return the flow matrix describing the optimal transport found during the computation of the EMD. Note that since the second term in Eq. 1 is implemented by including an additional particle in the event with lesser total pT, this will be reflected in the flow matrix. - **gdim** : _int_ - The dimension of the ground metric space. Useful for restricting which dimensions are considered part of the ground space. Can be larger than the number of dimensions present in the events (in which case all dimensions will be included). If `None`, has no effect. - **n_iter_max** : _int_ - Maximum number of iterations for solving the optimal transport problem. - **periodic_phi** : _bool_ - Whether to expect (and therefore properly handle) periodicity in the coordinate corresponding to the azimuthal angle $\phi$. Should typically be `True` for event-level applications but can be set to `False` (which is slightly faster) for jet applications where all $\phi$ differences are less than or equal to $\pi$. - **phi_col** : _int_ - The index of the column of $\phi$ values in the event array. **Returns** - _float_ - The EMD value. - [_numpy.ndarray_], optional - The flow matrix found while solving for the EMD. The `(i,j)`th entry is the amount of `pT` that flows between particle i in `ev0` and particle j in `ev1`. """ # parameter checks _check_params(norm, gdim, phi_col) # handle periodicity phi_col_m1 = phi_col - 1 # process events pTs0, coords0 = _process_for_emd(ev0, None, gdim, periodic_phi, phi_col_m1) pTs1, coords1 = _process_for_emd(ev1, None, gdim, periodic_phi, phi_col_m1) pT0, pT1 = pTs0.sum(), pTs1.sum() # if norm, then we normalize the pts to 1 if norm: pTs0 /= pT0 pTs1 /= pT1 thetas = _cdist_euclidean(coords0, coords1, periodic_phi, phi_col_m1)/R rescale = 1.0 # implement the EMD in Eq. 1 of the paper by adding an appropriate extra particle else: pTdiff = pT1 - pT0 if pTdiff > 0: pTs0 = np.hstack((pTs0, pTdiff)) coords0_extra = np.vstack((coords0, np.zeros(coords0.shape[1], dtype=np.float64))) thetas = _cdist_euclidean(coords0_extra, coords1, periodic_phi, phi_col_m1)/R thetas[-1,:] = 1.0 elif pTdiff < 0: pTs1 = np.hstack((pTs1, -pTdiff)) coords1_extra = np.vstack((coords1, np.zeros(coords1.shape[1], dtype=np.float64))) thetas = _cdist_euclidean(coords0, coords1_extra, periodic_phi, phi_col_m1)/R thetas[:,-1] = 1.0 # in this case, the pts were exactly equal already so no need to add a particle else: thetas = _cdist_euclidean(coords0, coords1, periodic_phi, phi_col_m1)/R # change units for numerical stability rescale = max(pT0, pT1) G, cost, _, _, result_code = emd_c(pTs0/rescale, pTs1/rescale, thetas, n_iter_max) check_result(result_code) # need to change units back return (cost * rescale, G * rescale) if return_flow else cost * rescale
def emd_pot(ev0, ev1, R=1.0, norm=False, beta=1.0, measure='euclidean', coords='hadronic', return_flow=False, gdim=None, mask=False, n_iter_max=100000, periodic_phi=False, phi_col=2, empty_policy='error'): r"""Compute the EMD between two events using the Python Optimal Transport library. **Arguments** - **ev0** : _numpy.ndarray_ - The first event, given as a two-dimensional array. The event is assumed to be an `(M,1+gdim)` array of particles, where `M` is the multiplicity and `gdim` is the dimension of the ground space in which to compute euclidean distances between particles (as specified by the `gdim` keyword argument. The zeroth column is assumed to be the energies (or equivalently, the transverse momenta) of the particles. For typical hadron collider jet applications, each particle will be of the form `(pT,y,phi)` where `y` is the rapidity and `phi` is the azimuthal angle. - **ev1** : _numpy.ndarray_ - The other event, same format as `ev0`. - **R** : _float_ - The R parameter in the EMD definition that controls the relative importance of the two terms. Must be greater than or equal to half of the maximum ground distance in the space in order for the EMD to be a valid metric satisfying the triangle inequality. - **beta** : _float_ - The angular weighting exponent. The internal pairwsie distance matrix is raised to this power prior to solving the optimal transport problem. - **norm** : _bool_ - Whether or not to normalize the pT values of the events prior to computing the EMD. - **measure** : _str_ - Controls which metric is used to calculate the ground distances between particles. `'euclidean'` uses the euclidean metric in however many dimensions are provided and specified by `gdim`. `'spherical'` uses the opening angle between particles on the sphere (note that this is not fully tested and should be used cautiously). - **coords** : _str_ - Only has an effect if `measure='spherical'`, in which case it controls if `'hadronic'` coordinates `(pT,y,phi,[m])` are expected versus `'cartesian'` coordinates `(E,px,py,pz)`. - **return_flow** : _bool_ - Whether or not to return the flow matrix describing the optimal transport found during the computation of the EMD. Note that since the second term in Eq. 1 is implemented by including an additional particle in the event with lesser total pT, this will be reflected in the flow matrix. - **gdim** : _int_ - The dimension of the ground metric space. Useful for restricting which dimensions are considered part of the ground space. Can be larger than the number of dimensions present in the events (in which case all dimensions will be included). If `None`, has no effect. - **mask** : _bool_ - If `True`, ignores particles farther than `R` away from the origin. - **n_iter_max** : _int_ - Maximum number of iterations for solving the optimal transport problem. - **periodic_phi** : _bool_ - Whether to expect (and therefore properly handle) periodicity in the coordinate corresponding to the azimuthal angle $\phi$. Should typically be `True` for event-level applications but can be set to `False` (which is slightly faster) for jet applications where all $\phi$ differences are less than or equal to $\pi$. - **phi_col** : _int_ - The index of the column of $\phi$ values in the event array. - **empty_policy** : _float_ or `'error'` - Controls behavior if an empty event is passed in. When set to `'error'`, a `ValueError` is raised if an empty event is encountered. If set to a float, that value is returned is returned instead on an empty event. **Returns** - _float_ - The EMD value. - [_numpy.ndarray_], optional - The flow matrix found while solving for the EMD. The `(i,j)`th entry is the amount of `pT` that flows between particle i in `ev0` and particle j in `ev1`. """ # parameter checks _check_params(norm, gdim, phi_col, measure, coords, empty_policy) euclidean = (measure == 'euclidean') hadr2cart = (not euclidean) and (coords == 'hadronic') error_on_empty = (empty_policy == 'error') # handle periodicity phi_col_m1 = phi_col - 1 # process events args = (None, gdim, periodic_phi, phi_col_m1, mask, R, hadr2cart, euclidean, error_on_empty) pTs0, coords0 = _process_for_emd(ev0, *args) pTs1, coords1 = _process_for_emd(ev1, *args) if pTs0 is None or pTs1 is None: if return_flow: return empty_policy, np.zeros((0,0)) else: return empty_policy pT0, pT1 = pTs0.sum(), pTs1.sum() # if norm, then we normalize the pts to 1 if norm: pTs0 /= pT0 pTs1 /= pT1 thetas = _cdist(coords0, coords1, euclidean, periodic_phi, phi_col_m1)/R rescale = 1.0 # implement the EMD in Eq. 1 of the paper by adding an appropriate extra particle else: pTdiff = pT1 - pT0 if pTdiff > 0: pTs0 = np.hstack((pTs0, pTdiff)) coords0_extra = np.vstack((coords0, np.zeros(coords0.shape[1], dtype=np.float64))) thetas = _cdist(coords0_extra, coords1, euclidean, periodic_phi, phi_col_m1)/R thetas[-1,:] = 1.0 elif pTdiff < 0: pTs1 = np.hstack((pTs1, -pTdiff)) coords1_extra = np.vstack((coords1, np.zeros(coords1.shape[1], dtype=np.float64))) thetas = _cdist(coords0, coords1_extra, euclidean, periodic_phi, phi_col_m1)/R thetas[:,-1] = 1.0 # in this case, the pts were exactly equal already so no need to add a particle else: thetas = _cdist(coords0, coords1, euclidean, periodic_phi, phi_col_m1)/R # change units for numerical stability rescale = max(pT0, pT1) # implement angular exponent if beta != 1: thetas **= beta G, cost, _, _, result_code = emd_c(pTs0/rescale, pTs1/rescale, thetas, n_iter_max) check_result(result_code) # need to change units back if return_flow: G *= rescale return cost * rescale, G else: return cost * rescale