Esempio n. 1
0
def test_latent_py_warning():
    ps, py, inv = test_latent_py_ps_inv()
    with pytest.raises(TypeError) as e:
        with pytest.warns(UserWarning) as w:
            py2 = latent_algebra.compute_py(
                ps=np.array([[[0.1, 0.3, 0.6]]]),
                noise_matrix=nm,
                inverse_noise_matrix=inv,
            )
            py2 = latent_algebra.compute_py(
                ps=np.array([[0.1], [0.2], [0.7]]),
                noise_matrix=nm,
                inverse_noise_matrix=inv,
            )
            assert (True)
Esempio n. 2
0
def converge_estimates(
    ps,
    py,
    noise_matrix, 
    inverse_noise_matrix, 
    inv_noise_matrix_iterations = 5,
    noise_matrix_iterations = 3,
):
    '''Computes py := P(y=k) and both noise_matrix and inverse_noise_matrix,
    by numerically converging ps := P(s=k), py, and the noise matrices.

    Forces numerical consistency of estimates. Each is estimated
    independently, but they are related mathematically with closed form 
    equivalences. This will iteratively make them mathematically consistent. 

    py := P(y=k) and the inverse noise matrix P(y=k_y|s=k_s) specify one another, 
    meaning one can be computed from the other and vice versa. When numerical
    discrepancy exists due to poor estimation, they can be made to agree by repeatedly
    computing one from the other, for some a certain number of iterations (3-10 works fine.)

    Do not set iterations too high or performance will decrease as small deviations
    will get perturbated over and over and potentially magnified.

    Note that we have to first converge the inverse_noise_matrix and py, 
    then we can update the noise_matrix, then repeat. This is becauase the
    inverse noise matrix depends on py (which is unknown/latent), but the
    noise matrix depends on ps (which is known), so there will be no change
    in the noise matrix if we recompute it when py and inverse_noise_matrix change.


    Parameters
    ----------

    ps : np.array (shape (K, ) or (1, K))
        The fraction (prior probability) of each observed, noisy class label, P(y = k).

    noise_matrix : np.array of shape (K, K), K = number of classes 
        A conditional probablity matrix of the form P(s=k_s|y=k_y) containing
        the fraction of examples in every class, labeled as every other class.
        Assumes columns of noise_matrix sum to 1.

    inverse_noise_matrix : np.array of shape (K, K), K = number of classes 
        A conditional probablity matrix of the form P(y=k_y|s=k_s) representing
        the estimated fraction observed examples in each class k_s, that are
        mislabeled examples from every other class k_y. If None, the 
        inverse_noise_matrix will be computed from psx and s.
        Assumes columns of inverse_noise_matrix sum to 1.

    Output
    ------  
        Three np.arrays of the form (py, noise_matrix, inverse_noise_matrix) with py 
        and inverse_noise_matrix and noise_matrix having numerical agreement.'''  
  
    for j in range(noise_matrix_iterations):
        for i in range(inv_noise_matrix_iterations):
            inverse_noise_matrix = compute_inv_noise_matrix(py, noise_matrix, ps)
            py = compute_py(ps, noise_matrix, inverse_noise_matrix)
        noise_matrix = compute_noise_matrix_from_inverse(ps, inverse_noise_matrix, py)
    
    return py, noise_matrix, inverse_noise_matrix
Esempio n. 3
0
def test_compute_py_err():
    ps, py, inv = test_latent_py_ps_inv()
    try:
        py = latent_algebra.compute_py(
            ps=ps,
            noise_matrix=nm,
            inverse_noise_matrix=inv,
            py_method='marginal_ps',
        )
    except ValueError as e:
        assert ('y_count' in str(e))
        with pytest.raises(ValueError) as e:
            py = latent_algebra.compute_py(
                ps=ps,
                noise_matrix=nm,
                inverse_noise_matrix=inv,
                py_method='marginal_ps',
            )
Esempio n. 4
0
def test_compute_py_marginal_ps():
    ps, py, inv = test_latent_py_ps_inv()
    cj = nm * ps * len(s)
    y_count = cj.sum(axis=0)
    py2 = latent_algebra.compute_py(ps=ps,
                                    noise_matrix=nm,
                                    inverse_noise_matrix=inv,
                                    py_method='marginal_ps',
                                    y_count=y_count)
    assert (all(abs(py - py2) < 1e-2))
Esempio n. 5
0
def test_latent_py():
    ps, py, inv = test_latent_py_ps_inv()
    py2 = latent_algebra.compute_py(ps, nm, inv)
    assert (np.all(abs(py - py2) < 1e-3))
Esempio n. 6
0
def estimate_latent(
    confident_joint,
    s,
    py_method='cnt',
    converge_latent_estimates=False,
):
    '''Computes the latent prior p(y), the noise matrix P(s|y) and the
    inverse noise matrix P(y|s) from the `confident_joint` count(s, y). The
    `confident_joint` estimated by `compute_confident_joint`
    by counting confident examples.

    Parameters
    ----------

    s : np.array
        A discrete vector of labels, s, which may contain mislabeling. "s" denotes
        the noisy label instead of \tilde(y), for ASCII encoding reasons.

    confident_joint : np.array (shape (K, K), type int)
        A K,K integer matrix of count(s=k, y=k). Estimatesa a confident subset of
        the joint disribution of the noisy and true labels P_{s,y}.
        Each entry in the matrix contains the number of examples confidently
        counted into every pair (s=j, y=k) classes.

    py_method : str (Options: ["cnt", "eqn", "marginal", "marginal_ps"])
        How to compute the latent prior p(y=k). Default is "cnt" as it often
        works well even when the noise matrices are estimated poorly by using
        the matrix diagonals instead of all the probabilities.

    converge_latent_estimates : bool
      If true, forces numerical consistency of estimates. Each is estimated
      independently, but they are related mathematically with closed form
      equivalences. This will iteratively make them mathematically consistent.

    Returns
    ------
        A tuple containing (py, noise_matrix, inv_noise_matrix).'''

    # Number of classes
    K = len(np.unique(s))
    # 'ps' is p(s=k)
    ps = value_counts(s) / float(len(s))
    # Ensure labels are of type np.array()
    s = np.asarray(s)
    # Number of training examples confidently counted from each noisy class
    s_count = confident_joint.sum(axis=1).astype(float)
    # Number of training examples confidently counted into each true class
    y_count = confident_joint.sum(axis=0).astype(float)
    # Confident Counts Estimator for p(s=k_s|y=k_y) ~ |s=k_s and y=k_y| / |y=k_y|
    noise_matrix = confident_joint / y_count
    # Confident Counts Estimator for p(y=k_y|s=k_s) ~ |y=k_y and s=k_s| / |s=k_s|
    inv_noise_matrix = confident_joint.T / s_count
    # Compute the prior p(y), the latent (uncorrupted) class distribution.
    py = compute_py(ps, noise_matrix, inv_noise_matrix, py_method, y_count)
    # Clip noise rates to be valid probabilities.
    noise_matrix = clip_noise_rates(noise_matrix)
    inv_noise_matrix = clip_noise_rates(inv_noise_matrix)
    # Make latent estimates mathematically agree in their algebraic relations.
    if converge_latent_estimates:
        py, noise_matrix, inv_noise_matrix = converge_estimates(
            ps, py, noise_matrix, inv_noise_matrix)
        # Again clip py and noise rates into proper range [0,1)
        py = clip_values(py, low=1e-5, high=1.0, new_sum=1.0)
        noise_matrix = clip_noise_rates(noise_matrix)
        inv_noise_matrix = clip_noise_rates(inv_noise_matrix)

    return py, noise_matrix, inv_noise_matrix