Example #1
0
    def test_ABC_to_xy_yscale(self):
        for yscale in [1.0, 2.0, np.sqrt(3) / 2]:
            out = ABC_to_xy(self.ABC, yscale=yscale)
            expect = self.xy.copy()
            expect[:, 1] *= yscale
            # test scale
            self.assertTrue(np.allclose(out, expect))
            # test inverse

            self.assertTrue(np.allclose(xy_to_ABC(out, yscale=yscale), close(self.ABC)))
Example #2
0
def random_compositional_trend(m1, m2, c1, c2, resolution=20, size=1000):
    """
    Generate a compositional trend between two compositions with independent
    variances.
    """
    # generate means intermediate between m1 and m2
    mv = np.vstack([ilr(close(m1)).reshape(1, -1), ilr(close(m2)).reshape(1, -1)])
    ms = np.apply_along_axis(lambda x: np.linspace(*x, resolution), 0, mv)
    # generate covariance matricies intermediate between c1 and c2
    cv = np.vstack([c1.reshape(1, -1), c2.reshape(1, -1)])
    cs = np.apply_along_axis(lambda x: np.linspace(*x, resolution), 0, cv)
    cs = cs.reshape(cs.shape[0], *c1.shape)
    # generate samples from each
    samples = np.vstack(
        [
            np.random.multivariate_normal(m.flatten(), cs[ix], size=size // resolution)
            for ix, m in enumerate(ms)
        ]
    )
    # combine together.
    return inverse_ilr(samples)
Example #3
0
 def test_tfm_inversion_ABCxy(self):
     out = xy_to_ABC(ABC_to_xy(self.ABC))
     self.assertTrue(np.allclose(out, close(self.ABC)))
Example #4
0
 def test_xy_to_ABC(self):
     out = xy_to_ABC(self.xy)
     self.assertTrue(np.allclose(out, close(self.ABC)))
Example #5
0
def EMCOMP(
    X,
    threshold=None,
    tol=0.0001,
    convergence_metric=lambda A, B, t: np.linalg.norm(np.abs(A - B)) < t,
    max_iter=30,
):
    r"""
    EMCOMP replaces rounded zeros in a compositional data set based on a set of
    thresholds. After Palarea-Albaladejo and Martín-Fernández (2008) [#ref_1]_.


    Parameters
    ----------
    X  : :class:`numpy.ndarray`
        Dataset with rounded zeros
    threshold : :class:`numpy.ndarray`
        Array of threshold values for each component as a proprotion.
    tol : :class:`float`
        Tolerance to check for convergence.
    convergence_metric : :class:`callable`
        Callable function to check for convergence. Here we use a compositional distance
        rather than a maximum absolute difference, with very similar performance.
        Function needs to accept two :class:`numpy.ndarray` arguments and third
        tolerance argument.
    max_iter : :class:`int`
        Maximum number of iterations before an error is thrown.

    Returns
    --------
    X_est : :class:`numpy.ndarray`
        Dataset with rounded zeros replaced.
    prop_zeros : :class:`float`
       Proportion of zeros in the original data set.
    n_iters : :class:`int`
        Number of iterations needed for convergence.

    Notes
    -----

        * At least one component without missing values is needed for the divisor.
          Rounded zeros/missing values are replaced by values below their respective
          detection limits.

        * This routine is not completely numerically stable as written.

    Todo
    -------
        * Implement methods to deal with variable decection limits (i.e thresholds are array shape :code:`(N, D)`)
        * Conisder non-normal models for data distributions.
        * Improve numerical stability to reduce the chance of :code:`np.inf` appearing.

    References
    ----------
    .. [#ref_1] Palarea-Albaladejo J. and Martín-Fernández J. A. (2008)
            A modified EM ALR-algorithm for replacing rounded zeros in compositional data sets.
            Computers & Geosciences 34, 902–917.
            doi: `10.1016/j.cageo.2007.09.015 <https://dx.doi.org/10.1016/j.cageo.2007.09.015>`__

    """
    X = X.copy()
    n_obs, D = X.shape
    X = close(X, sumf=np.nansum)
    # ---------------------------------
    # Convert zeros into missing values
    # ---------------------------------
    X = np.where(np.isclose(X, 0.0), np.nan, X)
    # Use a divisor free of missing values
    assert np.isfinite(X).all(axis=0).any()
    pos = np.argmax(np.isfinite(X).all(axis=0))
    Yden = X[:, pos]
    # --------------------------------------
    # Compute the matrix of censure points Ψ
    # --------------------------------------
    # need an equivalent concept for ilr
    cpoints = (
        np.ones((n_obs, 1)) @ np.log(threshold[np.newaxis, :])
        - np.log(Yden[:, np.newaxis]) @ np.ones((1, D))
        - np.spacing(1.0)  # Machine epsilon
    )
    assert np.isfinite(cpoints).all()
    cpoints = cpoints[:, [i for i in range(D) if not i == pos]]  # censure points
    prop_zeroes = np.count_nonzero(~np.isfinite(X)) / (n_obs * D)
    Y = ALR(X, pos)
    # ---------------Log Space--------------------------------
    LD = Y.shape[1]
    M = np.nanmean(Y, axis=0)  # μ0
    C = nancov(Y)  # Σ0
    assert np.isfinite(M).all() and np.isfinite(C).all()

    # --------------------------------------------------
    # Stage 2: Find and enumerate missing data patterns
    # --------------------------------------------------
    pID, pD = md_pattern(Y)
    # -------------------------------------------
    # Stage 3: Regression against other variables
    # -------------------------------------------
    logger.debug(
        "Starting Iterative Regression for Matrix : ({}, {})".format(n_obs, LD)
    )
    another_iter = True
    niters = 0
    while another_iter:
        niters += 1
        Mnew, Cnew = M.copy(), C.copy()
        Ystar = Y.copy()
        V = np.zeros((LD, LD))

        for p_no in np.unique(pID):
            logger.debug("Pattern ID: {}, {}".format(p_no, pD[p_no]["pattern"]))
            rows = np.arange(pID.size)[pID == p_no]  # rows with this pattern
            varobs, varmiss = (
                np.arange(D - 1)[~pD[p_no]["pattern"]],
                np.arange(D - 1)[pD[p_no]["pattern"]],
            )
            sigmas = np.zeros((LD))
            assert np.isfinite(Y[np.ix_(rows, varobs)]).all()
            assert (~np.isfinite(Y[np.ix_(rows, varmiss)])).all()
            if varobs.size and varmiss.size:  # Non-completely missing, but missing some
                logger.debug(
                    "Regressing {} rows for pattern {} | {}.".format(
                        rows.size,
                        "".join(varobs.astype(str)),
                        "".join(varmiss.astype(str)),
                    )
                )
                B, σ2_res = _reg_sweep(Mnew, Cnew, varobs)
                assert B.shape == (varobs.size + 1, varmiss.size)
                assert σ2_res.shape == (varmiss.size, varmiss.size)
                assert np.isfinite(B).all()
                logger.debug(
                    "Current Estimator (1, {})".format(
                        ", ".join(["β{}".format(i) for i in range(B.shape[0] - 1)])
                    )
                )

                Ystar[np.ix_(rows, varmiss)] = np.ones((rows.size, 1)) * B[0, :] + (
                    (Y[np.ix_(rows, varobs)] @ B[1 : (varobs.size + 1), :])
                )
                sigmas[varmiss] = np.sqrt(np.diag(σ2_res))
                assert np.isfinite(sigmas[varmiss]).all()

                x = (  # position of threshold values relative to estimated means
                    cpoints[np.ix_(rows, varmiss)] - Ystar[np.ix_(rows, varmiss)]
                )
                x /= sigmas[varmiss][np.newaxis, :]  # as standard deviations
                assert np.isfinite(x).all()
                # ----------------------------------------------------
                # Calculate inverse Mills Ratio for Heckman correction
                # ----------------------------------------------------
                ϕ = stats.norm.pdf(x, loc=0, scale=1)  # pdf
                Φ = stats.norm.cdf(x, loc=0, scale=1)  # cdf
                Φ[np.isclose(Φ, 0)] = np.finfo(np.float64).eps * 2
                assert (Φ > 0).all()  # if its not, infinity will be introduced
                inversemills = ϕ / Φ
                Ystar[np.ix_(rows, varmiss)] = (
                    Ystar[np.ix_(rows, varmiss)] - sigmas[varmiss] * inversemills
                )
                V[np.ix_(varmiss, varmiss)] += σ2_res * rows.size
        assert np.isfinite(V).all()
        # -----------------------------------------------
        # Update and store parameter vector (μ(t), Σ(t)).
        # -----------------------------------------------
        logger.debug("Regression finished.")
        M = np.nanmean(Ystar, axis=0)
        Ydevs = Ystar - np.ones((n_obs, 1)) * M
        Ydevs[~np.isfinite(Ydevs)] = 0.0  # remove nonfinite components
        PC = np.dot(Ydevs.T, Ydevs)
        logger.debug("Correlation:\n{}".format(PC / (n_obs - 1)))
        C = (PC + V) / (n_obs - 1)

        logger.debug("Average diff: {}".format(np.mean(Ydevs, axis=0)))
        assert np.isfinite(C).all()
        # --------------------
        # Convergence checking
        # --------------------
        if convergence_metric(M, Mnew, tol) & convergence_metric(C, Cnew, tol):
            another_iter = False
            logger.debug("Convergence achieved.")

        another_iter = another_iter & (niters < max_iter)
        logger.debug("Iterations Continuing: {}".format(another_iter))
    # ----------------------------
    # Back to compositional space
    # ---------------------------
    logger.debug("Finished. Inverting to compositional space.")
    Xstar = inverse_ALR(Ystar, pos)
    return Xstar, prop_zeroes, niters
Example #6
0
import matplotlib.pyplot as plt
from pyrolite.plot import pyroplot
from pyrolite.plot.density import density
from pyrolite.comp.codata import close

# sphinx_gallery_thumbnail_number = 6

np.random.seed(82)
########################################################################################
# First we create some example data :
#
oxs = ["SiO2", "CaO", "MgO", "Na2O"]
ys = np.random.rand(1000, len(oxs))
ys[:, 1] += 0.7
ys[:, 2] += 1.0
df = pd.DataFrame(data=close(np.exp(ys)), columns=oxs)
########################################################################################
# A minimal density plot can be constructed as follows:
#
ax = df.loc[:, ["SiO2", "MgO"]].pyroplot.density()
df.loc[:, ["SiO2", "MgO"]].pyroplot.scatter(ax=ax, s=10, alpha=0.3, c="k", zorder=2)
plt.show()
########################################################################################
# A colorbar linked to the KDE estimate colormap can be added using the `colorbar`
# boolean switch:
#
ax = df.loc[:, ["SiO2", "MgO"]].pyroplot.density(colorbar=True)
plt.show()
########################################################################################
# `density` by default will create a new axis, but can also be plotted over an
# existing axis for more control:
Example #7
0
 def setUp(self):
     xs = 1.0 / (np.random.randn(5) + 4)
     self.X = np.array([xs, 1 - xs])
     self.X = close(self.X)