Exemple #1
0
    def test__balance_basis_base_case(self):
        tree = u"(a,b);"
        t = TreeNode.read([tree])

        exp_basis = np.array([[-np.sqrt(1. / 2), np.sqrt(1. / 2)]])
        exp_keys = [t.name]
        res_basis, res_keys = _balance_basis(t)

        npt.assert_allclose(exp_basis, res_basis)
        self.assertListEqual(exp_keys, res_keys)
    def test__balance_basis_base_case(self):
        tree = u"(a,b);"
        t = TreeNode.read([tree])

        exp_basis = np.array([[-np.sqrt(1. / 2), np.sqrt(1. / 2)]])
        exp_keys = [t.name]
        res_basis, res_keys = _balance_basis(t)

        npt.assert_allclose(exp_basis, res_basis)
        self.assertListEqual(exp_keys, res_keys)
    def test__balance_basis_unbalanced(self):
        tree = u"((a,b)c, d);"
        t = TreeNode.read([tree])

        exp_basis = np.array(
            [[-np.sqrt(1. / 6), -np.sqrt(1. / 6),
              np.sqrt(2. / 3)], [-np.sqrt(1. / 2),
                                 np.sqrt(1. / 2), 0]])
        exp_keys = [t.name, t[0].name]
        res_basis, res_keys = _balance_basis(t)

        npt.assert_allclose(exp_basis, res_basis)
        self.assertListEqual(exp_keys, res_keys)
Exemple #4
0
    def test__balance_basis_unbalanced(self):
        tree = u"((a,b)c, d);"
        t = TreeNode.read([tree])

        exp_basis = np.array(
            [[-np.sqrt(1. / 6), -np.sqrt(1. / 6), np.sqrt(2. / 3)],
             [-np.sqrt(1. / 2), np.sqrt(1. / 2), 0]]
        )
        exp_keys = [t.name, t[0].name]
        res_basis, res_keys = _balance_basis(t)

        npt.assert_allclose(exp_basis, res_basis)
        self.assertListEqual(exp_keys, res_keys)
Exemple #5
0
def ilr_phylogenetic_differential(
        differential: pd.DataFrame,
        tree: skbio.TreeNode) -> (pd.DataFrame, skbio.TreeNode):
    t = tree.copy()
    t.bifurcate()
    diff, _tree = match_tips(differential.T, t)
    _tree = rename_internal_nodes(_tree)
    in_nodes = [n.name for n in _tree.levelorder() if not n.is_tip()]
    basis = _balance_basis(_tree)[0]
    basis = pd.DataFrame(basis.T, index=diff.columns, columns=in_nodes)
    diff_balances = (diff @ basis).T
    diff_balances.index.name = 'featureid'
    return diff_balances, t
Exemple #6
0
def multinomial_bioms(k, D, N, M, min_sv=0.11, max_sv=5.0, sigma_sq=0.1):
    """ Simulates biom tables from multinomial.

    Parameters
    ----------
    k : int
       Number of latent dimensions.
    D : int
       Number of microbes.
    N : int
       Number of samples.
    M : int
       Average sequencing depth.

    Returns
    -------
    dict of np.array
       Ground truth parameters.
    """
    dims, hdims, total = D, k, N
    eigs = min_sv + (max_sv - min_sv) * np.linspace(0, 1, hdims)
    eigvectors = ortho_group.rvs(dims - 1)[:, :hdims]
    W = np.matmul(eigvectors, np.diag(np.sqrt(eigs - sigma_sq)))
    sigma_sq = sigma_sq
    sigma = np.sqrt(sigma_sq)
    z = np.random.normal(size=(total, hdims))
    eta = np.random.normal(np.matmul(z, W.T), sigma).astype(np.float32)
    tree = random_linkage(D)
    Psi = _balance_basis(tree)[0]
    prob = closure(np.exp(eta @ Psi))
    depths = np.random.poisson(M, size=N)
    Y = np.vstack([np.random.multinomial(depths[i], prob[i])
                   for i in range(N)])
    return dict(
        sigma=sigma,
        W=W,
        Psi=Psi,
        tree=tree,
        eta=eta,
        z=z,
        Y=Y,
        depths=depths,
        eigs=eigs,
        eigvectors=eigvectors
    )
Exemple #7
0
def ilr_phylogenetic_ordination(
        table: pd.DataFrame,
        tree: skbio.TreeNode,
        pseudocount: float = 0.5,
        top_k_var: int = 10,
        clades: list = None
) -> (OrdinationResults, skbio.TreeNode, pd.DataFrame):
    t = tree.copy()
    t.bifurcate()
    _table, _tree = match_tips(table, t)
    _tree = rename_internal_nodes(_tree)
    if not clades:
        in_nodes = [n.name for n in _tree.levelorder() if not n.is_tip()]
        basis = _balance_basis(_tree)[0]
        _table = add_pseudocount(_table, pseudocount)
        basis = pd.DataFrame(basis.T, index=_table.columns, columns=in_nodes)
        balances = np.log(_table) @ basis
        var = balances.var(axis=0).sort_values(ascending=False)
        clades = var.index[:top_k_var]
        balances = balances[clades]
        basis = basis[clades]
    else:
        clades = clades[0].split(',')
        balances, basis = _fast_ilr(_tree, _table, clades, pseudocount=0.5)
        var = balances.var(axis=0).sort_values(ascending=False)

    balances.index.name = 'sampleid'
    # feature metadata
    eigvals = var
    prop = var[clades] / var.sum()
    balances = OrdinationResults(
        short_method_name='ILR',
        long_method_name='Phylogenetic Isometric Log Ratio Transform',
        samples=balances,
        features=pd.DataFrame(np.eye(len(clades)), index=clades),
        eigvals=eigvals,
        proportion_explained=prop)
    basis.index.name = 'featureid'
    return balances, _tree, basis
Exemple #8
0
def multinomial_batch_bioms(k, D, N, M, C=2,
                            min_sv=0.11, max_sv=5.0, sigma_sq=0.1):
    """ Simulates biom tables from multinomial with batch effects

    Parameters
    ----------
    k : int
       Number of latent dimensions.
    D : int
       Number of microbes.
    N : int
       Number of samples.
    M : int
       Average sequencing depth.
    C : int
       Number of batches.

    Returns
    -------
    dict of np.array
       Ground truth parameters.
    """
    dims, hdims, total = D, k, N
    eigs = min_sv + (max_sv - min_sv) * np.linspace(0, 1, hdims)
    eigvectors = ortho_group.rvs(dims - 1)[:, :hdims]
    W = np.matmul(eigvectors, np.diag(np.sqrt(eigs - sigma_sq)))
    sigma_sq = sigma_sq
    sigma = np.sqrt(sigma_sq)
    z = np.random.normal(size=(total, hdims))
    eta = np.random.normal(np.matmul(z, W.T), sigma).astype(np.float32)
    # Create ILR basis
    tree = random_linkage(D)
    Psi = _balance_basis(tree)[0]
    # add batch effects
    alpha = np.abs(np.random.normal(0, 0.5, size=(D)))
    alphaILR = np.abs(Psi)  @ alpha  # variances must always be positive
    m = np.zeros(D - 1)
    B = np.random.multivariate_normal(m, np.diag(alphaILR), size=C)
    batch_idx = np.random.randint(C, size=N)
    eta = np.vstack([eta[i] + B[batch_idx[i]] for i in range(N)])
    # Convert latent variables to observed counts
    prob = closure(np.exp(eta @ Psi))
    depths = np.random.poisson(M, size=N)
    Y = np.vstack([np.random.multinomial(depths[i], prob[i])
                   for i in range(N)])
    return dict(
        sigma=sigma,
        W=W,
        Psi=Psi,
        tree=tree,
        eta=eta,
        z=z,
        Y=Y,
        alpha=alpha,
        alphaILR=alphaILR,
        B=B,
        batch_idx=batch_idx,
        depths=depths,
        eigs=eigs,
        eigvectors=eigvectors
    )