コード例 #1
0
    def test_fit(self):
        np.random.seed(1)
        tf.reset_default_graph()
        n, d1 = self.trainX.shape
        n, d2 = self.trainY.shape
        with tf.Graph().as_default(), tf.Session() as session:
            set_random_seed(0)
            model = MMvec(beta_1=0.8, beta_2=0.9, latent_dim=2)
            model(session,
                  coo_matrix(self.trainX.values), self.trainY.values,
                  coo_matrix(self.testX.values), self.testY.values)
            model.fit(epoch=1000)

            U_ = np.hstack(
                (np.ones((self.U.shape[0], 1)), self.Ubias, self.U))
            V_ = np.vstack(
                (self.Vbias, np.ones((1, self.V.shape[1])), self.V))

            u_r, u_p = spearmanr(pdist(model.U), pdist(self.U))
            v_r, v_p = spearmanr(pdist(model.V.T), pdist(self.V.T))

            res = softmax(model.ranks())
            exp = softmax(np.hstack((np.zeros((d1, 1)), U_ @ V_)))
            s_r, s_p = spearmanr(np.ravel(res), np.ravel(exp))

            self.assertGreater(u_r, 0.5)
            self.assertGreater(v_r, 0.5)
            self.assertGreater(s_r, 0.5)
            self.assertLess(u_p, 5e-2)
            self.assertLess(v_p, 5e-2)
            self.assertLess(s_p, 5e-2)

            # sanity check cross validation
            self.assertLess(model.cv.eval(), 500)
コード例 #2
0
ファイル: sim.py プロジェクト: mortonjt/benchmark-mae
def partition_metabolites(uU, sigmaU, uV, sigmaV, num_metabolites, latent_dim,
                          microbe_partition, metabolite_in, state):
    """ Split up a single chemical abundances into multiple subspecies.

    Parameters
    ----------
    uU, sigmaU, uV, sigmaV : int, int, int, int
        Parameters for the conditional probability matrix.
    num_microbes : int
        Number of strains to be represented
    num_metabolites : int
        Number of chemicals to be represented
    latent_dim : int
        Number of latent dimensions in conditional probability
        matrix.
    microbe_partition : np.array
        The input microbial abundances for multiple strains.
    metabolite_in : np.array
        The input intensities for a single chemicals
    state : numpy random state
        Random number generator

    Returns
    -------
    U: np.array
        Microbial latent variables.
    V: np.array
        Metabolomic latent variables.
    metabolites_out: np.array
        Multiple chemical abundances.
    """
    num_microbes = microbe_partition.shape[1]
    num_samples = len(metabolite_in)

    U = state.normal(uU, sigmaU, size=(num_microbes, latent_dim))
    V = state.normal(uV, sigmaV, size=(latent_dim, num_metabolites))

    # Randomly generate conditional probability matrices
    # Question : how to incorporate the existing abundances?
    probs = softmax(U @ V)

    # for each submicrobe strain, generate metabolite distribution
    metabolite_partition = closure(microbe_partition @ probs)

    # Return partitioned metabolites
    metabolites_out = np.multiply(metabolite_partition,
                                  metabolite_in.reshape(-1, 1))

    return U, V, metabolites_out
コード例 #3
0
ファイル: multimodal.py プロジェクト: prashantbajpai/rhapsody
    def predict(self, X):
        """ Performs a prediction

        Parameters
        ----------
        X : np.array
           Input table (likely OTUs).

        Returns
        -------
        np.array :
           Predicted abundances.
        """
        X_hits, _ = onehot(X)

        d1 = X_hits.shape[0]
        U_ = np.hstack((np.ones((self.U.shape[0], 1)), self.Ubias, self.U))
        V_ = np.vstack((self.Vbias, np.ones((1, self.V.shape[1])), self.V))
        r = U_[X_hits] @ V_
        res = softmax(np.hstack((np.zeros((d1, 1)), r)))
        return res
コード例 #4
0
def deposit(output_dir, table1, table2, metadata, U, V, B, it, rep):
    """ Writes down tables, metadata and feature metadata into files.

    Parameters
    ----------
    output_dir : str
        output directory
    table1 : biom.Table
        Biom table
    table2 : biom.Table
        Biom table
    metadata : pd.DataFrame
        Dataframe of sample metadata
    U : np.array
        Microbial latent variables
    V : np.array
        Metabolite latent variables
    edges : list
        Edge list for ground truthing.
    feature_metadata : pd.DataFrame
        Dataframe of features metadata
    it : int
        iteration number
    rep : int
        repetition number
    """
    choice = 'abcdefghijklmnopqrstuvwxyz'
    output_microbes = "%s/table_microbes.%d_%s.biom" % (
        output_dir, it, choice[rep])
    output_metabolites = "%s/table_metabolites.%d_%s.biom" % (
        output_dir, it, choice[rep])
    output_md = "%s/metadata.%d_%s.txt" % (
        output_dir, it, choice[rep])
    output_U = "%s/U.%d_%s.txt" % (
        output_dir, it, choice[rep])
    output_V = "%s/V.%d_%s.txt" % (
        output_dir, it, choice[rep])
    output_B = "%s/B.%d_%s.txt" % (
        output_dir, it, choice[rep])
    output_ranks = "%s/ranks.%d_%s.txt" % (
        output_dir, it, choice[rep])

    idx1 = table1.sum(axis=0) > 0
    idx2 = table2.sum(axis=0) > 0
    table1 = table1.loc[:, idx1]
    table2 = table2.loc[:, idx2]

    table1 = Table(table1.values.T, table1.columns, table1.index)
    table2 = Table(table2.values.T, table2.columns, table2.index)

    with biom_open(output_microbes, 'w') as f:
        table1.to_hdf5(f, generated_by='moi1')
    with biom_open(output_metabolites, 'w') as f:
        table2.to_hdf5(f, generated_by='moi2')

    ranks = clr(softmax(np.hstack(
        (np.zeros((U.shape[0], 1)), U @ V))))
    ranks = ranks[idx1, :]
    ranks = ranks[:, idx2]
    ranks = pd.DataFrame(
        ranks, index=table1.ids(axis='observation'),
        columns=table2.ids(axis='observation'))
    ranks.to_csv(output_ranks, sep='\t')
    metadata.to_csv(output_md, sep='\t', index_label='#SampleID')

    np.savetxt(output_B, B)
    np.savetxt(output_U, U)
    np.savetxt(output_V, V)
コード例 #5
0
def random_multimodal(num_microbes=20, num_metabolites=100, num_samples=100,
                      latent_dim=3, low=-1, high=1,
                      microbe_total=10, metabolite_total=100,
                      uB=0, sigmaB=2, sigmaQ=0.1,
                      uU=0, sigmaU=1, uV=0, sigmaV=1,
                      seed=0):
    """
    Parameters
    ----------
    num_microbes : int
       Number of microbial species to simulate
    num_metabolites : int
       Number of molecules to simulate
    num_samples : int
       Number of samples to generate
    latent_dim :
       Number of latent dimensions
    low : float
       Lower bound of gradient
    high : float
       Upper bound of gradient
    microbe_total : int
       Total number of microbial species
    metabolite_total : int
       Total number of metabolite species
    uB : float
       Mean of regression coefficient distribution
    sigmaB : float
       Standard deviation of regression coefficient distribution
    sigmaQ : float
       Standard deviation of error distribution
    uU : float
       Mean of microbial input projection coefficient distribution
    sigmaU : float
       Standard deviation of microbial input projection
       coefficient distribution
    uV : float
       Mean of metabolite output projection coefficient distribution
    sigmaV : float
       Standard deviation of metabolite output projection
       coefficient distribution
    seed : float
       Random seed

    Returns
    -------
    microbe_counts : pd.DataFrame
       Count table of microbial counts
    metabolite_counts : pd.DataFrame
       Count table of metabolite counts
    """
    state = check_random_state(seed)
    # only have two coefficients
    beta = state.normal(uB, sigmaB, size=(2, num_microbes))
    X = np.vstack((np.ones(num_samples),
                   np.linspace(low, high, num_samples))).T

    microbes = softmax(state.normal(X @ beta, sigmaQ))

    #microbes = softmax(
    #    state.normal(loc=0, scale=sigmaQ,
    #                 size=(num_samples, num_microbes)
    #                )
    #)

    microbes = ilr_inv(state.multivariate_normal(
        mean=np.zeros(num_microbes-1), cov=np.diag([sigmaQ]*(num_microbes-1)),
        size=num_samples)
    )
    Umain = state.normal(
        uU, sigmaU, size=(num_microbes, latent_dim))
    Vmain = state.normal(
        uV, sigmaV, size=(latent_dim, num_metabolites-1))

    Ubias = state.normal(
        uU, sigmaU, size=(num_microbes, 1))
    Vbias = state.normal(
        uV, sigmaV, size=(1, num_metabolites-1))

    U_ = np.hstack(
        (np.ones((num_microbes, 1)), Ubias, Umain))
    V_ = np.vstack(
        (Vbias, np.ones((1, num_metabolites-1)), Vmain))

    phi = np.hstack((np.zeros((num_microbes, 1)), U_ @ V_))
    probs = softmax(phi)
    microbe_counts = np.zeros((num_samples, num_microbes))
    metabolite_counts = np.zeros((num_samples, num_metabolites))
    n1 = microbe_total
    n2 = metabolite_total // microbe_total
    for n in range(num_samples):
        otu = np.random.multinomial(n1, microbes[n, :])
        for i in range(num_microbes):
            ms = np.random.multinomial(otu[i] * n2, probs[i, :])
            metabolite_counts[n, :] += ms
        microbe_counts[n, :] += otu

    otu_ids = ['OTU_%d' % d for d in range(microbe_counts.shape[1])]
    ms_ids = ['metabolite_%d' % d for d in range(metabolite_counts.shape[1])]
    sample_ids = ['sample_%d' % d for d in range(metabolite_counts.shape[0])]

    microbe_counts = pd.DataFrame(
        microbe_counts, index=sample_ids, columns=otu_ids)
    metabolite_counts = pd.DataFrame(
        metabolite_counts, index=sample_ids, columns=ms_ids)

    return microbe_counts, metabolite_counts, X, beta, U_, V_
コード例 #6
0
def random_sigmoid_multimodal(
        num_microbes=20, num_metabolites=100, num_samples=100,
        num_latent_microbes=5, num_latent_metabolites=10,
        num_latent_shared=3, low=-1, high=1,
        microbe_total=10, metabolite_total=100,
        uB=0, sigmaB=2, sigmaQ=0.1,
        uU1=0, sigmaU1=1, uU2=0, sigmaU2=1,
        uV1=0, sigmaV1=1, uV2=0, sigmaV2=1,
        seed=0):
    """ Simulates sigmoid function for microbe-metabolite interations.

    Parameters
    ----------
    num_microbes : int
       Number of microbial species to simulate
    num_metabolites : int
       Number of molecules to simulate
    num_samples : int
       Number of samples to generate
    num_latent_microbes :
       Number of latent microbial dimensions
    num_latent_metabolites
       Number of latent metabolite dimensions
    num_latent_shared
       Number of dimensions in shared representation
    low : float
       Lower bound of gradient
    high : float
       Upper bound of gradient
    microbe_total : int
       Total number of microbial species
    metabolite_total : int
       Total number of metabolite species
    uB : float
       Mean of regression coefficient distribution
    sigmaB : float
       Standard deviation of regression coefficient distribution
    sigmaQ : float
       Standard deviation of error distribution
    uU1 : float
       Mean of microbial input projection coefficient distribution
    sigmaU1 : float
       Standard deviation of microbial input projection
       coefficient distribution
    uU2 : float
       Mean of microbe output projection coefficient distribution
    sigmaU2 : float
       Standard deviation of microbe output projection
       coefficient distribution
    uV1 : float
       Mean of metabolite input projection coefficient distribution
    sigmaU1 : float
       Standard deviation of metabolite input projection
       coefficient distribution
    uV2 : float
       Mean of metabolite output projection coefficient distribution
    sigmaU2 : float
       Standard deviation of metabolite output projection
       coefficient distribution
    seed : float
       Random seed
    Returns
    -------
    microbe_counts : pd.DataFrame
       Count table of microbial counts
    metabolite_counts : pd.DataFrame
       Count table of metabolite counts
    """
    k = num_latent_shared
    state = check_random_state(seed)
    # only have two coefficients
    beta = state.normal(uB, sigmaB, size=(2, k))

    X = np.vstack((np.ones(num_samples),
                   np.linspace(low, high, num_samples))).T

    Q = np.tanh(state.normal(X @ beta, sigmaQ))

    U1 = state.normal(
        uU1, sigmaU1, size=(num_latent_microbes, num_microbes))
    U2 = state.normal(
        uU2, sigmaU2, size=(k, num_latent_microbes))
    V1 = state.normal(
        uV1, sigmaV1, size=(num_latent_metabolites, num_metabolites))
    V2 = state.normal(
        uV2, sigmaV2, size=(k, num_latent_metabolites))

    def multinomial(n, p):
        return np.vstack([np.random.multinomial(n, p[i, :])
                          for i in range(p.shape[0])]).T

    microbe_counts = multinomial(microbe_total, softmax((Q @ U2 @ U1).T))
    metabolite_counts = multinomial(metabolite_total, softmax((Q @ V2 @ V1).T))
    otu_ids = ['OTU_%d' % d for d in range(microbe_counts.shape[1])]
    ms_ids = ['metabolite_%d' % d for d in range(metabolite_counts.shape[1])]
    sample_ids = ['sample_%d' % d for d in range(metabolite_counts.shape[0])]

    microbe_counts = pd.DataFrame(
        microbe_counts, index=sample_ids, columns=otu_ids)
    metabolite_counts = pd.DataFrame(
        metabolite_counts, index=sample_ids, columns=ms_ids)

    return microbe_counts, metabolite_counts, X, Q, U1, U2, V1, V2
コード例 #7
0
ファイル: util.py プロジェクト: antgonza/songbird
def random_multinomial_model(num_samples,
                             num_features,
                             reps=1,
                             low=2,
                             high=10,
                             beta_mean=0,
                             beta_scale=5,
                             mu=1,
                             sigma=1,
                             seed=0):
    """ Generates a table using a random poisson regression model.

    Here we will be simulating microbial counts given the model, and the
    corresponding model priors.

    Parameters
    ----------
    num_samples : int
        Number of samples
    num_features : int
        Number of features
    tree : np.array
        Tree specifying orthonormal contrast matrix.
    low : float
        Smallest gradient value.
    high : float
        Largest gradient value.
    beta_mean : float
        Mean of beta prior (for regression coefficients)
    beta_scale : float
        Scale of beta prior (for regression coefficients)
    mu : float
        Mean sequencing depth (in log units)
    sigma : float
        Variance for sequencing depth

    Returns
    -------
    table : biom.Table
        Biom representation of the count table.
    metadata : pd.DataFrame
        DataFrame containing relevant metadata.
    beta : np.array
        Regression parameter estimates.
    """
    N = num_samples

    # generate all of the coefficient using the random poisson model
    state = check_random_state(seed)
    beta = state.normal(beta_mean, beta_scale, size=(2, num_features - 1))

    X = np.hstack([np.linspace(low, high, num_samples // reps)]
                  for _ in range(reps))
    X = np.vstack((np.ones(N), X)).T
    phi = np.hstack((np.zeros((N, 1)), X @ beta))
    probs = softmax(phi)
    n = [mu] * N

    table = np.vstack(state.multinomial(n[i], probs[i, :]) for i in range(N)).T

    samp_ids = pd.Index(['S%d' % i for i in range(num_samples)],
                        name='sampleid')
    feat_ids = ['F%d' % i for i in range(num_features)]
    balance_ids = ['L%d' % i for i in range(num_features - 1)]

    table = Table(table, feat_ids, samp_ids)
    metadata = pd.DataFrame(X, columns=['Ones', 'X'], index=samp_ids)
    beta = pd.DataFrame(beta.T,
                        columns=['Intercept', 'beta'],
                        index=balance_ids)

    return table, metadata, beta