def test_fit(self): np.random.seed(1) tf.reset_default_graph() n, d1 = self.trainX.shape n, d2 = self.trainY.shape with tf.Graph().as_default(), tf.Session() as session: set_random_seed(0) model = MMvec(beta_1=0.8, beta_2=0.9, latent_dim=2) model(session, coo_matrix(self.trainX.values), self.trainY.values, coo_matrix(self.testX.values), self.testY.values) model.fit(epoch=1000) U_ = np.hstack( (np.ones((self.U.shape[0], 1)), self.Ubias, self.U)) V_ = np.vstack( (self.Vbias, np.ones((1, self.V.shape[1])), self.V)) u_r, u_p = spearmanr(pdist(model.U), pdist(self.U)) v_r, v_p = spearmanr(pdist(model.V.T), pdist(self.V.T)) res = softmax(model.ranks()) exp = softmax(np.hstack((np.zeros((d1, 1)), U_ @ V_))) s_r, s_p = spearmanr(np.ravel(res), np.ravel(exp)) self.assertGreater(u_r, 0.5) self.assertGreater(v_r, 0.5) self.assertGreater(s_r, 0.5) self.assertLess(u_p, 5e-2) self.assertLess(v_p, 5e-2) self.assertLess(s_p, 5e-2) # sanity check cross validation self.assertLess(model.cv.eval(), 500)
def partition_metabolites(uU, sigmaU, uV, sigmaV, num_metabolites, latent_dim, microbe_partition, metabolite_in, state): """ Split up a single chemical abundances into multiple subspecies. Parameters ---------- uU, sigmaU, uV, sigmaV : int, int, int, int Parameters for the conditional probability matrix. num_microbes : int Number of strains to be represented num_metabolites : int Number of chemicals to be represented latent_dim : int Number of latent dimensions in conditional probability matrix. microbe_partition : np.array The input microbial abundances for multiple strains. metabolite_in : np.array The input intensities for a single chemicals state : numpy random state Random number generator Returns ------- U: np.array Microbial latent variables. V: np.array Metabolomic latent variables. metabolites_out: np.array Multiple chemical abundances. """ num_microbes = microbe_partition.shape[1] num_samples = len(metabolite_in) U = state.normal(uU, sigmaU, size=(num_microbes, latent_dim)) V = state.normal(uV, sigmaV, size=(latent_dim, num_metabolites)) # Randomly generate conditional probability matrices # Question : how to incorporate the existing abundances? probs = softmax(U @ V) # for each submicrobe strain, generate metabolite distribution metabolite_partition = closure(microbe_partition @ probs) # Return partitioned metabolites metabolites_out = np.multiply(metabolite_partition, metabolite_in.reshape(-1, 1)) return U, V, metabolites_out
def predict(self, X): """ Performs a prediction Parameters ---------- X : np.array Input table (likely OTUs). Returns ------- np.array : Predicted abundances. """ X_hits, _ = onehot(X) d1 = X_hits.shape[0] U_ = np.hstack((np.ones((self.U.shape[0], 1)), self.Ubias, self.U)) V_ = np.vstack((self.Vbias, np.ones((1, self.V.shape[1])), self.V)) r = U_[X_hits] @ V_ res = softmax(np.hstack((np.zeros((d1, 1)), r))) return res
def deposit(output_dir, table1, table2, metadata, U, V, B, it, rep): """ Writes down tables, metadata and feature metadata into files. Parameters ---------- output_dir : str output directory table1 : biom.Table Biom table table2 : biom.Table Biom table metadata : pd.DataFrame Dataframe of sample metadata U : np.array Microbial latent variables V : np.array Metabolite latent variables edges : list Edge list for ground truthing. feature_metadata : pd.DataFrame Dataframe of features metadata it : int iteration number rep : int repetition number """ choice = 'abcdefghijklmnopqrstuvwxyz' output_microbes = "%s/table_microbes.%d_%s.biom" % ( output_dir, it, choice[rep]) output_metabolites = "%s/table_metabolites.%d_%s.biom" % ( output_dir, it, choice[rep]) output_md = "%s/metadata.%d_%s.txt" % ( output_dir, it, choice[rep]) output_U = "%s/U.%d_%s.txt" % ( output_dir, it, choice[rep]) output_V = "%s/V.%d_%s.txt" % ( output_dir, it, choice[rep]) output_B = "%s/B.%d_%s.txt" % ( output_dir, it, choice[rep]) output_ranks = "%s/ranks.%d_%s.txt" % ( output_dir, it, choice[rep]) idx1 = table1.sum(axis=0) > 0 idx2 = table2.sum(axis=0) > 0 table1 = table1.loc[:, idx1] table2 = table2.loc[:, idx2] table1 = Table(table1.values.T, table1.columns, table1.index) table2 = Table(table2.values.T, table2.columns, table2.index) with biom_open(output_microbes, 'w') as f: table1.to_hdf5(f, generated_by='moi1') with biom_open(output_metabolites, 'w') as f: table2.to_hdf5(f, generated_by='moi2') ranks = clr(softmax(np.hstack( (np.zeros((U.shape[0], 1)), U @ V)))) ranks = ranks[idx1, :] ranks = ranks[:, idx2] ranks = pd.DataFrame( ranks, index=table1.ids(axis='observation'), columns=table2.ids(axis='observation')) ranks.to_csv(output_ranks, sep='\t') metadata.to_csv(output_md, sep='\t', index_label='#SampleID') np.savetxt(output_B, B) np.savetxt(output_U, U) np.savetxt(output_V, V)
def random_multimodal(num_microbes=20, num_metabolites=100, num_samples=100, latent_dim=3, low=-1, high=1, microbe_total=10, metabolite_total=100, uB=0, sigmaB=2, sigmaQ=0.1, uU=0, sigmaU=1, uV=0, sigmaV=1, seed=0): """ Parameters ---------- num_microbes : int Number of microbial species to simulate num_metabolites : int Number of molecules to simulate num_samples : int Number of samples to generate latent_dim : Number of latent dimensions low : float Lower bound of gradient high : float Upper bound of gradient microbe_total : int Total number of microbial species metabolite_total : int Total number of metabolite species uB : float Mean of regression coefficient distribution sigmaB : float Standard deviation of regression coefficient distribution sigmaQ : float Standard deviation of error distribution uU : float Mean of microbial input projection coefficient distribution sigmaU : float Standard deviation of microbial input projection coefficient distribution uV : float Mean of metabolite output projection coefficient distribution sigmaV : float Standard deviation of metabolite output projection coefficient distribution seed : float Random seed Returns ------- microbe_counts : pd.DataFrame Count table of microbial counts metabolite_counts : pd.DataFrame Count table of metabolite counts """ state = check_random_state(seed) # only have two coefficients beta = state.normal(uB, sigmaB, size=(2, num_microbes)) X = np.vstack((np.ones(num_samples), np.linspace(low, high, num_samples))).T microbes = softmax(state.normal(X @ beta, sigmaQ)) #microbes = softmax( # state.normal(loc=0, scale=sigmaQ, # size=(num_samples, num_microbes) # ) #) microbes = ilr_inv(state.multivariate_normal( mean=np.zeros(num_microbes-1), cov=np.diag([sigmaQ]*(num_microbes-1)), size=num_samples) ) Umain = state.normal( uU, sigmaU, size=(num_microbes, latent_dim)) Vmain = state.normal( uV, sigmaV, size=(latent_dim, num_metabolites-1)) Ubias = state.normal( uU, sigmaU, size=(num_microbes, 1)) Vbias = state.normal( uV, sigmaV, size=(1, num_metabolites-1)) U_ = np.hstack( (np.ones((num_microbes, 1)), Ubias, Umain)) V_ = np.vstack( (Vbias, np.ones((1, num_metabolites-1)), Vmain)) phi = np.hstack((np.zeros((num_microbes, 1)), U_ @ V_)) probs = softmax(phi) microbe_counts = np.zeros((num_samples, num_microbes)) metabolite_counts = np.zeros((num_samples, num_metabolites)) n1 = microbe_total n2 = metabolite_total // microbe_total for n in range(num_samples): otu = np.random.multinomial(n1, microbes[n, :]) for i in range(num_microbes): ms = np.random.multinomial(otu[i] * n2, probs[i, :]) metabolite_counts[n, :] += ms microbe_counts[n, :] += otu otu_ids = ['OTU_%d' % d for d in range(microbe_counts.shape[1])] ms_ids = ['metabolite_%d' % d for d in range(metabolite_counts.shape[1])] sample_ids = ['sample_%d' % d for d in range(metabolite_counts.shape[0])] microbe_counts = pd.DataFrame( microbe_counts, index=sample_ids, columns=otu_ids) metabolite_counts = pd.DataFrame( metabolite_counts, index=sample_ids, columns=ms_ids) return microbe_counts, metabolite_counts, X, beta, U_, V_
def random_sigmoid_multimodal( num_microbes=20, num_metabolites=100, num_samples=100, num_latent_microbes=5, num_latent_metabolites=10, num_latent_shared=3, low=-1, high=1, microbe_total=10, metabolite_total=100, uB=0, sigmaB=2, sigmaQ=0.1, uU1=0, sigmaU1=1, uU2=0, sigmaU2=1, uV1=0, sigmaV1=1, uV2=0, sigmaV2=1, seed=0): """ Simulates sigmoid function for microbe-metabolite interations. Parameters ---------- num_microbes : int Number of microbial species to simulate num_metabolites : int Number of molecules to simulate num_samples : int Number of samples to generate num_latent_microbes : Number of latent microbial dimensions num_latent_metabolites Number of latent metabolite dimensions num_latent_shared Number of dimensions in shared representation low : float Lower bound of gradient high : float Upper bound of gradient microbe_total : int Total number of microbial species metabolite_total : int Total number of metabolite species uB : float Mean of regression coefficient distribution sigmaB : float Standard deviation of regression coefficient distribution sigmaQ : float Standard deviation of error distribution uU1 : float Mean of microbial input projection coefficient distribution sigmaU1 : float Standard deviation of microbial input projection coefficient distribution uU2 : float Mean of microbe output projection coefficient distribution sigmaU2 : float Standard deviation of microbe output projection coefficient distribution uV1 : float Mean of metabolite input projection coefficient distribution sigmaU1 : float Standard deviation of metabolite input projection coefficient distribution uV2 : float Mean of metabolite output projection coefficient distribution sigmaU2 : float Standard deviation of metabolite output projection coefficient distribution seed : float Random seed Returns ------- microbe_counts : pd.DataFrame Count table of microbial counts metabolite_counts : pd.DataFrame Count table of metabolite counts """ k = num_latent_shared state = check_random_state(seed) # only have two coefficients beta = state.normal(uB, sigmaB, size=(2, k)) X = np.vstack((np.ones(num_samples), np.linspace(low, high, num_samples))).T Q = np.tanh(state.normal(X @ beta, sigmaQ)) U1 = state.normal( uU1, sigmaU1, size=(num_latent_microbes, num_microbes)) U2 = state.normal( uU2, sigmaU2, size=(k, num_latent_microbes)) V1 = state.normal( uV1, sigmaV1, size=(num_latent_metabolites, num_metabolites)) V2 = state.normal( uV2, sigmaV2, size=(k, num_latent_metabolites)) def multinomial(n, p): return np.vstack([np.random.multinomial(n, p[i, :]) for i in range(p.shape[0])]).T microbe_counts = multinomial(microbe_total, softmax((Q @ U2 @ U1).T)) metabolite_counts = multinomial(metabolite_total, softmax((Q @ V2 @ V1).T)) otu_ids = ['OTU_%d' % d for d in range(microbe_counts.shape[1])] ms_ids = ['metabolite_%d' % d for d in range(metabolite_counts.shape[1])] sample_ids = ['sample_%d' % d for d in range(metabolite_counts.shape[0])] microbe_counts = pd.DataFrame( microbe_counts, index=sample_ids, columns=otu_ids) metabolite_counts = pd.DataFrame( metabolite_counts, index=sample_ids, columns=ms_ids) return microbe_counts, metabolite_counts, X, Q, U1, U2, V1, V2
def random_multinomial_model(num_samples, num_features, reps=1, low=2, high=10, beta_mean=0, beta_scale=5, mu=1, sigma=1, seed=0): """ Generates a table using a random poisson regression model. Here we will be simulating microbial counts given the model, and the corresponding model priors. Parameters ---------- num_samples : int Number of samples num_features : int Number of features tree : np.array Tree specifying orthonormal contrast matrix. low : float Smallest gradient value. high : float Largest gradient value. beta_mean : float Mean of beta prior (for regression coefficients) beta_scale : float Scale of beta prior (for regression coefficients) mu : float Mean sequencing depth (in log units) sigma : float Variance for sequencing depth Returns ------- table : biom.Table Biom representation of the count table. metadata : pd.DataFrame DataFrame containing relevant metadata. beta : np.array Regression parameter estimates. """ N = num_samples # generate all of the coefficient using the random poisson model state = check_random_state(seed) beta = state.normal(beta_mean, beta_scale, size=(2, num_features - 1)) X = np.hstack([np.linspace(low, high, num_samples // reps)] for _ in range(reps)) X = np.vstack((np.ones(N), X)).T phi = np.hstack((np.zeros((N, 1)), X @ beta)) probs = softmax(phi) n = [mu] * N table = np.vstack(state.multinomial(n[i], probs[i, :]) for i in range(N)).T samp_ids = pd.Index(['S%d' % i for i in range(num_samples)], name='sampleid') feat_ids = ['F%d' % i for i in range(num_features)] balance_ids = ['L%d' % i for i in range(num_features - 1)] table = Table(table, feat_ids, samp_ids) metadata = pd.DataFrame(X, columns=['Ones', 'X'], index=samp_ids) beta = pd.DataFrame(beta.T, columns=['Intercept', 'beta'], index=balance_ids) return table, metadata, beta