def testNegativeBinominal(): x_lim = 60 burnin = 50000 with pm.Model() as model: alpha = pm.Exponential('alpha', lam=0.2) mu = pm.Uniform('mu', lower=0, upper=100) y_pred = pm.NegativeBinomial('y_pred', mu=mu, alpha=alpha) # 这个y_esti是什么,有什么用? y_esti = pm.NegativeBinomial('y_esti', mu=mu, alpha=alpha, observed=msg['time_delay_seconds'].values) start = pm.find_MAP() step = pm.Metropolis() trace = pm.sample(200000, step, start=start, progressbar=True) pm.traceplot(trace[burnin:], varnames=['alpha', 'mu']) # fig = plt.figure(figsize=(10, 6)) # fig.add_subplot(211) # y_pred = trace[burnin:].get_values('y_pred') # plt.hist(y_pred, range=[0, x_lim], # bins=x_lim, histtype='stepfilled', color=colors[1]) # plt.xlim(1, x_lim) # plt.ylabel('Frequency') # plt.title('Posterior predictive distribution') # fig.add_subplot(212) # plt.hist(msg['time_delay_seconds'].values, # range=[0, x_lim], bins=x_lim, histtype='stepfilled') # plt.xlabel('Response time in seconds') # plt.ylabel('Frequency') # plt.title('Distribution of observed data') # plt.tight_layout() # plt.show() return trace
def test_HSStep_NegativeBinomial(): np.random.seed(2032) M = 5 N = 50 X = np.random.normal(size=N * M).reshape((N, M)) beta_true = np.array([1, 1, 2, 2, 0]) y_nb = pm.NegativeBinomial.dist(np.exp(X.dot(beta_true)), 1).random() N_draws = 500 with pm.Model(): beta = HorseShoe("beta", tau=1, shape=M) pm.NegativeBinomial("y", mu=at.exp(beta.dot(X.T)), alpha=1, observed=y_nb) hsstep = HSStep([beta]) trace = pm.sample( draws=N_draws, step=hsstep, chains=1, return_inferencedata=True, compute_convergence_checks=False, ) beta_samples = trace.posterior["beta"][0].values assert beta_samples.shape == (N_draws, M) np.testing.assert_allclose(beta_samples.mean(0), beta_true, atol=0.5) with pm.Model(): beta = HorseShoe("beta", tau=1, shape=M, testval=beta_true * 0.1) pm.NegativeBinomial("y", mu=beta.dot(np.abs(X.T)), alpha=1, observed=y_nb) hsstep = HSStep([beta]) trace = pm.sample( draws=N_draws, step=hsstep, chains=1, return_inferencedata=True, compute_convergence_checks=False, ) beta_samples = trace.posterior["beta"][0].values assert beta_samples.shape == (N_draws, M) with pm.Model(): beta = HorseShoe("beta", tau=1, shape=M, testval=beta_true * 0.1) eta = pm.NegativeBinomial("eta", mu=beta.dot(X.T), alpha=1, shape=N) pm.Normal("y", mu=at.exp(eta), sigma=1, observed=y_nb) with pytest.raises(NotImplementedError): HSStep([beta])
def testSepModels(): indiv_traces = {} # convert categorical variables to integer le = preprocessing.LabelEncoder() participants_idx = le.fit_transform(msg['prev_sender']) # print('participants_idx:\n', participants_idx) participants = le.classes_ print('participants:\n', participants) participants_num = len(participants) for p in participants: with pm.Model() as model: alpha = pm.Uniform('alpha', lower=0, upper=100) mu = pm.Uniform('mu', lower=0, upper=100) data = msg[msg['prev_sender'] == p]['time_delay_seconds'].values y_esti = pm.NegativeBinomial('y_esti', mu=mu, alpha=alpha, observed=data) y_pred = pm.NegativeBinomial('y_pred', mu=mu, alpha=alpha) start = pm.find_MAP() step = pm.Metropolis() trace = pm.sample(20000, step, start=start, progressbar=True) # sampling indiv_traces[p] = trace # visualize results # fig, axs = plt.subplots(3, 2, figsize=(12, 6)) # axs = axs.ravel() # obtain subplots # y_left_max = 2 # y_right_max = 2000 # x_lim = 60 # ix = [3, 4, 6] # selected samples # for i, j, p in zip([0, 1, 2], [0, 2, 4], participants[ix]): # axs[j].set_title('Observed: %s' % p) # axs[j].hist(msg[msg['prev_sender'] == p]['time_delay_seconds'].values, # range=[0, x_lim], bins=x_lim, histtype='stepfilled') # axs[j].set_ylim([0, y_left_max]) # for i, j, p in zip([0, 1, 2], [1, 3, 5], participants[ix]): # axs[j].set_title('Posterior predictive distribution: %s' % p) # axs[j].hist(indiv_traces[p].get_values('y_pred'), # range=[0, x_lim], bins=x_lim, # histtype='stepfilled', color=colors[1]) # axs[j].set_ylim([0, y_right_max]) # axs[4].set_xlabel('Response time (seconds)') # axs[5].set_xlabel('Response time (seconds)') # plt.tight_layout() # plt.show() return indiv_traces
def funcTrace24(path): # 'data/hr_day_cnctd.xlsx' import numpy as np import pymc3 as pm import pandas as pd # When we want to understand the effect of more factors such as "day of week," # "time of day," etc. We can use GLM (generalized linear models) to better # understand the effects of these factors. # Import Data data = pd.read_excel(path, index_col='Index'); #%% Houry NegativeBinomial Modeling # For each hour j and each EV connected i, we represent the model indiv_traces = {}; # Convert categorical variables to integer hours = list(data.Hour) n_hours = len(hours) x_lim = 16 print('---- Working -----') out_yPred = pd.DataFrame(np.zeros((x_lim,len(hours))), columns=list(hours)) out_yObs = pd.DataFrame(np.zeros((x_lim,len(hours))), columns=list(hours)) for h in hours: print('Hour: ', h) with pm.Model() as model: alpha = pm.Uniform('alpha', lower=0, upper=10) mu = pm.Uniform('mu', lower=0, upper=10) y_obs = data[data.Hour==h]['Connected'].values y_est = pm.NegativeBinomial('y_est', mu=mu, alpha=alpha, observed=y_obs) y_pred = pm.NegativeBinomial('y_pred', mu=mu, alpha=alpha) trace = pm.sample(10000, progressbar=True) indiv_traces[h] = trace out_yPred.loc[:,h], _ = np.histogram(indiv_traces[h].get_values('y_pred'), bins=x_lim) out_yObs.loc[:,h], _ = np.histogram(data[data.Hour==h]['Connected'].values, bins=x_lim) # Export results out_yPred.to_csv('out_yPred.csv') out_yObs.to_csv('out_yObs.csv') return(out_yPred, out_yObs)
def init_model(self, target): days, counties = target.index, target.columns # extract features features = self.evaluate_features(days, counties) Y_obs = target.stack().values.astype(np.float32) T_S = features["temporal_seasonal"].values.astype(np.float32) T_T = features["temporal_trend"].values.astype(np.float32) TS = features["spatiotemporal"].values.astype(np.float32) log_exposure = np.log( features["exposure"].values.astype(np.float32).ravel()) # extract dimensions num_obs = np.prod(target.shape) num_t_s = T_S.shape[1] num_t_t = T_T.shape[1] num_ts = TS.shape[1] with pm.Model() as self.model: # interaction effects are generated externally -> flat prior IA = pm.Flat("IA", testval=np.ones( (num_obs, self.num_ia)), shape=(num_obs, self.num_ia)) # priors # δ = 1/√α δ = pm.HalfCauchy("δ", 10, testval=1.0) α = pm.Deterministic("α", np.float32(1.0) / δ) W_ia = pm.Normal("W_ia", mu=0, sd=10, testval=np.zeros( self.num_ia), shape=self.num_ia) W_t_s = pm.Normal("W_t_s", mu=0, sd=10, testval=np.zeros(num_t_s), shape=num_t_s) W_t_t = pm.Normal("W_t_t", mu=0, sd=10, testval=np.zeros(num_t_t), shape=num_t_t) W_ts = pm.Normal("W_ts", mu=0, sd=10, testval=np.zeros(num_ts), shape=num_ts) self.param_names = ["δ", "W_ia", "W_t_s", "W_t_t", "W_ts"] self.params = [δ, W_ia, W_t_s, W_t_t, W_ts] # calculate interaction effect IA_ef = tt.dot(tt.dot(IA, self.Q), W_ia) # calculate mean rates μ = pm.Deterministic( "μ", tt.exp( IA_ef + tt.dot( T_S, W_t_s) + tt.dot( T_T, W_t_t) + tt.dot( TS, W_ts) + log_exposure)) # constrain to observations pm.NegativeBinomial("Y", mu=μ, alpha=α, observed=Y_obs)
def test_HSStep_NegativeBinomial_sparse(): np.random.seed(2032) M = 5 N = 50 X = np.random.normal(size=N * M).reshape((N, M)) beta_true = np.array([1, 1, 2, 2, 0]) y_nb = pm.NegativeBinomial.dist(np.exp(X.dot(beta_true)), 1).random() X = sp.sparse.csr_matrix(X) N_draws = 500 with pm.Model(): beta = HorseShoe("beta", tau=1, shape=M) pm.NegativeBinomial("y", mu=at.exp(sp_dot(X, at.shape_padright(beta))), alpha=1, observed=y_nb) hsstep = HSStep([beta]) trace = pm.sample( draws=N_draws, step=hsstep, chains=1, return_inferencedata=True, compute_convergence_checks=False, ) beta_samples = trace.posterior["beta"][0].values assert beta_samples.shape == (N_draws, M) np.testing.assert_allclose(beta_samples.mean(0), beta_true, atol=0.5)
def _sample_pymc3(cls, dist, size, seed): """Sample from PyMC3.""" import pymc3 pymc3_rv_map = { 'GeometricDistribution': lambda dist: pymc3.Geometric('X', p=float(dist.p)), 'PoissonDistribution': lambda dist: pymc3.Poisson('X', mu=float(dist.lamda)), 'NegativeBinomialDistribution': lambda dist: pymc3.NegativeBinomial('X', mu=float((dist.p * dist.r) / (1 - dist.p)), alpha=float(dist.r)) } dist_list = pymc3_rv_map.keys() if dist.__class__.__name__ not in dist_list: return None with pymc3.Model(): pymc3_rv_map[dist.__class__.__name__](dist) return pymc3.sample(size, chains=1, progressbar=False, random_seed=seed)[:]['X']
def add_observations(): with hierarchical_model.pymc_model: for i in range(hierarchical_model.n_groups): observations.append( pm.NegativeBinomial(f'y_{i}', mu=mu[i], alpha=alpha[i], observed=hierarchical_model.y[i]))
def phenom_model(self, method, field='deaths'): self.models[method] = {} for i, country in enumerate(self.countries): with pm.Model() as model: temp = self.data[(self.data.Country == country)].groupby( ['time']).mean()[field].values print(country, temp[0]) # TODO: Add if not external #np.argmax(temp)*1/3, # or add 0 self.phenom_constrains = { 'c1m': 0.0000000000001, 'c1M': 10, 'c2m': np.argmax(temp) * 1 / 3, # or add 0 'c2M': np.argmax(temp) * 3, 'c3m': np.max(temp), 'c3M': 50000 } print('phenom_constrains: ', self.phenom_constrains) const = {} for cn in ['c1', 'c2', 'c3']: const[cn] = pm.Uniform(cn, self.phenom_constrains[cn + 'm'], self.phenom_constrains[cn + 'M']) sigma = pm.HalfNormal('sigma', 100., shape=1) Nrepeat = 10 T = np.arange(0, len(temp)) T = np.append(T, np.repeat(T[-Nrepeat:], Nrepeat * 3)) temp = np.append(temp, np.repeat(temp[-Nrepeat:], Nrepeat * 3)) x = pm.Data("x", T) cases = pm.Data("y", temp) # Likelihood if method == 'log-model': pm.NegativeBinomial( country, const['c3'] * (1 / (1 + np.exp(-(const['c1'] * (x - const['c2']))))), sigma, observed=cases) if method == 'gompertz-model': pm.Poisson(country, const['c3'] * np.exp(-np.exp(-const['c1'] * (x - const['c2']))), observed=cases) self.models[method][country] = model return self.models
def testPartialFusionModel(): global msg with pm.Model() as model: # hyper paramers hyper_mu_mu = pm.Uniform('hyper_mu_mu', lower=0, upper=60) hyper_mu_sd = pm.Uniform('hyper_mu_sd', lower=0, upper=50) hyper_alpha_mu = pm.Uniform('hyper_alpha_mu', lower=0, upper=10) hyper_alpha_sd = pm.Uniform('hyper_alpha_sd', lower=0, upper=50) # participants le = preprocessing.LabelEncoder() participants_idx = le.fit_transform(msg['prev_sender']) participants = le.classes_ parti_num = len(participants) # parameters mu = pm.Gamma('mu', mu=hyper_mu_mu, sd=hyper_mu_sd, shape=parti_num) alpha = pm.Gamma('alpha', mu=hyper_alpha_mu, sd=hyper_alpha_sd, shape=parti_num) # sampling y_esti = pm.NegativeBinomial('y_esti', mu=mu[participants_idx], alpha=alpha[participants_idx], observed=msg['time_delay_seconds'].values) y_pred = pm.NegativeBinomial('y_pred', mu=mu[participants_idx], alpha=alpha[participants_idx], shape=msg['prev_sender'].shape) start = pm.find_MAP() step = pm.Metropolis() hierarchical_trace = pm.sample(200000, step, progressbar=True) pm.traceplot(hierarchical_trace[120000:], varnames=[ 'mu', 'alpha', 'hyper_mu_mu', 'hyper_mu_sd', 'hyper_alpha_mu', 'hyper_alpha_sd' ])
def phenom_model(self, method, field = 'deaths'): for i, country in enumerate(self.countries): self.models = {} self.models[method] = {} with pm.Model() as model: print('phenom_constrains:', self.phenom_constrains,'\n') const = {} for cn in ['c1','c2','c3']: grp = pm.Normal(cn+'grp', self.phenom_constrains[cn+'M'], self.phenom_constrains[cn+'s']) # Group variance grp_sigma = pm.HalfNormal(cn+'grp_sigma', self.phenom_constrains[cn+'s']) # Individual intercepts const[cn] = pm.Normal(cn, mu=grp, sigma=grp_sigma, shape=len(self.countries)) sigma = pm.HalfNormal('sigma', 10000., shape=len(self.countries)) temp = self.data[self.data['Country'] == country][field].values x = pm.Data("x", np.arange(0, len(temp))) cases = pm.Data("y", temp) # Likelihood if method == 'log-model': pm.NegativeBinomial( country, const['c3'][i]*(1/(1 + np.exp(-(const['c1'][i] * (-const['c2'][i] + x))))), sigma[i], observed=cases) if method == 'gompertz-model': pm.NegativeBinomial( country, const['c3'][i]*np.exp(-np.exp(-const['c1'][i]*(x-const['c2'][i]))), sigma[i], observed=cases) self.models[method][country] = model return models
def get_model(dist, data) -> pm.Model: means = data.mean(0) n_exp = data.shape[1] if dist == "Poisson": with pm.Model() as poi_model: lam = pm.Exponential("lam", lam=means, shape=(1, n_exp)) poi = pm.Poisson( "poi", mu=lam, observed=data, ) return poi_model if dist == "ZeroInflatedPoisson": with pm.Model() as zip_model: psi = pm.Uniform("psi", shape=(1, n_exp)) lam = pm.Exponential("lam", lam=means, shape=(1, n_exp)) zip = pm.ZeroInflatedPoisson( "zip", psi=psi, theta=lam, observed=data, ) return zip_model if dist == "NegativeBinomial": with pm.Model() as nb_model: gamma = pm.Gamma("gm", 0.01, 0.01, shape=(1, n_exp)) lam = pm.Exponential("lam", lam=means, shape=(1, n_exp)) nb = pm.NegativeBinomial( "nb", alpha=gamma, mu=lam, observed=data, ) return nb_model if dist == "ZeroInflatedNegativeBinomial": with pm.Model() as zinb_model: gamma = pm.Gamma("gm", 0.01, 0.01, shape=(1, n_exp)) lam = pm.Exponential("lam", lam=means, shape=(1, n_exp)) psi = pm.Uniform("psi", shape=(1, n_exp)) zinb = pm.ZeroInflatedNegativeBinomial( "zinb", psi=psi, alpha=gamma, mu=lam, observed=data, ) return zinb_model
def nb_model(self, N=1000, tune=1000): dat = self.data mu = self.mu alpha = self.alpha dat = np.asarray(dat) dat[dat > 10] = 0 dat = dat[dat > 0] print(np.max(dat)) with pm.Model() as model_n: mu = pm.Uniform('mu', lower=0, upper=mu) alpha = pm.Uniform('alpha', lower=0, upper=alpha) # y_pred = pm.NegativeBinomial('y_pred', mu=mu, alpha=alpha) y_est = pm.NegativeBinomial('y_est', mu=mu, alpha=alpha, observed=dat) trace_n = pm.sample(N, tune=tune, cores=2) return trace_n
def mcmcNegativeBinomial(data): """Generate a trace for the data""" with pm.Model() as model: # Not familiar with Negative Binomial, so no prior knowledge, let's choose uniform as a prior # To be safe, make sure the possible range is larger than needed. alpha_rv = pm.Uniform('alpha_rv', 0.0, 3.0) mu_rv = pm.Uniform('mu_rv', 0.1, 30.0) score_rv = pm.NegativeBinomial('score_rv', mu=mu_rv, alpha=alpha_rv, observed=data) step = pm.NUTS() trace = pm.sample(step=step, draws=10000, chains=4, cores=4, init='adapt_diag') graph = pm.model_to_graphviz(model) graph.render(filename='model', format='png') return trace
def visualize_trace(trace, data, desc='2007'): """Interpret the trace""" print(f"Visualize the probability distribution of two parameters.") alphas = trace.get_values('alpha_rv') mus = trace.get_values('mu_rv') fig, ax = plt.subplots(figsize=[9, 6], nrows=2) ax[0].hist(alphas, bins='auto', density=True) ax[0].set_title(f"Probability Distribution of Beta ({desc})") ax[1].hist(mus, bins='auto', density=True) ax[1].set_title(f"Probability Distribution of q ({desc})") plt.tight_layout() plt.show() print( f"Reconstruct the PHQ score distribution using mean value and compare with original data too see the fitness." ) mu_mean = np.mean(mus) alpha_mean = np.mean(alphas) with pm.Model() as model: score_rv = pm.NegativeBinomial('score_rv', mu=mu_mean, alpha=alpha_mean) x = score_rv.random(size=10000) #HACK: I don't know how to bound the model, so what I can do is cut off the tail after getting the data. # However, there's little data that is larger than the boundary, luckily. x = x[x <= 27] plot_ecdf([x, data], labels=[ f"Random Data (mu={mu_mean:.3f}, alpha={alpha_mean:.3f})", desc ], alphas=[1, 0.9]) plot_hist([x, data], bins=[27, 27], labels=[ f"Random Data (mu={mu_mean:.3f}, alpha={alpha_mean:.3f})", desc ], alphas=[1, 0.5]) return alpha_mean, mu_mean
def test_set_initval(): # Make sure the dependencies between variables are maintained when # generating initial values rng = np.random.RandomState(392) with pm.Model(rng_seeder=rng) as model: eta = pm.Uniform("eta", 1.0, 2.0, size=(1, 1)) mu = pm.Normal("mu", sd=eta, initval=[[100]]) alpha = pm.HalfNormal("alpha", initval=100) value = pm.NegativeBinomial("value", mu=mu, alpha=alpha) assert np.array_equal(model.initial_values[model.rvs_to_values[mu]], np.array([[100.0]])) np.testing.assert_almost_equal(model.initial_values[model.rvs_to_values[alpha]], np.log(100)) assert 50 < model.initial_values[model.rvs_to_values[value]] < 150 # `Flat` cannot be sampled, so let's make sure that doesn't break initial # value computations with pm.Model() as model: x = pm.Flat("x") y = pm.Normal("y", x, 1) assert model.rvs_to_values[y] in model.initial_values
def run_mcmc( df, country="US", days_in_future=50, logy=True, totalPop=7e9, tune=5000, draws=1200, ): dates = df.index y = by_country.loc[:, country].values x = (dates - np.datetime64(dates[0])).days xplot = np.arange(x[-1] + days_in_future) p0 = np.log([2.3, 46, 2000]) x0, cov = curve_fit(logistic_model, x, y, p0=p0, maxfev=10000) with pm.Model() as model: def logistic_cdf(x, la, lb, lc): a, b, c = la, tt.exp(lb), tt.exp(lc) return c / (1 + tt.exp(-(x - b) / a)) # growthBound = pm.Bound(pm.Normal, lower=0) # loga = growthBound("loga", mu=tt.log(5), sd=3) growthBound = pm.Bound(pm.Gamma, lower=1) a = growthBound("loga", alpha=3.5, beta=1) logb = pm.Normal("logb", mu=tt.log(150), sd=3) popBound = pm.Bound( pm.Normal, upper=tt.log(totalPop), lower=tt.log(y[-1]) ) logc = popBound("logc", mu=np.log(0.1 * totalPop), sd=5) # switching to an InvGamma prior on sd, cos its the conjugate # prior of the normal distrbution with unknown sd # logsd = pm.Normal("logsd", mu=2, sd=2) mask = y > 50 sd = pm.InverseGamma( "logsd", mu=np.std(y[mask] / x[mask]), sd=np.std(y[mask] / x[mask]) / len(x[mask]), ) mod = logistic_cdf(x.values[mask], a, logb, logc) # pm.Normal("obs", mu=mod, sd=sd, observed=y[mask]) # move to Negative Binomial pm.obs = pm.NegativeBinomial('obs', mod, sd, observed=y[mask]) mod_eval = pm.Deterministic( "mod_eval", logistic_cdf(xplot, a, logb, logc) ) map_params = optimize() trace = pm.sample( draws=draws, tune=tune, chains=2, cores=2, start=map_params, target_accept=0.9, progressbar=False, ) q = np.percentile(trace["mod_eval"], q=[50, 90, 10], axis=0) if logy: p = plotting.figure(y_axis_type="log", x_axis_type="datetime") p.yaxis.formatter = FuncTickFormatter(code=code) else: p = plotting.figure(y_axis_type="linear", x_axis_type="datetime") # ln = p.line( # [dates[0] + datetime.timedelta(days=x) for x in range(0, xplot[-1])], # q[0], # line_width=2, # ) ln = p.line( [dates[0] + datetime.timedelta(days=x) for x in range(0, xplot[-1])], np.mean(trace["mod_eval"], axis=0), line_width=2, ) p.line( [dates[0] + datetime.timedelta(days=x) for x in range(0, xplot[-1])], q[1], line_dash="dashed", line_width=1, ) p.line( [dates[0] + datetime.timedelta(days=x) for x in range(0, xplot[-1])], q[2], line_dash="dashed", line_width=1, ) p.circle(dates, y, color=colors[1]) p.y_range = Range1d(10, 1.2 * np.max(q[1])) p.yaxis.formatter = FuncTickFormatter(code=code) legend_it = [(country, [ln])] legend = Legend( items=legend_it, location="top_right", orientation="horizontal" ) legend.spacing = 17 legend.click_policy = "hide" p.add_layout(legend, "above") label_opts = dict( x=dates[0] + datetime.timedelta(days=int(xplot[-1])), y=np.max(q[1]) * 1.1, text_align="right", text_font_size="9pt", ) caption = Label( text=f'Created by Tom Barclay on {datetime.datetime.now().strftime("%b %d, %Y")}', **label_opts, ) p.add_layout(caption, "below") script, div = components(p) embedfile = ( f"_includes/{country.replace(' ', '')}_infections_mcmc_embed.html" ) with open(embedfile, "w") as ff: ff.write(div) ff.write(script) return [ f'{(dates[0] + datetime.timedelta(days=np.mean(np.exp(trace["logb"])))).strftime("%b %d, %Y")}', [ np.mean(np.exp(trace["logc"])), *np.percentile(np.exp(trace["logc"]), [90, 10]), ], ]
def __init__( self, cell_state_mat: np.ndarray, X_data: np.ndarray, n_comb: int = 50, data_type: str = "float32", n_iter=20000, learning_rate=0.005, total_grad_norm_constraint=200, verbose=True, var_names=None, var_names_read=None, obs_names=None, fact_names=None, sample_id=None, gene_level_prior={ "mean": 1 / 2, "sd": 1 / 4 }, gene_level_var_prior={"mean_var_ratio": 1.0}, cell_number_prior={ "cells_per_spot": 8.0, "factors_per_spot": 7.0, "combs_per_spot": 2.5 }, cell_number_var_prior={ "cells_mean_var_ratio": 1.0, "factors_mean_var_ratio": 1.0, "combs_mean_var_ratio": 1.0 }, phi_hyp_prior={ "mean": 3.0, "sd": 1.0 }, spot_fact_mean_var_ratio=5.0, exper_gene_level_mean_var_ratio=10, ): ############# Initialise parameters ################ super().__init__( cell_state_mat, X_data, data_type, n_iter, learning_rate, total_grad_norm_constraint, verbose, var_names, var_names_read, obs_names, fact_names, sample_id, ) for k in gene_level_var_prior.keys(): gene_level_prior[k] = gene_level_var_prior[k] self.gene_level_prior = gene_level_prior self.phi_hyp_prior = phi_hyp_prior self.n_comb = n_comb self.spot_fact_mean_var_ratio = spot_fact_mean_var_ratio self.exper_gene_level_mean_var_ratio = exper_gene_level_mean_var_ratio # generate parameters for samples self.spot2sample_df = pd.get_dummies(sample_id) # convert to np.ndarray self.spot2sample_mat = self.spot2sample_df.values self.n_exper = self.spot2sample_mat.shape[1] # assign extra data to dictionary with (1) shared parameters (2) input data self.extra_data_tt = { "spot2sample": theano.shared(self.spot2sample_mat.astype(self.data_type)) } self.extra_data = { "spot2sample": self.spot2sample_mat.astype(self.data_type) } cell_number_prior["factors_per_combs"] = ( cell_number_prior["factors_per_spot"] / cell_number_prior["combs_per_spot"]) for k in cell_number_var_prior.keys(): cell_number_prior[k] = cell_number_var_prior[k] self.cell_number_prior = cell_number_prior ############# Define the model ################ self.model = pm.Model() with self.model: # =====================Gene expression level scaling======================= # # Explains difference in expression between genes and # how it differs in single cell and spatial technology # compute hyperparameters from mean and sd shape = gene_level_prior["mean"]**2 / gene_level_prior["sd"]**2 rate = gene_level_prior["mean"] / gene_level_prior["sd"]**2 shape_var = shape / gene_level_prior["mean_var_ratio"] rate_var = rate / gene_level_prior["mean_var_ratio"] self.gene_level_alpha_hyp = pm.Gamma("gene_level_alpha_hyp", mu=shape, sigma=np.sqrt(shape_var), shape=(1, 1)) self.gene_level_beta_hyp = pm.Gamma("gene_level_beta_hyp", mu=rate, sigma=np.sqrt(rate_var), shape=(1, 1)) # global gene levels self.gene_level = pm.Gamma("gene_level", self.gene_level_alpha_hyp, self.gene_level_beta_hyp, shape=(self.n_var, 1)) # scale cell state factors by gene_level self.gene_factors = pm.Deterministic("gene_factors", self.cell_state) # self.gene_factors = self.cell_state # tt.printing.Print('gene_factors sum')(gene_factors.sum(0).shape) # tt.printing.Print('gene_factors sum')(gene_factors.sum(0)) # =====================Spot factors======================= # # prior on spot factors reflects the number of cells, fraction of their cytoplasm captured, # times heterogeniety in the total number of mRNA between individual cells with each cell type self.cells_per_spot = pm.Gamma( "cells_per_spot", mu=cell_number_prior["cells_per_spot"], sigma=np.sqrt(cell_number_prior["cells_per_spot"] / cell_number_prior["cells_mean_var_ratio"]), shape=(self.n_obs, 1), ) self.comb_per_spot = pm.Gamma( "combs_per_spot", mu=cell_number_prior["combs_per_spot"], sigma=np.sqrt(cell_number_prior["combs_per_spot"] / cell_number_prior["combs_mean_var_ratio"]), shape=(self.n_obs, 1), ) shape = self.comb_per_spot / np.array(self.n_comb).reshape((1, 1)) rate = tt.ones((1, 1)) / self.cells_per_spot * self.comb_per_spot self.combs_factors = pm.Gamma("combs_factors", alpha=shape, beta=rate, shape=(self.n_obs, self.n_comb)) self.factors_per_combs = pm.Gamma( "factors_per_combs", mu=cell_number_prior["factors_per_combs"], sigma=np.sqrt(cell_number_prior["factors_per_combs"] / cell_number_prior["factors_mean_var_ratio"]), shape=(self.n_comb, 1), ) c2f_shape = self.factors_per_combs / np.array(self.n_fact).reshape( (1, 1)) self.comb2fact = pm.Gamma("comb2fact", alpha=c2f_shape, beta=self.factors_per_combs, shape=(self.n_comb, self.n_fact)) self.spot_factors = pm.Gamma( "spot_factors", mu=pm.math.dot(self.combs_factors, self.comb2fact), sigma=pm.math.sqrt( pm.math.dot(self.combs_factors, self.comb2fact) / self.spot_fact_mean_var_ratio), shape=(self.n_obs, self.n_fact), ) # =====================Spot-specific additive component======================= # # molecule contribution that cannot be explained by cell state signatures # these counts are distributed between all genes not just expressed genes self.spot_add_hyp = pm.Gamma("spot_add_hyp", 1, 1, shape=2) self.spot_add = pm.Gamma("spot_add", self.spot_add_hyp[0], self.spot_add_hyp[1], shape=(self.n_obs, 1)) # =====================Gene-specific additive component ======================= # # per gene molecule contribution that cannot be explained by cell state signatures # these counts are distributed equally between all spots (e.g. background, free-floating RNA) self.gene_add_hyp = pm.Gamma("gene_add_hyp", 1, 1, shape=2) self.gene_add = pm.Gamma("gene_add", self.gene_add_hyp[0], self.gene_add_hyp[1], shape=(self.n_exper, self.n_var)) # =====================Gene-specific overdispersion ======================= # self.phi_hyp = pm.Gamma("phi_hyp", mu=phi_hyp_prior["mean"], sigma=phi_hyp_prior["sd"], shape=(1, 1)) self.gene_E = pm.Exponential("gene_E", self.phi_hyp, shape=(self.n_exper, self.n_var)) # =====================Expected expression ======================= # # expected expression self.mu_biol = ( pm.math.dot(self.spot_factors, self.gene_factors.T) * self.gene_level.T + pm.math.dot(self.extra_data_tt["spot2sample"], self.gene_add) + self.spot_add) # tt.printing.Print('mu_biol')(self.mu_biol.shape) # =====================DATA likelihood ======================= # # Likelihood (sampling distribution) of observations & add overdispersion via NegativeBinomial / Poisson self.data_target = pm.NegativeBinomial( "data_target", mu=self.mu_biol, alpha=pm.math.dot(self.extra_data_tt["spot2sample"], 1 / tt.pow(self.gene_E, 2)), observed=self.x_data, total_size=self.X_data.shape, ) # =====================Compute nUMI from each factor in spots ======================= # self.nUMI_factors = pm.Deterministic( "nUMI_factors", (self.spot_factors * (self.gene_factors * self.gene_level).sum(0)))
# mu = pm.Uniform('mu', 0, 10) beta = pm.Normal('beta', 0, 20, shape=companiesABC) beta1 = pm.Normal('beta1', 0, 20, shape=companiesABC) beta2 = pm.Normal('beta2', 0, 10) # theta = pm.Uniform('theta', lower=0, upper=10) muu = tt.printing.Print('beta2')(beta2) mu = pm.Deterministic( 'mu', tt.exp(beta[companyABC] + beta1[companyABC] * elec_year1 + beta2 * elec_tem1)) # mu = tt.exp(beta + beta1 * elec_year + beta2 * elec_tem) # mu = pm.math.exp(theta) # Observed_pred = pm.NegativeBinomial("Observed_pred", mu=mu, alpha=sigma, shape=elec_faults.shape) # 观测值 Observed = pm.NegativeBinomial("Observed", mu=mu, alpha=sigma, observed=elec_faults1) # 观测值 start = pm.find_MAP() # step1 = pm.Slice([beta, beta1, beta2]) # step = pm.Metropolis() trace2 = pm.sample(2000, start=start, tune=1000) chain2 = trace2 varnames1 = ['beta', 'beta1', 'beta2', 'sigma', 'mu'] varnames2 = ['beta', 'beta1', 'beta2', 'sigma'] pm.plot_posterior(chain2, varnames1) plt.show() map_estimate = pm.find_MAP(model=unpooled_model) print(map_estimate)
def build(self): """ Builds and returns the Generative model. Also sets self.model """ p_delay = get_delay_distribution() nonzero_days = self.observed.total.gt(0) len_observed = len(self.observed) convolution_ready_gt = self._get_convolution_ready_gt(len_observed) x = np.arange(len_observed)[:, None] coords = { "date": self.observed.index.values, "nonzero_date": self.observed.index.values[self.observed.total.gt(0)], } with pm.Model(coords=coords) as self.model: # Let log_r_t walk randomly with a fixed prior of ~0.035. Think # of this number as how quickly r_t can react. log_r_t = pm.GaussianRandomWalk("log_r_t", sigma=0.035, dims=["date"]) r_t = pm.Deterministic("r_t", pm.math.exp(log_r_t), dims=["date"]) # For a given seed population and R_t curve, we calculate the # implied infection curve by simulating an outbreak. While this may # look daunting, it's simply a way to recreate the outbreak # simulation math inside the model: # https://staff.math.su.se/hoehle/blog/2020/04/15/effectiveR0.html seed = pm.Exponential("seed", 1 / 0.02) y0 = tt.zeros(len_observed) y0 = tt.set_subtensor(y0[0], seed) outputs, _ = theano.scan( fn=lambda t, gt, y, r_t: tt.set_subtensor( y[t], tt.sum(r_t * y * gt)), sequences=[tt.arange(1, len_observed), convolution_ready_gt], outputs_info=y0, non_sequences=r_t, n_steps=len_observed - 1, ) infections = pm.Deterministic("infections", outputs[-1], dims=["date"]) # Convolve infections to confirmed positive reports based on a known # p_delay distribution. See patients.py for details on how we calculate # this distribution. test_adjusted_positive = pm.Deterministic( "test_adjusted_positive", conv2d( tt.reshape(infections, (1, len_observed)), tt.reshape(p_delay, (1, len(p_delay))), border_mode="full", )[0, :len_observed], dims=["date"]) # Picking an exposure with a prior that exposure never goes below # 0.1 * max_tests. The 0.1 only affects early values of Rt when # testing was minimal or when data errors cause underreporting # of tests. tests = pm.Data("tests", self.observed.total.values, dims=["date"]) exposure = pm.Deterministic("exposure", pm.math.clip( tests, self.observed.total.max() * 0.1, 1e9), dims=["date"]) # Test-volume adjust reported cases based on an assumed exposure # Note: this is similar to the exposure parameter in a Poisson # regression. positive = pm.Deterministic("positive", exposure * test_adjusted_positive, dims=["date"]) # Save data as part of trace so we can access in inference_data observed_positive = pm.Data("observed_positive", self.observed.positive.values, dims=["date"]) nonzero_observed_positive = pm.Data( "nonzero_observed_positive", self.observed.positive[nonzero_days.values].values, dims=["nonzero_date"]) positive_nonzero = pm.NegativeBinomial( "nonzero_positive", mu=positive[nonzero_days.values], alpha=pm.Gamma("alpha", mu=6, sigma=1), observed=nonzero_observed_positive, dims=["nonzero_date"]) return self.model
def build_model( observed: pandas.DataFrame, p_generation_time: numpy.ndarray, p_delay: numpy.ndarray, test_col: str, buffer_days: int = 10, pmodel: typing.Optional[pymc3.Model] = None, ) -> pymc3.Model: """ Builds the Rt.live PyMC3 model. Model by Kevin Systrom, Thomas Vladek and Rtlive contributors. Parameters ---------- observed : pandas.DataFrame date-indexed dataframe with column "new_cases" (daily positives) and a column of daily tests whose name is specified by parameter [test_col] p_generation_time : numpy.ndarray numpy array that describes the generation time distribution p_delay : numpy.ndarray numpy array that describes the testing delay distribution test_col : str name of column with daily new tests (predicted or actual data) buffer_days : int number of days to prepend before the beginning of the data pmodel : optional, PyMC3 model an existing PyMC3 model object to use (not context-activated) Returns ------- pmodel : pymc3.Model the (created) PyMC3 model """ observed = observed.rename(columns={test_col: "daily_tests"}) # Reindex to make sure that there are no gaps. # Also add (unobserved) buffer days at the beginning. observed = _reindex_observed(observed, buffer_days) # make boolean masks to filter for dates that have case data, testcount data or both has_cases = ~numpy.isnan(observed.new_cases).values has_testcounts = ~numpy.isnan(observed.daily_tests).values has_data = has_cases & has_testcounts # masks that can be used w.r.t. subsets of the dates. # These are used to slice tensors that are already shorter than the full length. has_data_wrt_cases = has_data[has_cases] has_data_wrt_testcounts = has_data[has_testcounts] coords = { # this is the full lenght of dates (without gaps) covered by the generative part of the model "date": observed.index.values, # these are subsets of dates where case/testcount data is available "date_with_cases": observed.index.values[has_cases], "date_with_testcounts": observed.index.values[has_testcounts], # and the dates with both case & testcount data (for the likelihood) "date_with_data": observed.index.values[has_data], } N_dates = len(coords["date"]) N_with_cases = len(coords["date_with_cases"]) N_with_testcounts = len(coords["date_with_testcounts"]) N_with_data = len(coords["date_with_data"]) _log.info( "The model describes %i days of which %i have case data and %i have testcount data. %i days have both.", N_dates, N_with_cases, N_with_testcounts, N_with_data) if not pmodel: pmodel = pymc3.Model(coords=coords) with pmodel: # Let log_r_t walk randomly with a fixed prior of ~0.035. Think # of this number as how quickly r_t can react. log_r_t = pymc3.GaussianRandomWalk("log_r_t", sigma=0.035, dims=["date"]) r_t = pymc3.Deterministic("r_t", pymc3.math.exp(log_r_t), dims=["date"]) # Save data as part of trace so we can access in inference_data t_generation_time = pymc3.Data("p_generation_time", p_generation_time) # precompute generation time interval vector to speed up tt.scan convolution_ready_gt = _to_convolution_ready_gt( p_generation_time, N_dates) # For a given seed population and R_t curve, we calculate the # implied infection curve by simulating an outbreak. While this may # look daunting, it's simply a way to recreate the outbreak # simulation math inside the model: # https://staff.math.su.se/hoehle/blog/2020/04/15/effectiveR0.html seed = pymc3.Exponential("seed", 1 / 0.02) y0 = tt.zeros(N_dates) y0 = tt.set_subtensor(y0[0], seed) outputs, _ = theano.scan( fn=lambda t, gt, y, r_t: tt.set_subtensor(y[t], tt.sum(r_t * y * gt )), sequences=[tt.arange(1, N_dates), convolution_ready_gt], outputs_info=y0, non_sequences=r_t, n_steps=N_dates - 1, ) infections = pymc3.Deterministic("infections", outputs[-1], dims=["date"]) t_p_delay = pymc3.Data("p_delay", p_delay) # Convolve infections to confirmed positive reports based on a known # p_delay distribution. See patients.py for details on how we calculate # this distribution. test_adjusted_positive = pymc3.Deterministic( "test_adjusted_positive", theano.tensor.signal.conv.conv2d( tt.reshape(infections, (1, N_dates)), tt.reshape(t_p_delay, (1, len(p_delay))), border_mode="full", )[0, :N_dates], dims=["date"]) # Picking an exposure with a prior that exposure never goes below # 0.1 * max_tests. The 0.1 only affects early values of Rt when # testing was minimal or when data errors cause underreporting # of tests. tests = pymc3.Data("tests", observed.daily_tests[has_testcounts], dims=["date_with_testcounts"]) exposure = pymc3.Deterministic("exposure", pymc3.math.clip( tests, observed.daily_tests.max() * 0.1, 1e9), dims=["date_with_testcounts"]) # Test-volume adjust reported cases based on an assumed exposure # Note: this is similar to the exposure parameter in a Poisson # regression. positive = pymc3.Deterministic("positive", exposure * test_adjusted_positive[has_testcounts], dims=["date_with_testcounts"]) positive_where_data = pymc3.Deterministic( "positive_where_data", positive[has_data_wrt_testcounts], dims=["date_with_data"]) observed_positive = pymc3.Data("observed_positive", observed.new_cases[has_cases], dims=["date_with_cases"]) observed_positive_where_data = pymc3.Data( "observed_positive_where_data", observed.new_cases[has_cases][has_data_wrt_cases], dims=["date_with_data"]) likelihood = pymc3.NegativeBinomial( "likelihood", mu=positive_where_data, alpha=pymc3.Gamma("alpha", mu=6, sigma=1), observed=observed_positive_where_data, dims=["date_with_data"]) return pmodel
def run(region, folder, load_trace=False, compute_sim=True, plot_posterior_dist = True): print("started ... " + region) if not os.path.exists(region): os.makedirs(region) # observed data (t_obs, datetimes, y_obs, n_pop, shutdown_day, u0, _) = data_fetcher.read_region_data(folder, region) y_obs = y_obs.astype(np.float64) u0 = u0.astype(np.float64) # set eqn eqn = Seir() eqn.population = n_pop eqn.tau = shutdown_day # set ode solver ti = t_obs[0] tf = t_obs[-1] m = 2 n_steps = m*(tf - ti) rk = RKSolverSeir(ti, tf, n_steps) rk.rk_type = "explicit_euler" rk.output_frequency = m rk.set_output_storing_flag(True) rk.equation = eqn du0_dp = np.zeros((eqn.n_components(), eqn.n_parameters())) rk.set_initial_condition(u0, du0_dp) rk.set_output_gradient_flag(True) # sample posterior with pm.Model() as model: # set prior distributions #beta = pm.Lognormal('beta', mu = math.log(0.4/n_pop), sigma = 0.4) #sigma = pm.Lognormal('sigma', mu = math.log(0.3), sigma = 0.5) #gamma = pm.Lognormal('gamma', mu = math.log(0.25), sigma = 0.5) #kappa = pm.Lognormal('kappa', mu = math.log(0.1), sigma = 0.5) #beta = pm.Normal('beta', mu = 0.4/n_pop, sigma = 0.06/n_pop) #sigma = pm.Normal('sigma', mu = 0.6, sigma = 0.1) #gamma = pm.Normal('gamma', mu = 0.3, sigma = 0.07) #kappa = pm.Normal('kappa', mu = 0.5, sigma = 0.1) #tint = pm.Lognormal('tint', mu = math.log(30), sigma = 1) beta = pm.Lognormal('beta', mu = math.log(0.1), sigma = 0.5) #math.log(0.3/n_pop), sigma = 0.5) sigma = pm.Lognormal('sigma', mu = math.log(0.05), sigma = 0.6) gamma = pm.Lognormal('gamma', mu = math.log(0.05), sigma = 0.6) kappa = pm.Lognormal('kappa', mu = math.log(0.2), sigma = 0.3) # math.log(0.001), sigma = 0.8) tint = pm.Lognormal('tint', mu = math.log(30), sigma = math.log(10)) dispersion = pm.Normal('dispersion', mu = 30., sigma = 10.) # set cached_sim object cached_sim = CachedSEIRSimulation(rk) # set theano model op object model = ModelOp(cached_sim) # set likelihood distribution y_sim = pm.NegativeBinomial('y_sim', mu=model((beta, sigma, gamma, kappa, tint)), alpha=dispersion, observed=y_obs) if not load_trace: # sample posterior distribution and save trace draws = 1000 #1000 tune = 500 #500 trace = pm.sample(draws=draws, tune=tune, cores=4, chains=4, nuts_kwargs=dict(target_accept=0.9), init='advi+adapt_diag') # using NUTS sampling # save trace pm.backends.text.dump(region + os.path.sep, trace) else: # load trace trace = pm.backends.text.load(region + os.path.sep) if plot_posterior_dist: # plot posterior distributions of all parameters data = az.from_pymc3(trace=trace) pm.plots.traceplot(data, legend=True) plt.savefig(region + os.path.sep + "trace_plot.pdf") az.plot_posterior(data, hdi_prob = 0.95) plt.savefig(region + os.path.sep + "post_dist.pdf") if compute_sim: #rk.set_output_gradient_flag(False) n_predictions = 7 rk.final_time = rk.final_time + n_predictions rk.n_steps = rk.n_steps + m*n_predictions y_sims = pm.sample_posterior_predictive(trace)['y_sim'][:,0,:] np.savetxt(region + os.path.sep + "y_sims.csv", y_sims, delimiter = ',') mean_y = np.mean(y_sims,axis=0) upper_y = np.percentile(y_sims,q=97.5,axis=0) lower_y = np.percentile(y_sims,q=2.5,axis=0) # plots dates = [dt.datetime.strptime(date, "%Y-%m-%d").date() for date in datetimes] pred_dates = dates + [dates[-1] + dt.timedelta(days=i) for i in range(1,1 + n_predictions)] np.savetxt(region + os.path.sep + "y_obs.csv", y_obs, delimiter = ',') dates_csv = pd.DataFrame(pred_dates).to_csv(region + os.path.sep + 'dates.csv', header=False, index=False) # linear plot font_size = 12 fig, ax = plt.subplots(figsize=(10, 10)) ax.plot(dates, y_obs, 'x', color='k', label='reported data') import matplotlib.dates as mdates ax.xaxis.set_major_formatter(mdates.DateFormatter('%d %b')) ax.xaxis.set_major_locator(mdates.DayLocator(bymonthday=(1,15))) plt.title(region[0].upper() + region[1:].lower() + "'s daily infections", fontsize = font_size) plt.xlabel('Date', fontsize = font_size) plt.ylabel('New daily infections', fontsize = font_size) ax.tick_params(axis='both', which='major', labelsize=10) # plot propagated uncertainty plt.plot(pred_dates, mean_y, color='g', lw=2, label='mean') plt.fill_between(pred_dates, lower_y, upper_y, color='darkseagreen', label='95% credible interval') plt.legend(loc='upper left') fig.autofmt_xdate() plt.savefig(region + os.path.sep + "linear.pdf") # log plot plt.yscale('log') plt.savefig(region + os.path.sep + "log.pdf") print("finished ... " + region)
def __init__(self, cell_state_mat: np.ndarray, X_data: np.ndarray, Y_data: np.ndarray, n_comb: int = 50, data_type: str = 'float32', n_iter=20000, learning_rate=0.005, total_grad_norm_constraint=200, verbose=True, var_names=None, var_names_read=None, obs_names=None, fact_names=None, sample_id=None, gene_level_prior={ 'mean': 1 / 2, 'sd': 1 / 4 }, gene_level_var_prior={'mean_var_ratio': 1}, cell_number_prior={ 'cells_per_spot': 8, 'factors_per_spot': 7, 'combs_per_spot': 2.5 }, cell_number_var_prior={ 'cells_mean_var_ratio': 1, 'factors_mean_var_ratio': 1, 'combs_mean_var_ratio': 1 }, phi_hyp_prior={ 'mean': 3, 'sd': 1 }, spot_fact_mean_var_ratio=0.5): ############# Initialise parameters ################ super().__init__(cell_state_mat, X_data, data_type, n_iter, learning_rate, total_grad_norm_constraint, verbose, var_names, var_names_read, obs_names, fact_names, sample_id) self.Y_data = Y_data self.n_npro = Y_data.shape[1] self.y_data = theano.shared(Y_data.astype(self.data_type)) self.n_rois = Y_data.shape[0] # Total number of gene counts in each region of interest, divided by 10^5: self.l_r = np.array([np.sum(X_data[i, :]) for i in range(self.n_rois) ]).reshape(self.n_rois, 1) * 10**(-5) for k in gene_level_var_prior.keys(): gene_level_prior[k] = gene_level_var_prior[k] self.gene_level_prior = gene_level_prior self.phi_hyp_prior = phi_hyp_prior self.n_comb = n_comb self.spot_fact_mean_var_ratio = spot_fact_mean_var_ratio cell_number_prior['factors_per_combs'] = ( cell_number_prior['factors_per_spot'] / cell_number_prior['combs_per_spot']) for k in cell_number_var_prior.keys(): cell_number_prior[k] = cell_number_var_prior[k] self.cell_number_prior = cell_number_prior ############# Define the model ################ self.model = pm.Model() with self.model: # ===================== Non-specific binding additive component ======================= # # Additive term for non-specific binding of gene probes are drawn from a gamma distribution with # the same mean and variance as for negative probes above. self.gene_add_hyp = pm.Gamma('gene_add_hyp', 1, 1, shape=2) self.gene_add = pm.Gamma('gene_add', self.gene_add_hyp[0], self.gene_add_hyp[1], shape=(self.n_genes, 1)) # =====================Gene expression level scaling======================= # # Explains difference in expression between genes and # how it differs in single cell and spatial technology # compute hyperparameters from mean and sd shape = gene_level_prior['mean']**2 / gene_level_prior['sd']**2 rate = gene_level_prior['mean'] / gene_level_prior['sd']**2 shape_var = shape / gene_level_prior['mean_var_ratio'] rate_var = rate / gene_level_prior['mean_var_ratio'] self.gene_level_alpha_hyp = pm.Gamma('gene_level_alpha_hyp', mu=shape, sigma=np.sqrt(shape_var), shape=(1, 1)) self.gene_level_beta_hyp = pm.Gamma('gene_level_beta_hyp', mu=rate, sigma=np.sqrt(rate_var), shape=(1, 1)) self.gene_level = pm.Gamma('gene_level', self.gene_level_alpha_hyp, self.gene_level_beta_hyp, shape=(self.n_genes, 1)) self.gene_factors = pm.Deterministic('gene_factors', self.cell_state) # =====================Spot factors======================= # # prior on spot factors reflects the number of cells, fraction of their cytoplasm captured, # times heterogeniety in the total number of mRNA between individual cells with each cell type self.cells_per_spot = pm.Gamma('cells_per_spot', mu=cell_number_prior['cells_per_spot'], sigma=np.sqrt(cell_number_prior['cells_per_spot'] \ / cell_number_prior['cells_mean_var_ratio']), shape=(self.n_cells, 1)) self.comb_per_spot = pm.Gamma('combs_per_spot', mu=cell_number_prior['combs_per_spot'], sigma=np.sqrt(cell_number_prior['combs_per_spot'] \ / cell_number_prior['combs_mean_var_ratio']), shape=(self.n_cells, 1)) shape = self.comb_per_spot / np.array(self.n_comb).reshape((1, 1)) rate = tt.ones((1, 1)) / self.cells_per_spot * self.comb_per_spot self.combs_factors = pm.Gamma('combs_factors', alpha=shape, beta=rate, shape=(self.n_cells, self.n_comb)) self.factors_per_combs = pm.Gamma('factors_per_combs', mu=cell_number_prior['factors_per_combs'], sigma=np.sqrt(cell_number_prior['factors_per_combs'] \ / cell_number_prior['factors_mean_var_ratio']), shape=(self.n_comb, 1)) c2f_shape = self.factors_per_combs / np.array(self.n_fact).reshape( (1, 1)) self.comb2fact = pm.Gamma('comb2fact', alpha=c2f_shape, beta=self.factors_per_combs, shape=(self.n_comb, self.n_fact)) self.spot_factors = pm.Gamma('spot_factors', mu=pm.math.dot(self.combs_factors, self.comb2fact), sigma=pm.math.sqrt(pm.math.dot(self.combs_factors, self.comb2fact) \ / self.spot_fact_mean_var_ratio), shape=(self.n_cells, self.n_fact)) # =====================Spot-specific additive component======================= # # molecule contribution that cannot be explained by cell state signatures # these counts are distributed between all genes not just expressed genes self.spot_add_hyp = pm.Gamma('spot_add_hyp', 1, 1, shape=2) self.spot_add = pm.Gamma('spot_add', self.spot_add_hyp[0], self.spot_add_hyp[1], shape=(self.n_cells, 1)) # =====================Gene-specific overdispersion ======================= # self.phi_hyp = pm.Gamma('phi_hyp', mu=phi_hyp_prior['mean'], sigma=phi_hyp_prior['sd'], shape=(1, 1)) self.gene_E = pm.Exponential('gene_E', self.phi_hyp, shape=(self.n_genes, 1)) # =====================Expected expression ======================= # # Expected counts for negative probes and gene probes concatenated into one array. Note that non-specific binding # scales linearly with the total number of counts (l_r) in this model. self.mu_biol = pm.math.dot(self.spot_factors, self.gene_factors.T) * self.gene_level.T \ + self.gene_add.T + self.spot_add # =====================DATA likelihood ======================= # # Likelihood (sampling distribution) of observations & add overdispersion via NegativeBinomial / Poisson self.data_target = pm.NegativeBinomial( 'data_target', mu=self.mu_biol, alpha=1 / (self.gene_E.T * self.gene_E.T), observed=self.x_data) # =====================Compute nUMI from each factor in spots ======================= # self.nUMI_factors = pm.Deterministic( 'nUMI_factors', (self.spot_factors * (self.gene_factors * self.gene_level).sum(0)))
# Convert categorical variables to integer le = preprocessing.LabelEncoder() data_idx = le.fit_transform(data.Hour) hours = le.classes_ n_hours = len(hours) for h in [8]: print('Hour: ', h) with pm.Model() as model: alpha = pm.Uniform('alpha', lower=0, upper=20) mu = pm.Uniform('mu', lower=0, upper=20) y_obs = data[data.Hour == h]['Connected'].values y_est = pm.NegativeBinomial('y_est', mu=mu, alpha=alpha, observed=y_obs) y_pred = pm.NegativeBinomial('y_pred', mu=mu, alpha=alpha) trace = pm.sample(25, progressbar=True) indiv_traces[h] = trace #%% Plot NegBino Traces per Hour fig, axs = plt.subplots(n_hours, 2, figsize=(10, 48)) axs = axs.ravel() colLeft = np.arange(0, 48, 2) colRight = np.arange(1, 48, 2)
def lohhla_clone_model(sample_ids, tree_edges, clonal_prevalence_mat, cellularity, ploidy_values, tumour_sample_reads, normal_sample_reads, integercpn_info, all_genotypes, transition_inputs, stayrate_alpha=0.9, stayrate_beta=0.1, sd=0.5, nb_alpha=0.5, iter_count=20000, tune_iters=20000, anchor_type='nb', anchor_mode='snvcn', nchains=2, njobs=2): ''' stayrate_alpha: Beta prior alpha-parameter on stayrate in clone tree Markov chain stayrate_beta: Beta prior beta-parameter on stayrate in clone tree Markov chain all_genotypes: Dataframe of genotypes, 0-indexed ''' num_nodes = clonal_prevalence_mat.shape[1] valid_transitions = transition_inputs['valid_transitions'] num_transitions = transition_inputs['num_transitions'] num_genotypes = transition_inputs['num_genotypes'] cn_genotype_matrix = transition_inputs['cn_genotype_matrix'] ## Beta-binomial dispersion (higher = less dispersed) dispersion = 200. ## Tree edges edges = tree_edges.as_matrix().astype(int) - 1 with pm.Model() as model: BoundedNormal = pm.Bound(pm.Normal, lower=0., upper=1.) stay_rate = BoundedNormal('stayrate', mu=0.75, sd=0.4) P = np.zeros(shape=(num_genotypes, num_genotypes)) P = P + tt.eye(num_genotypes) * stay_rate fill_values = tt.as_tensor((1. - stay_rate) / num_transitions) fill_values = tt.set_subtensor(fill_values[0], 0) P = P + valid_transitions * fill_values[:, np.newaxis] P = tt.set_subtensor(P[0, 0], 1.) A = tt.dmatrix('A') PA = tt.ones(shape=(num_genotypes)) / num_genotypes states = CloneTreeGenotypes('genotypes', PA=PA, P=P, edges=edges, k=num_genotypes, shape=(num_nodes)) total_cns = theano.shared(np.array(all_genotypes['total_cn'].values)) alt_cns = theano.shared(np.array(all_genotypes['alt_cn'].values)) total_cn = pm.Deterministic('total_cn', total_cns[states]) alt_cn = pm.Deterministic('alt_cn', alt_cns[states]) sample_alt_copies = tt.dot(clonal_prevalence_mat, alt_cn ) * cellularity + (1. - cellularity) * 1. vafs = sample_alt_copies / ( tt.dot(clonal_prevalence_mat, total_cn) * cellularity + (1. - cellularity) * 2.) pm.Deterministic('vafs', vafs) alphas = vafs * dispersion betas = (1 - vafs) * dispersion ## Copy number of tumour cells (aggregated over clones, but not including normal contamination) tutotalcn = pm.Deterministic('tutotalcn', tt.dot(clonal_prevalence_mat, total_cn)) ## Can't be vectorized further for j in range(len(sample_ids)): current_sample = sample_ids[j] total_counts = integercpn_info['TumorCov_type1'][ current_sample].values + integercpn_info['TumorCov_type2'][ current_sample].values alt_counts = integercpn_info['TumorCov_type2'][ current_sample].values alpha_sel = alphas[j] beta_sel = betas[j] ## Draw alternative allele counts for HLA locus for each polymorphic site alt_reads = pm.BetaBinomial('x_' + str(j), alpha=alpha_sel, beta=beta_sel, n=total_counts, observed=alt_counts) mult_factor_mean = (tumour_sample_reads[current_sample] / normal_sample_reads) ploidy = ploidy_values[j] ploidy_ratio = (tutotalcn[j] * cellularity[j] + (1 - cellularity[j]) * 2) / ( cellularity[j] * ploidy + (1 - cellularity[j]) * 2) if anchor_mode == 'snvcn': mult_factor_computed = pm.Deterministic( 'mult_factor_computed_' + str(j), 1. / ploidy_ratio * (integercpn_info['Total_TumorCov'][current_sample].values / integercpn_info['Total_NormalCov'][current_sample].values) ) nloci = len( integercpn_info['Total_TumorCov'][current_sample].values) tumour_reads_observed = integercpn_info['Total_TumorCov'][ current_sample].values normal_reads_observed = integercpn_info['Total_NormalCov'][ current_sample].values elif anchor_mode == 'binmedian': binvar_tumour = 'combinedBinTumor' binvar_normal = 'combinedBinNormal' ## All within a bin are the same, so this is OK duplicated_entries = integercpn_info['binNum'][ current_sample].duplicated(keep='first') nloci = len(integercpn_info[binvar_tumour][current_sample] [~duplicated_entries].values) mult_factor_computed = pm.Deterministic( 'mult_factor_computed_' + str(j), (1. / ploidy_ratio * (integercpn_info[binvar_tumour][current_sample] [~duplicated_entries].values / integercpn_info[binvar_normal][current_sample] [~duplicated_entries].values))) tumour_reads_observed = integercpn_info[binvar_tumour][ current_sample][~duplicated_entries].values normal_reads_observed = integercpn_info[binvar_normal][ current_sample][~duplicated_entries].values else: raise Exception("Invalid option specified.") ## Draw ploidy-corrected tumour/normal locus coverage ratio for each polymorphic site if anchor_type == 'mult_factor': mult_factor = pm.Lognormal('mult_factor_' + str(j), mu=np.log(mult_factor_mean), sd=sd, observed=mult_factor_computed, shape=(nloci)) elif anchor_type == 'nb': tc_nc_ratio = pm.Deterministic( 'tc_nc_ratio_' + str(j), (tutotalcn[j] * cellularity[j] + (1 - cellularity[j]) * 2) / (ploidy * cellularity[j] + (1 - cellularity[j]) * 2)) tumoursamplecn = pm.Deterministic( 'tumoursamplecn_' + str(j), (tutotalcn[j] * cellularity[j] + (1 - cellularity[j]) * 2)) tumour_reads_mean = pm.Deterministic( 'tumour_reads_mean_' + str(j), tc_nc_ratio * mult_factor_mean * normal_reads_observed) tumour_reads = pm.NegativeBinomial( 'tumour_reads_' + str(j), mu=tumour_reads_mean, alpha=nb_alpha, observed=tumour_reads_observed) else: raise Exception('Must specify a valid model type.') pm.Deterministic('log_prob', model.logpt) step1 = pm.CategoricalGibbsMetropolis(vars=[states]) step2 = pm.Metropolis(vars=[stay_rate]) trace = pm.sample(iter_count, tune=tune_iters, step=[step1, step2], njobs=njobs, chains=nchains) return trace
def __init__( self, cell_state_mat: np.ndarray, X_data: np.ndarray, n_comb: int = 50, data_type: str = 'float32', n_iter=20000, learning_rate=0.005, total_grad_norm_constraint=200, verbose=True, var_names=None, var_names_read=None, obs_names=None, fact_names=None, sample_id=None, cell_number_prior={ 'cells_per_spot': 8, 'factors_per_spot': 7, 'combs_per_spot': 2.5 }, cell_number_var_prior={ 'cells_mean_var_ratio': 1, 'factors_mean_var_ratio': 1, 'combs_mean_var_ratio': 1 }, phi_hyp_prior={ 'mean': 3, 'sd': 1 }, spot_fact_mean_var_ratio=5, exper_gene_level_mean_var_ratio=10, ): ############# Initialise parameters ################ super().__init__(cell_state_mat, X_data, data_type, n_iter, learning_rate, total_grad_norm_constraint, verbose, var_names, var_names_read, obs_names, fact_names, sample_id) self.phi_hyp_prior = phi_hyp_prior self.n_comb = n_comb self.spot_fact_mean_var_ratio = spot_fact_mean_var_ratio self.exper_gene_level_mean_var_ratio = exper_gene_level_mean_var_ratio # generate parameters for samples self.spot2sample_df = pd.get_dummies(sample_id) # convert to np.ndarray self.spot2sample_mat = self.spot2sample_df.values self.n_exper = self.spot2sample_mat.shape[1] # assign extra data to dictionary with (1) shared parameters (2) input data self.extra_data_tt = { 'spot2sample': theano.shared(self.spot2sample_mat.astype(self.data_type)) } self.extra_data = { 'spot2sample': self.spot2sample_mat.astype(self.data_type) } cell_number_prior['factors_per_combs'] = ( cell_number_prior['factors_per_spot'] / cell_number_prior['combs_per_spot']) for k in cell_number_var_prior.keys(): cell_number_prior[k] = cell_number_var_prior[k] self.cell_number_prior = cell_number_prior ############# Define the model ################ self.model = pm.Model() with self.model: # =====================Gene expression level scaling======================= # # scale cell state factors by gene_level self.gene_factors = pm.Deterministic('gene_factors', self.cell_state) #self.gene_factors = self.cell_state # tt.printing.Print('gene_factors sum')(gene_factors.sum(0).shape) # tt.printing.Print('gene_factors sum')(gene_factors.sum(0)) # =====================Spot factors======================= # # prior on spot factors reflects the number of cells, fraction of their cytoplasm captured, # times heterogeniety in the total number of mRNA between individual cells with each cell type self.cells_per_spot = pm.Gamma('cells_per_spot', mu=cell_number_prior['cells_per_spot'], sigma=np.sqrt(cell_number_prior['cells_per_spot'] \ / cell_number_prior['cells_mean_var_ratio']), shape=(self.n_obs, 1)) self.comb_per_spot = pm.Gamma('combs_per_spot', mu=cell_number_prior['combs_per_spot'], sigma=np.sqrt(cell_number_prior['combs_per_spot'] \ / cell_number_prior['combs_mean_var_ratio']), shape=(self.n_obs, 1)) shape = self.comb_per_spot / np.array(self.n_comb).reshape((1, 1)) rate = tt.ones((1, 1)) / self.cells_per_spot * self.comb_per_spot self.combs_factors = pm.Gamma('combs_factors', alpha=shape, beta=rate, shape=(self.n_obs, self.n_comb)) self.factors_per_combs = pm.Gamma('factors_per_combs', mu=cell_number_prior['factors_per_combs'], sigma=np.sqrt(cell_number_prior['factors_per_combs'] \ / cell_number_prior['factors_mean_var_ratio']), shape=(self.n_comb, 1)) c2f_shape = self.factors_per_combs / np.array(self.n_fact).reshape( (1, 1)) self.comb2fact = pm.Gamma('comb2fact', alpha=c2f_shape, beta=self.factors_per_combs, shape=(self.n_comb, self.n_fact)) self.spot_factors = pm.Gamma('spot_factors', mu=pm.math.dot(self.combs_factors, self.comb2fact), sigma=pm.math.sqrt(pm.math.dot(self.combs_factors, self.comb2fact) \ / self.spot_fact_mean_var_ratio), shape=(self.n_obs, self.n_fact)) # =====================Spot-specific additive component======================= # # molecule contribution that cannot be explained by cell state signatures # these counts are distributed between all genes not just expressed genes self.spot_add_hyp = pm.Gamma('spot_add_hyp', 1, 1, shape=2) self.spot_add = pm.Gamma('spot_add', self.spot_add_hyp[0], self.spot_add_hyp[1], shape=(self.n_obs, 1)) # =====================Gene-specific additive component ======================= # # per gene molecule contribution that cannot be explained by cell state signatures # these counts are distributed equally between all spots (e.g. background, free-floating RNA) self.gene_add_hyp = pm.Gamma('gene_add_hyp', 1, 1, shape=2) self.gene_add = pm.Gamma('gene_add', self.gene_add_hyp[0], self.gene_add_hyp[1], shape=(self.n_exper, self.n_var)) # =====================Gene-specific overdispersion ======================= # self.phi_hyp = pm.Gamma('phi_hyp', mu=phi_hyp_prior['mean'], sigma=phi_hyp_prior['sd'], shape=(1, 1)) self.gene_E = pm.Exponential('gene_E', self.phi_hyp, shape=(self.n_exper, self.n_var)) # =====================Expected expression ======================= # # expected expression self.mu_biol = pm.math.dot(self.spot_factors, self.gene_factors.T) \ + pm.math.dot(self.extra_data_tt['spot2sample'], self.gene_add) + self.spot_add # tt.printing.Print('mu_biol')(self.mu_biol.shape) # =====================DATA likelihood ======================= # # Likelihood (sampling distribution) of observations & add overdispersion via NegativeBinomial / Poisson self.data_target = pm.NegativeBinomial( 'data_target', mu=self.mu_biol, alpha=pm.math.dot(self.extra_data_tt['spot2sample'], 1 / tt.pow(self.gene_E, 2)), observed=self.x_data, total_size=self.X_data.shape) # =====================Compute nUMI from each factor in spots ======================= # self.nUMI_factors = pm.Deterministic('nUMI_factors', (self.spot_factors * (self.gene_factors).sum(0)))
with pm.Model() as model: hyper_alpha_sd = pm.Uniform('hyper_alpha_sd', lower=0, upper=50) hyper_alpha_mu = pm.Uniform('hyper_alpha_mu', lower=0, upper=10) hyper_mu_sd = pm.Uniform('hyper_mu_sd', lower=0, upper=50) hyper_mu_mu = pm.Uniform('hyper_mu_mu', lower=0, upper=60) alpha = pm.Gamma('alpha', mu=hyper_alpha_mu, sd=hyper_alpha_sd, shape=n_participants) mu = pm.Gamma('mu', mu=hyper_mu_mu, sd=hyper_mu_sd, shape=n_participants) y_est = pm.NegativeBinomial('y_est', mu=mu[participants_idx], alpha=alpha[participants_idx], observed=messages['time_delay_seconds'].values) y_pred = pm.NegativeBinomial('y_pred', mu=mu[participants_idx], alpha=alpha[participants_idx], shape=messages['prev_sender'].shape) start = pm.find_MAP() step = pm.Metropolis() hierarchical_trace = pm.sample(20000, step=step, progressbar=True) _ = pm.traceplot(hierarchical_trace[12000:], varnames=[ 'mu', 'alpha', 'hyper_mu_mu', 'hyper_mu_sd', 'hyper_alpha_mu', 'hyper_alpha_sd'
y = nbinom.rvs(mu, 0.5) with pm.Model() as model: # Define priors alpha = pm.Uniform('sigma', 0, 100) sigma_a = pm.Uniform('sigma_a', 0, 10) beta1 = pm.Normal('beta1', 0, sd=100) beta2 = pm.Normal('beta2', 0, sd=100) beta3 = pm.Normal('beta3', 0, sd=100) # priors for random intercept (RI) parameters a_param = pm.Normal( 'a_param', np.repeat(0, NGroups), # mean sd=np.repeat(sigma_a, NGroups), # standard deviation shape=NGroups) # number of RI parameters eta = beta1 + beta2 * x1 + beta3 * x2 + a_param[Groups] # Define likelihood y = pm.NegativeBinomial('y', mu=pm.exp(eta), alpha=alpha, observed=y) # Fit start = pm.find_MAP() # Find starting value by optimization step = pm.NUTS(state=start) # Initiate sampling trace = pm.sample(7000, step, start=start) # Print summary to screen pm.summary(trace)
def __init__(self, cell_state_mat: np.ndarray, X_data: np.ndarray, n_comb: int = 50, data_type: str = 'float32', n_iter=20000, learning_rate=0.005, total_grad_norm_constraint=200, verbose=True, var_names=None, var_names_read=None, obs_names=None, fact_names=None, sample_id=None, gene_level_prior={ 'mean': 1 / 2, 'sd': 1 / 4 }, gene_level_var_prior={'mean_var_ratio': 1}, cell_number_prior={ 'cells_per_spot': 8, 'factors_per_spot': 7, 'combs_per_spot': 2.5 }, cell_number_var_prior={ 'cells_mean_var_ratio': 1, 'factors_mean_var_ratio': 1, 'combs_mean_var_ratio': 1 }, phi_hyp_prior={ 'mean': 3, 'sd': 1 }, spot_fact_mean_var_ratio=5): ############# Initialise parameters ################ super().__init__(cell_state_mat, X_data, data_type, n_iter, learning_rate, total_grad_norm_constraint, verbose, var_names, var_names_read, obs_names, fact_names, sample_id) for k in gene_level_var_prior.keys(): gene_level_prior[k] = gene_level_var_prior[k] self.gene_level_prior = gene_level_prior self.phi_hyp_prior = phi_hyp_prior self.n_comb = n_comb self.spot_fact_mean_var_ratio = spot_fact_mean_var_ratio cell_number_prior['factors_per_combs'] = ( cell_number_prior['factors_per_spot'] / cell_number_prior['combs_per_spot']) for k in cell_number_var_prior.keys(): cell_number_prior[k] = cell_number_var_prior[k] self.cell_number_prior = cell_number_prior ############# Define the model ################ self.model = pm.Model() with self.model: # =====================Gene expression level scaling======================= # # Explains difference in expression between genes and # how it differs in single cell and spatial technology # compute hyperparameters from mean and sd shape = gene_level_prior['mean']**2 / gene_level_prior['sd']**2 rate = gene_level_prior['mean'] / gene_level_prior['sd']**2 shape_var = shape / gene_level_prior['mean_var_ratio'] rate_var = rate / gene_level_prior['mean_var_ratio'] n_g_prior = np.array(gene_level_prior['mean']).shape if len(n_g_prior) == 0: n_g_prior = 1 else: n_g_prior = self.n_var self.gene_level_alpha_hyp = pm.Gamma('gene_level_alpha_hyp', mu=shape, sigma=np.sqrt(shape_var), shape=(n_g_prior, 1)) self.gene_level_beta_hyp = pm.Gamma('gene_level_beta_hyp', mu=rate, sigma=np.sqrt(rate_var), shape=(n_g_prior, 1)) self.gene_level = pm.Gamma('gene_level', self.gene_level_alpha_hyp, self.gene_level_beta_hyp, shape=(self.n_var, 1)) # scale cell state factors by gene_level self.gene_factors = pm.Deterministic('gene_factors', self.cell_state) # tt.printing.Print('gene_factors sum')(gene_factors.sum(0).shape) # tt.printing.Print('gene_factors sum')(gene_factors.sum(0)) # =====================Spot factors======================= # # prior on spot factors reflects the number of cells, fraction of their cytoplasm captured, # times heterogeniety in the total number of mRNA between individual cells with each cell type self.cells_per_spot = pm.Gamma('cells_per_spot', mu=cell_number_prior['cells_per_spot'], sigma=np.sqrt(cell_number_prior['cells_per_spot'] \ / cell_number_prior['cells_mean_var_ratio']), shape=(self.n_obs, 1)) self.comb_per_spot = pm.Gamma('combs_per_spot', mu=cell_number_prior['combs_per_spot'], sigma=np.sqrt(cell_number_prior['combs_per_spot'] \ / cell_number_prior['combs_mean_var_ratio']), shape=(self.n_obs, 1)) shape = self.comb_per_spot / np.array(self.n_comb).reshape((1, 1)) rate = tt.ones((1, 1)) / self.cells_per_spot * self.comb_per_spot self.combs_factors = pm.Gamma('combs_factors', alpha=shape, beta=rate, shape=(self.n_obs, self.n_comb)) self.factors_per_combs = pm.Gamma('factors_per_combs', mu=cell_number_prior['factors_per_combs'], sigma=np.sqrt(cell_number_prior['factors_per_combs'] \ / cell_number_prior['factors_mean_var_ratio']), shape=(self.n_comb, 1)) c2f_shape = self.factors_per_combs / np.array(self.n_fact).reshape( (1, 1)) self.comb2fact = pm.Gamma('comb2fact', alpha=c2f_shape, beta=self.factors_per_combs, shape=(self.n_comb, self.n_fact)) self.spot_factors = pm.Gamma('spot_factors', mu=pm.math.dot(self.combs_factors, self.comb2fact), sigma=pm.math.sqrt(pm.math.dot(self.combs_factors, self.comb2fact) \ / self.spot_fact_mean_var_ratio), shape=(self.n_obs, self.n_fact)) # =====================Spot-specific additive component======================= # # molecule contribution that cannot be explained by cell state signatures # these counts are distributed between all genes not just expressed genes self.spot_add_hyp = pm.Gamma('spot_add_hyp', 1, 1, shape=2) self.spot_add = pm.Gamma('spot_add', self.spot_add_hyp[0], self.spot_add_hyp[1], shape=(self.n_obs, 1)) # =====================Gene-specific additive component ======================= # # per gene molecule contribution that cannot be explained by cell state signatures # these counts are distributed equally between all spots (e.g. background, free-floating RNA) self.gene_add_hyp = pm.Gamma('gene_add_hyp', 1, 1, shape=2) self.gene_add = pm.Gamma('gene_add', self.gene_add_hyp[0], self.gene_add_hyp[1], shape=(self.n_var, 1)) # =====================Gene-specific overdispersion ======================= # self.phi_hyp = pm.Gamma('phi_hyp', mu=phi_hyp_prior['mean'], sigma=phi_hyp_prior['sd'], shape=(1, 1)) self.gene_E = pm.Exponential('gene_E', self.phi_hyp, shape=(self.n_var, 1)) # =====================Expected expression ======================= # # expected expression self.mu_biol = pm.math.dot(self.spot_factors, self.gene_factors.T) * self.gene_level.T \ + self.gene_add.T + self.spot_add # tt.printing.Print('mu_biol')(self.mu_biol.shape) # =====================DATA likelihood ======================= # # Likelihood (sampling distribution) of observations & add overdispersion via NegativeBinomial / Poisson self.data_target = pm.NegativeBinomial( 'data_target', mu=self.mu_biol, alpha=1 / (self.gene_E.T * self.gene_E.T), observed=self.x_data, total_size=self.X_data.shape) # =====================Compute nUMI from each factor in spots ======================= # self.nUMI_factors = pm.Deterministic( 'nUMI_factors', (self.spot_factors * (self.gene_factors * self.gene_level).sum(0)))
def __init__( self, cell_state_mat: np.ndarray, X_data: np.ndarray, Y_data: np.ndarray, n_comb: int = 50, data_type: str = 'float32', n_iter=20000, learning_rate=0.005, total_grad_norm_constraint=200, verbose=True, var_names=None, var_names_read=None, obs_names=None, fact_names=None, sample_id=None, gene_level_prior={'mean': 1 / 2, 'sd': 1 / 4, 'sample_alpha': 20}, gene_level_var_prior={'mean_var_ratio': 1}, cell_number_prior={'cells_per_spot': 8, 'factors_per_spot': 7, 'combs_per_spot': 2.5}, cell_number_var_prior={'cells_mean_var_ratio': 1, 'factors_mean_var_ratio': 1, 'combs_mean_var_ratio': 1}, phi_hyp_prior={'mean': 3, 'sd': 1}, spot_fact_mean_var_ratio=0.5 ): ############# Initialise parameters ################ super().__init__(cell_state_mat, X_data, data_type, n_iter, learning_rate, total_grad_norm_constraint, verbose, var_names, var_names_read, obs_names, fact_names, sample_id) self.Y_data = Y_data self.n_npro = Y_data.shape[1] self.y_data = theano.shared(Y_data.astype(self.data_type)) self.n_rois = Y_data.shape[0] self.n_genes = X_data.shape[1] # Total number of gene counts in each region of interest, divided by 10^5: self.l_r = np.array([np.sum(X_data[i,:]) for i in range(self.n_rois)]).reshape(self.n_rois,1)*10**(-5) for k in gene_level_var_prior.keys(): gene_level_prior[k] = gene_level_var_prior[k] self.gene_level_prior = gene_level_prior self.phi_hyp_prior = phi_hyp_prior self.n_comb = n_comb self.spot_fact_mean_var_ratio = spot_fact_mean_var_ratio cell_number_prior['factors_per_combs'] = (cell_number_prior['factors_per_spot'] / cell_number_prior['combs_per_spot']) for k in cell_number_var_prior.keys(): cell_number_prior[k] = cell_number_var_prior[k] self.cell_number_prior = cell_number_prior # generate one-hot encoded parameters for samples self.spot2sample_df = pd.get_dummies(sample_id) # convert to np.ndarray self.spot2sample_mat = self.spot2sample_df.values self.n_exper = self.spot2sample_mat.shape[1] # assign extra data to dictionary with (1) shared parameters (2) input data self.extra_data_tt = {'spot2sample': theano.shared(self.spot2sample_mat.astype(self.data_type))} self.extra_data = {'spot2sample': self.spot2sample_mat.astype(self.data_type)} ############# Define the model ################ self.model = pm.Model() with self.model: # ============================ Negative Probe Binding ===================== # # Negative probe counts scale linearly with the total number of counts in a region of interest. # The linear slope is drawn from a gamma distribution. Mean and variance are inferred from the data # and are the same for the non-specific binding term for gene probes further below. self.b_n_hyper = pm.Gamma('b_n_hyper', alpha=np.array((3,1)), beta=np.array((1,1)), shape=2) self.b_n = pm.Gamma('b_n', mu=self.b_n_hyper[0], sigma=self.b_n_hyper[1], shape=(self.n_exper, self.n_npro)) self.y_rn = pm.math.dot(self.extra_data_tt['spot2sample'], self.b_n) * self.l_r # ===================== Non-specific binding additive component ======================= # # Additive term for non-specific binding of gene probes are drawn from a gamma distribution with # the same mean and variance as for negative probes above. self.gene_add = pm.Gamma('gene_add', mu=self.b_n_hyper[0], sigma=self.b_n_hyper[1], shape=(self.n_exper, self.n_genes)) # =====================Gene expression level scaling======================= # # Explains difference in expression between genes and # how it differs in single cell and spatial technology # compute hyperparameters from mean and sd shape = gene_level_prior['mean'] ** 2 / gene_level_prior['sd'] ** 2 rate = gene_level_prior['mean'] / gene_level_prior['sd'] ** 2 shape_var = shape / gene_level_prior['mean_var_ratio'] rate_var = rate / gene_level_prior['mean_var_ratio'] self.gene_level_alpha_hyp = pm.Gamma('gene_level_alpha_hyp', mu=shape, sigma=np.sqrt(shape_var), shape=(1, 1)) self.gene_level_beta_hyp = pm.Gamma('gene_level_beta_hyp', mu=rate, sigma=np.sqrt(rate_var), shape=(1, 1)) # global per gene sensitivity, including platform effect self.gene_level = pm.Gamma('gene_level', self.gene_level_alpha_hyp, self.gene_level_beta_hyp, shape=(1, self.n_genes)) # independent experiment-specific effect on each gene (narrow prior around 1) self.gene_level_independent = pm.Gamma('gene_level_independent', 100, 100, shape=(self.n_exper, self.n_genes)) # experiment specific capture efficiency (wide prior around 1) self.gene_level_e = pm.Gamma('gene_level_e', gene_level_prior['sample_alpha'], gene_level_prior['sample_alpha'], shape=(self.n_exper, 1)) self.gene_factors = pm.Deterministic('gene_factors', self.cell_state) # =====================Spot factors======================= # # prior on spot factors reflects the number of cells, fraction of their cytoplasm captured, # times heterogeniety in the total number of mRNA between individual cells with each cell type self.cells_per_spot = pm.Gamma('cells_per_spot', mu=cell_number_prior['cells_per_spot'], sigma=np.sqrt(cell_number_prior['cells_per_spot'] \ / cell_number_prior['cells_mean_var_ratio']), shape=(self.n_rois, 1)) self.comb_per_spot = pm.Gamma('combs_per_spot', mu=cell_number_prior['combs_per_spot'], sigma=np.sqrt(cell_number_prior['combs_per_spot'] \ / cell_number_prior['combs_mean_var_ratio']), shape=(self.n_rois, 1)) shape = self.comb_per_spot / np.array(self.n_comb).reshape((1, 1)) rate = tt.ones((1, 1)) / self.cells_per_spot * self.comb_per_spot self.combs_factors = pm.Gamma('combs_factors', alpha=shape, beta=rate, shape=(self.n_rois, self.n_comb)) self.factors_per_combs = pm.Gamma('factors_per_combs', mu=cell_number_prior['factors_per_combs'], sigma=np.sqrt(cell_number_prior['factors_per_combs'] \ / cell_number_prior['factors_mean_var_ratio']), shape=(self.n_comb, 1)) c2f_shape = self.factors_per_combs / np.array(self.n_fact).reshape((1, 1)) self.comb2fact = pm.Gamma('comb2fact', alpha=c2f_shape, beta=self.factors_per_combs, shape=(self.n_comb, self.n_fact)) self.spot_factors = pm.Gamma('spot_factors', mu=pm.math.dot(self.combs_factors, self.comb2fact), sigma=pm.math.sqrt(pm.math.dot(self.combs_factors, self.comb2fact) \ / self.spot_fact_mean_var_ratio), shape=(self.n_rois, self.n_fact)) # =====================Spot-specific additive component======================= # # molecule contribution that cannot be explained by cell state signatures # these counts are distributed between all genes not just expressed genes self.spot_add_hyp = pm.Gamma('spot_add_hyp', 1, 1, shape=2) self.spot_add = pm.Gamma('spot_add', self.spot_add_hyp[0], self.spot_add_hyp[1], shape=(self.n_rois, 1)) # =====================Gene-specific overdispersion ======================= # self.phi_hyp = pm.Gamma('phi_hyp', mu=phi_hyp_prior['mean'], sigma=phi_hyp_prior['sd'], shape=(1, 1)) self.gene_E = pm.Exponential('gene_E', self.phi_hyp, shape=(self.n_exper, self.n_genes)) # =====================Expected expression ======================= # # Expected counts for negative probes and gene probes concatenated into one array. Note that non-specific binding # scales linearly with the total number of counts (l_r) in this model. mu_biol = pm.math.dot(self.spot_factors, self.gene_factors.T) \ * self.gene_level \ * pm.math.dot(self.extra_data_tt['spot2sample'], self.gene_level_e) \ * pm.math.dot(self.extra_data_tt['spot2sample'], self.gene_level_independent) \ + pm.math.dot(self.extra_data_tt['spot2sample'], self.gene_add) * self.l_r \ + self.spot_add self.mu_biol = tt.concatenate([self.y_rn, mu_biol], axis = 1) # =====================DATA likelihood ======================= # # Likelihood (sampling distribution) of observations & add overdispersion via NegativeBinomial / Poisson self.data_target = pm.NegativeBinomial('data_target', mu=self.mu_biol, alpha=tt.concatenate([np.full((self.n_rois, self.n_npro), 10**10), pm.math.dot(self.extra_data_tt['spot2sample'], 1 / (self.gene_E * self.gene_E))], axis = 1), observed=tt.concatenate([self.y_data, self.x_data], axis = 1)) # =====================Compute nUMI from each factor in spots ======================= # self.nUMI_factors = pm.Deterministic('nUMI_factors', (self.spot_factors * (self.gene_factors * self.gene_level.T).sum(0)))