def run_factorization(self, N, S, X, K, num_cov, k, n): # Smart initialization rat = k/n nans = np.isnan(rat) conc_inits = np.zeros((1, S)) beta_inits = np.zeros((num_cov, S)) for index_s in range(S): column_rat = rat[:, index_s] column_nans = np.isnan(column_rat) valid_rat = column_rat[~column_nans] conc_init = min(1.0/np.var(valid_rat), 1000.0) m_init = min(max(np.mean(valid_rat), 1.0/1000 ), 1.0-(1.0/1000)) conc_inits[0, index_s] = conc_init beta_inits[0, index_s] = np.log(m_init/(1.0-m_init)) # Run bb-mf with pm.Model() as bb_glm: CONC = pm.Gamma('CONC', alpha=1e-4, beta=1e-4, shape=(1,S), testval=conc_inits) BETA = pm.Normal('BETA', mu=0, tau=(1/1000000.0), shape=(S, num_cov), testval=beta_inits.T) U = pm.Normal('U', mu=0, tau=(1/1000.0), shape=(N, K), testval=np.random.randn(N, K)) V = pm.Normal('V', mu=0, tau=(1/1000.0), shape=(S, K), testval=np.random.randn(S, K)) p = pm.math.invlogit(pm.math.dot(X, BETA.T) + pm.math.dot(U,V.T)) conc_mat = pm.math.dot(np.ones((N,1)), CONC) R = pm.BetaBinomial('like',alpha=(p*conc_mat)[~nans], beta=((1.0-p)*conc_mat)[~nans], n=n[~nans], observed=k[~nans]) approx = pm.fit(method='advi', n=30000) pickle.dump(approx, open(self.output_root + '_model', 'wb')) #approx = pickle.load( open(self.output_root + '_model', "rb" ) ) means_dict = approx.bij.rmap(approx.params[0].eval()) np.savetxt(self.output_root + '_temper_U.txt', (means_dict['U']), fmt="%s", delimiter='\t') np.savetxt(self.output_root + '_temper_V.txt', (means_dict['V'].T), fmt="%s", delimiter='\t') np.savetxt(self.output_root + '_temper_BETA.txt', (means_dict['BETA'].T), fmt="%s", delimiter='\t')
def run_factorization(self): rat = self.allelic_counts/self.total_counts nans = np.isnan(rat) # Run bb-mf with pm.Model() as bb_glm: CONC = pm.HalfCauchy('CONC', beta=5, shape=(1,self.S), testval=self.conc_init) BETA = pm.Normal('BETA', mu=0, tau=(1/1000000.0), shape=(self.S, self.num_cov), testval=self.beta_init) U = pm.Normal('U', mu=0, tau=(1.0/100.0), shape=(self.N, self.K), testval=self.U_init) V = pm.Normal('V', mu=0, tau=(1.0/100.0), shape=(self.S, self.K), testval=self.V_init) MU_A = pm.Normal("MU_A", mu=0., sd=100**2, shape=(1,self.S), testval=self.mu_a_init) SIGMA_A = pm.HalfCauchy("SIGMA_A", beta=5.0, shape=(1,self.S), testval=self.sigma_a_init) mu_a_mat = pm.math.dot(np.ones((self.I,1)), MU_A) sigma_a_mat = pm.math.dot(np.ones((self.I,1)), SIGMA_A) A = pm.Normal('A', mu=mu_a_mat, sigma=sigma_a_mat, shape=(self.I,self.S), testval=self.A_init) p = pm.math.invlogit(pm.math.dot(self.cov, BETA.T) + pm.math.dot(U,V.T) + A[self.Z,:]) conc_mat = pm.math.dot(np.ones((self.N,1)), CONC) R = pm.BetaBinomial('like',alpha=(p*conc_mat)[~nans], beta=((1.0-p)*conc_mat)[~nans], n=self.total_counts[~nans], observed=self.allelic_counts[~nans]) approx = pm.fit(method='advi', n=1000) pickle.dump(approx, open(self.output_root + '_model', 'wb')) #approx = pickle.load( open(self.output_root + '_model', "rb" ) ) means_dict = approx.bij.rmap(approx.params[0].eval()) np.savetxt(self.output_root + '_temper_U.txt', (means_dict['U']), fmt="%s", delimiter='\t') np.savetxt(self.output_root + '_temper_V.txt', (means_dict['V'].T), fmt="%s", delimiter='\t') np.savetxt(self.output_root + '_temper_BETA.txt', (means_dict['BETA'].T), fmt="%s", delimiter='\t') np.savetxt(self.output_root + '_temper_CONC.txt', np.exp(means_dict['CONC_log__']), fmt="%s", delimiter='\t') np.savetxt(self.output_root + '_temper_A.txt', (means_dict['A']), fmt="%s", delimiter='\t') np.savetxt(self.output_root + '_temper_MU_A.txt', (means_dict['MU_A']), fmt="%s", delimiter='\t') np.savetxt(self.output_root + '_temper_SIGMA_A.txt', np.exp(means_dict['SIGMA_A_log__']), fmt="%s", delimiter='\t') np.savetxt(self.output_root + '_temper_ELBO.txt', approx.hist, fmt="%s", delimiter='\t')
def build_biallelic_model3(g, n, s): # EXPERIMENTAL: Observations overdispersed as a BetaBinom w/ concentrations # 10. a = 2 with pm.Model() as model: # Fraction pi = pm.Dirichlet( 'pi', a=np.ones(s), shape=(n, s), transform=stick_breaking, ) pi_hyper = pm.Data('pi_hyper', value=0.0) pm.Potential('heterogeneity_penalty', -(pm.math.sqrt(pi).sum(0).sum()**2) * pi_hyper) rho_hyper = pm.Data('rho_hyper', value=0.0) pm.Potential('diversity_penalty', -(pm.math.sqrt(pi.sum(0)).sum()**2) * rho_hyper) # Genotype gamma_ = pm.Uniform('gamma_', 0, 1, shape=(g * s, 1)) gamma = pm.Deterministic( 'gamma', (pm.math.concatenate([gamma_, 1 - gamma_], axis=1).reshape( (g, s, a)))) gamma_hyper = pm.Data('gamma_hyper', value=0.0) pm.Potential( 'ambiguity_penalty', -(pm.math.sqrt(gamma).sum(2)**2).sum(0).sum(0) * gamma_hyper) # Product of fraction and genotype true_p = pm.Deterministic('true_p', pm.math.dot(pi, gamma)) # Sequencing error epsilon_hyper = pm.Data('epsilon_hyper', value=100) epsilon = pm.Beta('epsilon', alpha=2, beta=epsilon_hyper, shape=n) epsilon_ = epsilon.reshape((n, 1, 1)) err_base_prob = tt.ones((n, g, a)) / a p_with_error = (true_p * (1 - epsilon_)) + (err_base_prob * epsilon_) # Observation _p = p_with_error.reshape((-1, a))[:, 0] # Overdispersion term # alpha = pm.Gamma('alpha', mu=100, sigma=5) # TODO: Figure out how to also fit this term. # FIXME: Do I want the default to be a valid value? # Realistic or close to asymptotic? alpha = pm.Data('alpha', value=1000) observed = pm.Data('observed', value=np.empty((g * n, a))) pm.BetaBinomial('data', alpha=_p * alpha, beta=(1 - _p) * alpha, n=observed.reshape((-1, a)).sum(1), observed=observed[:, 0]) return model
def test_pymc3_convert_dists(): """Just a basic check that all PyMC3 RVs will convert to and from Theano RVs.""" tt.config.compute_test_value = "ignore" theano.config.cxx = "" with pm.Model() as model: norm_rv = pm.Normal("norm_rv", 0.0, 1.0, observed=1.0) mvnorm_rv = pm.MvNormal("mvnorm_rv", np.r_[0.0], np.c_[1.0], shape=1, observed=np.r_[1.0]) cauchy_rv = pm.Cauchy("cauchy_rv", 0.0, 1.0, observed=1.0) halfcauchy_rv = pm.HalfCauchy("halfcauchy_rv", 1.0, observed=1.0) uniform_rv = pm.Uniform("uniform_rv", observed=1.0) gamma_rv = pm.Gamma("gamma_rv", 1.0, 1.0, observed=1.0) invgamma_rv = pm.InverseGamma("invgamma_rv", 1.0, 1.0, observed=1.0) exp_rv = pm.Exponential("exp_rv", 1.0, observed=1.0) halfnormal_rv = pm.HalfNormal("halfnormal_rv", 1.0, observed=1.0) beta_rv = pm.Beta("beta_rv", 2.0, 2.0, observed=1.0) binomial_rv = pm.Binomial("binomial_rv", 10, 0.5, observed=5) dirichlet_rv = pm.Dirichlet("dirichlet_rv", np.r_[0.1, 0.1], observed=np.r_[0.1, 0.1]) poisson_rv = pm.Poisson("poisson_rv", 10, observed=5) bernoulli_rv = pm.Bernoulli("bernoulli_rv", 0.5, observed=0) betabinomial_rv = pm.BetaBinomial("betabinomial_rv", 0.1, 0.1, 10, observed=5) categorical_rv = pm.Categorical("categorical_rv", np.r_[0.5, 0.5], observed=1) multinomial_rv = pm.Multinomial("multinomial_rv", 5, np.r_[0.5, 0.5], observed=np.r_[2]) # Convert to a Theano `FunctionGraph` fgraph = model_graph(model) rvs_by_name = { n.owner.inputs[1].name: n.owner.inputs[1] for n in fgraph.outputs } pymc_rv_names = {n.name for n in model.observed_RVs} assert all( isinstance(rvs_by_name[n].owner.op, RandomVariable) for n in pymc_rv_names) # Now, convert back to a PyMC3 model pymc_model = graph_model(fgraph) new_pymc_rv_names = {n.name for n in pymc_model.observed_RVs} pymc_rv_names == new_pymc_rv_names
def run_ppca_initialization(self): print('Starting PPCA initialization') rat = self.allelic_counts/self.total_counts nans = np.isnan(rat) scaled_rat = scale_allelic_ratios(rat) scaled_residual_rat = regress_out_cell_line(scaled_rat, self.Z) rescaled_residual_rat = scale_allelic_ratios(scaled_residual_rat) ppca = PPCA() ppca.fit(data=np.transpose(rescaled_residual_rat), d=self.K, verbose=True, tol=1e-6) self.U_init = ppca.C/np.std(ppca.C) # Run bb-mf with pm.Model() as bb_glm_init: CONC = pm.HalfCauchy('CONC', beta=5, shape=(1,self.S), testval=self.conc_init) BETA = pm.Normal('BETA', mu=0, tau=(1/1000000.0), shape=(self.S, self.num_cov), testval=self.beta_init) #U = pm.Normal('U', mu=0, tau=(1.0/1.0), shape=(N, K), testval=self.U_init) V = pm.Normal('V', mu=0, tau=(1.0/1.0), shape=(self.S, self.K), testval=np.zeros(self.V_init.shape)) MU_A = pm.Normal("MU_A", mu=0., sd=100**2, shape=(1,self.S), testval=self.mu_a_init) SIGMA_A = pm.HalfCauchy("SIGMA_A", beta=5.0, shape=(1,self.S), testval=self.sigma_a_init) mu_a_mat = pm.math.dot(np.ones((self.I,1)), MU_A) sigma_a_mat = pm.math.dot(np.ones((self.I,1)), SIGMA_A) A = pm.Normal('A', mu=mu_a_mat, sigma=sigma_a_mat, shape=(self.I,self.S), testval=self.A_init) p = pm.math.invlogit(pm.math.dot(self.cov, BETA.T) + pm.math.dot(self.U_init,V.T) + A[self.Z,:]) conc_mat = pm.math.dot(np.ones((self.N,1)), CONC) R = pm.BetaBinomial('like',alpha=(p*conc_mat)[~nans], beta=((1.0-p)*conc_mat)[~nans], n=self.total_counts[~nans], observed=self.allelic_counts[~nans]) approx_init = pm.fit(method='advi', n=2000) pickle.dump(approx_init, open(self.output_root + '_model_init', 'wb')) init_dict = approx_init.bij.rmap(approx_init.params[0].eval()) self.beta_init = init_dict['BETA'] self.A_init = init_dict['A'] self.sigma_a_init = np.exp(init_dict['SIGMA_A_log__']) self.mu_a_init = init_dict['MU_A'] self.conc_init = np.exp(init_dict['CONC_log__']) self.V_init = init_dict['V'] print('Smart PPCA complete')
def mcmc_sample(self, data, bin_width): proteins, idx = get_proteins_and_indices(data) with pm.Model(): τ = pm.Gamma('τ', alpha=7.5, beta=1) BoundedNormal = pm.Bound(pm.Normal, lower=0, upper=1) μ = BoundedNormal('μ', mu=0.5, sigma=1, shape=len(proteins)) κ = pm.Exponential('κ', τ, shape=len(proteins)) pm.BetaBinomial('y', alpha=μ[idx] * κ[idx], beta=(1.0 - μ[idx]) * κ[idx], n=data['sum'], observed=data[self.channel]) db = hist_backend.Histogram(vars=[μ], bin_width=bin_width, remove_first=self.tuning) pm.sample(draws=self.samples, tune=self.tuning, chains=self.chains, cores=get_num_cores(), progressbar=sys.stdout.isatty(), compute_convergence_checks=False, trace=db) return proteins, db.hist['μ']
def lohhla_clone_model(sample_ids, tree_edges, clonal_prevalence_mat, cellularity, ploidy_values, tumour_sample_reads, normal_sample_reads, integercpn_info, all_genotypes, transition_inputs, stayrate_alpha=0.9, stayrate_beta=0.1, sd=0.5, nb_alpha=0.5, iter_count=20000, tune_iters=20000, anchor_type='nb', anchor_mode='snvcn', nchains=2, njobs=2): ''' stayrate_alpha: Beta prior alpha-parameter on stayrate in clone tree Markov chain stayrate_beta: Beta prior beta-parameter on stayrate in clone tree Markov chain all_genotypes: Dataframe of genotypes, 0-indexed ''' num_nodes = clonal_prevalence_mat.shape[1] valid_transitions = transition_inputs['valid_transitions'] num_transitions = transition_inputs['num_transitions'] num_genotypes = transition_inputs['num_genotypes'] cn_genotype_matrix = transition_inputs['cn_genotype_matrix'] ## Beta-binomial dispersion (higher = less dispersed) dispersion = 200. ## Tree edges edges = tree_edges.as_matrix().astype(int) - 1 with pm.Model() as model: BoundedNormal = pm.Bound(pm.Normal, lower=0., upper=1.) stay_rate = BoundedNormal('stayrate', mu=0.75, sd=0.4) P = np.zeros(shape=(num_genotypes, num_genotypes)) P = P + tt.eye(num_genotypes) * stay_rate fill_values = tt.as_tensor((1. - stay_rate) / num_transitions) fill_values = tt.set_subtensor(fill_values[0], 0) P = P + valid_transitions * fill_values[:, np.newaxis] P = tt.set_subtensor(P[0, 0], 1.) A = tt.dmatrix('A') PA = tt.ones(shape=(num_genotypes)) / num_genotypes states = CloneTreeGenotypes('genotypes', PA=PA, P=P, edges=edges, k=num_genotypes, shape=(num_nodes)) total_cns = theano.shared(np.array(all_genotypes['total_cn'].values)) alt_cns = theano.shared(np.array(all_genotypes['alt_cn'].values)) total_cn = pm.Deterministic('total_cn', total_cns[states]) alt_cn = pm.Deterministic('alt_cn', alt_cns[states]) sample_alt_copies = tt.dot(clonal_prevalence_mat, alt_cn ) * cellularity + (1. - cellularity) * 1. vafs = sample_alt_copies / ( tt.dot(clonal_prevalence_mat, total_cn) * cellularity + (1. - cellularity) * 2.) pm.Deterministic('vafs', vafs) alphas = vafs * dispersion betas = (1 - vafs) * dispersion ## Copy number of tumour cells (aggregated over clones, but not including normal contamination) tutotalcn = pm.Deterministic('tutotalcn', tt.dot(clonal_prevalence_mat, total_cn)) ## Can't be vectorized further for j in range(len(sample_ids)): current_sample = sample_ids[j] total_counts = integercpn_info['TumorCov_type1'][ current_sample].values + integercpn_info['TumorCov_type2'][ current_sample].values alt_counts = integercpn_info['TumorCov_type2'][ current_sample].values alpha_sel = alphas[j] beta_sel = betas[j] ## Draw alternative allele counts for HLA locus for each polymorphic site alt_reads = pm.BetaBinomial('x_' + str(j), alpha=alpha_sel, beta=beta_sel, n=total_counts, observed=alt_counts) mult_factor_mean = (tumour_sample_reads[current_sample] / normal_sample_reads) ploidy = ploidy_values[j] ploidy_ratio = (tutotalcn[j] * cellularity[j] + (1 - cellularity[j]) * 2) / ( cellularity[j] * ploidy + (1 - cellularity[j]) * 2) if anchor_mode == 'snvcn': mult_factor_computed = pm.Deterministic( 'mult_factor_computed_' + str(j), 1. / ploidy_ratio * (integercpn_info['Total_TumorCov'][current_sample].values / integercpn_info['Total_NormalCov'][current_sample].values) ) nloci = len( integercpn_info['Total_TumorCov'][current_sample].values) tumour_reads_observed = integercpn_info['Total_TumorCov'][ current_sample].values normal_reads_observed = integercpn_info['Total_NormalCov'][ current_sample].values elif anchor_mode == 'binmedian': binvar_tumour = 'combinedBinTumor' binvar_normal = 'combinedBinNormal' ## All within a bin are the same, so this is OK duplicated_entries = integercpn_info['binNum'][ current_sample].duplicated(keep='first') nloci = len(integercpn_info[binvar_tumour][current_sample] [~duplicated_entries].values) mult_factor_computed = pm.Deterministic( 'mult_factor_computed_' + str(j), (1. / ploidy_ratio * (integercpn_info[binvar_tumour][current_sample] [~duplicated_entries].values / integercpn_info[binvar_normal][current_sample] [~duplicated_entries].values))) tumour_reads_observed = integercpn_info[binvar_tumour][ current_sample][~duplicated_entries].values normal_reads_observed = integercpn_info[binvar_normal][ current_sample][~duplicated_entries].values else: raise Exception("Invalid option specified.") ## Draw ploidy-corrected tumour/normal locus coverage ratio for each polymorphic site if anchor_type == 'mult_factor': mult_factor = pm.Lognormal('mult_factor_' + str(j), mu=np.log(mult_factor_mean), sd=sd, observed=mult_factor_computed, shape=(nloci)) elif anchor_type == 'nb': tc_nc_ratio = pm.Deterministic( 'tc_nc_ratio_' + str(j), (tutotalcn[j] * cellularity[j] + (1 - cellularity[j]) * 2) / (ploidy * cellularity[j] + (1 - cellularity[j]) * 2)) tumoursamplecn = pm.Deterministic( 'tumoursamplecn_' + str(j), (tutotalcn[j] * cellularity[j] + (1 - cellularity[j]) * 2)) tumour_reads_mean = pm.Deterministic( 'tumour_reads_mean_' + str(j), tc_nc_ratio * mult_factor_mean * normal_reads_observed) tumour_reads = pm.NegativeBinomial( 'tumour_reads_' + str(j), mu=tumour_reads_mean, alpha=nb_alpha, observed=tumour_reads_observed) else: raise Exception('Must specify a valid model type.') pm.Deterministic('log_prob', model.logpt) step1 = pm.CategoricalGibbsMetropolis(vars=[states]) step2 = pm.Metropolis(vars=[stay_rate]) trace = pm.sample(iter_count, tune=tune_iters, step=[step1, step2], njobs=njobs, chains=nchains) return trace
plt.ylabel("Density") # %% admit_df = pd.read_csv("data/UCBadmit.csv", sep=";") # %% with pm.Model() as m11_5: a = pm.Normal("a", 0.0, 2.0) pbar = pm.Deterministic("pbar", pm.math.sigmoid(a)) theta = pm.Exponential("theta", 1.0) admit_obs = pm.BetaBinomial( "admit_obs", pbar * theta, (1.0 - pbar) * theta, admit_df.applications.values, observed=admit_df.admit.values, ) # %% with m11_5: trace_11_5 = pm.sample(1000, tune=1000) # %% pm.summary(trace_11_5).round(2) # %% np.percentile(trace_11_5["pbar"], [2.5, 50.0, 97.5])
def run_factorization(self, N, S, X, Z, I, K, num_cov, k, n): # Smart initialization rat = k / n nans = np.isnan(rat) conc_inits = np.zeros((1, S)) beta_inits = np.zeros((num_cov, S)) for index_s in range(S): column_rat = rat[:, index_s] column_nans = np.isnan(column_rat) valid_rat = column_rat[~column_nans] conc_init = min(1.0 / np.var(valid_rat), 1000.0) m_init = min(max(np.mean(valid_rat), 1.0 / 1000), 1.0 - (1.0 / 1000)) conc_inits[0, index_s] = conc_init beta_inits[0, index_s] = np.log(m_init / (1.0 - m_init)) U_init = np.random.rand(N, K) for n_iter in range(N): U_init[n_iter, :] = U_init[n_iter, :] / np.sum(U_init[n_iter, :]) # Run bb-mf with pm.Model() as bb_glm: CONC = pm.HalfCauchy('CONC', beta=5, shape=(1, S), testval=conc_inits) BETA = pm.Normal('BETA', mu=0, tau=(1 / 1000000.0), shape=(S, num_cov), testval=beta_inits.T) #U = pm.Normal('U', mu=0, tau=(1/10000.0), shape=(N, K), testval=np.random.randn(N, K)) U = pm.Dirichlet('U', a=np.ones(K) * 1.0, shape=(N, K), testval=U_init) V = pm.Normal('V', mu=0, tau=(1 / 10000.0), shape=(S, K), testval=np.random.randn(S, K)) MU_A = pm.Normal("MU_A", mu=0., sd=100**2, shape=(1, S), testval=np.zeros((1, S))) SIGMA_A = pm.HalfCauchy("SIGMA_A", beta=5.0, shape=(1, S), testval=np.ones((1, S))) mu_a_mat = pm.math.dot(np.ones((I, 1)), MU_A) sigma_a_mat = pm.math.dot(np.ones((I, 1)), SIGMA_A) A = pm.Normal('A', mu=mu_a_mat, sigma=sigma_a_mat, shape=(I, S), testval=np.zeros((I, S))) p = pm.math.invlogit( pm.math.dot(X, BETA.T) + pm.math.dot(U, V.T) + A[Z, :]) conc_mat = pm.math.dot(np.ones((N, 1)), CONC) R = pm.BetaBinomial('like', alpha=(p * conc_mat)[~nans], beta=((1.0 - p) * conc_mat)[~nans], n=n[~nans], observed=k[~nans]) approx = pm.fit(method='advi', n=30000) pickle.dump(approx, open(self.output_root + '_model', 'wb')) #approx = pickle.load( open(self.output_root + '_model', "rb" ) ) means_dict = approx.bij.rmap(approx.params[0].eval()) U = backward_stickbreaking(means_dict['U_stickbreaking__']) np.savetxt(self.output_root + '_temper_U.txt', U, fmt="%s", delimiter='\t') np.savetxt(self.output_root + '_temper_U_init.txt', U_init, fmt="%s", delimiter='\t') np.savetxt(self.output_root + '_temper_V.txt', (means_dict['V'].T), fmt="%s", delimiter='\t') np.savetxt(self.output_root + '_temper_BETA.txt', (means_dict['BETA'].T), fmt="%s", delimiter='\t')
26, 24, 31, 25 ]) print('Length of array: ', y.size) q = 40 # How many total questions yp = y.astype(float) / q print('marks: ', y) print('Rates (yp): ', yp) # Priors for p e.g., (alpha1,beta1) = (2.0,5.0) yields maximum at p = 0.2 alpha1 = 2.0 beta1 = 5.0 # Prior for k e.g., combination of alpha2=beta2 yields maximum at 50% knowledge, magnitudes determine dispersion alpha2 = 3.3 beta2 = 3.3 # Number of iterations for MCMC niter = 50000 with pm.Model(): # context management # define priors p = pm.Beta('p', alpha=alpha1, beta=beta1) k = pm.BetaBinomial('k', alpha=alpha2, beta=beta2, n=y) # Likelihood (sampling distribution) of observations obs = pm.Binomial('obs', n=q - k, p=p, observed=y - k) # inference trace = pm.sample(niter, return_inferencedata=False) az.plot_trace(trace) az.plot_posterior(trace, hdi_prob=0.95)
def run_non_sparse_model_for_initialization(self): rat = self.allelic_counts / self.total_counts nans = np.isnan(rat) # Run bb-mf with pm.Model() as bb_glm: CONC = pm.HalfCauchy('CONC', beta=5, shape=(1, self.S), testval=self.conc_init) BETA = pm.Normal('BETA', mu=0, tau=(1 / 1000000.0), shape=(self.S, self.num_cov), testval=self.beta_init) U = pm.Normal('U', mu=0, tau=(1.0 / 1.0), shape=(self.N, self.K), testval=self.U_init) V = pm.Normal('V', mu=0, tau=(1.0 / 1.0), shape=(self.S, self.K), testval=self.V_init) MU_A = pm.Normal("MU_A", mu=0., sd=100**2, shape=(1, self.S), testval=self.mu_a_init) SIGMA_A = pm.HalfCauchy("SIGMA_A", beta=5.0, shape=(1, self.S), testval=self.sigma_a_init) mu_a_mat = pm.math.dot(np.ones((self.I, 1)), MU_A) sigma_a_mat = pm.math.dot(np.ones((self.I, 1)), SIGMA_A) A = pm.Normal('A', mu=mu_a_mat, sigma=sigma_a_mat, shape=(self.I, self.S), testval=self.A_init) p = pm.math.invlogit( pm.math.dot(self.cov, BETA.T) + pm.math.dot(U, V.T) + A[self.Z, :]) conc_mat = pm.math.dot(np.ones((self.N, 1)), CONC) R = pm.BetaBinomial('like', alpha=(p * conc_mat)[~nans], beta=((1.0 - p) * conc_mat)[~nans], n=self.total_counts[~nans], observed=self.allelic_counts[~nans]) approx = pm.fit(method='advi', n=10000) means_dict = approx.bij.rmap(approx.params[0].eval()) # Set initializations for sparse model to learned values from this non-sparse model self.conc_init = np.exp(means_dict['CONC_log__']) self.beta_init = means_dict['BETA'] self.U_init = means_dict['U'] self.V_init = means_dict['V'] self.mu_a_init = means_dict['MU_A'] self.sigma_a_init = np.exp(means_dict['SIGMA_A_log__']) self.A_init = means_dict['A']
freq_bins = np.logspace(-6, -3, 50, base=10) tot_bins = np.logspace(4, 6.5, 50, base=10) w_bins = np.linspace(0, 70, 50) g = _jointplot('W', 'what', sim, ybins=None, xbins=None) g = _jointplot('W', 'sfreq', sim, ybins=None, xbins=None) g = _jointplot('what', 'resid', sim, ybins=None, xbins=None) g = _jointplot('W', 'resid', sim, ybins=None, xbins=None) with pm.Model() as model: alpha = pm.Exponential('alpha', 1 / sim['W'].sum()) beta = pm.Exponential('beta', 1 / (sim['M'] - sim['W']).sum()) obs = pm.BetaBinomial('obs', alpha, beta, sim['M'], observed=sim['W']) with model: # draw 500 posterior samples trace = pm.sample(5000, return_inferencedata=False) az.plot_trace(trace) az.summary(trace, round_to=2) with pm.Model() as betabinomial: predictor = pm.Data('predictor', sim['x']) trials = pm.Data('trials', sim['M']) intercept = pm.Normal('intercept', mu=np.log(0.1), sd=0.001)
def run_factorization(self, N, S, X, Z, I, K, num_cov, k, n): # Smart initialization print("STARTING") rat = k / n nans = np.isnan(rat) conc_inits = np.zeros((1, S)) beta_inits = np.zeros((num_cov, S)) for index_s in range(S): column_rat = rat[:, index_s] column_nans = np.isnan(column_rat) valid_rat = column_rat[~column_nans] conc_init = min(1.0 / np.var(valid_rat), 1000.0) m_init = min(max(np.mean(valid_rat), 1.0 / 1000), 1.0 - (1.0 / 1000)) conc_inits[0, index_s] = conc_init beta_inits[0, index_s] = np.log(m_init / (1.0 - m_init)) # Run bb-mf with pm.Model() as bb_glm: CONC = pm.HalfCauchy('CONC', beta=5, shape=(1, S), testval=conc_inits) BETA = pm.Normal('BETA', mu=0, tau=(1 / 1000000.0), shape=(S, num_cov), testval=beta_inits.T) #U = pm.Normal('U', mu=0, tau=(1/1.0), shape=(N, K), testval=np.random.randn(N, K)) #U = pm.Exponential('U',lam=10.0, shape=(N, K), testval=np.abs(np.random.randn(N, K))) #V = pm.Normal('V', mu=0, tau=(1/100000.0), shape=(S, K), testval=np.random.randn(S, K)) LAMBDA_U = pm.HalfCauchy('LAMBDA_U', beta=1, shape=(N, K), testval=np.ones((N, K))) TAU_U = pm.HalfCauchy('TAU_U', beta=1, testval=1.0) SIGMA_U = pm.Deterministic('SIGMA_U', TAU_U * TAU_U * LAMBDA_U * LAMBDA_U) U = pm.Normal('U', mu=0, sd=SIGMA_U, shape=(N, K), testval=np.random.randn(N, K)) LAMBDA_V = pm.HalfCauchy('LAMBDA_V', beta=1, shape=(S, K), testval=np.ones((S, K))) TAU_V = pm.HalfCauchy('TAU_V', beta=1, testval=1.0) SIGMA_V = pm.Deterministic('SIGMA_V', TAU_V * TAU_V * LAMBDA_V * LAMBDA_V) V = pm.Normal('V', mu=0, sd=SIGMA_V, shape=(S, K), testval=np.random.randn(S, K)) MU_A = pm.Normal("MU_A", mu=0., sd=100**2, shape=(1, S), testval=np.zeros((1, S))) SIGMA_A = pm.HalfCauchy("SIGMA_A", beta=5.0, shape=(1, S), testval=np.ones((1, S))) mu_a_mat = pm.math.dot(np.ones((I, 1)), MU_A) sigma_a_mat = pm.math.dot(np.ones((I, 1)), SIGMA_A) A = pm.Normal('A', mu=mu_a_mat, sigma=sigma_a_mat, shape=(I, S), testval=np.zeros((I, S))) p = pm.math.invlogit( pm.math.dot(X, BETA.T) + pm.math.dot(U, V.T) + A[Z, :]) conc_mat = pm.math.dot(np.ones((N, 1)), CONC) R = pm.BetaBinomial('like', alpha=(p * conc_mat)[~nans], beta=((1.0 - p) * conc_mat)[~nans], n=n[~nans], observed=k[~nans]) approx = pm.fit(method='advi', n=30000) pickle.dump(approx, open(self.output_root + '_model', 'wb')) approx = pickle.load(open(self.output_root + '_model', "rb")) means_dict = approx.bij.rmap(approx.params[0].eval()) np.savetxt(self.output_root + '_temper_U.txt', (means_dict['U']), fmt="%s", delimiter='\t') np.savetxt(self.output_root + '_temper_V.txt', (means_dict['V'].T), fmt="%s", delimiter='\t') np.savetxt(self.output_root + '_temper_BETA.txt', (means_dict['BETA'].T), fmt="%s", delimiter='\t')
def run_factorization(self, N, S, X, Z, I, K, num_cov, k, n): # Smart initialization rat = k / n nans = np.isnan(rat) conc_inits = np.zeros((1, S)) beta_inits = np.zeros((num_cov, S)) for index_s in range(S): column_rat = rat[:, index_s] column_nans = np.isnan(column_rat) valid_rat = column_rat[~column_nans] conc_init = min(1.0 / np.var(valid_rat), 1000.0) m_init = min(max(np.mean(valid_rat), 1.0 / 1000), 1.0 - (1.0 / 1000)) conc_inits[0, index_s] = conc_init beta_inits[0, index_s] = np.log(m_init / (1.0 - m_init)) # Run bb-mf with mini batch indices = np.asarray(range(N)) num_mb_indices = 1000 mb_indices = pm.Minibatch(indices, num_mb_indices) mb_X = pm.Minibatch(X, num_mb_indices) mb_n = pm.Minibatch(n, num_mb_indices) mb_k = pm.Minibatch(k, num_mb_indices) mb_Z = pm.Minibatch(Z, num_mb_indices) mb_nans = pm.Minibatch(nans, num_mb_indices) with pm.Model() as bb_glm: CONC = pm.HalfCauchy('CONC', beta=5, shape=(1, S), testval=conc_inits) BETA = pm.Normal('BETA', mu=0, tau=(1 / 1000000.0), shape=(S, num_cov), testval=beta_inits.T) U = pm.Normal('U', mu=0, tau=(1 / 10000.0), shape=(N, K), testval=np.random.randn(N, K)) V = pm.Normal('V', mu=0, tau=(1 / 10000.0), shape=(S, K), testval=np.random.randn(S, K)) MU_A = pm.Normal("MU_A", mu=0., sd=100**2, shape=(1, S), testval=np.zeros((1, S))) SIGMA_A = pm.HalfCauchy("SIGMA_A", beta=5.0, shape=(1, S), testval=np.ones((1, S))) mu_a_mat = pm.math.dot(np.ones((I, 1)), MU_A) sigma_a_mat = pm.math.dot(np.ones((I, 1)), SIGMA_A) A = pm.Normal('A', mu=mu_a_mat, sigma=sigma_a_mat, shape=(I, S), testval=np.zeros((I, S))) p = pm.math.invlogit( pm.math.dot(mb_X, BETA.T) + pm.math.dot(U[mb_indices, :], V.T) + A[mb_Z, :]) conc_mat = pm.math.dot(np.ones((num_mb_indices, 1)), CONC) R = pm.BetaBinomial('like', alpha=(p * conc_mat)[~mb_nans], beta=((1.0 - p) * conc_mat)[~mb_nans], n=mb_n[~mb_nans], observed=mb_k[~mb_nans], total_size=(k[~nans]).shape) approx = pm.fit(method='advi', n=30000) pickle.dump(approx, open(self.output_root + '_model', 'wb')) #approx = pickle.load( open(self.output_root + '_model', "rb" ) ) means_dict = approx.bij.rmap(approx.params[0].eval()) np.savetxt(self.output_root + '_temper_U.txt', (means_dict['U']), fmt="%s", delimiter='\t') np.savetxt(self.output_root + '_temper_V.txt', (means_dict['V'].T), fmt="%s", delimiter='\t') np.savetxt(self.output_root + '_temper_BETA.txt', (means_dict['BETA'].T), fmt="%s", delimiter='\t')