def generate( self, n_samples: int = 100, genes: Union[list, np.ndarray] = None, batch_size: int = 64, #batch_size: int = 128, ) -> Tuple[torch.Tensor, torch.Tensor]: """ Create observation samples from the Posterior Predictive distribution :param n_samples: Number of required samples for each cell :param genes: Indices of genes of interest :param batch_size: Desired Batch size to generate data :return: Tuple (x_new, x_old) Where x_old has shape (n_cells, n_genes) Where x_new has shape (n_cells, n_genes, n_samples) """ assert self.model.reconstruction_loss in ["zinb", "nb"] zero_inflated = self.model.reconstruction_loss == "zinb" x_old = [] x_new = [] for tensors in self.update({"batch_size": batch_size}): sample_batch, _, _, batch_index, labels = tensors outputs = self.model.inference(sample_batch, batch_index=batch_index, y=labels, n_samples=n_samples) px_r = outputs["px_r"] px_rate = outputs["px_rate"] px_dropout = outputs["px_dropout"] p = px_rate / (px_rate + px_r) r = px_r # Important remark: Gamma is parametrized by the rate = 1/scale! l_train = distributions.Gamma(concentration=r, rate=(1 - p) / p).sample() # Clamping as distributions objects can have buggy behaviors when # their parameters are too high l_train = torch.clamp(l_train, max=1e8) gene_expressions = distributions.Poisson(l_train).sample( ) # Shape : (n_samples, n_cells_batch, n_genes) if zero_inflated: p_zero = (1.0 + torch.exp(-px_dropout)).pow(-1) random_prob = torch.rand_like(p_zero) gene_expressions[random_prob <= p_zero] = 0 gene_expressions = gene_expressions.permute( [1, 2, 0]) # Shape : (n_cells_batch, n_genes, n_samples) x_old.append(sample_batch.cpu()) x_new.append(gene_expressions.cpu()) x_old = torch.cat(x_old) # Shape (n_cells, n_genes) x_new = torch.cat(x_new) # Shape (n_cells, n_genes, n_samples) if genes is not None: gene_ids = self.gene_dataset.genes_to_index(genes) x_new = x_new[:, gene_ids, :] x_old = x_old[:, gene_ids] return x_new.numpy(), x_old.numpy()
def generate_data(self): if self.n_comps == 2: cell_type = distributions.Bernoulli(probs=self.cat).sample( (self.n_cells,) ) else: cell_type = torch.zeros(self.n_cells) z = torch.zeros((self.n_cells, self.n_latent)).float() for idx in range(z.shape[0]): z[idx, :] = self.dists[int(cell_type[idx])].sample() self.z = z rate = compute_rate(self.a_mat, self.b, z) self.h = rate gene_expressions = np.expand_dims( distributions.Poisson(rate=rate).sample(), axis=0 ) labels = np.expand_dims(cell_type, axis=0) if self.n_comps == 2 else None gene_names = np.arange(self.n_genes).astype(str) self.populate_from_per_batch_list( gene_expressions, labels_per_batch=labels, gene_names=gene_names, )
def gen_data(self): # sample overall relative abundances of ASVs from a Dirichlet distribution self.ASV_rel_abundance = tdist.Dirichlet(torch.ones( self.numASVs)).sample() # sample spatial embedding of ASVs self.w = torch.zeros(self.numASVs, self.D) w_prior = tdist.MultivariateNormal(torch.zeros(self.D), torch.eye(self.D)) for o in range(0, self.numASVs): self.w[o, :] = w_prior.sample() self.data = torch.zeros(self.numParticles, self.numASVs) num_nonempty = 0 mu_prior = tdist.MultivariateNormal(torch.zeros(self.D), torch.eye(self.D)) rad_prior = tdist.LogNormal(torch.tensor([self.mu_rad]), torch.tensor([self.mu_std])) # replace with neg bin prior num_reads_prior = tdist.Poisson( torch.tensor([self.avgNumReadsParticle])) while (num_nonempty < self.numParticles): # sample center mu = mu_prior.sample() rad = rad_prior.sample() zr = torch.zeros(1, self.numASVs, dtype=torch.float64) for o in range(0, self.numASVs): p = mu - self.w[o, :] p = torch.pow(p, 2.0) / rad p = (torch.sum(p)).sqrt() zr[0, o] = unitboxcar(p, 0.0, 2.0, self.step_approx) if torch.sum(zr) > 0.95: particle = Particle(mu, self) particle.zr = zr self.particles.append(particle) # renormalize particle abundances rn = self.ASV_rel_abundance * zr rn = rn / torch.sum(rn) # sample relative abundances for particle part_rel_abundance = tdist.Dirichlet(rn * self.conc).sample() # sample number of reads for particle # (replace w/ neg bin instead of Poisson) num_reads = num_reads_prior.sample().long().item() particle.total_reads = num_reads particle.reads = tdist.Multinomial( num_reads, probs=part_rel_abundance).sample() num_nonempty += 1
def sample_poisson(self, lam): # get Poisson(lam) with torch.no_grad(): sample = distributions.Poisson(lam).sample() eps = (sample - lam) / torch.sqrt(lam + 1e-12) z = torch.sqrt(lam + 1e-12) * eps + lam return z
def __init__(self, in_features: int, out_channels: int, num_repetitions: int = 1, dropout=0.0): """Creat a poisson layer. Args: out_channels: Number of parallel representations for each input feature. in_features: Number of input features. """ super().__init__(in_features, out_channels, num_repetitions, dropout) self.rate = nn.Parameter(torch.rand(1, in_features, out_channels, num_repetitions)) self.poisson = dist.Poisson(rate=self.rate)
def __init__(self, multiplicity, in_features, dropout=0.0): """Creat a poisson layer. Args: multiplicity: Number of parallel representations for each input feature. in_features: Number of input features. """ super().__init__(multiplicity, in_features, dropout) self.rate = nn.Parameter(torch.rand(1, in_features, multiplicity)) self.poisson = dist.Poisson(rate=self.rate)
def generate_leaves( self, n_samples: int = 100, batch_size: int = 128, ): """Create observation samples from the Posterior Predictive distribution Parameters ---------- n_samples Number of required samples for each cell genes Indices of genes of interest batch_size Desired Batch size to generate data Returns ------- x_new : :py:class:`torch.Tensor` tensor with shape (n_cells, n_genes, n_samples) x_old : :py:class:`torch.Tensor` tensor with shape (n_cells, n_genes) """ assert self.model.reconstruction_loss in ["nb", "poisson"] x_old = [] x_new = [] for tensors in self.update({"batch_size": len(self.barcodes)}): sample_batch, _, _, batch_index, labels = tensors outputs = self.model.inference(sample_batch) px_r = outputs["px_r"] px_rate = outputs["px_rate"] if self.model.reconstruction_loss == "poisson": l_train = px_rate l_train = torch.clamp(l_train, max=1e8) dist = distributions.Poisson( l_train) # Shape : (n_samples, n_cells_batch, n_genes) elif self.model.reconstruction_loss == "nb": dist = NegativeBinomial(mu=px_rate, theta=px_r) else: raise ValueError( "{} reconstruction error not handled right now".format( self.model.reconstruction_loss)) gene_expressions = dist.sample( ) #.permute([1, 2, 0]) # Shape : (n_cells_batch, n_genes, n_samples) x_old.append(sample_batch.cpu()) x_new.append(gene_expressions.cpu()) x_old = torch.cat(x_old) # Shape (n_cells, n_genes) x_new = torch.cat(x_new) # Shape (n_cells, n_genes, n_samples) return x_new.numpy(), x_old.numpy()
def decode_x(self, w, z): params = self.decoder_x(torch.cat((w, z), dim=-1)) px_wz = [] samples = [] for indices in self.likelihood_partition: data_type = self.likelihood_partition[indices] params_subset = params[:, indices[0]:(indices[1] + 1)] if data_type == 'real': cov_diag = self.likelihood_params['lik_var'] * torch.ones_like( params_subset).to(self.device) dist = D.Normal(loc=params_subset, scale=cov_diag.sqrt()) elif data_type == 'categorical': dist = D.OneHotCategorical(logits=params_subset) elif data_type == 'binary': dist = D.Bernoulli(logits=params_subset) elif data_type == 'positive': lognormal_var = self.likelihood_params[ 'lik_var_lognormal'] * torch.ones_like(params_subset).to( self.device) dist = D.LogNormal(loc=params_subset, scale=lognormal_var.sqrt()) elif data_type == 'count': positive_params_subset = F.softplus(params_subset) dist = D.Poisson(rate=positive_params_subset) elif data_type == 'binomial': num_trials = self.likelihood_params['binomial_num_trials'] dist = D.Binomial(total_count=num_trials, logits=params_subset) elif data_type == 'ordinal': h = params_subset[:, 0:1] thetas = torch.cumsum(F.softplus(params_subset[:, 1:]), axis=1) prob_lessthans = torch.sigmoid(thetas - h) probs = torch.cat((prob_lessthans, torch.ones(len(prob_lessthans), 1)), axis=1) - \ torch.cat((torch.zeros(len(prob_lessthans), 1), prob_lessthans), axis=1) dist = D.OneHotCategorical(probs=probs) else: raise NotImplementedError samples.append(dist.sample()) px_wz.append(dist) sample_x = torch.cat(samples, axis=1) return params, sample_x, px_wz
def simulate_runs(self, coeffs, nruns=100, models=None): """Fits appropriation rule models and simulates optimal runs using those models """ if models is None: models = self.fit_approp_rules(coeffs) runs = torch.ones(nruns, self.T+1) * self.Y0 approps = torch.zeros(nruns, self.T+1) Nts = dist.Poisson(self.lambda_).sample((nruns, self.T)) if self.lambda_ else torch.zeros(nruns, self.T) ratios = torch.exp(self.mu - self.sigma**2 / 2 \ + dist.Normal(loc=Nts*self.m, scale=torch.sqrt(self.sigma**2 + Nts*self.s2)).sample()) for t in range(1, self.T+1): approps[:, t] = models[t-1].predict(runs[:,t-1], ratios[:,t-1]).flatten() runs[:, t] = runs[:, t-1] * ratios[:, t-1] - approps[:, t] return runs, approps
def fit_approp_rules(self, coeffs, npaths=100, lr=0.02, epochs_per_step=50, Model=AppropRule): """Estimates optimal appropriation rules to maximize the manager's utility. """ print(f'fitting approp rules with {coeffs}...') time0 = time() # Draw transition ratios Nts = dist.Poisson(self.lambda_).sample((npaths, self.T)) if self.lambda_ else torch.zeros(npaths, self.T) ratios = torch.exp(self.mu - self.sigma**2 / 2 \ + dist.Normal(loc=Nts*self.m, scale=torch.sqrt(self.sigma**2 + Nts*self.s2)).sample()) # calculate [fixed] future value discounts discounts = torch.exp(-self.rho * torch.arange(self.T+1)) # calculate endpoints Yt = self.Y0 * ratios.prod(axis=1) models = [Model(self.max_approp) for _ in range(self.T)] for t in reversed(range(1, self.T+1)): def loss_fn(approp): utils = npaths * (self.theta * approp \ + poly_eval(Yt*(ratios[:, t-1] - self.ratio) - approp, coeffs)) for s in range(t+1, self.T+1): new_Yt = ratios[: t:(s-1)].prod(axis=1) * Yt approp = models[s-1].predict(new_Yt, ratios[:, s-1]) utils += discounts[s-t] * (self.theta * approp \ + poly_eval(new_Yt * (ratios[:, s-1] - self.ratio) - approp, coeffs)) return -utils.mean() optimizer = torch.optim.Adam(models[t-1].parameters(), lr=lr) losses = [] for i in range(epochs_per_step): approp = models[t-1].forward(Yt, ratios[:, t-1]) loss = loss_fn(approp) losses.append(loss.item()) optimizer.zero_grad() loss.backward() optimizer.step() # exit prematurely if no progress is being made if i > 3 and (losses[-1] - losses[-4]) / abs(losses[-4]) > -0.001: break if PLOT_LOSSES: plt.plot(losses) if PLOT_LOSSES: plt.show() print(f'models fit in {time()-time0:.2f} s') self.models = models return models
def loglik_count(batch_data, list_type, theta, normalization_params): output = dict() epsilon = 1e-6 # Data outputs data, missing_mask = batch_data missing_mask = missing_mask.float() est_lambda = theta est_lambda = torch.clamp(torch.nn.Softplus()(est_lambda), epsilon, 1e20) # log_p_x = -torch.sum(log_poisson_loss(targets=data, log_input=torch.log(est_lambda), # compute_full_loss=True), 1) poisson = td.Poisson(est_lambda) log_p_x = poisson.log_prob(data).sum(1) output['log_p_x'] = torch.mul(log_p_x, missing_mask) output['log_p_x_missing'] = torch.mul(log_p_x, 1.0 - missing_mask) output['params'] = est_lambda output['samples'] = poisson.sample() return output
def __init__(self, pi=[0.7], loc_reduce=0, n_cells=100, mu0_path='mu_0.npy', mu1_path='mu_2.npy', separate_reduce=False, sig0_path='sigma_0.npy', sig1_path='sigma_2.npy'): super().__init__() current_dir = os.path.dirname(os.path.realpath(__file__)) mu_0 = self.load_array(os.path.join(current_dir, mu0_path)) mu_1 = self.load_array(os.path.join(current_dir, mu1_path)) sigma_0 = self.load_array(os.path.join(current_dir, sig0_path)) sigma_1 = self.load_array(os.path.join(current_dir, sig1_path)) np.random.seed(0) torch.manual_seed(0) n_genes = len(mu_0) if not(separate_reduce): self.dist0 = distributions.MultivariateNormal(loc=mu_0-loc_reduce, covariance_matrix=sigma_0) self.dist1 = distributions.MultivariateNormal(loc=mu_1-loc_reduce, covariance_matrix=sigma_1) else: n_genes *= 2 mu_0_new = torch.zeros((2*mu_0.shape[0],)) mu_0_new[:mu_0.shape[0]] = mu_0 mu_0_new[-mu_0.shape[0]:] = mu_0 - loc_reduce mu_0_new = mu_0_new.double() mu_1_new = torch.zeros((2*mu_1.shape[0],)) mu_1_new[:mu_1.shape[0]] = mu_1 mu_1_new[-mu_1.shape[0]:] = mu_1 - loc_reduce mu_1_new = mu_1_new.double() sigma_0 = sigma_0.cpu().numpy() sigma_1 = sigma_1.cpu().numpy() fac = 4 sigma_0 = sigma_0 / fac sigma_1 = sigma_1 / fac np.fill_diagonal(sigma_0, fac*np.diag(sigma_0)) np.fill_diagonal(sigma_1, fac* np.diag(sigma_1)) sigma_0 = torch.tensor(sigma_0).double() sigma_1 = torch.tensor(sigma_1).double() sigma_0_new = torch.zeros((2*sigma_0.shape[0],2*sigma_0.shape[1])) sigma_0_new[:sigma_0.shape[0], :sigma_0.shape[1]] = sigma_0 sigma_0_new[sigma_0.shape[0]:, sigma_0.shape[1]:] = sigma_0 sigma_0_new = sigma_0_new.double() sigma_1_new = torch.zeros((2 * sigma_1.shape[0], 2 * sigma_1.shape[1])) sigma_1_new[:sigma_1.shape[0], :sigma_1.shape[1]] = sigma_1 sigma_1_new[sigma_1.shape[0]:, sigma_1.shape[1]:] = sigma_1 sigma_1_new = sigma_1_new.double() self.dist0 = distributions.MultivariateNormal(loc=mu_0_new, covariance_matrix=sigma_0_new) self.dist1 = distributions.MultivariateNormal(loc=mu_1_new, covariance_matrix=sigma_1_new) cell_type = distributions.Bernoulli(probs=torch.tensor(pi)).sample((n_cells,)) zero_mask = (cell_type == 0).squeeze() one_mask = (cell_type == 1).squeeze() z = torch.zeros((n_cells, n_genes)).double() z[zero_mask, :] = self.dist0.sample((zero_mask.sum(),)) z[one_mask, :] = self.dist1.sample((one_mask.sum(),)) gene_expressions = distributions.Poisson(rate=z.exp()).sample().cpu().numpy() labels = cell_type.cpu().numpy() self.mask_zero_biological = (gene_expressions == 0) gene_expressions, batches = self.mask(gene_expressions, labels) gene_names = np.arange(n_genes).astype(str) keep_cells = (gene_expressions.sum(axis=1) > 0) gene_expressions = gene_expressions[keep_cells,:] if labels is not None: labels = labels[keep_cells] if batches is not None: batches = batches[keep_cells] self.populate_from_data( gene_expressions, labels =labels, gene_names=gene_names, batch_indices=batches, )
def generate( self, n_samples: int = 100, genes: Optional[np.ndarray] = None, batch_size: int = 256, #batch_size: int = 128, ): """ Create observation samples from the Posterior Predictive distribution :param n_samples: Number of required samples for each cell :param genes: Indices of genes of interest :param batch_size: Desired Batch size to generate data :return: Tuple (x_new, x_old) Where x_old has shape (n_cells, n_genes) Where x_new has shape (n_cells, n_genes, n_samples) """ assert self.model.reconstruction_loss in ["zinb", "zip"] zero_inflated = "zinb" rna_old = [] rna_new = [] atac_old = [] atac_new = [] for tensors in self.update({"batch_size": batch_size}): sample_batch, _, _, batch_index, labels = tensors outputs = self.model.inference(sample_batch, batch_index=batch_index, y=labels, n_samples=n_samples) p_rna_r = outputs["p_rna_r"] p_rna_rate = outputs["p_rna_rate"] p_rna_dropout = outputs["p_rna_dropout"] p_atac_mean = outputs["p_atac_mean"] p_atac_dropout = outputs["p_atac_dropout"] # Generating rna-seq data p = p_rna_rate / (p_rna_rate + p_rna_r) r = p_rna_r # Important remark: Gamma is parametrized by the rate = 1/scale! l_train_rna = distributions.Gamma(concentration=r, rate=(1 - p) / p).sample() # Clamping as distributions objects can have buggy behaviors when # their parameters are too high l_train_rna = torch.clamp(l_train_rna, max=1e8) gene_expressions = distributions.Poisson(l_train_rna).sample( ) # Shape : (n_samples, n_cells_batch, n_genes) #Generating atac-seq data l_train_atac = torch.clamp(p_atac_mean, max=1e2) atac_expressions = distributions.Poisson(l_train_atac).sample() # zero-inflate if zero_inflated: p_zero_rna = (1.0 + torch.exp(-p_rna_dropout)).pow(-1) random_prob_rna = torch.rand_like(p_zero_rna) gene_expressions[random_prob_rna <= p_zero_rna] = 0 p_zero_atac = (1.0 + torch.exp(-p_atac_dropout)).pow(-1) random_prob_atac = torch.rand_like(p_zero_atac) atac_expressions[random_prob_atac <= p_zero_atac] = 0 gene_expressions = gene_expressions.permute( [1, 2, 0]) # Shape : (n_cells_batch, n_genes, n_samples) atac_expressions = atac_expressions.permute([1, 2, 0]) rna_old.append(sample_batch[0].cpu()) rna_new.append(gene_expressions.cpu()) atac_old.append(sample_batch[1].cpu()) atac_new.append(atac_expressions.cpu()) rna_old = torch.cat(rna_old) # Shape (n_cells, n_genes) rna_new = torch.cat(rna_new) # Shape (n_cells, n_genes, n_samples) if genes is not None: gene_ids = self.gene_dataset.genes_to_index(genes) rna_new = rna_new[:, gene_ids, :] rna_old = rna_old[:, gene_ids] return rna_new.numpy(), rna_old.numpy(), atac_new.numpy( ), rna_old.numpy()
def s2() -> RVIdentifier: return dist.Poisson(theta2())
def forward(self, x: torch.Tensor) -> dist.Poisson: # type: ignore """Output a Poisson distribution parameterized by ``exp(x)``.""" _validate_input(x) return dist.Poisson(torch.exp(x))
def main(args): print("Loading data...") teams, df = load_data() train = df[df["split"] == "train"] print("Starting inference...") samples = bm.GlobalNoUTurnSampler().infer( queries=[ alpha(), home(), sd_att(), sd_def(), attack(), defend(), ], observations={ s1(): torch.tensor(train["score1"].values), s2(): torch.tensor(train["score2"].values), }, num_samples=args.num_samples, num_chains=args.num_chains, num_adaptive_samples=args.num_warmup, ) samples = samples.to_xarray() fit = az.InferenceData(posterior=samples) print("Analyse posterior...") az.plot_forest( fit, backend="bokeh", ) az.plot_trace( fit, backend="bokeh", ) # Attack and defence quality = teams.copy() quality = quality.assign( attack=samples[attack()].mean(axis=(0, 1)), attacksd=samples[attack()].std(axis=(0, 1)), defend=samples[defend()].mean(axis=(0, 1)), defendsd=samples[defend()].std(axis=(0, 1)), ) quality = quality.assign( attack_low=quality["attack"] - quality["attacksd"], attack_high=quality["attack"] + quality["attacksd"], defend_low=quality["defend"] - quality["defendsd"], defend_high=quality["defend"] + quality["defendsd"], ) plot_quality(quality) # Predicted goals and table predict = df[df["split"] == "predict"] theta1 = (samples[alpha()].expand_dims("", axis=-1).values + samples[home()].expand_dims("", axis=-1).values + samples[attack()][:, :, predict["Home_id"]].values - samples[defend()][:, :, predict["Away_id"]].values) theta1 = torch.tensor(theta1.reshape(-1, theta1.shape[-1])) theta2 = (samples[alpha()].expand_dims("", axis=-1).values + samples[attack()][:, :, predict["Away_id"]].values - samples[defend()][:, :, predict["Home_id"]].values) theta2 = torch.tensor(theta2.reshape(-1, theta2.shape[-1])) score1 = np.array(dist.Poisson(torch.exp(theta1)).sample()) score2 = np.array(dist.Poisson(torch.exp(theta2)).sample()) predicted_full = predict.copy() predicted_full = predicted_full.assign( score1=score1.mean(axis=0).round(), score1error=score1.std(axis=0), score2=score2.mean(axis=0).round(), score2error=score2.std(axis=0), ) predicted_full = train.append( predicted_full.drop(columns=["score1error", "score2error"])) print(score_table(df)) print(score_table(predicted_full))
def forward(iota_xfull, iota_x, iota_y, mask_x, mask_y, batch_size, niw): tiled_iota_x = torch.Tensor.repeat(iota_x,[niw,1]); tiled_tiled_iota_x = torch.Tensor.repeat(tiled_iota_x,[niw,1]) tiledmask_x = torch.Tensor.repeat(mask_x,[niw,1]); tiled_tiledmask_x = torch.Tensor.repeat(tiledmask_x,[niw,1]) if not draw_miss: tiled_iota_xfull = torch.Tensor.repeat(iota_xfull,[niw,1]) tiled_iota_y = torch.Tensor.repeat(iota_y,[niw,1]); tiled_tiled_iota_y = torch.Tensor.repeat(tiled_iota_y,[niw,1]) tiledmask_y = torch.Tensor.repeat(mask_y,[niw,1]); tiled_tiledmask_y = torch.Tensor.repeat(tiledmask_y,[niw,1]) if not draw_miss: tiled_iota_yfull = torch.Tensor.repeat(iota_yfull,[niw,1]) ## uncorrelated covariates # p_x = td.Normal(loc=mu_x, scale=torch.nn.Softplus()(scale_x)+0.001) ## Correlated covariates (unstructured covariance structure) # p_x = td.multivariate_normal.MultivariateNormal(loc=mu_x,covariance_matrix=torch.nn.Softplus()(scale_x)+0.001) p_x = td.multivariate_normal.MultivariateNormal(loc=mu_x,covariance_matrix=torch.matmul(scale_x, scale_x.t())) # multiply by transpose -> make it positive definite params_x = None; xm = iota_x; xm_flat = torch.Tensor.repeat(iota_x,[niw,1]) # if no missing x params_y = None; ym = iota_y; ym_flat = torch.Tensor.repeat(iota_y,[niw,1]) ## NN_xm ## p(xm|xo,r) (if missing in x detected) if miss_x: out_NN_xm = NN_xm(torch.cat([iota_x,mask_x],1)) # bs x p -- > sample niw times qxmgivenxor = td.Normal(loc=out_NN_xm[..., :p],scale=torch.nn.Softplus()(out_NN_xm[..., p:(2*p)])+0.001) ### condition contribution of this term in the ELBO by miss_x params_xm = {'mean':out_NN_xm[..., :p], 'scale':torch.nn.Softplus()(out_NN_xm[..., p:(2*p)])+0.001} if draw_miss: xm = qxmgivenxor.rsample([niw]); xm_flat = xm.reshape([niw*batch_size,p]) else: qxmgivenxor=None; params_xm=None; xm_flat = torch.Tensor.repeat(iota_x,[niw,1]) # organize completed (sampled) xincluded for missingness model. observed values are not sampled if miss_x: if miss_y: tiled_xm_flat = torch.Tensor.repeat(xm_flat,[niw,1]) xincluded = tiled_tiled_iota_x*(tiled_tiledmask_x) + tiled_xm_flat*(1-tiled_tiledmask_x) else: xincluded = tiled_iota_x*(tiledmask_x) + xm_flat*(1-tiledmask_x) else: xincluded = iota_x ## NN_ym ## p(ym|yo,x,r) (if missing in y detected) if miss_y: if not miss_x: out_NN_ym = NN_ym(torch.cat([iota_y, iota_x, mask_y],1)) # bs x 1 --> sample niw times elif miss_x: out_NN_ym = NN_ym(torch.cat([tiled_iota_y, tiledmask_x*tiled_iota_x + (1-tiledmask_x)*xm_flat, tiledmask_y],1)) # (niw*bs) x 1 --> sampled niw times if family=="Gaussian": qymgivenyor = td.Normal(loc=out_NN_ym[..., :1],scale=torch.nn.Softplus()(out_NN_ym[..., 1:2])+0.001) ### condition contribution of this term in the ELBO by miss_y params_ym = {'mean':out_NN_ym[..., :1], 'scale':torch.nn.Softplus()(out_NN_ym[..., 1:2])+0.001} if draw_miss: ym = qymgivenyor.rsample([niw]); ym_flat = ym.reshape([-1,1]) # ym_flat is (niw*bs x 1) if no miss_x, and (niw*niw*bs x 1) if miss_x else: qymgivenyor=None; params_ym=None; ym_flat = torch.Tensor.repeat(iota_y,[niw,1]) # organize completed (sampled) xincluded for missingness model. observed values are not sampled if miss_y: if miss_x: yincluded = tiled_tiled_iota_y*(tiled_tiledmask_y) + ym_flat*(1-tiled_tiledmask_y) else: yincluded = tiled_iota_y*(tiledmask_y) + ym_flat*(1-tiledmask_y) else: if miss_x: yincluded = tiled_iota_y else: yincluded = iota_y ## NN_y ## p(y|x) out_NN_y = NN_y(xincluded) # if miss_x and miss_y: this becomes niw*niw*bs x p, otherwise: niw*bs x p if family=="Gaussian": mu_y = invlink(link)(out_NN_y[..., 0]); var_y = V(mu_y, torch.nn.Softplus()(alpha)+0.001, family) # default: link="identity", family="Gaussian" pygivenx = td.Normal(loc = mu_y, scale = (var_y)**(1/2)) # scale = sd = var^(1/2) params_y = {'mean': mu_y.detach(), 'scale': (var_y.detach())**(1/2)} elif family=="Multinomial": probs = invlink(link)(out_NN_y[..., :C]) pygivenx = td.OneHotCategorical(probs=probs) #print("probs:"); print(probs) #print("pygivenx (event_shape):"); print(pygivenx.event_shape) #print("pygivenx (batch_shape):"); print(pygivenx.batch_shape) params_y = {'probs': probs.detach()} elif family=="Poisson": lambda_y = invlink(link)(out_NN_y[..., 0]) # variance is the same as mean in Poisson pygivenx = td.Poisson(rate = lambda_y) params_y = {'lambda': lambda_y.detach()} #print(pygivenx.rsample().shape) ## NN_r ## p(r|x,y,covars): always. Include option to specify covariates in X, y, and additional covars_miss # Organize covariates for missingness model (NN_r) if covars_r_y==1: if np.sum(covars_r_x)>0: covars_included = torch.cat([xincluded[:,covars_r_x==1], yincluded],1) else: covars_included = yincluded elif covars_r_y==0: if np.sum(covars_r_x)>0: covars_included = xincluded[:,covars_r_x==1] # else: IGNORABLE HERE. NO COVARIATES #print(covars_included.shape) #print(NN_r) if not Ignorable: if (covars): out_NN_r = NN_r(torch.cat([covars_included, covars_miss])) # right now: just X in as covariates (Case 1) (niw*niw*bs x p) for case 3, (niw*bs x p) for other cases else: out_NN_r = NN_r(covars_included) # can additionally include covariates prgivenxy = td.Bernoulli(logits = out_NN_r) # for just the features with missing valuess params_r = {'probs': torch.nn.Sigmoid()(out_NN_r).detach()} else: prgivenxy=None; params_r=None return xincluded, yincluded, p_x, qxmgivenxor, qymgivenyor, pygivenx, prgivenxy, params_xm, params_ym, params_y, params_r
def __init__( self, pi=[0.7], n_cells=100, mu0_path="mu_0.npy", mu1_path="mu_2.npy", sig0_path="sigma_0.npy", sig1_path="sigma_2.npy", seed=42, n_genes=None, change_means=False, cuda_mcmc=False, ): super().__init__() torch.manual_seed(seed) assert len(pi) == 1 self.probas = torch.tensor([1.0 - pi[0], pi[0]]) self.logprobas = np.log(self.probas) current_dir = os.path.dirname(os.path.realpath(__file__)) self.mu_0 = self.load_array(os.path.join(current_dir, mu0_path), n_genes) self.mu_1 = self.load_array(os.path.join(current_dir, mu1_path), n_genes) n_genes = len(self.mu_0) if change_means: self.mu_0[:n_genes // 4] = self.mu_0[:n_genes // 4] / 1.5 self.mu_0[n_genes // 4:n_genes // 2] = (self.mu_0[n_genes // 4:n_genes // 2] / 0.5) self.sigma_0 = self.load_array(os.path.join(current_dir, sig0_path), n_genes) self.sigma_1 = self.load_array(os.path.join(current_dir, sig1_path), n_genes) d1, d2 = self.sigma_1.shape assert d1 == d2 self.sigma_0 = self.sigma_0 + 2e-6 * torch.eye( d2, d2, dtype=self.sigma_0.dtype) self.sigma_1 = self.sigma_1 + 2e-6 * torch.eye( d2, d2, dtype=self.sigma_1.dtype) self.mus = torch.stack([self.mu_0, self.mu_1]).float() self.sigmas = torch.stack([self.sigma_0, self.sigma_1]).float() if cuda_mcmc: self.mus.cuda() self.sigmas.cuda() self.probas.cuda() self.logprobas.cuda() self.dist0 = distributions.MultivariateNormal( loc=self.mu_0, covariance_matrix=self.sigma_0) self.dist1 = distributions.MultivariateNormal( loc=self.mu_1, covariance_matrix=self.sigma_1) self.dist_x = distributions.Poisson cell_type = distributions.Bernoulli(probs=torch.tensor(pi)).sample( (n_cells, )) zero_mask = (cell_type == 0).squeeze() one_mask = ~zero_mask # (cell_type == 1).squeeze() z = torch.zeros((n_cells, n_genes)).double() z[zero_mask] = self.dist0.sample((zero_mask.sum(), )) z[one_mask] = self.dist1.sample((one_mask.sum(), )) print(z.min(), z.max()) rate = torch.clamp(z.exp(), max=1e5) gene_expressions = np.expand_dims( distributions.Poisson(rate=rate).sample(), axis=0) labels = np.expand_dims(cell_type, axis=0) gene_names = np.arange(n_genes).astype(str) print("Dataset shape: ", gene_expressions.shape) print("Gene expressions bounds: ", gene_expressions.min(), gene_expressions.max()) self.populate_from_per_batch_list( gene_expressions, labels_per_batch=labels, gene_names=gene_names, )