def make_full_posterior(evidence, logprior): logprior = _complete_logprior(logprior) posterior = mutrel.Mutrel( vids=evidence.vids, rels=_calc_posterior_full(evidence.rels, logprior), ) return posterior
def make_mutrel_from_trees_and_unique_clusterings(structs, llhs, clusterings): ''' Relative to `make_mutrel_from_trees_and_single_clustering`, this function is slower and more memory intensive, but also more flexible. It differs in two respects: 1. It doesn't assume that the user has already computed counts for all unique samples -- i.e., it allows duplicate samples. 2. It allows unique clusterings for every sample. ''' assert len(structs) == len(llhs) == len(clusterings) weights = util.softmax(llhs) vids = None for struct, clustering, weight in zip(structs, clusterings, weights): adjm = util.convert_parents_to_adjmatrix(struct) mrel = make_mutrel_from_cluster_adj(adjm, clustering) if vids is None: vids = mrel.vids soft_mutrel = np.zeros(mrel.rels.shape) else: assert mrel.vids == vids soft_mutrel += weight * mrel.rels soft_mutrel = fix_rounding_errors(soft_mutrel) return mutrel.Mutrel( vids=vids, rels=soft_mutrel, )
def make_mutrel_from_trees_and_single_clustering(structs, llhs, counts, clustering): # Oftentimes, we will have many samples of the same adjacency matrix paired # with the same clustering. This will produce the same mutrel. As computing # the mutrel from adjm + clustering is expensive, we want to avoid repeating # this unnecessarily. Instead, we just modify the associated weight of the # the pairing to reflect this. # # Observe that if we have `C` copies of the LLH `W`, we obtain # equivalent post-softmax linear-space weights under either of the following # two methods: # # 1. (naive) Represent the associated samples `C` separate times in the softmax # 2. (smart) Set `W' = W + log(C)`, as `exp(W') = Cexp(W)` weights = util.softmax(llhs + np.log(counts)) vids = None for struct, weight in zip(structs, weights): adjm = util.convert_parents_to_adjmatrix(struct) crel = make_clustrel_from_cluster_adj(adjm) if vids is None: vids = crel.vids soft_clustrel = np.zeros(crel.rels.shape) else: assert crel.vids == vids soft_clustrel += weight * crel.rels soft_clustrel = fix_rounding_errors(soft_clustrel) clustrel = mutrel.Mutrel(rels=soft_clustrel, vids=vids) mrel = make_mutrel_from_clustrel(clustrel, clustering) return mrel
def make_mutrel_from_clustrel(clustrel, clusters, check_sanity=True): mutrel.check_posterior_sanity(clustrel.rels) K = len(clusters) assert clustrel.rels.shape == (K, K, NUM_MODELS) vids, membership = util.make_membership_mat(clusters) # K: number of non-empty clusters M = len(membership) assert len(vids) == M assert membership.shape == (M, K) mrel = np.zeros((M, M, NUM_MODELS)) for modelidx in range(NUM_MODELS): mut_vs_cluster = np.dot(membership, clustrel.rels[:, :, modelidx]) # MxK mrel[:, :, modelidx] = np.dot(mut_vs_cluster, membership.T) # Disable check to improve performance. Since this is called for each tree # (for methods that don't have a fixed clustering), it can be prohibitively # slow -- it was consuming >50% of the total runtime for LICHeE's output # conversion. #mutrel.check_posterior_sanity(mrel) return mutrel.Mutrel( vids=vids, rels=mrel, )
def make_clustrel_from_cluster_adj(cluster_adj): ''' * `K` = # of clusters (including empty first cluster) Arguments: `cluster_adj`: a `KxK` adjacency matrix, where `cluster_adj[a,b] = 1` iff `a = b` or `b` is a child of `a` Returns: a `KxKx5` binary mutation relation tensor ''' K = len(cluster_adj) assert cluster_adj.shape == (K, K) cluster_anc = util.make_ancestral_from_adj(cluster_adj) # In determining A_B relations, don't want to set mutations (i,j), where i # and j are in same cluster, to 1. assert np.all(1 == cluster_anc[0]) np.fill_diagonal(cluster_anc, 0) clustrel = np.zeros((K, K, NUM_MODELS)) clustrel[:, :, Models.cocluster] = np.eye(K) clustrel[:, :, Models.A_B] = cluster_anc clustrel[:, :, Models.B_A] = clustrel[:, :, Models.A_B].T existing = (Models.cocluster, Models.A_B, Models.B_A) already_filled = np.sum(clustrel[:, :, existing], axis=2) clustrel[already_filled == 0, Models.diff_branches] = 1 assert np.array_equal(np.ones((K, K)), np.sum(clustrel, axis=2)) vids = ['S%s' % (idx + 1) for idx in range(K)] clustrel = mutrel.Mutrel(vids=vids, rels=clustrel) mutrel.check_posterior_sanity(clustrel.rels) return clustrel
def merge_variants(to_merge, evidence, logprior): assert np.all(np.array([V for group in to_merge for V in group]) < len(evidence.vids)) already_merged = set() for vidxs in to_merge: vidxs = set(vidxs) assert len(vidxs & already_merged) == 0 M_old = len(evidence.vids) merged_vid = ','.join([evidence.vids[V] for V in vidxs]) new_vids = evidence.vids + [merged_vid] new_evidence = mutrel.init_mutrel(new_vids) new_evidence.rels[:-1,:-1] = evidence.rels merged_row = np.sum(np.array([evidence.rels[V] for V in vidxs]), axis=0) assert merged_row.shape == (M_old, NUM_MODELS) merged_col = np.copy(merged_row) merged_col[:,Models.A_B] = merged_row[:,Models.B_A] merged_col[:,Models.B_A] = merged_row[:,Models.A_B] new_evidence.rels[-1,:-1] = merged_row new_evidence.rels[:-1,-1] = merged_col new_evidence.rels[-1,-1,:] = -np.inf new_evidence.rels[-1,-1,Models.cocluster] = 0 already_merged |= vidxs evidence = new_evidence evidence = mutrel.remove_variants_by_vidx(evidence, already_merged) posterior = mutrel.Mutrel( vids = evidence.vids, rels = _calc_posterior_full(evidence.rels, logprior), ) return (posterior, evidence)
def load_mutrels(mutrel_args): mutrels = {} for mutrel_arg in mutrel_args: mutrel_name, mutrel_path = mutrel_arg.split('=', 1) assert mutrel_name not in mutrels, '%s is duplicate' % mutrel_name if os.path.exists(mutrel_path): mrel = np.load(mutrel_path) mutrels[mutrel_name] = mutrel.Mutrel(vids=mrel['vids'], rels=mrel['rels']) else: mutrels[mutrel_name] = None return mutrels
def get_mutrel(self, name): data = self.get_many(['%s_%s' % (name, T) for T in ('vids', 'rels')]) return mutrel.Mutrel(vids=data['%s_vids' % name], rels=data['%s_rels' % name])