def make_mutrel_from_trees_and_unique_clusterings(structs, llhs, clusterings): ''' Relative to `make_mutrel_from_trees_and_single_clustering`, this function is slower and more memory intensive, but also more flexible. It differs in two respects: 1. It doesn't assume that the user has already computed counts for all unique samples -- i.e., it allows duplicate samples. 2. It allows unique clusterings for every sample. ''' assert len(structs) == len(llhs) == len(clusterings) weights = util.softmax(llhs) vids = None for struct, clustering, weight in zip(structs, clusterings, weights): adjm = util.convert_parents_to_adjmatrix(struct) mrel = make_mutrel_from_cluster_adj(adjm, clustering) if vids is None: vids = mrel.vids soft_mutrel = np.zeros(mrel.rels.shape) else: assert mrel.vids == vids soft_mutrel += weight * mrel.rels soft_mutrel = fix_rounding_errors(soft_mutrel) return mutrel.Mutrel( vids=vids, rels=soft_mutrel, )
def make_mutrel_from_trees_and_single_clustering(structs, llhs, counts, clustering): # Oftentimes, we will have many samples of the same adjacency matrix paired # with the same clustering. This will produce the same mutrel. As computing # the mutrel from adjm + clustering is expensive, we want to avoid repeating # this unnecessarily. Instead, we just modify the associated weight of the # the pairing to reflect this. # # Observe that if we have `C` copies of the LLH `W`, we obtain # equivalent post-softmax linear-space weights under either of the following # two methods: # # 1. (naive) Represent the associated samples `C` separate times in the softmax # 2. (smart) Set `W' = W + log(C)`, as `exp(W') = Cexp(W)` weights = util.softmax(llhs + np.log(counts)) vids = None for struct, weight in zip(structs, weights): adjm = util.convert_parents_to_adjmatrix(struct) crel = make_clustrel_from_cluster_adj(adjm) if vids is None: vids = crel.vids soft_clustrel = np.zeros(crel.rels.shape) else: assert crel.vids == vids soft_clustrel += weight * crel.rels soft_clustrel = fix_rounding_errors(soft_clustrel) clustrel = mutrel.Mutrel(rels=soft_clustrel, vids=vids) mrel = make_mutrel_from_clustrel(clustrel, clustering) return mrel
def _compute_cna_influence(struct, cna_events, ssm_segs, ssm_pops, ssm_phases, ssm_timing): assert len(ssm_segs) == len(ssm_pops) == len(ssm_phases) == len(ssm_timing) M = len(ssm_segs) C = len(cna_events) # For `cna_influence`, we have an `MxC` matrix, where `cna_influence[i,j] = # 1` iff SSM `i` is influenced by CNA `j`. That is, SSM `i` occurred in the # same phase on the same segment as CNA `j` in an ancestral population to # where `j` occurred, or `i` occurred in the same phase on the same segment # as `j` in the same population with timing such that `i` was before (not # after) `j`. infl = np.zeros((M, C), dtype=np.int8) adjm = util.convert_parents_to_adjmatrix(struct) anc = util.make_ancestral_from_adj(adjm) np.fill_diagonal(anc, 0) for cna_idx, event in enumerate(cna_events): anc_pops = np.flatnonzero(anc[event.pop]) assert event.pop not in anc_pops ancestral_ssm_mask = np.logical_and.reduce(( np.isin(ssm_pops, anc_pops), ssm_segs == event.seg, ssm_phases == event.phase, )) before_cna_ssm_mask = np.logical_and( ssm_pops == event.pop, ssm_timing == TIMING_BEFORE, ) ssm_mask = np.logical_or(ancestral_ssm_mask, before_cna_ssm_mask) infl[ssm_mask, cna_idx] = 1 return infl
def calc_cadi(eta, struct): ''' Compute the clone and ancestor diversity index (CADI), which is the joint entropy of eta and the subclones ancestral to a clone. >>> eta = np.array([[0.5], [0.2], [0.2], [0.1]]) >>> struct = [0, 1, 1] >>> cadi = calc_cadi(eta, struct) >>> np.isclose(cadi[0], 2.1219280948873624) True ''' K, S = eta.shape adj = util.convert_parents_to_adjmatrix(struct) anc = util.make_ancestral_from_adj(adj, check_validity=True) assert anc.shape == (K, K) A = np.sum(anc, axis=0) - 1 A = np.repeat(A[1:][:, np.newaxis], S, axis=1) assert np.all(A >= 1) eta = _fix_eta(eta) assert A.shape == eta.shape H_joint = -ma.sum(eta * (ma.log2(eta) - np.log2(A)), axis=0) assert H_joint.shape == (S, ) return H_joint
def calc_cmdi(eta, clusters, struct): '''Compute the clone and mutation diversity index (CMDI), which is the joint entropy of eta and the mutations presnt in a clone (i.e., the mutations specific to it as well as the mutations inherited from its ancestors).''' K, S = eta.shape adj = util.convert_parents_to_adjmatrix(struct) anc = util.make_ancestral_from_adj(adj, check_validity=True) assert anc.shape == (K, K) vids, mutmem = util.make_membership_mat(clusters) M = len(vids) # Root node has no associated mutations. mutmem = np.insert(mutmem, 0, 0, axis=1) assert mutmem.shape == (M, K) assert np.sum(mutmem) == M # `mutanc[i,j] = 1` iff mutation `i` occurred in node `j` or a node ancestral # to it. mutanc = np.dot(mutmem, anc) # `mutanc_cnt[i]` = number of mutations that occurred in clone `i` and all # clones ancestral to it. mutanc_cnt = np.sum(mutanc, axis=0) assert mutanc_cnt[0] == 0 and np.all(mutanc_cnt[1:] > 0) M_k = np.repeat(mutanc_cnt[1:][:, np.newaxis], S, axis=1) eta = _fix_eta(eta) assert eta.shape == M_k.shape H_joint = -ma.sum(eta * (ma.log2(eta) - np.log2(M_k)), axis=0) assert H_joint.shape == (S, ) return H_joint
def _calc_num_pops(parents): # Calculate number of populations in each subclone. adj = util.convert_parents_to_adjmatrix(parents) K = len(adj) assert adj.shape == (K, K) anc = util.make_ancestral_from_adj(adj) C = np.sum(anc, axis=1) assert C[0] == K return C[1:].astype(np.int)
def compute_parent_dist(structs, weights): K = len(structs[0]) + 1 parent_dist = np.zeros((K, K)) assert np.all(weights >= 0) and np.isclose(1, np.sum(weights)) for struct, weight in zip(structs, weights): adjm = util.convert_parents_to_adjmatrix(struct) np.fill_diagonal(adjm, 0) parent_dist += weight * adjm assert np.all(0 == parent_dist[:, 0]) parent_dist = parent_dist[:, 1:] parent_dist = evalutil.fix_rounding_errors(parent_dist) assert np.all(0 <= parent_dist) and np.all(parent_dist <= 1) assert np.allclose(1, np.sum(parent_dist, axis=0)) return parent_dist
def main(): parser = argparse.ArgumentParser( description='LOL HI THERE', formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('params_fn') parser.add_argument('pickle_fn') args = parser.parse_args() params = inputparser.load_params(args.params_fn) adjm = util.convert_parents_to_adjmatrix(params['structure']) with open(args.pickle_fn, 'wb') as outf: pickle.dump( { 'adjm': adjm, 'clusters': params['clusters'], 'vids_good': [V for C in params['clusters'] for V in C], 'vids_garbage': params['garbage'], }, outf)
def generate_tree(K, S, alpha, tree_type, eta_min=1e-30): parents = make_parents(K, tree_type) #leaves = np.flatnonzero(np.sum(adjm, axis=1) == 0) adjm = util.convert_parents_to_adjmatrix(parents) Z = util.make_ancestral_from_adj(adjm) # (K+1)x(K+1) eta = np.random.dirichlet(alpha = (K+1)*[alpha], size=S).T # (K+1)xS # In general, we want etas on leaves to be more "peaked" -- that is, only a # few subclones come to dominate, so they should have large etas relative to # internal nodes. We accomplish this by using a smaller alpha for these. #eta[leaves] += np.random.dirichlet(alpha = len(leaves)*[1e0], size = S).T # Given the true phis, we want enumeration to be able to recover the true # tree (as well as other trees, potentially). For this to work, there needs # to be a well-defined ordering based on phis, which means that we can't have # `eta = 0` exactly. Without this minimum eta, especially given only one # sample, we can end up with two populations that have exactly the same phi, # which means their ordering is arbitrary. eta = np.maximum(eta_min, eta) eta /= np.sum(eta, axis=0) phi = np.dot(Z, eta) # (Kx1)xS assert np.allclose(1, phi[0]) return (parents, phi, eta)
def _make_noderels(struct): adjm = util.convert_parents_to_adjmatrix(struct) rels = util.compute_node_relations(adjm) return rels
def _generate_cna_events(K, H, C, ploidy, struct): assert len(struct) == K adjm = util.convert_parents_to_adjmatrix(struct) anc = util.make_ancestral_from_adj(adjm) cn_seg_probs = np.random.dirichlet(alpha = H*[5]) cn_phase_probs = np.random.dirichlet(alpha = ploidy*[5]) cn_pop_probs = np.random.dirichlet(alpha = K*[5]) # Directions: 0=deletion, 1=gain direction_probs = np.random.dirichlet(alpha = 2*[5]) lam = 1.5 attempts = 0 max_attempts = 5000*C events = [] triplets = set() directions = {} deletions = {} while len(events) < C: attempts += 1 if attempts > max_attempts: raise TooManyAttemptsError('Could not generate configuration without duplicates in %s attempts' % max_attempts) cn_seg = np.random.choice(H, p=cn_seg_probs) cn_phase = np.random.choice(ploidy, p=cn_phase_probs) # Add one so that no CNAs are assigned to the root. cn_pop = np.random.choice(K, p=cn_pop_probs) + 1 triplet = (cn_seg, cn_phase, cn_pop) doublet = (cn_seg, cn_phase) if triplet in triplets: continue if doublet in directions: direction = directions[doublet] else: direction = np.random.choice(2, p=direction_probs) if direction == DIRECTION_GAIN: delta = np.ceil(np.random.exponential(scale=1/lam)).astype(np.int) assert delta >= 1 else: # We only ever have one allele to lose, so can never lose more than one. delta = -1 if doublet in deletions: same_branch_nodes = set(np.flatnonzero(anc[cn_pop])) | set(np.flatnonzero(anc[:,cn_pop])) same_branch_deletions = deletions[doublet] & same_branch_nodes if len(same_branch_deletions) > 0: continue else: deletions[doublet] = set() deletions[doublet].add(cn_pop) triplets.add(triplet) if doublet not in directions: directions[doublet] = direction events.append(Cna(cn_pop, cn_seg, cn_phase, delta)) return events