Beispiel #1
0
def make_mutrel_from_trees_and_unique_clusterings(structs, llhs, clusterings):
    '''
  Relative to `make_mutrel_from_trees_and_single_clustering`, this function is
  slower and more memory intensive, but also more flexible. It differs in two
  respects:

  1. It doesn't assume that the user has already computed counts for all unique
  samples -- i.e., it allows duplicate samples.

  2. It allows unique clusterings for every sample.
  '''
    assert len(structs) == len(llhs) == len(clusterings)
    weights = util.softmax(llhs)
    vids = None

    for struct, clustering, weight in zip(structs, clusterings, weights):
        adjm = util.convert_parents_to_adjmatrix(struct)
        mrel = make_mutrel_from_cluster_adj(adjm, clustering)
        if vids is None:
            vids = mrel.vids
            soft_mutrel = np.zeros(mrel.rels.shape)
        else:
            assert mrel.vids == vids
        soft_mutrel += weight * mrel.rels

    soft_mutrel = fix_rounding_errors(soft_mutrel)
    return mutrel.Mutrel(
        vids=vids,
        rels=soft_mutrel,
    )
Beispiel #2
0
def make_mutrel_from_trees_and_single_clustering(structs, llhs, counts,
                                                 clustering):
    # Oftentimes, we will have many samples of the same adjacency matrix paired
    # with the same clustering. This will produce the same mutrel. As computing
    # the mutrel from adjm + clustering is expensive, we want to avoid repeating
    # this unnecessarily. Instead, we just modify the associated weight of the
    # the pairing to reflect this.
    #
    # Observe that if we have `C` copies of the LLH `W`, we obtain
    # equivalent post-softmax linear-space weights under either of the following
    # two methods:
    #
    # 1. (naive) Represent the associated samples `C` separate times in the softmax
    # 2. (smart) Set `W' = W + log(C)`, as `exp(W') = Cexp(W)`
    weights = util.softmax(llhs + np.log(counts))
    vids = None

    for struct, weight in zip(structs, weights):
        adjm = util.convert_parents_to_adjmatrix(struct)
        crel = make_clustrel_from_cluster_adj(adjm)

        if vids is None:
            vids = crel.vids
            soft_clustrel = np.zeros(crel.rels.shape)
        else:
            assert crel.vids == vids
        soft_clustrel += weight * crel.rels

    soft_clustrel = fix_rounding_errors(soft_clustrel)
    clustrel = mutrel.Mutrel(rels=soft_clustrel, vids=vids)
    mrel = make_mutrel_from_clustrel(clustrel, clustering)
    return mrel
Beispiel #3
0
def _compute_cna_influence(struct, cna_events, ssm_segs, ssm_pops, ssm_phases, ssm_timing):
  assert len(ssm_segs) == len(ssm_pops) == len(ssm_phases) == len(ssm_timing)
  M = len(ssm_segs)
  C = len(cna_events)

  # For `cna_influence`, we have an `MxC` matrix, where `cna_influence[i,j] =
  # 1` iff SSM `i` is influenced by CNA `j`. That is, SSM `i` occurred in the
  # same phase on the same segment as CNA `j` in an ancestral population to
  # where `j` occurred, or `i` occurred in the same phase on the same segment
  # as `j`  in the same population with timing such that `i` was before (not
  # after) `j`.
  infl = np.zeros((M, C), dtype=np.int8)
  adjm = util.convert_parents_to_adjmatrix(struct)
  anc = util.make_ancestral_from_adj(adjm)
  np.fill_diagonal(anc, 0)

  for cna_idx, event in enumerate(cna_events):
    anc_pops = np.flatnonzero(anc[event.pop])
    assert event.pop not in anc_pops
    ancestral_ssm_mask = np.logical_and.reduce((
      np.isin(ssm_pops, anc_pops),
      ssm_segs == event.seg,
      ssm_phases == event.phase,
    ))
    before_cna_ssm_mask = np.logical_and(
      ssm_pops == event.pop,
      ssm_timing == TIMING_BEFORE,
    )
    ssm_mask = np.logical_or(ancestral_ssm_mask, before_cna_ssm_mask)
    infl[ssm_mask, cna_idx] = 1

  return infl
Beispiel #4
0
def calc_cadi(eta, struct):
    '''
  Compute the clone and ancestor diversity index (CADI), which is the joint
  entropy of eta and the subclones ancestral to a clone.

  >>> eta = np.array([[0.5], [0.2], [0.2], [0.1]])
  >>> struct = [0, 1, 1]
  >>> cadi = calc_cadi(eta, struct)
  >>> np.isclose(cadi[0], 2.1219280948873624)
  True
  '''
    K, S = eta.shape

    adj = util.convert_parents_to_adjmatrix(struct)
    anc = util.make_ancestral_from_adj(adj, check_validity=True)
    assert anc.shape == (K, K)
    A = np.sum(anc, axis=0) - 1
    A = np.repeat(A[1:][:, np.newaxis], S, axis=1)
    assert np.all(A >= 1)

    eta = _fix_eta(eta)
    assert A.shape == eta.shape

    H_joint = -ma.sum(eta * (ma.log2(eta) - np.log2(A)), axis=0)
    assert H_joint.shape == (S, )
    return H_joint
Beispiel #5
0
def calc_cmdi(eta, clusters, struct):
    '''Compute the clone and mutation diversity index (CMDI), which is the joint
  entropy of eta and the mutations presnt in a clone (i.e., the mutations
  specific to it as well as the mutations inherited from its ancestors).'''
    K, S = eta.shape

    adj = util.convert_parents_to_adjmatrix(struct)
    anc = util.make_ancestral_from_adj(adj, check_validity=True)
    assert anc.shape == (K, K)

    vids, mutmem = util.make_membership_mat(clusters)
    M = len(vids)
    # Root node has no associated mutations.
    mutmem = np.insert(mutmem, 0, 0, axis=1)
    assert mutmem.shape == (M, K)
    assert np.sum(mutmem) == M
    # `mutanc[i,j] = 1` iff mutation `i` occurred in node `j` or a node ancestral
    # to it.
    mutanc = np.dot(mutmem, anc)
    # `mutanc_cnt[i]` = number of mutations that occurred in clone `i` and all
    # clones ancestral to it.
    mutanc_cnt = np.sum(mutanc, axis=0)

    assert mutanc_cnt[0] == 0 and np.all(mutanc_cnt[1:] > 0)
    M_k = np.repeat(mutanc_cnt[1:][:, np.newaxis], S, axis=1)
    eta = _fix_eta(eta)
    assert eta.shape == M_k.shape

    H_joint = -ma.sum(eta * (ma.log2(eta) - np.log2(M_k)), axis=0)
    assert H_joint.shape == (S, )
    return H_joint
Beispiel #6
0
def _calc_num_pops(parents):
    # Calculate number of populations in each subclone.
    adj = util.convert_parents_to_adjmatrix(parents)
    K = len(adj)
    assert adj.shape == (K, K)
    anc = util.make_ancestral_from_adj(adj)
    C = np.sum(anc, axis=1)
    assert C[0] == K
    return C[1:].astype(np.int)
Beispiel #7
0
def compute_parent_dist(structs, weights):
    K = len(structs[0]) + 1
    parent_dist = np.zeros((K, K))
    assert np.all(weights >= 0) and np.isclose(1, np.sum(weights))

    for struct, weight in zip(structs, weights):
        adjm = util.convert_parents_to_adjmatrix(struct)
        np.fill_diagonal(adjm, 0)
        parent_dist += weight * adjm

    assert np.all(0 == parent_dist[:, 0])
    parent_dist = parent_dist[:, 1:]
    parent_dist = evalutil.fix_rounding_errors(parent_dist)

    assert np.all(0 <= parent_dist) and np.all(parent_dist <= 1)
    assert np.allclose(1, np.sum(parent_dist, axis=0))
    return parent_dist
Beispiel #8
0
def main():
    parser = argparse.ArgumentParser(
        description='LOL HI THERE',
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('params_fn')
    parser.add_argument('pickle_fn')
    args = parser.parse_args()

    params = inputparser.load_params(args.params_fn)
    adjm = util.convert_parents_to_adjmatrix(params['structure'])
    with open(args.pickle_fn, 'wb') as outf:
        pickle.dump(
            {
                'adjm': adjm,
                'clusters': params['clusters'],
                'vids_good': [V for C in params['clusters'] for V in C],
                'vids_garbage': params['garbage'],
            }, outf)
Beispiel #9
0
def generate_tree(K, S, alpha, tree_type, eta_min=1e-30):
  parents = make_parents(K, tree_type)
  #leaves = np.flatnonzero(np.sum(adjm, axis=1) == 0)
  adjm = util.convert_parents_to_adjmatrix(parents)
  Z = util.make_ancestral_from_adj(adjm) # (K+1)x(K+1)
  eta = np.random.dirichlet(alpha = (K+1)*[alpha], size=S).T # (K+1)xS

  # In general, we want etas on leaves to be more "peaked" -- that is, only a
  # few subclones come to dominate, so they should have large etas relative to
  # internal nodes. We accomplish this by using a smaller alpha for these.
  #eta[leaves] += np.random.dirichlet(alpha = len(leaves)*[1e0], size = S).T

  # Given the true phis, we want enumeration to be able to recover the true
  # tree (as well as other trees, potentially). For this to work, there needs
  # to be a well-defined ordering based on phis, which means that we can't have
  # `eta = 0` exactly. Without this minimum eta, especially given only one
  # sample, we can end up with two populations that have exactly the same phi,
  # which means their ordering is arbitrary.
  eta = np.maximum(eta_min, eta)

  eta /= np.sum(eta, axis=0)
  phi = np.dot(Z, eta) # (Kx1)xS
  assert np.allclose(1, phi[0])
  return (parents, phi, eta)
Beispiel #10
0
def _make_noderels(struct):
  adjm = util.convert_parents_to_adjmatrix(struct)
  rels = util.compute_node_relations(adjm)
  return rels
Beispiel #11
0
def _generate_cna_events(K, H, C, ploidy, struct):
  assert len(struct) == K
  adjm = util.convert_parents_to_adjmatrix(struct)
  anc = util.make_ancestral_from_adj(adjm)

  cn_seg_probs = np.random.dirichlet(alpha = H*[5])
  cn_phase_probs = np.random.dirichlet(alpha = ploidy*[5])
  cn_pop_probs = np.random.dirichlet(alpha = K*[5])
  # Directions: 0=deletion, 1=gain
  direction_probs = np.random.dirichlet(alpha = 2*[5])
  lam = 1.5

  attempts = 0
  max_attempts = 5000*C

  events = []
  triplets = set()
  directions = {}
  deletions = {}

  while len(events) < C:
    attempts += 1
    if attempts > max_attempts:
      raise TooManyAttemptsError('Could not generate configuration without duplicates in %s attempts' % max_attempts)

    cn_seg = np.random.choice(H, p=cn_seg_probs)
    cn_phase = np.random.choice(ploidy, p=cn_phase_probs)
    # Add one so that no CNAs are assigned to the root.
    cn_pop = np.random.choice(K, p=cn_pop_probs) + 1
    triplet = (cn_seg, cn_phase, cn_pop)
    doublet = (cn_seg, cn_phase)

    if triplet in triplets:
      continue

    if doublet in directions:
      direction = directions[doublet]
    else:
      direction = np.random.choice(2, p=direction_probs)

    if direction == DIRECTION_GAIN:
      delta = np.ceil(np.random.exponential(scale=1/lam)).astype(np.int)
      assert delta >= 1
    else:
      # We only ever have one allele to lose, so can never lose more than one.
      delta = -1
      if doublet in deletions:
        same_branch_nodes = set(np.flatnonzero(anc[cn_pop])) | set(np.flatnonzero(anc[:,cn_pop]))
        same_branch_deletions = deletions[doublet] & same_branch_nodes
        if len(same_branch_deletions) > 0:
          continue
      else:
        deletions[doublet] = set()
      deletions[doublet].add(cn_pop)

    triplets.add(triplet)
    if doublet not in directions:
      directions[doublet] = direction
    events.append(Cna(cn_pop, cn_seg, cn_phase, delta))

  return events