def dup_loss_topology_prior(tree, stree, recon, birth, death, maxdoom=20, events=None): """ Returns the log prior of a gene tree topology according to dup-loss model """ def gene2species(gene): return recon[tree.nodes[gene]].name if events is None: events = phylo.label_events(tree, recon) leaves = set(tree.leaves()) phylo.add_implied_spec_nodes(tree, stree, recon, events) pstree, snodes, snodelookup = spidir.make_ptree(stree) # get doomtable doomtable = calc_doom_table(stree, birth, death, maxdoom) prod = 0.0 for node in tree: if events[node] == "spec": for schild in recon[node].children: nodes2 = [x for x in node.children if recon[x] == schild] if len(nodes2) > 0: node2 = nodes2[0] subleaves = get_sub_tree(node2, schild, recon, events) nhist = birthdeath.num_topology_histories(node2, subleaves) s = len(subleaves) thist = stats.factorial(s) * stats.factorial(s - 1) / 2 ** (s - 1) if len(set(subleaves) & leaves) == 0: # internal prod += log(num_redundant_topology(node2, gene2species, subleaves, True)) else: # leaves prod += log(num_redundant_topology(node2, gene2species, subleaves, False)) else: nhist = 1.0 thist = 1.0 s = 0 t = sum( stats.choose(s + i, i) * birthdeath.prob_birth_death1(s + i, schild.dist, birth, death) * exp(doomtable[snodelookup[schild]]) ** i for i in range(maxdoom + 1) ) prod += log(nhist) - log(thist) + log(t) # correct for renumbering nt = num_redundant_topology(tree.root, gene2species) prod -= log(nt) # phylo.removeImpliedSpecNodes(tree, recon, events) treelib.remove_single_children(tree) return prod
def count_dup_loss_coal_tree(coal_tree, extra, stree, gene2species, implied=True, locus_mpr=True): """count dup loss coal""" if not locus_mpr: raise Exception("not implemented") # TODO: use locus_recon and locus_events rather than MPR # (currently, phylo.py reconciliation functions fail for non-MPR) locus_tree = extra["locus_tree"] locus_recon = phylo.reconcile(locus_tree, stree, gene2species) locus_events = phylo.label_events(locus_tree, locus_recon) coal_recon = extra["coal_recon"] ndup, nloss, nappear = phylo.count_dup_loss_tree(locus_tree, stree, gene2species, locus_recon, locus_events) # add implied speciation nodes if desired # this must be added AFTER counting dups and losses since it affects loss inference if implied: added = phylo.add_implied_spec_nodes(locus_tree, stree, locus_recon, locus_events) # count coals ncoal = 0 counts = coal.count_lineages_per_branch(coal_tree, coal_recon, locus_tree) for lnode, (count_bot, count_top) in counts.iteritems(): n = max(count_top-1, 0) locus_recon[lnode].data['coal'] += n ncoal += n if implied: phylo.remove_implied_spec_nodes(locus_tree, added, locus_recon, locus_events) return ndup, nloss, ncoal, nappear
def eval_proposal(self, proposal): """Compute cost of proposal""" if not phyloDLC.assert_daughters(proposal.locus_events, proposal.daughters): # ensure locus events (duplications) and daughters match ndup, nloss, ncoal = None, None, None dupcost, losscost, coalcost = util.INF, util.INF, util.INF else: # find dup cost if self.dupcost == 0: ndup = None dupcost = 0 else: ndup = phylo.count_dup(proposal.locus_tree, proposal.locus_events) dupcost = ndup * self.dupcost # find loss cost if self.losscost == 0: nloss = None losscost = 0 else: nloss = phylo.count_loss(proposal.locus_tree, self.stree, proposal.locus_recon) losscost = nloss * self.losscost # find coal cost (first ensure bounded coalescent is satisfied - should always be true based on how daughters are proposed) phyloDLC.assert_bounded_coal(self.coal_tree, proposal.coal_recon, proposal.locus_tree, proposal.daughters) if self.coalcost == 0: ncoal = None coalcost = 0 else: # add implied speciation nodes if desired # this must be added AFTER counting dups and losses since it affects loss inference if self.implied: added = phylo.add_implied_spec_nodes(proposal.locus_tree, self.stree, proposal.locus_recon, proposal.locus_events) ncoal = phyloDLC.count_coal(self.coal_tree, proposal.coal_recon, proposal.locus_tree) coalcost = ncoal * self.coalcost if self.implied: phylo.remove_implied_spec_nodes(proposal.locus_tree, added, proposal.locus_recon, proposal.locus_events) # total cost cost = dupcost + losscost + coalcost # logging info info = {} info["ndup"] = ndup info["nloss"] = nloss info["ncoal"] = ncoal info["cost"] = cost proposal.data = info return cost
def eval_proposal(self, proposal): """Compute probability of proposal""" # compute recon probability phylo.add_implied_spec_nodes(proposal["locus_tree"], self.stree, proposal["locus_recon"], proposal["locus_events"]) p = prob_dlcoal_recon_topology(self.coal_tree, proposal["coal_recon"], proposal["locus_tree"], proposal["locus_recon"], proposal["locus_events"], proposal["daughters"], self.stree, self.n, self.duprate, self.lossrate, self.pretime, self.premean, maxdoom=self.maxdoom, nsamples=self.nsamples, add_spec=False) treelib.remove_single_children(proposal["locus_tree"]) phylo.subset_recon(proposal["locus_tree"], proposal["locus_recon"]) return p
def add_implied_spec_nodes(tree, stree, recon, events): """ Add speciation nodes to tree that are implied but are not present because of gene losses. Extends phylo.add_implied_spec_nodes to handle non-MPR. Only guaranteed to work for binary trees. """ added_spec = phylo.add_implied_spec_nodes(tree, stree, recon, events) added_dup = [] for node in list(tree): schildren = [recon[child] for child in node.children] if len(schildren) > 1 and len(set(schildren)) == 1 and events[node] != "dup": added_dup.extend(add_spec_from_dup_nodes(node, tree, recon, events)) assert is_full_tree(tree, stree, recon, events) return added_spec, added_dup
def count_dup_loss_coal(coal_tree, extra, stree, implied=True): """Returns the number of duplications + transfers + losses in a gene tree""" locus_tree = extra["locus_tree"] locus_recon = extra["locus_recon"] locus_events = extra["locus_events"] coal_recon = extra["coal_recon"] ndup = count_dup(locus_tree, locus_events) nloss = count_loss(locus_tree, stree, locus_recon) if implied: # add implied speciation nodes if desired # this must be added AFTER counting dups and losses since it affects loss inference added = phylo.add_implied_spec_nodes(locus_tree, stree, locus_recon, locus_events) ncoal = count_coal(coal_tree, coal_recon, locus_tree) if implied: phylo.remove_implied_spec_nodes(locus_tree, added, locus_recon, locus_events) return ndup + nloss + ncoal
def count_dup_loss_coal_tree(coal_tree, extra, stree, gene2species, implied=True, locus_mpr=True): """count dup loss coal""" if not locus_mpr: raise Exception("not implemented") # TODO: use locus_recon and locus_events rather than MPR # (currently, phylo.py reconciliation functions fail for non-MPR) locus_tree = extra["locus_tree"] locus_recon = phylo.reconcile(locus_tree, stree, gene2species) locus_events = phylo.label_events(locus_tree, locus_recon) coal_recon = extra["coal_recon"] ndup, nloss, nappear = phylo.count_dup_loss_tree(locus_tree, stree, gene2species, locus_recon, locus_events) # add implied speciation nodes if desired # this must be added AFTER counting dups and losses since it affects loss inference if implied: added = phylo.add_implied_spec_nodes(locus_tree, stree, locus_recon, locus_events) # count coals ncoal = 0 counts = coal.count_lineages_per_branch(coal_tree, coal_recon, locus_tree) for lnode, (count_bot, count_top) in counts.iteritems(): n = max(count_top - 1, 0) locus_recon[lnode].data['coal'] += n ncoal += n if implied: phylo.remove_implied_spec_nodes(locus_tree, added, locus_recon, locus_events) return ndup, nloss, ncoal, nappear
def add_implied_spec_nodes(tree, stree, recon, events): """ Add speciation nodes to tree that are implied but are not present because of gene losses. Extends phylo.add_implied_spec_nodes to handle non-MPR. Only guaranteed to work for binary trees. """ added_spec = phylo.add_implied_spec_nodes(tree, stree, recon, events) added_dup = [] for node in list(tree): schildren = [recon[child] for child in node.children] if len(schildren) > 1 and len( set(schildren)) == 1 and events[node] != "dup": added_dup.extend(add_spec_from_dup_nodes(node, tree, recon, events)) assert is_full_tree(tree, stree, recon, events) return added_spec, added_dup
def eval_proposal(self, proposal): """Compute cost of proposal""" if not phyloDLC.assert_daughters(proposal.locus_events, proposal.daughters): # ensure locus events (duplications) and daughters match ndup, nloss, ncoal = None, None, None dupcost, losscost, coalcost = util.INF, util.INF, util.INF else: # find dup cost if self.dupcost == 0: ndup = None dupcost = 0 else: ndup = phylo.count_dup(proposal.locus_tree, proposal.locus_events) dupcost = ndup * self.dupcost # find loss cost if self.losscost == 0: nloss = None losscost = 0 else: nloss = phylo.count_loss(proposal.locus_tree, self.stree, proposal.locus_recon) losscost = nloss * self.losscost # find coal cost (first ensure bounded coalescent is satisfied - should always be true based on how daughters are proposed) phyloDLC.assert_bounded_coal(self.coal_tree, proposal.coal_recon, proposal.locus_tree, proposal.daughters) if self.coalcost == 0: ncoal = None coalcost = 0 else: # add implied speciation nodes if desired # this must be added AFTER counting dups and losses since it affects loss inference if self.implied: added = phylo.add_implied_spec_nodes( proposal.locus_tree, self.stree, proposal.locus_recon, proposal.locus_events) ncoal = phyloDLC.count_coal(self.coal_tree, proposal.coal_recon, proposal.locus_tree) coalcost = ncoal * self.coalcost if self.implied: phylo.remove_implied_spec_nodes(proposal.locus_tree, added, proposal.locus_recon, proposal.locus_events) # total cost cost = dupcost + losscost + coalcost # logging info info = {} info["ndup"] = ndup info["nloss"] = nloss info["ncoal"] = ncoal info["cost"] = cost proposal.data = info return cost
def dlcoal_recon_old(tree, stree, gene2species, n, duprate, lossrate, pretime=None, premean=None, nsearch=1000, maxdoom=20, nsamples=100, search=phylo.TreeSearchNni): """ Perform reconciliation using the DLCoal model Returns (maxp, maxrecon) where 'maxp' is the probability of the MAP reconciliation 'maxrecon' which further defined as maxrecon = {'coal_recon': coal_recon, 'locus_tree': locus_tree, 'locus_recon': locus_recon, 'locus_events': locus_events, 'daughters': daughters} """ # init coal tree coal_tree = tree # init locus tree as congruent to coal tree # equivalent to assuming no ILS locus_tree = coal_tree.copy() maxp = - util.INF maxrecon = None # init search locus_search = search(locus_tree) for i in xrange(nsearch): # TODO: propose other reconciliations beside LCA locus_tree2 = locus_tree.copy() phylo.recon_root(locus_tree2, stree, gene2species, newCopy=False) locus_recon = phylo.reconcile(locus_tree2, stree, gene2species) locus_events = phylo.label_events(locus_tree2, locus_recon) # propose daughters (TODO) daughters = set() # propose coal recon (TODO: propose others beside LCA) coal_recon = phylo.reconcile(coal_tree, locus_tree2, lambda x: x) # compute recon probability phylo.add_implied_spec_nodes(locus_tree2, stree, locus_recon, locus_events) p = prob_dlcoal_recon_topology(coal_tree, coal_recon, locus_tree2, locus_recon, locus_events, daughters, stree, n, duprate, lossrate, pretime, premean, maxdoom=maxdoom, nsamples=nsamples, add_spec=False) treelib.remove_single_children(locus_tree2) if p > maxp: maxp = p maxrecon = {"coal_recon": coal_recon, "locus_tree": locus_tree2, "locus_recon": locus_recon, "locus_events": locus_events, "daughters": daughters} locus_tree = locus_tree2.copy() locus_search.set_tree(locus_tree) else: locus_search.revert() # perform local rearrangement to locus tree locus_search.propose() return maxp, maxrecon
def prob_dlcoal_recon_topology(coal_tree, coal_recon, locus_tree, locus_recon, locus_events, daughters, stree, n, duprate, lossrate, pretime=None, premean=None, maxdoom=20, nsamples=100, add_spec=True): """ Probability of a reconcile gene tree in the DLCoal model. coal_tree -- coalescent tree coal_recon -- reconciliation of coalescent tree to locus tree locus_tree -- locus tree (has dup-loss) locus_recon -- reconciliation of locus tree to species tree locus_events -- events dict for locus tree stree -- species tree n -- population sizes in species tree duprate -- duplication rate lossrate -- loss rate You must also specify one of the following pretime -- starting time before species tree premean -- mean starting time before species tree Note: locus tree must have implied speciation nodes present """ dups = phylo.count_dup(locus_tree, locus_events) # ensure implicit speciations are present if add_spec: phylo.add_implied_spec_nodes(locus_tree, stree, locus_recon, locus_events) # init popsizes for locus tree stree_popsizes = coal.init_popsizes(stree, n) popsizes = {} for node in locus_tree: popsizes[node.name] = stree_popsizes[locus_recon[node].name] # duploss probability util.tic("top") dl_prob = spidir.calc_birth_death_prior(locus_tree, stree, locus_recon, duprate, lossrate, maxdoom=maxdoom) util.toc() # daughters probability d_prob = dups * log(.5) # integrate over duplication times using sampling prob = 0.0 #util.tic("int") for i in xrange(nsamples): # sample duplication times locus_times = spidir.topology_prior.sample_dup_times( locus_tree, stree, locus_recon, duprate, lossrate, pretime, premean, events=locus_events) assert len(locus_times) == len(locus_tree.nodes), ( len(locus_times), len(locus_tree.nodes)) treelib.set_dists_from_timestamps(locus_tree, locus_times) # coal topology probability coal_prob = prob_coal_recon_topology(coal_tree, coal_recon, locus_tree, popsizes, daughters) prob += exp(coal_prob) print coal_prob #util.toc() return dl_prob + d_prob + util.safelog(prob / nsamples)
def dup_loss_topology_prior(tree, stree, recon, birth, death, maxdoom=20, events=None): """ Returns the log prior of a gene tree topology according to dup-loss model """ def gene2species(gene): return recon[tree.nodes[gene]].name if events is None: events = phylo.label_events(tree, recon) leaves = set(tree.leaves()) phylo.add_implied_spec_nodes(tree, stree, recon, events) pstree, snodes, snodelookup = spidir.make_ptree(stree) # get doomtable doomtable = calc_doom_table(stree, birth, death, maxdoom) prod = 0.0 for node in tree: if events[node] == "spec": for schild in recon[node].children: nodes2 = [x for x in node.children if recon[x] == schild] if len(nodes2) > 0: node2 = nodes2[0] subleaves = get_sub_tree(node2, schild, recon, events) nhist = birthdeath.num_topology_histories(node2, subleaves) s = len(subleaves) thist = stats.factorial(s) * stats.factorial(s - 1) / 2**( s - 1) if len(set(subleaves) & leaves) == 0: # internal prod += log( num_redundant_topology(node2, gene2species, subleaves, True)) else: # leaves prod += log( num_redundant_topology(node2, gene2species, subleaves, False)) else: nhist = 1.0 thist = 1.0 s = 0 t = sum( stats.choose(s + i, i) * birthdeath.prob_birth_death1( s + i, schild.dist, birth, death) * exp(doomtable[snodelookup[schild]])**i for i in range(maxdoom + 1)) prod += log(nhist) - log(thist) + log(t) # correct for renumbering nt = num_redundant_topology(tree.root, gene2species) prod -= log(nt) #phylo.removeImpliedSpecNodes(tree, recon, events) treelib.remove_single_children(tree) return prod