def compute_cost(self, gtree): """Returns the duplication-loss cost""" recon = phylo.reconcile(gtree, self.stree, self.gene2species) events = phylo.label_events(gtree, recon) cost = 0 if self.dupcost != 0: cost += phylo.count_dup(gtree, events) * self.dupcost if self.losscost != 0: cost += phylo.count_loss(gtree, self.stree, recon) * self.losscost return cost
def eval_proposal(self, proposal): """Compute cost of proposal""" if not phyloDLC.assert_daughters(proposal.locus_events, proposal.daughters): # ensure locus events (duplications) and daughters match ndup, nloss, ncoal = None, None, None dupcost, losscost, coalcost = util.INF, util.INF, util.INF else: # find dup cost if self.dupcost == 0: ndup = None dupcost = 0 else: ndup = phylo.count_dup(proposal.locus_tree, proposal.locus_events) dupcost = ndup * self.dupcost # find loss cost if self.losscost == 0: nloss = None losscost = 0 else: nloss = phylo.count_loss(proposal.locus_tree, self.stree, proposal.locus_recon) losscost = nloss * self.losscost # find coal cost (first ensure bounded coalescent is satisfied - should always be true based on how daughters are proposed) phyloDLC.assert_bounded_coal(self.coal_tree, proposal.coal_recon, proposal.locus_tree, proposal.daughters) if self.coalcost == 0: ncoal = None coalcost = 0 else: # add implied speciation nodes if desired # this must be added AFTER counting dups and losses since it affects loss inference if self.implied: added = phylo.add_implied_spec_nodes(proposal.locus_tree, self.stree, proposal.locus_recon, proposal.locus_events) ncoal = phyloDLC.count_coal(self.coal_tree, proposal.coal_recon, proposal.locus_tree) coalcost = ncoal * self.coalcost if self.implied: phylo.remove_implied_spec_nodes(proposal.locus_tree, added, proposal.locus_recon, proposal.locus_events) # total cost cost = dupcost + losscost + coalcost # logging info info = {} info["ndup"] = ndup info["nloss"] = nloss info["ncoal"] = ncoal info["cost"] = cost proposal.data = info return cost
def _compute_duplosscost(self, ltree): """Returns dup/loss cost from locus tree to species tree""" cost = 0 if self.dupcost > 0 or self.losscost > 0: recon = phylo.reconcile(ltree, self.stree, self.gene2species) events = phylo.label_events(ltree, recon) if self.dupcost != 0: cost += phylo.count_dup(ltree, events) * self.dupcost if self.losscost != 0: cost += phylo.count_loss(ltree, self.stree, recon) * self.losscost return cost
def _compute_coalcost(self, gtree, ltree): """Returns deep coalescent cost from coalescent tree (gene tree) to locus tree Note: uses Zhang (RECOMB 2000) result that C = L - 2*D """ cost = 0 if self.coalcost > 0: recon = phylo.reconcile(gtree, ltree) events = phylo.label_events(gtree, recon) cost = (phylo.count_loss(gtree, ltree, recon) - 2*phylo.count_dup(gtree, events)) * self.coalcost return cost
def prescreen(self, tree): recon = phylo.reconcile(tree, self.stree, self.gene2species) events = phylo.label_events(tree, recon) if self.dupcost == 0: dupcost = 0 else: ndup = phylo.count_dup(tree, events) dupcost = ndup * self.dupcost if self.losscost == 0: losscost = 0 else: nloss = phylo.count_loss(tree, self.stree, recon) losscost = nloss * self.losscost return dupcost + losscost
def eval_proposal(self, proposal): """Compute cost of proposal""" if not phyloDLC.assert_daughters(proposal.locus_events, proposal.daughters): # ensure locus events (duplications) and daughters match ndup, nloss, ncoal = None, None, None dupcost, losscost, coalcost = util.INF, util.INF, util.INF else: # find dup cost if self.dupcost == 0: ndup = None dupcost = 0 else: ndup = phylo.count_dup(proposal.locus_tree, proposal.locus_events) dupcost = ndup * self.dupcost # find loss cost if self.losscost == 0: nloss = None losscost = 0 else: nloss = phylo.count_loss(proposal.locus_tree, self.stree, proposal.locus_recon) losscost = nloss * self.losscost # find coal cost (first ensure bounded coalescent is satisfied - should always be true based on how daughters are proposed) phyloDLC.assert_bounded_coal(self.coal_tree, proposal.coal_recon, proposal.locus_tree, proposal.daughters) if self.coalcost == 0: ncoal = None coalcost = 0 else: # add implied speciation nodes if desired # this must be added AFTER counting dups and losses since it affects loss inference if self.implied: added = phylo.add_implied_spec_nodes( proposal.locus_tree, self.stree, proposal.locus_recon, proposal.locus_events) ncoal = phyloDLC.count_coal(self.coal_tree, proposal.coal_recon, proposal.locus_tree) coalcost = ncoal * self.coalcost if self.implied: phylo.remove_implied_spec_nodes(proposal.locus_tree, added, proposal.locus_recon, proposal.locus_events) # total cost cost = dupcost + losscost + coalcost # logging info info = {} info["ndup"] = ndup info["nloss"] = nloss info["ncoal"] = ncoal info["cost"] = cost proposal.data = info return cost
def prob_dlcoal_recon_topology(coal_tree, coal_recon, locus_tree, locus_recon, locus_events, daughters, stree, n, duprate, lossrate, pretime=None, premean=None, maxdoom=20, nsamples=100, add_spec=True): """ Probability of a reconcile gene tree in the DLCoal model. coal_tree -- coalescent tree coal_recon -- reconciliation of coalescent tree to locus tree locus_tree -- locus tree (has dup-loss) locus_recon -- reconciliation of locus tree to species tree locus_events -- events dict for locus tree stree -- species tree n -- population sizes in species tree duprate -- duplication rate lossrate -- loss rate You must also specify one of the following pretime -- starting time before species tree premean -- mean starting time before species tree Note: locus tree must have implied speciation nodes present """ dups = phylo.count_dup(locus_tree, locus_events) # ensure implicit speciations are present if add_spec: phylo.add_implied_spec_nodes(locus_tree, stree, locus_recon, locus_events) # init popsizes for locus tree stree_popsizes = coal.init_popsizes(stree, n) popsizes = {} for node in locus_tree: popsizes[node.name] = stree_popsizes[locus_recon[node].name] # duploss probability util.tic("top") dl_prob = spidir.calc_birth_death_prior(locus_tree, stree, locus_recon, duprate, lossrate, maxdoom=maxdoom) util.toc() # daughters probability d_prob = dups * log(.5) # integrate over duplication times using sampling prob = 0.0 #util.tic("int") for i in xrange(nsamples): # sample duplication times locus_times = spidir.topology_prior.sample_dup_times( locus_tree, stree, locus_recon, duprate, lossrate, pretime, premean, events=locus_events) assert len(locus_times) == len(locus_tree.nodes), ( len(locus_times), len(locus_tree.nodes)) treelib.set_dists_from_timestamps(locus_tree, locus_times) # coal topology probability coal_prob = prob_coal_recon_topology(coal_tree, coal_recon, locus_tree, popsizes, daughters) prob += exp(coal_prob) print coal_prob #util.toc() return dl_prob + d_prob + util.safelog(prob / nsamples)
def prob_dlcoal_recon_topology(coal_tree, coal_recon, locus_tree, locus_recon, locus_events, daughters, stree, n, duprate, lossrate, pretime=None, premean=None, nsamples=100, add_spec=True, info=None): """ Probability of a reconcile gene tree in the DLCoal model. coal_tree -- coalescent tree coal_recon -- reconciliation of coalescent tree to locus tree locus_tree -- locus tree (has dup-loss) locus_recon -- reconciliation of locus tree to species tree locus_events -- events dict for locus tree stree -- species tree n -- population sizes in species tree duprate -- duplication rate lossrate -- loss rate You must also specify one of the following pretime -- starting time before species tree premean -- mean starting time before species tree """ # init popsizes for locus tree stree_popsizes = coal.init_popsizes(stree, n) popsizes = {} for node in locus_tree: popsizes[node.name] = stree_popsizes[locus_recon[node].name] # duploss probability dl_prob = duploss.prob_dup_loss( locus_tree, stree, locus_recon, locus_events, duprate, lossrate) # daughters probability dups = phylo.count_dup(locus_tree, locus_events) d_prob = dups * log(.5) # integrate over duplication times using sampling stimes = treelib.get_tree_timestamps(stree) prob = prob_locus_coal_recon_topology_samples( coal_tree, coal_recon, locus_tree, locus_recon, locus_events, popsizes, stree, stimes, daughters, duprate, lossrate, nsamples, pretime, premean) # logging info if info is not None: info["duploss_prob"] = dl_prob info["daughters_prob"] = d_prob info["coal_prob"] = prob info["prob"] = dl_prob + d_prob + prob - log(nsamples) return dl_prob + d_prob + prob - log(nsamples)
def prob_locus_gene_species_alignment_recon(alnfile, partfile, stree, popsizes, duprate, lossrate, subrate, beta, pretime, premean, coal_tree, coal_recon, nsamples_coal, locus_tree, locus_recon, nsamples_locus, daughters, rates, freqs, alphas, threads=1, seed=ALIGNMENT_SEED, eps=0.1, info=None): """ (Log) probability of the joint probability of locus_tree, locus_recon, coal_tree, coal_recon, daughters and alignment. Mathematically, it computes: P(T^G, T^L, R^G, R^L, delta^L, A | S, theta) = P(delta^L | T^L, R^L, S) + P(T^L, R^L | S, theta^S) + int int P(t^L | T^L, R^L, S, theta) * P(T^G, R^G, t^G | t^L, T^L, daughters, R^L, theta) * P(A | T^G, t^G) dt^L dt^G alnfile -- alignment file partfile -- partition file stree -- species tree popsizes -- population sizes in species tree duprate -- duplication rate lossrate -- loss rate subrate -- substitution rate beta -- regularization parameter pretime -- starting time before species tree premean -- mean starting time before species tree coal_tree -- coalescent tree coal_recon -- reconciliation of coalescent tree to locus tree nsamples_coal -- number of times to sample coal times t^G locus_tree -- locus tree (has dup-loss) locus_recon -- reconciliation of locus tree to species tree nsamples_locus -- number of times to sample the locus tree times t^L daughters -- daughter nodes rates, freqs, alphas -- optimization parameters Note: Adapted from dlcoal.prob_dlcoal_recon_topology(...) [in __init.py] """ # duploss proability: P(T^L, R^L | S, theta) locus_events = phylo.label_events(locus_tree, locus_recon) dl_prob = duploss.prob_dup_loss(locus_tree, stree, locus_recon, locus_events, duprate, lossrate) # daughters probability: P(daughters | T^L, R^L, S) dups = phylo.count_dup(locus_tree, locus_events) daughter_prob = dups * log(.5) # double integral double_integral = prob_gene_species_alignment_recon(alnfile, partfile, stree, popsizes, duprate, lossrate, subrate, beta, pretime, premean, coal_tree, coal_recon, nsamples_coal, locus_tree, locus_recon, nsamples_locus, daughters, rates, freqs, alphas, threads=1, seed=ALIGNMENT_SEED, eps=0.1, info=None) return dl_prob + daughter_prob + double_integral