def setUp(self): """Set up parameters for test.""" random.seed(1) numpy.random.seed(1) self.underflowfreq = 1 # define tree self.newick = ('((node1:0.2,node2:0.3)node4:0.3,node3:0.5)node5:0.04;') tempfile = '_temp.tree' with open(tempfile, 'w') as f: f.write(self.newick) self.tree = Bio.Phylo.read(tempfile, 'newick') os.remove(tempfile) # amino-acid preferences self.nsites = 50 prefs = [] minpref = 0.02 for _r in range(self.nsites): rprefs = numpy.random.dirichlet([0.5] * N_AA) rprefs[rprefs < minpref] = minpref rprefs /= rprefs.sum() prefs.append(dict(zip(sorted(AA_TO_INDEX.keys()), rprefs))) # simulate alignment with pyvolve pyvolvetree = pyvolve.read_tree(tree=self.newick) self.nseqs = self.tree.count_terminals() expcm = phydmslib.models.ExpCM(prefs) partitions = phydmslib.simulate.pyvolvePartitions(expcm) alignment = '_temp_simulatedalignment.fasta' info = '_temp_info.txt' rates = '_temp_ratefile.txt' evolver = pyvolve.Evolver(partitions=partitions, tree=pyvolvetree) evolver(seqfile=alignment, infofile=info, ratefile=rates) self.alignment = [(s.description, str(s.seq)) for s in Bio.SeqIO.parse(alignment, 'fasta')] for f in [alignment, info, rates]: os.remove(f) assert len(self.alignment[0][1]) == self.nsites * 3 assert len(self.alignment) == self.nseqs # define model if self.MODEL == phydmslib.models.ExpCM: self.model = phydmslib.models.ExpCM(prefs) else: raise ValueError("Invalid MODEL: {0}".format(self.MODEL)) if self.DISTRIBUTIONMODEL is None: pass elif (self.DISTRIBUTIONMODEL == phydmslib.models.GammaDistributedOmegaModel): self.model = self.DISTRIBUTIONMODEL(self.model, ncats=4) else: raise ValueError("Invalid DISTRIBUTIONMODEL: {0}".format( self.DISTRIBUTIONMODEL))
def test_ExpCM(self): """Initialize `ExpCM`, test values, update, test again.""" # create preferences random.seed(1) numpy.random.seed(1) self.nsites = 2 self.prefs = [] minpref = 0.01 for _r in range(self.nsites): rprefs = numpy.random.dirichlet([0.5] * N_AA) rprefs[rprefs < minpref] = minpref rprefs /= rprefs.sum() self.prefs.append(dict(zip(sorted(AA_TO_INDEX.keys()), rprefs))) # create initial ExpCM phi = numpy.random.dirichlet([2] * N_NT) omega = 0.7 kappa = 2.5 beta = 1.9 self.expcm = phydmslib.models.ExpCM(self.prefs, phi=phi, omega=omega, kappa=kappa, beta=beta) self.assertTrue(numpy.allclose(phi, self.expcm.phi)) self.assertTrue(numpy.allclose(omega, self.expcm.omega)) self.assertTrue(numpy.allclose(kappa, self.expcm.kappa)) self.assertTrue(numpy.allclose(beta, self.expcm.beta)) self.assertTrue( numpy.allclose(numpy.repeat(1.0, self.nsites), self.expcm.stationarystate.sum(axis=1))) # now check ExpCM attributes / derivates, updating several times for _update in range(2): self.params = { "omega": random.uniform(*self.expcm.PARAMLIMITS["omega"]), "kappa": random.uniform(*self.expcm.PARAMLIMITS["kappa"]), "beta": random.uniform(0.5, 2.5), "eta": numpy.array([ random.uniform(*self.expcm.PARAMLIMITS["eta"]) for i in range(N_NT - 1) ]), "mu": random.uniform(0.05, 3.0) } self.expcm.updateParams(self.params) self.check_ExpCM_attributes() self.check_ExpCM_derivatives() self.check_ExpCM_matrix_exponentials()
def test_ExpCM_empirical_phi_divpressure(self): """Init `ExpCM_empirical_phi_divpressure`, test, update, test again.""" # create preferences random.seed(1) numpy.random.seed(1) self.nsites = 6 self.prefs = [] minpref = 0.01 for _r in range(self.nsites): rprefs = numpy.random.dirichlet([0.5] * N_AA) rprefs[rprefs < minpref] = minpref rprefs /= rprefs.sum() self.prefs.append(dict(zip(sorted(AA_TO_INDEX.keys()), rprefs))) self.divpressure = numpy.random.randint(2, size=self.nsites) # create initial ExpCM g = numpy.random.dirichlet([3] * N_NT) omega = 0.7 omega2 = 0.2 kappa = 2.5 beta = 1.2 self.model = phydmslib.models.ExpCM_empirical_phi_divpressure( self.prefs, g=g, divPressureValues=self.divpressure, omega=omega, kappa=kappa, beta=beta, omega2=omega2) # now check ExpCM attributes / derivates, updating several times for _update in range(2): self.params = { "omega": random.uniform(0.1, 2), "kappa": random.uniform(0.5, 10), "beta": random.uniform(0.5, 3), "mu": random.uniform(0.05, 5.0), "omega2": random.uniform(0.1, 0.3), } self.model.updateParams(self.params) self.assertTrue(numpy.allclose(g, self.model.g)) self.check_empirical_phi() self.check_dQxy_dbeta() self.check_dprx_dbeta() self.check_ExpCM_attributes() self.check_ExpCM_derivatives() self.check_ExpCM_matrix_exponentials()
def testExpCM_spielmanwr(self): """Test the `ExpCM` function `_spielman_wr`.""" # create models random.seed(1) numpy.random.seed(1) nsites = 10 g = numpy.random.dirichlet([5] * N_NT) prefs = [] minpref = 0.01 for _r in range(nsites): rprefs = numpy.random.dirichlet([0.5] * N_AA) rprefs[rprefs < minpref] = minpref rprefs /= rprefs.sum() prefs.append(dict(zip(sorted(AA_TO_INDEX.keys()), rprefs))) if self.MODEL == phydmslib.models.ExpCM: self.model = phydmslib.models.ExpCM(prefs) elif self.MODEL == phydmslib.models.ExpCM_empirical_phi: self.model = phydmslib.models.ExpCM_empirical_phi(prefs, g) else: raise ValueError("Invalid MODEL: {0}".format(self.MODEL)) # test `_spielman_wr` calculation wr = [] for n in range(self.model.nsites): numerator = 0 denominator = 0 for x in range(N_CODON): for y in range(N_CODON): if CODON_SINGLEMUT[x][y] and CODON_NONSYN[x][y]: prx = self.model.stationarystate[n][x] Prxy = self.model.Prxy[n][x][y] Qxy = self.model.Qxy[x][y] numerator += prx * Prxy denominator += prx * Qxy wr.append(numerator / denominator) wr = numpy.array(wr) self.assertTrue( numpy.allclose(wr, self.model.spielman_wr(norm=False), rtol=0.01)) self.assertTrue( numpy.allclose(wr / self.model.omega, self.model.spielman_wr(), rtol=0.01))
def test_ExpCM_empirical_phi(self): """Initialize `ExpCM_empirical_phi`, test, update, test again.""" # create preferences random.seed(1) numpy.random.seed(1) self.nsites = 7 self.prefs = [] minpref = 0.01 for _r in range(self.nsites): rprefs = numpy.random.dirichlet([0.5] * N_AA) rprefs[rprefs < minpref] = minpref rprefs /= rprefs.sum() self.prefs.append(dict(zip(sorted(AA_TO_INDEX.keys()), rprefs))) # create initial ExpCM g = numpy.random.dirichlet([3] * N_NT) omega = 0.7 kappa = 2.5 beta = 1.2 self.expcm = (phydmslib.models.ExpCM_empirical_phi(self.prefs, g=g, omega=omega, kappa=kappa, beta=beta)) self.assertTrue(numpy.allclose(g, self.expcm.g)) # now check ExpCM attributes / derivates, updating several times for _update in range(2): self.params = { 'omega': random.uniform(0.1, 2), 'kappa': random.uniform(0.5, 10), 'beta': random.uniform(0.5, 5), 'mu': random.uniform(0.05, 5.0), } self.expcm.updateParams(self.params) self.assertTrue(numpy.allclose(g, self.expcm.g)) self.check_empirical_phi() self.check_dQxy_dbeta() self.check_dprx_dbeta() self.check_ExpCM_attributes() self.check_ExpCM_derivatives() self.check_ExpCM_matrix_exponentials()
def setUp(self): """Set up for tests.""" numpy.random.seed(1) random.seed(1) nsites = 1 minpref = 0.001 self.prefs = [] for _r in range(nsites): rprefs = numpy.random.dirichlet([0.7] * N_AA) rprefs[rprefs < minpref] = minpref rprefs[0] = rprefs[1] + 1.0e-8 # near equal prefs handled OK rprefs /= rprefs.sum() self.prefs.append(dict(zip(sorted(AA_TO_INDEX.keys()), rprefs))) self.expcm_fitprefs = self.MODEL(self.prefs, prior=None, kappa=3.0, omega=0.3, phi=numpy.random.dirichlet([5] * N_NT)) assert len(self.expcm_fitprefs.zeta.flatten()) == nsites * (N_AA - 1) assert self.expcm_fitprefs.nsites == nsites
def test_simulateAlignmentRandomSeed(self): """Simulate evolution, ensure scaled branches match number of subs.""" numpy.random.seed(1) random.seed(1) # define model nsites = 200 prefs = [] minpref = 0.01 for _r in range(nsites): rprefs = numpy.random.dirichlet([1] * N_AA) rprefs[rprefs < minpref] = minpref rprefs /= rprefs.sum() prefs.append(dict(zip(sorted(AA_TO_INDEX.keys()), rprefs))) kappa = 4.2 omega = 0.4 beta = 1.5 mu = 0.3 if self.MODEL == phydmslib.models.ExpCM: phi = numpy.random.dirichlet([7] * N_NT) model = phydmslib.models.ExpCM(prefs, kappa=kappa, omega=omega, beta=beta, mu=mu, phi=phi, freeparams=['mu']) elif self.MODEL == phydmslib.models.ExpCM_empirical_phi: g = numpy.random.dirichlet([7] * N_NT) model = phydmslib.models.ExpCM_empirical_phi(prefs, g, kappa=kappa, omega=omega, beta=beta, mu=mu, freeparams=['mu']) elif self.MODEL == phydmslib.models.YNGKP_M0: e_pw = numpy.asarray( [numpy.random.dirichlet([7] * N_NT) for i in range(3)]) model = phydmslib.models.YNGKP_M0(e_pw, nsites) else: raise ValueError("Invalid MODEL: {0}".format(type(self.MODEL))) # make a test tree # tree is two sequences separated by a single branch t = 0.04 / model.branchScale newicktree = '(tip1:{0},tip2:{0});'.format(t / 2.0) temptree = '_temp.tree' with open(temptree, 'w') as f: f.write(newicktree) counter = 0 seed = 1 alignments = [{}, {}, {}] # alignments with the same seed number should be the same # make two alignments with the same seed number for counter in range(2): alignmentPrefix = "test_counter{0}_seed{1}".format(counter, seed) phydmslib.simulate.simulateAlignment(model, temptree, alignmentPrefix, seed) for s in Bio.SeqIO.parse( "test_counter{0}_seed{1}_simulated" "alignment.fasta".format(counter, seed), "fasta"): alignments[counter][s.id] = str(s.seq) # check they are the same for key in alignments[counter].keys(): self.assertTrue(alignments[counter][key] == alignments[counter - 1][key]) # alignments with different seed numbers should be different # make an alignment with a different seed number seed += 1 counter += 1 alignmentPrefix = "test_counter{0}_seed{1}".format(counter, seed) phydmslib.simulate.simulateAlignment(model, temptree, alignmentPrefix, seed) for s in Bio.SeqIO.parse( "test_counter{0}_seed{1}_simulatedalignment." "fasta".format(counter, seed), "fasta"): alignments[counter][s.id] = str(s.seq) # check they are different for key in alignments[counter].keys(): self.assertFalse(alignments[counter][key] == alignments[counter - 1][key]) # general clean-up os.remove(temptree) for fasta in glob.glob("test*simulatedalignment.fasta"): if os.path.isfile(fasta): os.remove(fasta)
def test_GammaDistributedOmega(self): """Initialize, test values, update, test again.""" random.seed(1) numpy.random.seed(1) nsites = 10 if self.BASEMODEL == phydmslib.models.ExpCM: prefs = [] minpref = 0.01 for _r in range(nsites): rprefs = numpy.random.dirichlet([0.5] * N_AA) rprefs[rprefs < minpref] = minpref rprefs /= rprefs.sum() prefs.append(dict(zip(sorted(AA_TO_INDEX.keys()), rprefs))) paramvalues = { "eta": numpy.random.dirichlet([5] * (N_NT - 1)), "omega": 0.7, "kappa": 2.5, "beta": 1.2, "mu": 0.5 } basemodel = self.BASEMODEL(prefs) assert set(paramvalues.keys()) == set( basemodel.freeparams), "{0} vs {1}".format( set(paramvalues.keys()), set(basemodel.freeparams)) basemodel.updateParams(paramvalues) elif self.BASEMODEL == phydmslib.models.YNGKP_M0: e_pw = numpy.random.uniform(0.4, 0.6, size=(3, N_NT)) e_pw = e_pw / e_pw.sum(axis=1, keepdims=True) basemodel = self.BASEMODEL(e_pw, nsites) paramvalues = {"kappa": 2.5, "omega": 0.7, "mu": 0.5} assert set(paramvalues.keys()) == set(basemodel.freeparams) basemodel.updateParams(paramvalues) else: raise ValueError("Invalid BASEMODEL: {0}".format(self.BASEMODEL)) rprefs /= rprefs.sum() prefs.append(dict(zip(sorted(AA_TO_INDEX.keys()), rprefs))) ncats = 4 gammamodel = phydmslib.models.GammaDistributedOmegaModel( basemodel, ncats) self.assertTrue( numpy.allclose( numpy.array([m.omega for m in gammamodel._models]), phydmslib.models.DiscreteGamma(gammamodel.alpha_lambda, gammamodel.beta_lambda, gammamodel.ncats))) for (param, pvalue) in paramvalues.items(): if param != gammamodel.distributedparam: self.assertTrue( numpy.allclose(getattr(gammamodel, param), pvalue)) # try some updates and make sure everything remains OK for _i in range(3): newvalues = {} for param in gammamodel.freeparams: (low, high) = gammamodel.PARAMLIMITS[param] if gammamodel.PARAMTYPES[param] == float: newvalues[param] = random.uniform(low, high) else: paramlength = gammamodel.PARAMTYPES[param][1] newvalues[param] = numpy.random.uniform( low, high, paramlength) gammamodel.updateParams(newvalues) self.assertTrue( numpy.allclose( numpy.array([m.omega for m in gammamodel._models]), phydmslib.models.DiscreteGamma(gammamodel.alpha_lambda, gammamodel.beta_lambda, gammamodel.ncats))) for (param, pvalue) in newvalues.items(): if param != gammamodel.distributedparam: self.assertTrue( numpy.allclose(pvalue, getattr(gammamodel, param))) if param not in gammamodel.distributionparams: self.assertTrue( all((numpy.allclose(pvalue, getattr(m, param)) for m in gammamodel._models))) self.assertTrue(gammamodel._models[0].branchScale < gammamodel. branchScale < gammamodel._models[-1].branchScale) t = 0.15 for k in range(gammamodel.ncats): M = gammamodel.M(k, t) self.assertTrue(numpy.allclose(gammamodel._models[k].M(t), M)) for param in gammamodel.freeparams: if param not in gammamodel.distributionparams: dM = gammamodel.dM(k, t, param, M) self.assertTrue( numpy.allclose( dM, gammamodel._models[k].dM(t, param, Mt=None))) # Check derivatives with respect to distribution params d_distparams = gammamodel.d_distributionparams self.assertTrue((d_distparams["alpha_lambda"] > 0).all()) self.assertTrue((d_distparams["beta_lambda"] < 0).all()) for param in gammamodel.distributionparams: diffs = [] for k in range(gammamodel.ncats): pvalue = getattr(gammamodel, param) def func(x): gammamodel.updateParams({param: x[0]}) return getattr(gammamodel._models[k], gammamodel.distributedparam) def dfunc(x): gammamodel.updateParams({param: x[0]}) return gammamodel.d_distributionparams[param][k] diff = scipy.optimize.check_grad(func, dfunc, numpy.array([pvalue])) gammamodel.updateParams({param: pvalue}) diffs.append(diff) diffs = numpy.array(diffs) self.assertTrue( (diffs < 1e-5).all(), ("Excessive diff for d_distributionparams[{0}] when " "distributionparams = {1}:\n{2}".format( param, gammamodel.distributionparams, diffs)))
def test_compare(self): """Make sure all attributes are the same when `divpressure` is 0.""" random.seed(1) numpy.random.seed(1) nsites = 6 prefs = [] minpref = 0.01 for _r in range(nsites): rprefs = numpy.random.dirichlet([0.5] * N_AA) rprefs[rprefs < minpref] = minpref rprefs /= rprefs.sum() prefs.append(dict(zip(sorted(AA_TO_INDEX.keys()), rprefs))) g = numpy.random.dirichlet([3] * N_NT) omega = 0.7 omega2 = 0.2 kappa = 2.5 beta = 1.2 divpressure = numpy.zeros(nsites) expcm = phydmslib.models.ExpCM_empirical_phi( prefs, g, omega=omega, kappa=kappa, beta=beta ) expcm_divpressure = phydmslib.models.ExpCM_empirical_phi_divpressure( prefs, g, divPressureValues=divpressure, omega=omega, kappa=kappa, beta=beta, omega2=omega2, ) self.assertTrue(numpy.allclose(expcm.stationarystate, expcm_divpressure.stationarystate), "stationarystate differs.") self.assertTrue(numpy.allclose(expcm.Qxy, expcm_divpressure.Qxy), "Qxy differs") self.assertTrue( numpy.allclose(expcm.Frxy, expcm_divpressure.Frxy), "Frxy differs") self.assertTrue( numpy.allclose(expcm.Prxy, expcm_divpressure.Prxy), "Prxy differs") t = 0.02 self.assertTrue( numpy.allclose(expcm.M(t), expcm_divpressure.M(t)), "M({0}) differs".format(t)) for param in ["kappa", "omega", "beta"]: self.assertTrue( numpy.allclose( getattr(expcm, param), getattr(expcm_divpressure, param)), "param values differ for {0}".format(param)) self.assertTrue( numpy.allclose( expcm.dstationarystate(param), (expcm_divpressure.dstationarystate(param))), "dstationarystate differs for {0}".format(param)) self.assertTrue( numpy.allclose( expcm.dM(t, param, expcm.M(t)), (expcm_divpressure.dM(t, param, expcm_divpressure.M(t)))), "dM({0}) differs for {1}".format(t, param))
def setUp(self): """Set up parameters for test.""" random.seed(1) numpy.random.seed(1) self.underflowfreq = 1 # define tree self.newick = ('((node1:0.2,node2:0.3)node4:0.3,node3:0.5)node5:0.04;') tempfile = '_temp.tree' with open(tempfile, 'w') as f: f.write(self.newick) self.tree = Bio.Phylo.read(tempfile, 'newick') os.remove(tempfile) # simulate alignment with pyvolve pyvolvetree = pyvolve.read_tree(tree=self.newick) self.nsites = 50 self.nseqs = self.tree.count_terminals() e_pw = numpy.ndarray((3, N_NT), dtype='float') e_pw.fill(0.25) yngkp_m0 = phydmslib.models.YNGKP_M0(e_pw, self.nsites) partitions = phydmslib.simulate.pyvolvePartitions(yngkp_m0) alignment = '_temp_simulatedalignment.fasta' info = '_temp_info.txt' rates = '_temp_ratefile.txt' evolver = pyvolve.Evolver(partitions=partitions, tree=pyvolvetree) evolver(seqfile=alignment, infofile=info, ratefile=rates) self.alignment = [(s.description, str(s.seq)) for s in Bio.SeqIO.parse(alignment, 'fasta')] for f in [alignment, info, rates]: os.remove(f) assert len(self.alignment[0][1]) == self.nsites * 3 assert len(self.alignment) == self.nseqs # define model prefs = [] minpref = 0.02 g = numpy.random.dirichlet([10] * N_NT) for _r in range(self.nsites): rprefs = numpy.random.dirichlet([0.5] * N_AA) rprefs[rprefs < minpref] = minpref rprefs /= rprefs.sum() prefs.append(dict(zip(sorted(AA_TO_INDEX.keys()), rprefs))) if self.MODEL == phydmslib.models.ExpCM: self.model = phydmslib.models.ExpCM(prefs) elif self.MODEL == phydmslib.models.ExpCM_empirical_phi: self.model = phydmslib.models.ExpCM_empirical_phi(prefs, g) elif self.MODEL == phydmslib.models.ExpCM_empirical_phi_divpressure: divpressure = numpy.random.uniform(-1, 5, self.nsites) divpressure /= max(abs(divpressure)) self.model = phydmslib.models.ExpCM_empirical_phi_divpressure( prefs, g, divpressure) elif self.MODEL == phydmslib.models.YNGKP_M0: e_pw = numpy.random.uniform(0.2, 0.8, size=(3, N_NT)) e_pw = e_pw / e_pw.sum(axis=1, keepdims=True) self.model = phydmslib.models.YNGKP_M0(e_pw, self.nsites) else: raise ValueError("Invalid MODEL: {0}".format(self.MODEL)) if self.DISTRIBUTIONMODEL is None: pass elif (self.DISTRIBUTIONMODEL == phydmslib.models.GammaDistributedOmegaModel): self.model = self.DISTRIBUTIONMODEL(self.model, ncats=4) else: raise ValueError("Invalid DISTRIBUTIONMODEL: {0}".format( self.DISTRIBUTIONMODEL))
def test_branchScale(self): """Simulate evolution, ensure scaled branches match number of subs.""" numpy.random.seed(1) random.seed(1) # define model, only free parameter is mu for testing simulations nsites = 50 prefs = [] minpref = 0.01 for _r in range(nsites): rprefs = numpy.random.dirichlet([1] * N_AA) rprefs[rprefs < minpref] = minpref rprefs /= rprefs.sum() prefs.append(dict(zip(sorted(AA_TO_INDEX.keys()), rprefs))) kappa = 4.2 omega = 0.4 beta = 1.5 mu = 0.3 if self.MODEL == phydmslib.models.ExpCM: phi = numpy.random.dirichlet([7] * N_NT) model = phydmslib.models.ExpCM(prefs, kappa=kappa, omega=omega, beta=beta, mu=mu, phi=phi, freeparams=['mu']) partitions = phydmslib.simulate.pyvolvePartitions(model) elif self.MODEL == phydmslib.models.ExpCM_empirical_phi: g = numpy.random.dirichlet([7] * N_NT) model = phydmslib.models.ExpCM_empirical_phi(prefs, g, kappa=kappa, omega=omega, beta=beta, mu=mu, freeparams=['mu']) partitions = phydmslib.simulate.pyvolvePartitions(model) elif self.MODEL == phydmslib.models.YNGKP_M0: e_pw = numpy.asarray( [numpy.random.dirichlet([7] * N_NT) for i in range(3)]) model = phydmslib.models.YNGKP_M0(e_pw, nsites) partitions = phydmslib.simulate.pyvolvePartitions(model) else: raise ValueError("Invalid MODEL: {0}".format(type(self.MODEL))) # tree is two sequences separated by a single branch t = 0.04 / model.branchScale newicktree = '(tip1:{0},tip2:{0});'.format(t / 2.0) pyvolvetree = pyvolve.read_tree(tree=newicktree) temptree = '_temp.tree' with open(temptree, 'w') as f: f.write(newicktree) biotree = Bio.Phylo.read(temptree, 'newick') os.remove(temptree) # Simulate evolution of two sequences separated by a long branch. # Then estimate subs per site in a heuristic way that will be # roughly correct for short branches. Do this all several times # and average results to get better accuracy. alignment = '_temp_branchScale_simulatedalignment.fasta' info = '_temp_info.txt' rates = '_temp_ratefile.txt' evolver = pyvolve.Evolver(partitions=partitions, tree=pyvolvetree) nsubs = 0 # subs in simulated seqs (estimate from Hamming distance) treedist = 0.0 # distance inferred by `TreeLikelihood` nreplicates = 100 for _i in range(nreplicates): evolver(seqfile=alignment, infofile=info, ratefile=rates) a = [(s.description, str(s.seq)) for s in Bio.SeqIO.parse(alignment, 'fasta')] assert len(a[0][1]) == len(a[1][1]) == nsites * 3 for f in [alignment, info, rates]: if os.path.isfile(f): os.remove(f) for r in range(nsites): codon1 = a[0][1][3 * r:3 * r + 3] codon2 = a[1][1][3 * r:3 * r + 3] nsubs += len([j for j in range(3) if codon1[j] != codon2[j]]) tl = phydmslib.treelikelihood.TreeLikelihood(biotree, a, model) tl.maximizeLikelihood() treedist += sum((n.branch_length for n in tl.tree.get_terminals())) nsubs /= float(nsites * nreplicates) treedist /= float(nreplicates) # We expect nsubs = branchScale * t, but build in some tolerance # with rtol since we simulated finite number of sites. self.assertTrue( numpy.allclose(nsubs, model.branchScale * t, rtol=0.2), ("Simulated subs per site of {0} is not close to " "expected value of {1} (branchScale = {2}, t = {3})").format( nsubs, t * model.branchScale, model.branchScale, t)) self.assertTrue( numpy.allclose(treedist, nsubs, rtol=0.2), ("Simulated subs per site of {0} is not close to inferred " "branch length of {1}").format(nsubs, treedist))
def setUp(self): """Set up for tests.""" numpy.random.seed(1) random.seed(1) nsites = 1 minpref = 0.001 self.prefs = [] self.realprefs = [] for _r in range(nsites): rprefs = numpy.random.dirichlet([0.5] * N_AA) rprefs[rprefs < minpref] = minpref rprefs /= rprefs.sum() self.prefs.append(dict(zip(sorted(AA_TO_INDEX.keys()), rprefs))) numpy.random.shuffle(rprefs) self.realprefs.append(dict(zip(sorted(AA_TO_INDEX.keys()), rprefs))) self.kappa = 3.0 self.omega = 3.0 self.phi = numpy.random.dirichlet([5] * N_NT) self.model = self.MODEL(self.prefs, prior=None, kappa=self.kappa, omega=self.omega, phi=self.phi) self.realmodel = phydmslib.models.ExpCM(self.realprefs, kappa=self.kappa, omega=self.omega, mu=10.0, phi=self.phi) treefile = os.path.abspath( os.path.join(os.path.dirname(__file__), "./NP_data/NP_tree.newick")) self.tree = Bio.Phylo.read(treefile, "newick") self.tree.root_at_midpoint() # simulate alignment using realmodel evolver = pyvolve.Evolver( partitions=phydmslib.simulate.pyvolvePartitions(self.realmodel), tree=pyvolve.read_tree(file=treefile)) alignmentfile = "_temp_fitprefs_simulatedalignment.fasta" info = "_temp_info.txt" rates = "_temp_ratefile.txt" evolver(seqfile=alignmentfile, infofile=info, ratefile=rates) self.alignment = phydmslib.file_io.ReadCodonAlignment( alignmentfile, True) assert len(self.alignment[0][1]) == nsites * 3 for f in [alignmentfile, info, rates]: os.remove(f) self.codoncounts = { r: {INDEX_TO_CODON[c]: 0 for c in range(N_CODON)} for r in range(nsites) } self.aacounts = {r: {a: 0 for a in range(N_AA)} for r in range(nsites)} for (_head, seq) in self.alignment: for r, i in enumerate(range(0, nsites + 1, 3)): self.codoncounts[r][seq[i:i + 3]] += 1 self.aacounts[r][CODON_TO_AA[CODON_TO_INDEX[seq[i:i + 3]]]] += 1 self.tl = phydmslib.treelikelihood.TreeLikelihood( self.tree, self.alignment, self.model)
def setUp(self): """Set up parameters for test.""" random.seed(1) numpy.random.seed(1) # define tree self.newick = "((node1:0.2,node2:0.3)node4:0.3,node3:0.5)node5:0.04;" tempfile = "_temp.tree" with open(tempfile, "w") as f: f.write(self.newick) self.tree = Bio.Phylo.read(tempfile, "newick") os.remove(tempfile) self.brlen = {} for (name, brlen) in re.findall(r"(?P<name>node\d):(?P<brlen>\d+\.\d+)", self.newick): if name != self.tree.root.name: i = name[-1] # node number self.brlen[int(i)] = float(brlen) # simulate alignment with pyvolve pyvolvetree = pyvolve.read_tree(tree=self.newick) self.nsites = 60 self.nseqs = self.tree.count_terminals() e_pw = numpy.ndarray((3, N_NT), dtype="float") e_pw.fill(0.25) yngkp_m0 = phydmslib.models.YNGKP_M0(e_pw, self.nsites) partitions = phydmslib.simulate.pyvolvePartitions(yngkp_m0) alignment = "_temp_simulatedalignment.fasta" info = "_temp_info.txt" rates = "_temp_ratefile.txt" evolver = pyvolve.Evolver(partitions=partitions, tree=pyvolvetree) evolver(seqfile=alignment, infofile=info, ratefile=rates) self.alignment = [(s.description, str(s.seq)) for s in Bio.SeqIO.parse(alignment, "fasta")] for f in [alignment, info, rates]: os.remove(f) assert len(self.alignment[0][1]) == self.nsites * 3 assert len(self.alignment) == self.nseqs self.codons = {} # indexed by node, site, gives codon index for node in self.tree.get_terminals(): node = node.name i = int(node[-1]) self.codons[i] = {} seq = [seq for (head, seq) in self.alignment if node == head][0] for r in range(self.nsites): codon = seq[3 * r:3 * r + 3] self.codons[i][r] = CODON_TO_INDEX[codon] # define model prefs = [] minpref = 0.02 g = numpy.random.dirichlet([5] * N_NT) g[g < 0.1] = 0.1 g /= g.sum() for _r in range(self.nsites): rprefs = numpy.random.dirichlet([0.5] * N_AA) rprefs[rprefs < minpref] = minpref rprefs /= rprefs.sum() prefs.append(dict(zip(sorted(AA_TO_INDEX.keys()), rprefs))) if self.MODEL == phydmslib.models.ExpCM: self.model = phydmslib.models.ExpCM(prefs) elif self.MODEL == phydmslib.models.ExpCM_empirical_phi: self.model = phydmslib.models.ExpCM_empirical_phi(prefs, g) elif self.MODEL == phydmslib.models.ExpCM_empirical_phi_divpressure: divpressure = numpy.random.uniform(-1, 5, self.nsites) divpressure /= max(abs(divpressure)) self.model = phydmslib.models.ExpCM_empirical_phi_divpressure( prefs, g, divpressure) elif self.MODEL == phydmslib.models.YNGKP_M0: e_pw = numpy.random.uniform(0.2, 0.8, size=(3, N_NT)) e_pw = e_pw / e_pw.sum(axis=1, keepdims=True) self.model = phydmslib.models.YNGKP_M0(e_pw, self.nsites) else: raise ValueError("Invalid MODEL: {0}".format(self.MODEL)) if self.DISTRIBUTIONMODEL is None: pass elif self.DISTRIBUTIONMODEL == ( phydmslib.models.GammaDistributedOmegaModel): self.model = self.DISTRIBUTIONMODEL(self.model, ncats=4) elif self.DISTRIBUTIONMODEL == ( phydmslib.models.GammaDistributedBetaModel): self.model = self.DISTRIBUTIONMODEL(self.model, ncats=4) else: raise ValueError("Invalid DISTRIBUTIONMODEL: {0}".format( self.DISTRIBUTIONMODEL))
def test_simulateAlignment(self): """Simulate evolution, ensure scaled branches match number of subs.""" numpy.random.seed(1) random.seed(1) alignmentPrefix = "test" # define model nsites = 1000 prefs = [] minpref = 0.01 for _r in range(nsites): rprefs = numpy.random.dirichlet([1] * N_AA) rprefs[rprefs < minpref] = minpref rprefs /= rprefs.sum() prefs.append(dict(zip(sorted(AA_TO_INDEX.keys()), rprefs))) kappa = 4.2 omega = 0.4 beta = 1.5 mu = 0.3 omega2 = 1.2 deltar = numpy.array([1 if x in random.sample(range(nsites), 20) else 0 for x in range(nsites)]) if self.MODEL == phydmslib.models.ExpCM_empirical_phi_divpressure: g = numpy.random.dirichlet([7] * N_NT) model = (phydmslib.models .ExpCM_empirical_phi_divpressure(prefs, g, deltar, kappa=kappa, omega=omega, beta=beta, mu=mu, freeparams=['mu'], omega2=omega2)) else: raise ValueError("Invalid MODEL: {0}".format(type(self.MODEL))) # make a test tree # tree is two sequences separated by a single branch # the units are in sub/site t = 0.04 newicktree = '(tip1:{0},tip2:{0});'.format(t / 2.0) temptree = '_temp.tree' with open(temptree, 'w') as f: f.write(newicktree) # simulate the alignment phydmslib.simulate.simulateAlignment(model, temptree, alignmentPrefix) # read in the test tree, re-scale the branch lengths, remove the file biotree = Bio.Phylo.read(temptree, 'newick') os.remove(temptree) for node in biotree.get_terminals() + biotree.get_nonterminals(): if node.branch_length: node.branch_length /= model.branchScale # check and see if the simulated alignment has the expected number of # subs exists alignment = '{0}_simulatedalignment.fasta'.format(alignmentPrefix) nsubs = 0 # subs in simulated seqs (estimate from Hamming distance) treedist = 0.0 # distance inferred by `TreeLikelihood` a = [(s.description, str(s.seq)) for s in Bio.SeqIO.parse( alignment, 'fasta')] assert len(a[0][1]) == len(a[1][1]) == nsites * 3 for f in [alignment]: if os.path.isfile(f): os.remove(f) for r in range(nsites): codon1 = a[0][1][3 * r: 3 * r + 3] codon2 = a[1][1][3 * r: 3 * r + 3] nsubs += len([j for j in range(3) if codon1[j] != codon2[j]]) nsubs /= float(nsites) tl = phydmslib.treelikelihood.TreeLikelihood(biotree, a, model) tl.maximizeLikelihood() treedist += sum((n.branch_length for n in tl.tree.get_terminals())) # We expect nsubs = t, but build in some tolerance # with rtol since we simulated finite number of sites. self.assertTrue(numpy.allclose(nsubs, t, rtol=0.2), ("Simulated subs per site of {0} is not close " "to expected value of {1} (branchScale = {2}, " "t = {3})").format(nsubs, t, model.branchScale, t)) self.assertTrue(numpy.allclose(treedist, nsubs, rtol=0.2), ( "Simulated subs per site of {0} is not close to inferred " "branch length of {1}").format(nsubs, treedist))