def finalize():
        if GC.random_number_seed is not None:
            from warnings import warn
            warn(
                "random_number_seed specified, but Pyvolve does not support seeding its random generator"
            )
        makedirs("pyvolve_output", exist_ok=True)
        label_to_node = MF.modules['TreeNode'].label_to_node()
        for root, treestr in GC.pruned_newick_trees:
            # run Pyvolve
            treestr = treestr.strip()
            label = root.get_label()
            rootseq = root.get_seq()
            if GC.VERBOSE:
                print('[%s] Pyvolve evolving sequences on tree: %s' %
                      (datetime.now(), treestr),
                      file=stderr)
                print('[%s] Pyvolve root sequence: %s' %
                      (datetime.now(), rootseq),
                      file=stderr)
            if treestr != '(':
                treestr = '(%s);' % treestr[:-1]
            try:
                tree = pyvolve.read_tree(tree=treestr)
                partition = pyvolve.Partition(models=GC.pyvolve_model,
                                              root_sequence=rootseq)
                evolver = pyvolve.Evolver(partitions=partition, tree=tree)
            except NameError:
                import pyvolve
                tree = pyvolve.read_tree(tree=treestr)
                partition = pyvolve.Partition(models=GC.pyvolve_model,
                                              root_sequence=rootseq)
                evolver = pyvolve.Evolver(partitions=partition, tree=tree)
            except AssertionError:
                assert False, "Error setting up Pyvolve. Tree: %s" % treestr
            ratefile = "pyvolve_output/%s_ratefile.txt" % label  # set each to None to not generate these files
            infofile = "pyvolve_output/%s_infofile.txt" % label
            seqfile = "pyvolve_output/%s_seqfile.fasta" % label
            evolver(ratefile=ratefile, infofile=infofile, seqfile=seqfile)
            seqs = evolver.get_sequences(
            )  # use anc=True to get internal sequences as well

            # store leaf sequences in GlobalContext
            if not hasattr(
                    GC, 'final_sequences'
            ):  # GC.final_sequences[cn_node][t] = set of (label,seq) tuples
                GC.final_sequences = {}
            for leaf in seqs:
                seq = seqs[leaf]
                virus_label, cn_label, sample_time = leaf.split('|')
                sample_time = float(sample_time)
                if cn_label not in GC.final_sequences:
                    GC.final_sequences[cn_label] = {}
                if sample_time not in GC.final_sequences[cn_label]:
                    GC.final_sequences[cn_label][sample_time] = []
                GC.final_sequences[cn_label][sample_time].append((leaf, seq))
Esempio n. 2
0
    def run_u(self, tree_file, sequences_folder):

        with open(tree_file) as f:
            line = f.readline().strip()
            if "(" not in line or line == ";":
                return None
            else:
                my_tree = ete3.Tree(line, format=1)

        root = my_tree.get_tree_root()
        root.name = "Root"

        # in this case we need to read the multipliers
        # First we apply the multipliers per family
        # Second, the multipliers per species tree branch

        gf_multiplier = self.gf_multipliers[tree_file.split("_")[-2].split("/")[-1]]

        for node in my_tree.traverse():
            node.dist = node.dist * gf_multiplier * self.st_multipliers[node.name.split("_")[0]]

        tree = pyvolve.read_tree(tree=my_tree.write(format=5), scale_tree = self.parameters["SCALING"])
        name_mapping = self.get_mapping_internal_names(tree, my_tree)
        partition = pyvolve.Partition(models=self.model, size=self.size)
        evolver = pyvolve.Evolver(tree=tree, partitions=partition)
        fasta_file = tree_file.split("/")[-1].replace("_completetree.nwk", "_") +  "complete.fasta"
        evolver(seqfile=os.path.join(sequences_folder, fasta_file), ratefile=None, infofile=None, write_anc=True)
        # Correct the names
        self.correct_names(os.path.join(sequences_folder, fasta_file), name_mapping)
def get_random_tree(filename, tree_string, L, kappa):

	# strains = read_in_strains(filename)
	# # L = genome_length(strains)
	# min_m = get_min_m(strains, L)
	# scaled_tree_string = scale_newick_format_tree(strains, L, min_m, tree_string)

	phylogeny = pyvolve.read_tree(tree = tree_string)
	# pyvolve.print_tree(phylogeny)

	freqs = [0.25,0.25,0.25,0.25]

	nuc_model = pyvolve.Model('nucleotide', {'kappa':kappa, 'state_freqs':freqs})

	ancestor = generate_ancestor(L)
	print(ancestor)

	my_partition = pyvolve.Partition(models = nuc_model, root_sequence = ancestor)

	my_evolver = pyvolve.Evolver(partitions = my_partition, tree = phylogeny)
	my_evolver() 
	# my_evolver(write_anc = True)
	simulated_strains = my_evolver.get_sequences()
	# strains = my_evolver.get_sequences(anc = True)
	# strain_names = list(strains.keys())
	pi = pi_value(simulated_strains)
	theta = theta_value(simulated_strains)

	# print('pi: ' + str(pi))
	# print('theta: ' + str(theta))

	return {'pi': pi, 'theta': theta}


	
Esempio n. 4
0
    def test_OnSimulatedData(self):
        random.seed(1)
        divpressuresites = random.sample(range(self.nsites), 5)
        partitions = phydmslib.simulate.pyvolvePartitions(self.model,
                (200.0, divpressuresites))
        evolver = pyvolve.Evolver(partitions=partitions,
                tree=pyvolve.read_tree(file=self.tree))
        simulateprefix = os.path.join(self.outdir, self.modelname)
        simulatedalignment = simulateprefix + '_simulatedalignment.fasta'
        info = simulateprefix + '_temp_info.txt'
        rates = simulateprefix + '_temp_ratefile.txt'
        evolver(seqfile=simulatedalignment, infofile=info, ratefile=rates)
        subprocess.check_call(['phydms', simulatedalignment, self.tree,
                self.modelarg, simulateprefix, '--omegabysite',
                '--brlen', 'scale'])
        omegabysitefile = simulateprefix + '_omegabysite.txt'
        omegas = pandas.read_csv(omegabysitefile, sep='\t', comment='#')
        divpressureomegas = omegas[omegas['site'].isin(divpressuresites)]
        self.assertTrue(len(divpressureomegas) == len(divpressuresites))
        self.assertTrue((divpressureomegas['omega'].values > 2).all(),
                "Not all divpressure sites have omega > 2:\n{0}".format(
                divpressureomegas))
        self.assertTrue((divpressureomegas['P'].values < 0.08).all(),
                "Not all divpressure sites have P < 0.08:\n{0}".format(
                divpressureomegas))
        nspurious = len(omegas[(omegas['omega'] > 2) & (omegas['P'] < 0.05)
                & (~omegas['site'].isin(divpressuresites))])
        self.assertTrue(nspurious <= 1, "{0} spurious sites".format(nspurious))

        for f in ["custom_matrix_frequencies.txt"]:
            if os.path.isfile(f):
                os.remove(f)
Esempio n. 5
0
def evolve(newicks, sequence_size, scale_tree):
    temp = "temporary_sequences.fasta"
    phy_files = []
    my_model = pyvolve.Model("nucleotide")
    partition = pyvolve.Partition(models = my_model, size = sequence_size)
    for i in range(0, len(newicks)):

        newick = newicks[i]
        tree = pyvolve.read_tree(tree = newick, scale_tree = scale_tree)
        my_evolver = pyvolve.Evolver(tree = tree, partitions = partition)
        fasta_seqfile = "temp" + str(i) + ".fasta"
        phylip_seqfile = "temp" + str(i) + ".phyl"
        phy_files.append(phylip_seqfile)

        my_evolver(seqfile=fasta_seqfile, seqfmt = "fasta", ratefile = None, infofile = None)
        fasta_to_phyl(fasta_seqfile, phylip_seqfile)

        os.remove(fasta_seqfile)

    phyl_output = "temp_seq.phyl"

    with open(phyl_output, 'w') as outfile:
        for fname in phy_files:
            with open(fname) as infile:
                outfile.write(infile.read())
                outfile.write("\n")
            os.remove(fname)

    return phyl_output
Esempio n. 6
0
def simulate_genomes(model, tree, asize, outdir, number):
    path = mkdir(os.path.join(outdir, str(number)))
    partition = pyvolve.Partition(models=model, size=asize)
    evolver = pyvolve.Evolver(tree=tree, partitions=partition)
    evolver(
        seqfile=None,  # ,
        ratefile=os.path.join(path, "rate_{}.fasta".format(number)),
        infofile=None)
    return evolver.get_sequences()
Esempio n. 7
0
    def setUp(self):
        """Set up parameters for test."""
        random.seed(1)
        scipy.random.seed(1)

        self.underflowfreq = 1

        # define tree
        self.newick = ('((node1:0.2,node2:0.3)node4:0.3,node3:0.5)node5:0.04;')
        tempfile = '_temp.tree'
        with open(tempfile, 'w') as f:
            f.write(self.newick)
        self.tree = Bio.Phylo.read(tempfile, 'newick')
        os.remove(tempfile)

        # amino-acid preferences
        self.nsites = 50
        prefs = []
        minpref = 0.02
        g = scipy.random.dirichlet([5] * N_NT)
        for r in range(self.nsites):
            rprefs = scipy.random.dirichlet([0.5] * N_AA)
            rprefs[rprefs < minpref] = minpref
            rprefs /= rprefs.sum()
            prefs.append(dict(zip(sorted(AA_TO_INDEX.keys()), rprefs)))

        # simulate alignment with pyvolve
        pyvolvetree = pyvolve.read_tree(tree=self.newick)
        self.nseqs = self.tree.count_terminals()
        expcm = phydmslib.models.ExpCM(prefs)
        partitions = phydmslib.simulate.pyvolvePartitions(expcm)
        alignment = '_temp_simulatedalignment.fasta'
        info = '_temp_info.txt'
        rates = '_temp_ratefile.txt'
        evolver = pyvolve.Evolver(partitions=partitions, tree=pyvolvetree)
        evolver(seqfile=alignment, infofile=info, ratefile=rates)
        self.alignment = [(s.description, str(s.seq))
                          for s in Bio.SeqIO.parse(alignment, 'fasta')]
        for f in [alignment, info, rates]:
            os.remove(f)
        assert len(self.alignment[0][1]) == self.nsites * 3
        assert len(self.alignment) == self.nseqs

        # define model
        if self.MODEL == phydmslib.models.ExpCM:
            self.model = phydmslib.models.ExpCM(prefs)
        else:
            raise ValueError("Invalid MODEL: {0}".format(self.MODEL))
        if self.DISTRIBUTIONMODEL is None:
            pass
        elif (self.DISTRIBUTIONMODEL ==
              phydmslib.models.GammaDistributedOmegaModel):
            self.model = self.DISTRIBUTIONMODEL(self.model, ncats=4)
        else:
            raise ValueError("Invalid DISTRIBUTIONMODEL: {0}".format(
                self.DISTRIBUTIONMODEL))
Esempio n. 8
0
    def test_OnSimulatedData(self):
        """Run ``phydms`` on the simulated data."""
        random.seed(1)
        numpy.random.seed(1)
        partitions = phydmslib.simulate.pyvolvePartitions(self.model)
        evolver = pyvolve.Evolver(partitions=partitions,
                                  tree=pyvolve.read_tree(file=self.tree))
        simulateprefix = os.path.join(self.outdir, self.modelname)
        simulatedalignment = simulateprefix + '_simulatedalignment.fasta'
        info = simulateprefix + '_temp_info.txt'
        rates = simulateprefix + '_temp_ratefile.txt'
        evolver(seqfile=simulatedalignment, infofile=info, ratefile=rates)

        prefsbymethod = {}
        for fitprefsmethod in ['1', '2']:
            outprefix = simulateprefix + '_fitprefsmethod{0}'.format(
                    fitprefsmethod)
            subprocess.check_call(['phydms', simulatedalignment, self.tree,
                                   self.modelarg, outprefix,
                                   '--diffprefsbysite', '--brlen', 'scale',
                                   '--ncpus', '-1', '--diffprefsprior',
                                   'invquadratic,150,0.5'] +
                                  self.gammaomega_arg +
                                  ['--fitprefsmethod', fitprefsmethod])
            diffprefsbysitefile = outprefix + '_diffprefsbysite.txt'
            aas = ['dpi_{0}'.format(INDEX_TO_AA[a]) for a in range(N_AA)]
            diffprefs = pandas.read_csv(diffprefsbysitefile, sep='\t',
                                        comment='#')
            diffprefs['total'] = diffprefs[aas].abs().sum(axis=1)
            for (site, a) in self.targetaas.items():
                siteentry = diffprefs[diffprefs['site'] == site]
                self.assertTrue(len(siteentry) == 1, str(len(siteentry)))
                self.assertTrue((siteentry['dpi_{0}'.format(a)] > 0).all())

            prefsbymethod[fitprefsmethod] = diffprefs

        for (i, (method1, prefs1)) in enumerate(sorted(prefsbymethod.items())):
            total1 = prefs1['total'].values
            for (method2, prefs2) in sorted(prefsbymethod.items())[i + 1:]:
                total2 = prefs2['total'].values
                (r, p) = scipy.stats.pearsonr(total1, total2)
                plt.scatter(total1, total2)
                plt.xlabel('fitprefsmethod{0}'.format(method1))
                plt.ylabel('fitprefsmethod{0}'.format(method2))
                plotfile = os.path.join(self.outdir, '{0}_vs_{1}.pdf'.format(
                        method1, method2))
                plt.savefig(plotfile)
                self.assertTrue(r > 0.98, "Low correlation between "
                                "fitprefsmethods: {0}\nSee {1}"
                                .format(r, plotfile))

        for f in ["custom_matrix_frequencies.txt"]:
            if os.path.isfile(f):
                os.remove(f)
Esempio n. 9
0
def get_random_tree(L, species, scaled_tree_string, kappa, iteration):
    # strains = read_in_strains(filename)
    # L = genome_length(strains)
    # min_m = get_min_m(strains, L)
    # max_m = get_max_m(strains, L, tree_string)
    # pis = []
    # thetas = []

    # scaled_trees = []

    # for x in range(min_m,max_m+1):
    # 	scaled_tree_string = scale_newick_format_tree(strains, L, x, tree_string, increment)
    # 	scaled_trees.append(scaled_tree_string)

    # for tree in scaled_trees:
    phylogeny = pyvolve.read_tree(tree=scaled_tree_string)
    print('read in the tree')
    pyvolve.print_tree(phylogeny)

    freqs = [0.25, 0.25, 0.25, 0.25]

    nuc_model = pyvolve.Model('nucleotide', {
        'kappa': kappa,
        'state_freqs': freqs
    })

    ancestor = generate_ancestor(L)
    print('generated an ancestor')
    # 	# print(ancestor)

    my_partition = pyvolve.Partition(models=nuc_model, root_sequence=ancestor)

    my_evolver = pyvolve.Evolver(partitions=my_partition, tree=phylogeny)
    my_evolver(ratefile=None,
               infofile=None,
               seqfile="simulated_alignment_" + str(species[:-1]) +
               "_universal_" + str(iteration + 1) + ".fasta")
    # 	# my_evolver()
    print('evolved the sequences')
    # 	# my_evolver(write_anc = True)
    simulated_strains = my_evolver.get_sequences()
    # 	# strains = my_evolver.get_sequences(anc = True)
    # 	# strain_names = list(strains.keys())
    pi = pi_value(simulated_strains)
    theta = theta_value(simulated_strains)
    # 	pis.append(pi)
    # 	thetas.append(theta)

    # # print('pi: ' + str(pi))
    # # print('theta: ' + str(theta))

    # return {'pi': pis, 'theta': thetas}

    return pi, theta
Esempio n. 10
0
def simulateAlignment(model, treeFile, alignmentPrefix, randomSeed=False):
    """
    Simulate an alignment given a model and tree (units = subs/site).

    Simulations done using `pyvolve`.

    Args:
        `model` (`phydmslib.models.Models` object)
            The model used for the simulations. Only
            models that can be passed to `pyvolve.Partitions`
            are supported.
        `treeFile` (str)
            Name of newick file used to simulate the sequences.
            The branch lengths should be in substitutions per site,
            which is the default units for all `phydms` outputs.
        `alignmentPrefix`
            Prefix for the files created by `pyvolve`.

    The result of this function is a simulated FASTA alignment
    file with the name having the prefix giving by `alignmentPrefix`
    and the suffix `'_simulatedalignment.fasta'`.
    """
    if randomSeed == False:
        pass
    else:
        random.seed(randomSeed)

    #Transform the branch lengths by dividing by the model `branchScale`
    tree = Bio.Phylo.read(treeFile, 'newick')
    for node in tree.get_terminals() + tree.get_nonterminals():
        if (node.branch_length == None) and (node == tree.root):
            node.branch_length = 1e-06
        else:
            node.branch_length /= model.branchScale
    fd, temp_path = mkstemp()
    Bio.Phylo.write(tree, temp_path, 'newick')
    os.close(fd)
    pyvolve_tree = pyvolve.read_tree(file=temp_path)
    os.remove(temp_path)

    #Make the `pyvolve` partition
    partitions = pyvolvePartitions(model)

    #Simulate the alignment
    alignment = '{0}_simulatedalignment.fasta'.format(alignmentPrefix)
    info = '_temp_{0}info.txt'.format(alignmentPrefix)
    rates = '_temp_{0}_ratefile.txt'.format(alignmentPrefix)
    evolver = pyvolve.Evolver(partitions=partitions, tree=pyvolve_tree)
    evolver(seqfile=alignment, infofile=info, ratefile=rates)
    for f in [rates, info, "custom_matrix_frequencies.txt"]:
        if os.path.isfile(f):
            os.remove(f)
    assert os.path.isfile(alignment)
Esempio n. 11
0
def simulate(f, seqfile, tree, mu_dict, length):
    ''' Simulate single partition according homogeneous mutation-selection model.
    '''

    try:
        my_tree = pyvolve.read_tree(file=tree)
    except:
        my_tree = pyvolve.read_tree(tree=tree)

    model = pyvolve.Model("MutSel", {'state_freqs': f, 'mu': mu_dict})

    part = pyvolve.Partition(size=length, models=model)
    e = pyvolve.Evolver(partitions=part, tree=my_tree)
    e(seqfile=seqfile, ratefile=None, infofile=None)
Esempio n. 12
0
def get_accurate_c(L, kappa):

    ancestor = generate_ancestor(L)
    print(ancestor)

    # phylogeny = pyvolve.read_tree(tree = '(  (t1:0.5,t2:0.5)i1:0.5, (t3:0.5,t4:0.5)i2:0.5 ,  (t5:0.5,t6:0.5)i3:0.5, (t7:0.5,t8:0.5)i4:0.5  ) root;')
    phylogeny = pyvolve.read_tree(
        tree=
        '(  ((t7:0.5,t8:0.5)i4:0.5,(t5:0.5,t6:0.5)i3:0.5)i1:0.5, (t3:0.5,t4:0.5)i2:0.5  ) root;'
    )

    pyvolve.print_tree(phylogeny)

    freqs = [0.25, 0.25, 0.25, 0.25]

    nuc_model = pyvolve.Model('nucleotide', {
        'kappa': 1.86836732388,
        'state_freqs': freqs
    })

    my_partition = pyvolve.Partition(models=nuc_model, root_sequence=ancestor)

    my_evolver = pyvolve.Evolver(partitions=my_partition, tree=phylogeny)
    # my_evolver()
    my_evolver(write_anc=True)
    # strains = my_evolver.get_sequences()
    strains = my_evolver.get_sequences(anc=True)
    strain_names = list(strains.keys())  # pre-order traversal of the tree
    n = len(strain_names)

    print(strain_names)

    c_sites = {}
    for key in strain_names:
        c_sites[key] = []

    site_counts = L * [
        None
    ]  # list of dictionaries to keep track of which nucleotides are at each convergent site; index = site; key = nucleotide, value = number of strains with that nucleotide
    strains_with_site = L * [
        None
    ]  # list of the strains that have a convergent mutation at each site; index = site
    for x in range(L):
        site_counts[x] = {'A': 0, 'T': 0, 'G': 0, 'C': 0}
        strains_with_site[x] = []
    # c_list_matrix = [[{} for x in range(n)] for y in range(n)] # matrix of the convergent mutation sites; the (i,j) entry is a dictionary of the convergent mutation sites between strain i and strain j; key = site, value = nucleotide

    c = 0
    strain_names
Esempio n. 13
0
def execute(tree, model, length, out, numSim):

    # read in model, tree, and define partition
    pyvolveModel = pyvolve.Model(model)
    pyvolveTree = pyvolve.read_tree(file=tree)
    pyvolvePartition = pyvolve.Partition(models=pyvolveModel, size=int(length))

    # create evolver
    my_evolver = pyvolve.Evolver(tree=pyvolveTree, partitions=pyvolvePartition)
    my_evolver()

    print("Simulating sequences...")
    # create simluated sequences
    for i in range(int(numSim)):
        print(str(out) + "." + str(i) + ".fa")
        my_evolver(seqfile=str(out) + "." + str(model) + "-" + str(i) + ".fa")
Esempio n. 14
0
    def run(self, tree_file, sequences_folder):

        with open(tree_file) as f:

            line = f.readline().strip()
            if "(" not in line or line == ";":
                return None
            else:
                my_tree = ete3.Tree(line, format=1)

        tree = pyvolve.read_tree(tree=my_tree.write(format=5), scale_tree = self.parameters["SCALING"])
        name_mapping = self.get_mapping_internal_names(tree, my_tree)
        partition = pyvolve.Partition(models=self.model, size=self.size)
        evolver = pyvolve.Evolver(tree=tree, partitions=partition)
        fasta_file = tree_file.split("/")[-1].replace("_completetree.nwk", "_complete") + ".fasta"
        evolver(seqfile=os.path.join(sequences_folder, fasta_file), ratefile=None, infofile=None, write_anc=True)

        # Correct the names
        self.correct_names(os.path.join(sequences_folder, fasta_file), name_mapping)
Esempio n. 15
0
def exampleFastaGenerator(nwkFile, fastaOutputLocation, seqLength, rate=1):
    # Tree.
    treeName = nwkFile[nwkFile.rindex('/'):]
    treeName = treeName.split('.')[0]
    phylogony = pyvolve.read_tree(file=nwkFile)
    # Rates.
    mutationRates = {
        "AC": rate,
        "AG": rate,
        "AT": rate,
        "CG": rate,
        "CT": rate,
        "GT": rate
    }
    # Model.
    model = pyvolve.Model("nucleotide", {"mu": mutationRates})
    partition = pyvolve.Partition(models=model, size=seqLength)
    # Evolver.
    evolver = pyvolve.Evolver(partitions=[partition], tree=phylogony)
    evolver(seqfile=fastaOutputLocation, ratefile=None, infofile=None)
Esempio n. 16
0
def evolve_nonconvergent_partition(g):
    if (g['num_convergent_site'] == 0):
        site_start = 1
    else:
        site_start = g['num_simulated_site'] - g['num_convergent_site'] + 1
    site_end = g['num_simulated_site']
    print('Codon site {}-{}; Non-convergent codons'.format(
        site_start, site_end))
    num_nonconvergent_site = g['num_simulated_site'] - g['num_convergent_site']
    q_matrix = copy.copy(g['background_Q'])
    with suppress_stdout_stderr():
        model = pyvolve.Model(model_type='custom',
                              name='root',
                              parameters={'matrix': q_matrix})
    partition = pyvolve.Partition(models=model, size=num_nonconvergent_site)
    evolver = pyvolve.Evolver(partitions=partition, tree=g['background_tree'])
    evolver(ratefile='tmp.csubst.simulate_nonconvergent_ratefile.txt',
            infofile='tmp.csubst.simulate_nonconvergent_infofile.txt',
            seqfile='tmp.csubst.simulate_nonconvergent.fa',
            write_anc=False)
Esempio n. 17
0
    def simulate_single_sequence(self, name, gene_length, tree_file, sequences_folder):

        my_tree = "(A:1,B:1);".replace("A",name)
        tree = pyvolve.read_tree(tree=my_tree)
        partition = pyvolve.Partition(models=self.model, size=gene_length)
        evolver = pyvolve.Evolver(tree=tree, partitions=partition)

        fasta_file = tree_file.split("/")[-1].replace("_completetree.nwk", "_complete") + ".fasta"
        evolver(seqfile=os.path.join(sequences_folder, fasta_file), ratefile=None, infofile=None, write_anc=True)

        # Select single sequence

        entries = list()

        for n, v in af.fasta_reader(os.path.join(sequences_folder, fasta_file)):
            if n[1:] != name:
                continue
            else:
                entries.append((n,v))
        af.fasta_writer(os.path.join(sequences_folder, fasta_file), entries)
Esempio n. 18
0
def generateTree(tns, ntaxa, seqlen):
    #Construct the tree and save as newick file
    t = dendropy.simulate.treesim.birth_death_tree(birth_rate=1.0, death_rate=0, taxon_namespace=tns, num_extant_tips=ntaxa)
    t.write(path='/tmp/pyvt', schema='newick', suppress_rooting=True, suppress_internal_node_labels=True)
    
    #Set pyvolve data type
    m1 = pyvolve.Model("nucleotide")
    p1 = pyvolve.Partition(models=m1, size=seqlen)
    
    #Read tree from dendropy
    pot = pyvolve.read_tree(file='/tmp/pyvt')
    
    #Simulate evolution with no save file
    e1 = pyvolve.Evolver(tree=pot, partitions=p1)
    e1(seqfile=None)
    
    seqs = e1.get_sequences()
    
    ds=dendropy.DnaCharacterMatrix.from_dict(seqs, taxon_namespace=tns)
    ds.write(path="evolvedsequences.fasta", schema="fasta")
    #print ds
    return t
Esempio n. 19
0
def simulate(tree_index,length):
    """
        Inputs: tree (integer 0-2)
        Outputs: array of 4 sequences, using the tree from above
    """
    tree_map = ["alpha","beta","charlie"]
    tree = tree_map[tree_index]
    my_tree = pyvolve.read_tree(file = "trees/"+tree+".tre")

    #Idk weird pyvolve paramets
    parameters_omega = {"omega": 0.65}
    parameters_alpha_beta = {"beta": 0.65, "alpha": 0.98} # Corresponds to dN/dS = 0.65 / 0.98
    my_model = pyvolve.Model("MG", parameters_alpha_beta)

    # Assign the model to a pyvolve.Partition. The size argument indicates to evolve 250 positions (for a codon alignment, this means 250 codons, i.e. 750 nucleotide sites)
    my_partition = pyvolve.Partition(models = my_model, size = length)

    # Evolve!
    my_evolver = pyvolve.Evolver(partitions = my_partition, tree = my_tree, ratefile = None, infofile = None)
    my_evolver(ratefile = None, infofile = None)

    #Extract the sequences
    simulated_sequences = list(my_evolver.get_sequences().values())
    return simulated_sequences
    def setUp(self):
        """Set up for tests."""
        scipy.random.seed(1)
        random.seed(1)

        nsites = 1
        minpref = 0.001
        self.prefs = []
        self.realprefs = []
        for r in range(nsites):
            rprefs = scipy.random.dirichlet([0.5] * N_AA)
            rprefs[rprefs < minpref] = minpref
            rprefs /= rprefs.sum()
            self.prefs.append(dict(zip(sorted(AA_TO_INDEX.keys()), rprefs)))
            scipy.random.shuffle(rprefs)
            self.realprefs.append(dict(zip(sorted(AA_TO_INDEX.keys()),
                                           rprefs)))
        self.kappa = 3.0
        self.omega = 3.0
        self.phi = scipy.random.dirichlet([5] * N_NT)
        self.model = self.MODEL(self.prefs,
                                prior=None,
                                kappa=self.kappa,
                                omega=self.omega,
                                phi=self.phi)
        self.realmodel = phydmslib.models.ExpCM(self.realprefs,
                                                kappa=self.kappa,
                                                omega=self.omega,
                                                mu=10.0,
                                                phi=self.phi)

        treefile = os.path.abspath(
            os.path.join(os.path.dirname(__file__),
                         './NP_data/NP_tree.newick'))
        self.tree = Bio.Phylo.read(treefile, 'newick')
        self.tree.root_at_midpoint()

        # simulate alignment using realmodel
        evolver = pyvolve.Evolver(
            partitions=phydmslib.simulate.pyvolvePartitions(self.realmodel),
            tree=pyvolve.read_tree(file=treefile))
        alignmentfile = '_temp_fitprefs_simulatedalignment.fasta'
        info = '_temp_info.txt'
        rates = '_temp_ratefile.txt'
        evolver(seqfile=alignmentfile, infofile=info, ratefile=rates)
        self.alignment = phydmslib.file_io.ReadCodonAlignment(
            alignmentfile, True)
        assert len(self.alignment[0][1]) == nsites * 3
        for f in [alignmentfile, info, rates]:
            os.remove(f)
        self.codoncounts = dict([(r,
                                  dict([(INDEX_TO_CODON[c], 0)
                                        for c in range(N_CODON)]))
                                 for r in range(nsites)])
        self.aacounts = dict([(r, dict([(a, 0) for a in range(N_AA)]))
                              for r in range(nsites)])
        for (head, seq) in self.alignment:
            self.codoncounts[r][seq] += 1
            self.aacounts[r][CODON_TO_AA[CODON_TO_INDEX[seq]]] += 1

        self.tl = phydmslib.treelikelihood.TreeLikelihood(
            self.tree, self.alignment, self.model)
Esempio n. 21
0
def main():
    """Main body of script."""
    codons = pyvolve.genetics.Genetics().codons
    codon_dict = pyvolve.genetics.Genetics().codon_dict
    pyrims = pyvolve.genetics.Genetics().pyrims
    purines = pyvolve.genetics.Genetics().purines

    args = vars(ParseArguments().parse_args())
    print("Read the following command line arguments:")
    print("\n\t{0}".format("\n\t".join(
        ["{0} = {1}".format(key, value) for (key, value) in args.items()])))

    print("\nPerforming simulation with pyvolve version {0}".format(
        pyvolve.__version__))

    print("\nReading model params from {0}".format(args['modelparams']))
    params = ReadParams(args['modelparams'])
    for (param, paramvalue) in params.items():
        print("The value of {0} is {1}".format(param, paramvalue))

    print("\nReading preferences from {0}".format(args['prefs']))
    tup = dms_tools.file_io.ReadPreferences(args['prefs'])
    (sites, pis) = (tup[0], tup[2])
    print("\nRead amino-acid preferences for {0} sites".format(len(pis)))

    tree = pyvolve.read_tree(file=args['tree'])

    # create models for simulation
    partitions = []
    for r in sites:
        if params['diversifyingsitesA'] and (int(r)
                                             in params['diversifyingsitesA']):
            omega = params['diversifyingomegaA']
            print r, omega
        elif params['diversifyingsitesB'] and (
                int(r) in params['diversifyingsitesB']):
            omega = params['diversifyingomegaB']
            print r, omega
        else:
            omega = 1.0
        matrix = []  # matrix[x][y] is rate of substitution from x to y
        for (xi, x) in enumerate(codons):
            row = []
            for (yi, y) in enumerate(codons):
                ntdiffs = [(x[j], y[j]) for j in range(3) if x[j] != y[j]]
                if len(ntdiffs) == 0:
                    assert x == y
                    row.append(
                        0)  # will later be adjusted to make row sum to zero
                elif len(ntdiffs) > 1:
                    # multi-nucleotide codon change
                    row.append(0)
                else:
                    # single nucleotide change
                    (xnt, ynt) = ntdiffs[0]
                    if (xnt in purines) == (ynt in purines):
                        # transition
                        qxy = params['kappa'] * params['phi{0}'.format(ynt)]
                    else:
                        # transversion
                        qxy = params['phi{0}'.format(ynt)]
                    (xaa, yaa) = (codon_dict[x], codon_dict[y])
                    if xaa == yaa:
                        fxy = 1.0
                    else:
                        pix = pis[r][xaa]**params['stringencyparameter']
                        piy = pis[r][yaa]**params['stringencyparameter']
                        if abs(pix - piy) < 1e-6:
                            fxy = omega
                        else:
                            fxy = omega * math.log(
                                piy / pix) / (1.0 - pix / piy)
                    row.append(qxy * fxy * params['scalerate'])
            assert len(row) == len(codons)
            row[xi] = -sum(row)
            matrix.append(row)
        model = pyvolve.Model("custom", {"matrix": matrix})
        partitions.append(pyvolve.Partition(models=model, size=1))

    print("\nSimulating evolution, writing to {0}...".format(
        args['simulatedalignment']))
    basename = os.path.splitext(args['simulatedalignment'])[0]
    evolver = pyvolve.Evolver(partitions=partitions, tree=tree)
    evolver(
        seqfile=args['simulatedalignment'],
        infofile='{0}_infofile.txt'.format(basename),
        ratefile='{0}_ratefile.txt'.format(basename),
    )
    print("Finished simulation")

    uniqueseqs = set([])
    uniquealignment = []
    ninitial = 0
    for seq in Bio.SeqIO.parse(args['simulatedalignment'], 'fasta'):
        ninitial += 1
        seqstr = str(seq.seq)
        if seqstr not in uniqueseqs:
            uniqueseqs.add(seqstr)
            uniquealignment.append(seq)
    print(
        "\nAfter removing redundant sequences, we have shrunk {0} from {1} to {2} sequences"
        .format(args['simulatedalignment'], ninitial, len(uniquealignment)))
    Bio.SeqIO.write(uniquealignment, args['simulatedalignment'], 'fasta')
# This example script demonstrates how to evolve according to a nucleotide model with several partitions.
# In this example, the first partition has gamma-distributedsitewise rate heterogeneity, the second partition is homogenous, and the third partition has custom sitewise rate heterogeneity.
# All models use default mutation-rate parameters

import pyvolve

# Define a phylogeny, from a file containing a newick tree
my_tree = pyvolve.read_tree(file="file_with_tree.tre")

# Define first model and partition. This partition has a length of 50 positions
model1 = pyvolve.Model("nucleotide", alpha=0.7, num_categories=4)
part1 = pyvolve.Partition(models=model1, size=50)

# Define second model and partition. This partition has a length of 20 positions
model2 = pyvolve.Model("nucleotide")
part2 = pyvolve.Partition(models=model2, size=20)

# Define second model and partition. This partition has a length of 100 positions
model3 = pyvolve.Model("nucleotide",
                       rate_factors=[0.5, 1.6, 4.1],
                       rate_probs=[0.75, 0.2, 0.05])
part3 = pyvolve.Partition(models=model3, size=100)

# Provide all partitions *in the order in which they should be evolved* to Evolver and evolve
my_evolver = pyvolve.Evolver(partitions=[part1, part2, part3], tree=my_tree)
my_evolver()
Esempio n. 23
0
    usage ='''
    python pyvolve-genseq.py <tree.nwk> <seq-size> [<scale> default=1 (no scale)]
    '''
    if len(sys.argv) < 3:
        sys.exit(usage)

    tree_f = sys.argv[1]
    outfiles = tree_f
    size = sys.argv[2]
    scale = 1
    scale = float(sys.argv[3]) if len(sys.argv) > 3 else None

    print("Reading tree..")
    my_tree = pyvolve.read_tree(file = tree_f, scale_tree=scale)
    my_model = pyvolve.Model("nucleotide")
    my_partition = pyvolve.Partition(models = my_model, size = int(size))

    print("Simulating sequences..")
    my_evolver = pyvolve.Evolver(tree = my_tree, partitions = my_partition)
    my_evolver(ratefile = "%s.%s.ratefile.txt" % (outfiles, size),
               infofile = "%s.%s.infofile.txt" % (outfiles, size),
               seqfile = "%s.%s.seqfile.fasta" % (outfiles, size) )

    print("Tree info..")
    tree_distances_info(tree_f, scale, int(size))

    print("Running ANI on sequences..")
    pyani_seq("%s.%s.seqfile.fasta" % (outfiles, size))


Esempio n. 24
0
    def setUp(self):
        """Set up parameters for test."""
        random.seed(1)
        scipy.random.seed(1)

        self.underflowfreq = 1

        # define tree
        self.newick = ('((node1:0.2,node2:0.3)node4:0.3,node3:0.5)node5:0.04;')
        tempfile = '_temp.tree'
        with open(tempfile, 'w') as f:
            f.write(self.newick)
        self.tree = Bio.Phylo.read(tempfile, 'newick')
        os.remove(tempfile)

        # simulate alignment with pyvolve
        pyvolvetree = pyvolve.read_tree(tree=self.newick)
        self.nsites = 50
        self.nseqs = self.tree.count_terminals()
        e_pw = scipy.ndarray((3, N_NT), dtype='float')
        e_pw.fill(0.25)
        yngkp_m0 = phydmslib.models.YNGKP_M0(e_pw, self.nsites)
        partitions = phydmslib.simulate.pyvolvePartitions(yngkp_m0)
        alignment = '_temp_simulatedalignment.fasta'
        info = '_temp_info.txt'
        rates = '_temp_ratefile.txt'
        evolver = pyvolve.Evolver(partitions=partitions, tree=pyvolvetree)
        evolver(seqfile=alignment, infofile=info, ratefile=rates)
        self.alignment = [(s.description, str(s.seq))
                          for s in Bio.SeqIO.parse(alignment, 'fasta')]
        for f in [alignment, info, rates]:
            os.remove(f)
        assert len(self.alignment[0][1]) == self.nsites * 3
        assert len(self.alignment) == self.nseqs

        # define model
        prefs = []
        minpref = 0.02
        g = scipy.random.dirichlet([5] * N_NT)
        for r in range(self.nsites):
            rprefs = scipy.random.dirichlet([0.5] * N_AA)
            rprefs[rprefs < minpref] = minpref
            rprefs /= rprefs.sum()
            prefs.append(dict(zip(sorted(AA_TO_INDEX.keys()), rprefs)))
        if self.MODEL == phydmslib.models.ExpCM:
            self.model = phydmslib.models.ExpCM(prefs)
        elif self.MODEL == phydmslib.models.ExpCM_empirical_phi:
            self.model = phydmslib.models.ExpCM_empirical_phi(prefs, g)
        elif self.MODEL == phydmslib.models.ExpCM_empirical_phi_divpressure:
            divpressure = scipy.random.uniform(-1, 5, self.nsites)
            divpressure /= max(abs(divpressure))
            self.model = phydmslib.models.ExpCM_empirical_phi_divpressure(
                prefs, g, divpressure)
        elif self.MODEL == phydmslib.models.YNGKP_M0:
            e_pw = scipy.random.uniform(0.2, 0.8, size=(3, N_NT))
            e_pw = e_pw / e_pw.sum(axis=1, keepdims=True)
            self.model = phydmslib.models.YNGKP_M0(e_pw, self.nsites)
        else:
            raise ValueError("Invalid MODEL: {0}".format(self.MODEL))

        if self.DISTRIBUTIONMODEL is None:
            pass
        elif (self.DISTRIBUTIONMODEL ==
              phydmslib.models.GammaDistributedOmegaModel):
            self.model = self.DISTRIBUTIONMODEL(self.model, ncats=4)
        else:
            raise ValueError("Invalid DISTRIBUTIONMODEL: {0}".format(
                self.DISTRIBUTIONMODEL))
Esempio n. 25
0
    def setUp(self):
        """Set up parameters for test."""
        random.seed(1)
        scipy.random.seed(1)

        # define tree
        self.newick = ('((node1:0.2,node2:0.3)node4:0.3,node3:0.5)node5:0.04;')
        tempfile = '_temp.tree'
        with open(tempfile, 'w') as f:
            f.write(self.newick)
        self.tree = Bio.Phylo.read(tempfile, 'newick')
        os.remove(tempfile)
        self.brlen = {}
        for (name,
             brlen) in re.findall(r'(?P<name>node\d):(?P<brlen>\d+\.\d+)',
                                  self.newick):
            if name != self.tree.root.name:
                i = name[-1]  # node number
                self.brlen[int(i)] = float(brlen)

        # simulate alignment with pyvolve
        pyvolvetree = pyvolve.read_tree(tree=self.newick)
        self.nsites = 60
        self.nseqs = self.tree.count_terminals()
        e_pw = scipy.ndarray((3, N_NT), dtype='float')
        e_pw.fill(0.25)
        yngkp_m0 = phydmslib.models.YNGKP_M0(e_pw, self.nsites)
        partitions = phydmslib.simulate.pyvolvePartitions(yngkp_m0)
        alignment = '_temp_simulatedalignment.fasta'
        info = '_temp_info.txt'
        rates = '_temp_ratefile.txt'
        evolver = pyvolve.Evolver(partitions=partitions, tree=pyvolvetree)
        evolver(seqfile=alignment, infofile=info, ratefile=rates)
        self.alignment = [(s.description, str(s.seq))
                          for s in Bio.SeqIO.parse(alignment, 'fasta')]
        for f in [alignment, info, rates]:
            os.remove(f)
        assert len(self.alignment[0][1]) == self.nsites * 3
        assert len(self.alignment) == self.nseqs
        self.codons = {}  # indexed by node, site, gives codon index
        for node in self.tree.get_terminals():
            node = node.name
            i = int(node[-1])
            self.codons[i] = {}
            seq = [seq for (head, seq) in self.alignment if node == head][0]
            for r in range(self.nsites):
                codon = seq[3 * r:3 * r + 3]
                self.codons[i][r] = CODON_TO_INDEX[codon]

        # define model
        prefs = []
        minpref = 0.02
        g = scipy.random.dirichlet([5] * N_NT)
        g[g < 0.1] = 0.1
        g /= g.sum()
        for r in range(self.nsites):
            rprefs = scipy.random.dirichlet([0.5] * N_AA)
            rprefs[rprefs < minpref] = minpref
            rprefs /= rprefs.sum()
            prefs.append(dict(zip(sorted(AA_TO_INDEX.keys()), rprefs)))
        if self.MODEL == phydmslib.models.ExpCM:
            self.model = phydmslib.models.ExpCM(prefs)
        elif self.MODEL == phydmslib.models.ExpCM_empirical_phi:
            self.model = phydmslib.models.ExpCM_empirical_phi(prefs, g)
        elif self.MODEL == phydmslib.models.ExpCM_empirical_phi_divpressure:
            divpressure = scipy.random.uniform(-1, 5, self.nsites)
            divpressure /= max(abs(divpressure))
            self.model = phydmslib.models.ExpCM_empirical_phi_divpressure(
                prefs, g, divpressure)
        elif self.MODEL == phydmslib.models.YNGKP_M0:
            e_pw = scipy.random.uniform(0.2, 0.8, size=(3, N_NT))
            e_pw = e_pw / e_pw.sum(axis=1, keepdims=True)
            self.model = phydmslib.models.YNGKP_M0(e_pw, self.nsites)
        else:
            raise ValueError("Invalid MODEL: {0}".format(self.MODEL))

        if self.DISTRIBUTIONMODEL is None:
            pass
        elif (self.DISTRIBUTIONMODEL ==
              phydmslib.models.GammaDistributedOmegaModel):
            self.model = self.DISTRIBUTIONMODEL(self.model, ncats=4)
        elif (self.DISTRIBUTIONMODEL ==
              phydmslib.models.GammaDistributedBetaModel):
            self.model = self.DISTRIBUTIONMODEL(self.model, ncats=4)
        else:
            raise ValueError("Invalid DISTRIBUTIONMODEL: {0}".format(
                self.DISTRIBUTIONMODEL))
Esempio n. 26
0
import sys, os
import pyvolve
import glob
from mungo.fasta import FastaReader
from collections import defaultdict
input_fasta = sys.argv[1]
input_tree_txt = sys.argv[2]
output_seqs = sys.argv[3]

#f = pyvolve.ReadFrequencies("amino_acid", file = "/Users/fengqian/Downloads/UniMelb_shared-master/project/mosaic_data/Protein_translateable_pilot_upper_centroids.fasta")
#f = pyvolve.ReadFrequencies("amino_acid", file = "/data/cephfs/punim0609/qian_feng/snake_pipeline/data/Protein_translateable_pilot_upper_centroids.fasta")
f = pyvolve.ReadFrequencies("amino_acid", file=input_fasta)
frequencies = f.compute_frequencies()
my_tree_1 = pyvolve.read_tree(file=input_tree_txt, scale_tree=0.5)
my_model_1 = pyvolve.Model("MTMAM", {"state_freqs": frequencies})
my_partition_1 = pyvolve.Partition(models=my_model_1, size=200)
my_evolver_1 = pyvolve.Evolver(partitions=my_partition_1, tree=my_tree_1)
my_evolver_1(ratefile=None, infofile=None, seqfile=output_seqs)

seqs = {}
seq_list = []
count = 0
for h, s in FastaReader(output_seqs):
    seqs["seq" + str(count)] = s
    seq_list.append("seq" + str(count))
    count += 1
##organize the seq ID name
with open(output_seqs, 'w') as outfile:
    for s in seq_list:
        outfile.write(">" + s + "\n" + seqs[s] + "\n")
Esempio n. 27
0
# Define a phylogeny, from a file containing a newick tree
my_tree = pyvolve.read_tree(file="file_with_tree.tre")

# Define a nucleotide model, as a pyvolve.Model object. For this example, we'll use default parameters, but see the example script custom_aminoacid.py for other options

# To implement rate heterogeneity, do either of these:
## 1) Custom rates: Provide a list of rate_factors when defining a Model object. These rate factors will be assigned to sites with equal probability by default. To change this, provide probabilities with the argument `rate_probs`.
## 2) Gamma rates: Provide the keyword arguments num_categories and alpha when defining a Model object. <num_categories> rates will be drawn from a gamma distribution with shape and scale parameter each equal to <alpha>. These rates will be equiprobable, unless overridden by `rate_probs`.

# Several model definitions are shown below (first argument can be a different model, as desired).

# custom rates
my_model1 = pyvolve.Model(
    "WAG", rate_factors=[0.3, 0.8, 1.5,
                         2.45])  # 25% of sites will have each factor.
my_model2 = pyvolve.Model(
    "WAG",
    rate_factors=[0.3, 0.8, 1.5, 2.45],
    rate_probs=[0.7, 0.2, 0.05, 0.05]
)  # 70% of sites evolve with rate of 0.3, 20% with a rate of 0.8, 5% with a rate of 1.5, and 5% with a rate of 2.45

# gamma rates
my_model3 = pyvolve.Model("WAG", alpha=0.6, num_categories=5)

# Assign the model to a pyvolve.Partition. The size argument indicates to evolve 250 positions
my_partition = pyvolve.Partition(models=my_model2, size=250)

# Evolve!
my_evolver = pyvolve.Evolver(partitions=my_partition, tree=my_tree)
my_evolver()
Esempio n. 28
0
def get_c(L, kappa):

    ancestor = generate_ancestor(L)
    print(ancestor)

    phylogeny = pyvolve.read_tree(
        tree='((t1:0.5,t2:0.5)i1:0.5,(t3:0.5,t4:0.5)i2:0.5)root;')
    # '(t4:0.785,(t3:0.380,(t2:0.806,(t5:0.612,t1:0.660)i1:0.762)i2:0.921)i3:0.207)root;')
    # ((s1,s2)n1,(s3,s4)n2)n3
    pyvolve.print_tree(phylogeny)

    freqs = [0.25, 0.25, 0.25, 0.25]

    nuc_model = pyvolve.Model('nucleotide', {
        'kappa': 1.86836732388,
        'state_freqs': freqs
    })

    my_partition = pyvolve.Partition(models=nuc_model, root_sequence=ancestor)

    my_evolver = pyvolve.Evolver(partitions=my_partition, tree=phylogeny)
    my_evolver()
    # my_evolver(write_anc = True)
    strains = my_evolver.get_sequences()
    # strains = my_evolver.get_sequences(anc = True)
    strain_names = list(strains.keys())

    n = len(strain_names)
    site_counts = L * [
        None
    ]  # list of dictionaries to keep track of which nucleotides are at each convergent site; index = site; key = nucleotide, value = number of strains with that nucleotide
    strains_with_site = L * [
        None
    ]  # list of the strains that have a convergent mutation at each site; index = site
    for x in range(L):
        site_counts[x] = {'A': 0, 'T': 0, 'G': 0, 'C': 0}
        strains_with_site[x] = []
    # c_list_matrix = [[{} for x in range(n)] for y in range(n)] # matrix of the convergent mutation sites; the (i,j) entry is a dictionary of the convergent mutation sites between strain i and strain j; key = site, value = nucleotide

    for s1 in range(n):
        strain1 = strains[strain_names[s1]]
        for s2 in range(s1, n):
            strain2 = strains[strain_names[s2]]
            for site in range(L):
                if strain1[site] == strain2[
                        site] and strain1[site] != ancestor[site]:
                    if strain1 not in strains_with_site[
                            site]:  # avoids double counting strain1 as convergent at that site
                        strains_with_site[site].append(strain1)
                        site_counts[site][strain1[site]] += 1
                    if strain2 not in strains_with_site[
                            site]:  # avoids double counting strain2 as convergent at that site
                        strains_with_site[site].append(strain2)
                        site_counts[site][strain2[site]] += 1

    c_q = (n - 1) * [
        None
    ]  # list of the number of convergent mutations between q strains; index = q - 2
    nucleotides = ['A', 'T', 'G', 'C']
    for x in range(n - 1):
        c_q[x] = 0
    for site in site_counts:
        for base in nucleotides:
            for q in range(2, n + 1):
                if site[base] == (q):
                    c_q[q - 2] += 1

    c = sum(c_q)
    print(c)
    return c
Esempio n. 29
0
    def test_branchScale(self):
        """Simulate evolution, ensure scaled branches match number of subs."""

        scipy.random.seed(1)
        random.seed(1)

        # define model, only free parameter is mu for testing simulations
        nsites = 50
        prefs = []
        minpref = 0.01
        for r in range(nsites):
            rprefs = scipy.random.dirichlet([1] * N_AA)
            rprefs[rprefs < minpref] = minpref
            rprefs /= rprefs.sum()
            prefs.append(dict(zip(sorted(AA_TO_INDEX.keys()), rprefs)))
        kappa = 4.2
        omega = 0.4
        beta = 1.5
        mu = 0.3
        if self.MODEL == phydmslib.models.ExpCM:
            phi = scipy.random.dirichlet([7] * N_NT)
            model = phydmslib.models.ExpCM(prefs,
                                           kappa=kappa,
                                           omega=omega,
                                           beta=beta,
                                           mu=mu,
                                           phi=phi,
                                           freeparams=['mu'])
            partitions = phydmslib.simulate.pyvolvePartitions(model)
        elif self.MODEL == phydmslib.models.ExpCM_empirical_phi:
            g = scipy.random.dirichlet([7] * N_NT)
            model = phydmslib.models.ExpCM_empirical_phi(prefs,
                                                         g,
                                                         kappa=kappa,
                                                         omega=omega,
                                                         beta=beta,
                                                         mu=mu,
                                                         freeparams=['mu'])
            partitions = phydmslib.simulate.pyvolvePartitions(model)
        elif self.MODEL == phydmslib.models.YNGKP_M0:
            e_pw = scipy.asarray(
                [scipy.random.dirichlet([7] * N_NT) for i in range(3)])
            model = phydmslib.models.YNGKP_M0(e_pw, nsites)
            partitions = phydmslib.simulate.pyvolvePartitions(model)
        else:
            raise ValueError("Invalid MODEL: {0}".format(type(self.MODEL)))

        # tree is two sequences separated by a single branch
        t = 0.04 / model.branchScale
        newicktree = '(tip1:{0},tip2:{0});'.format(t / 2.0)
        pyvolvetree = pyvolve.read_tree(tree=newicktree)
        temptree = '_temp.tree'
        with open(temptree, 'w') as f:
            f.write(newicktree)
        biotree = Bio.Phylo.read(temptree, 'newick')
        os.remove(temptree)

        # Simulate evolution of two sequences separated by a long branch.
        # Then estimate subs per site in a heuristic way that will be
        # roughly correct for short branches. Do this all several times
        # and average results to get better accuracy.
        alignment = '_temp_branchScale_simulatedalignment.fasta'
        info = '_temp_info.txt'
        rates = '_temp_ratefile.txt'
        evolver = pyvolve.Evolver(partitions=partitions, tree=pyvolvetree)
        nsubs = 0  # subs in simulated seqs (estimate from Hamming distance)
        treedist = 0.0  # distance inferred by `TreeLikelihood`
        nreplicates = 100
        for i in range(nreplicates):
            evolver(seqfile=alignment, infofile=info, ratefile=rates)
            a = [(s.description, str(s.seq))
                 for s in Bio.SeqIO.parse(alignment, 'fasta')]
            assert len(a[0][1]) == len(a[1][1]) == nsites * 3
            for f in [alignment, info, rates]:
                if os.path.isfile(f):
                    os.remove(f)
            for r in range(nsites):
                codon1 = a[0][1][3 * r:3 * r + 3]
                codon2 = a[1][1][3 * r:3 * r + 3]
                nsubs += len([j for j in range(3) if codon1[j] != codon2[j]])
            tl = phydmslib.treelikelihood.TreeLikelihood(biotree, a, model)
            tl.maximizeLikelihood()
            treedist += sum([n.branch_length for n in tl.tree.get_terminals()])
        nsubs /= float(nsites * nreplicates)
        treedist /= float(nreplicates)

        # We expect nsubs = branchScale * t, but build in some tolerance
        # with rtol since we simulated finite number of sites.
        self.assertTrue(
            scipy.allclose(nsubs, model.branchScale * t, rtol=0.2),
            ("Simulated subs per site of {0} is not close "
             "to expected value of {1} (branchScale = {2}, t = {3})").format(
                 nsubs, t * model.branchScale, model.branchScale, t))
        self.assertTrue(
            scipy.allclose(treedist, nsubs, rtol=0.2),
            ("Simulated subs per site of {0} is not close to inferred "
             "branch length of {1}").format(nsubs, treedist))
Esempio n. 30
0
def main(strain, seedFilepath, gffFilepath):
    for record in SeqIO.parse(seedFilepath, "fasta"):
        seedRec = record
        break
    gff_df = read_gff(gffFilepath)

    #get all the shuffle region
    prv = 0
    pos_lst = []
    for _, row in gff_df.iterrows():
        pos_lst.append(("nc", prv, row["start"] - 1, "+"))
        pos_lst.append(("c", row["start"] - 1, row["end"], row["strand"]))
        prv = row["end"]
    pos_lst.append(("nc", prv, len(seedRec), "+"))

    # configuration for evolution
    treeFilepath = "tmp.tree"
    mytree = pyvolve.read_tree(file=treeFilepath)
    ncm = pyvolve.Model("nucleotide")  # non-coding model
    cm = pyvolve.Model("ECMrest")  # coding model

    outputSeq_lst = [Seq("") for _ in range(4)]  # assuming tree has 4 nodes
    for pos in pos_lst:
        category, start, end, strand = pos

        # get rootSeq according to start, end, strand info
        rootSeq = seedRec.seq[start:end]
        if strand == "-":
            rootSeq = rootSeq.reverse_complement()
        rootSeq = str(rootSeq)

        # get simulated sequences
        if category == "nc":
            #            partition = pyvolve.Partition(models = ncm, root_sequence = rootSeq)
            #            evolver = pyvolve.Evolver(partition = partition, tree = mytree)
            #            rec_lst = get_evolved(evolver)
            rec_lst = [SeqRecord(Seq(rootSeq)) for _ in range(4)]
        elif category == "c":
            partition = pyvolve.Partition(
                models=cm,
                root_sequence=rootSeq[3:-3])  #remove start & stop codon
            evolver = pyvolve.Evolver(partition=partition, tree=mytree)
            rec_lst = get_evolved(evolver)
            for rec in rec_lst:
                rec.seq = rootSeq[:3] + rec.seq + rootSeq[
                    -3:]  #add last stop codon back
        assert len(rec_lst) == len(outputSeq_lst)

        # concat to outputSeq_lst
        for i, rec in enumerate(rec_lst):
            simSeq = rec.seq
            if strand == "-":
                simSeq = simSeq.reverse_complement()
            outputSeq_lst[i] += simSeq

    for i, outputSeq in enumerate(outputSeq_lst):
        genomeId = "{}_sim{}".format(strain, i + 1)
        outFilepath = "../data/dnaseq/{}.dnaseq".format(genomeId)
        with open(outFilepath, "w") as f:
            seqname = "{}:seq".format(genomeId)
            rec = SeqRecord(outputSeq, id=seqname, description="")
            SeqIO.write(rec, f, "fasta")
        print("DONE: output {}".format(outFilepath))