def _evolve_sequence(tree, L, gtr): """ Produce random sequence of a given length L, evolve it on a given tree using the given gtr model. """ if isinstance(tree, str): tree = Phylo.read(tree, 'newick') root_seq = np.random.choice(gtr.alphabet, p=gtr.Pi, size=1000) tree.root.ref_seq = root_seq print("Started sequence evolution...") for node in tree.find_clades(): for c in node.clades: c.up = node if hasattr(node, 'ref_seq'): continue t = node.branch_length p = gtr.propagate_profile( treetime.seq_utils.seq2prof(node.up.ref_seq, gtr.profile_map), t) # normalie profile p = (p.T / p.sum(axis=1)).T # sample mutations randomly ref_seq_idxs = np.array([ int(np.random.choice(np.arange(p.shape[1]), p=p[k])) for k in np.arange(p.shape[0]) ]) node.ref_seq = np.array([gtr.alphabet[k] for k in ref_seq_idxs]) records = [ Align.SeqRecord(Align.Seq("".join(k.ref_seq)), id=k.name, name=k.name) for k in tree.get_terminals() ] aln = Align.MultipleSeqAlignment(records) #full_aln = Align.MultipleSeqAlignment(full_records) print("Sequence evolution done...") return root_seq, aln
genome, gene = entry.name.split('|') else: genome, gene = entry.name.split('_') if genome in genomes[aln]: sys.exit('\t**Error, duplicated genome in %s: %s' %(aln, genome)) genomes[aln].add(genome) genome_union = set.union(*genomes.values()) missing_genes = {} # just to keep track of the number of missing marker genes in each genome concatenation = {} for genome in genome_union: missing_genes[genome] = 0 concatenation[genome] = Align.SeqRecord( Align.Seq('', aln_alphabet) ) concatenation[genome].name = genome concatenation[genome].id = genome concatenation[genome].description = genome # # fill the handles with the marker sequences from each genome total_genes = 0.0 # keep track of the number of genes added to the concatenation current_position = 1 partitions = open('%s/concatenated_partitions' %output_folder, 'wb') for aln in os.listdir(aln_folder): tmp_aln = AlignIO.read( '%s/%s' %(aln_folder, aln), 'fasta' ) aln_length = tmp_aln.get_alignment_length() # get the expected size of the alignment so you can compare if all have the same size total_genes += aln_length
#!/usr/bin/env python import Bio.Align as align, Bio.SeqIO as sio, Bio, Bio.AlignIO as aio import sys import subprocess as spc seqs = [align.SeqRecord(align.Seq( 'ATGATGGGGGATGATG')),\ align.SeqRecord(align.Seq( 'ATGATGATGATG')),\ ] m_proc = spc.Popen('muscle -clw', stdin=spc.PIPE, stdout=spc.PIPE, stderr=spc.PIPE, shell=True) sio.write(seqs, m_proc.stdin, "fasta") m_proc.stdin.close() align = aio.read(m_proc.stdout, "clustal") print align
def evolve_seq(treefile, basename, mu=0.0001, L=1000, mygtr=treetime.GTR.standard('jc')): """ Generate a random sequence of a given length, and evolve it on the tree Args: - treefile: filename for the tree, on which a sequence should be evolved. - basename: filename prefix to save alignments. - mu: mutation rate. The units of the mutation rate should be consistent with the tree branch length - L: sequence length. - mygtr: GTR model for sequence evolution """ from treetime import seq_utils from Bio import Phylo, AlignIO import numpy as np from itertools import izip mygtr.mu = mu tree = Phylo.read(treefile, 'newick') tree.root.ref_seq = np.random.choice(mygtr.alphabet, p=mygtr.Pi, size=L) print("Started sequence evolution...") mu_real = 0.0 n_branches = 0 #print ("Root sequence: " + ''.join(tree.root.ref_seq)) for node in tree.find_clades(): for c in node.clades: c.up = node if hasattr(node, 'ref_seq'): continue t = node.branch_length p = mygtr.propagate_profile( seq_utils.seq2prof(node.up.ref_seq, mygtr.profile_map), t) # normalie profile p = (p.T / p.sum(axis=1)).T # sample mutations randomly ref_seq_idxs = np.array([ int(np.random.choice(np.arange(p.shape[1]), p=p[k])) for k in np.arange(p.shape[0]) ]) node.ref_seq = np.array([mygtr.alphabet[k] for k in ref_seq_idxs]) node.ref_mutations = [ (anc, pos, der) for pos, (anc, der) in enumerate(izip(node.up.ref_seq, node.ref_seq)) if anc != der ] #print (node.name, len(node.ref_mutations)) mu_real += 1.0 * (node.ref_seq != node.up.ref_seq).sum() / L n_branches += t mu_real /= n_branches print("Mutation rate is {}".format(mu_real)) records = [ Align.SeqRecord(Align.Seq("".join(k.ref_seq)), id=k.name, name=k.name) for k in tree.get_terminals() ] full_records = [ Align.SeqRecord(Align.Seq("".join(k.ref_seq)), id=k.name, name=k.name) for k in tree.get_terminals() ] #import ipdb; ipdb.set_trace() aln = Align.MultipleSeqAlignment(records) full_aln = Align.MultipleSeqAlignment(full_records) print("Sequence evolution done...") # save results AlignIO.write(aln, basename + '.aln.ev.fasta', 'fasta') AlignIO.write(full_aln, basename + '.aln.ev_full.fasta', 'fasta') return aln, full_aln, mu_real