def finalize(): if GC.random_number_seed is not None: from warnings import warn warn( "random_number_seed specified, but Pyvolve does not support seeding its random generator" ) makedirs("pyvolve_output", exist_ok=True) label_to_node = MF.modules['TreeNode'].label_to_node() for root, treestr in GC.pruned_newick_trees: # run Pyvolve treestr = treestr.strip() label = root.get_label() rootseq = root.get_seq() if GC.VERBOSE: print('[%s] Pyvolve evolving sequences on tree: %s' % (datetime.now(), treestr), file=stderr) print('[%s] Pyvolve root sequence: %s' % (datetime.now(), rootseq), file=stderr) if treestr != '(': treestr = '(%s);' % treestr[:-1] try: tree = pyvolve.read_tree(tree=treestr) partition = pyvolve.Partition(models=GC.pyvolve_model, root_sequence=rootseq) evolver = pyvolve.Evolver(partitions=partition, tree=tree) except NameError: import pyvolve tree = pyvolve.read_tree(tree=treestr) partition = pyvolve.Partition(models=GC.pyvolve_model, root_sequence=rootseq) evolver = pyvolve.Evolver(partitions=partition, tree=tree) except AssertionError: assert False, "Error setting up Pyvolve. Tree: %s" % treestr ratefile = "pyvolve_output/%s_ratefile.txt" % label # set each to None to not generate these files infofile = "pyvolve_output/%s_infofile.txt" % label seqfile = "pyvolve_output/%s_seqfile.fasta" % label evolver(ratefile=ratefile, infofile=infofile, seqfile=seqfile) seqs = evolver.get_sequences( ) # use anc=True to get internal sequences as well # store leaf sequences in GlobalContext if not hasattr( GC, 'final_sequences' ): # GC.final_sequences[cn_node][t] = set of (label,seq) tuples GC.final_sequences = {} for leaf in seqs: seq = seqs[leaf] virus_label, cn_label, sample_time = leaf.split('|') sample_time = float(sample_time) if cn_label not in GC.final_sequences: GC.final_sequences[cn_label] = {} if sample_time not in GC.final_sequences[cn_label]: GC.final_sequences[cn_label][sample_time] = [] GC.final_sequences[cn_label][sample_time].append((leaf, seq))
def run_u(self, tree_file, sequences_folder): with open(tree_file) as f: line = f.readline().strip() if "(" not in line or line == ";": return None else: my_tree = ete3.Tree(line, format=1) root = my_tree.get_tree_root() root.name = "Root" # in this case we need to read the multipliers # First we apply the multipliers per family # Second, the multipliers per species tree branch gf_multiplier = self.gf_multipliers[tree_file.split("_")[-2].split("/")[-1]] for node in my_tree.traverse(): node.dist = node.dist * gf_multiplier * self.st_multipliers[node.name.split("_")[0]] tree = pyvolve.read_tree(tree=my_tree.write(format=5), scale_tree = self.parameters["SCALING"]) name_mapping = self.get_mapping_internal_names(tree, my_tree) partition = pyvolve.Partition(models=self.model, size=self.size) evolver = pyvolve.Evolver(tree=tree, partitions=partition) fasta_file = tree_file.split("/")[-1].replace("_completetree.nwk", "_") + "complete.fasta" evolver(seqfile=os.path.join(sequences_folder, fasta_file), ratefile=None, infofile=None, write_anc=True) # Correct the names self.correct_names(os.path.join(sequences_folder, fasta_file), name_mapping)
def evolve(newicks, sequence_size, scale_tree): temp = "temporary_sequences.fasta" phy_files = [] my_model = pyvolve.Model("nucleotide") partition = pyvolve.Partition(models = my_model, size = sequence_size) for i in range(0, len(newicks)): newick = newicks[i] tree = pyvolve.read_tree(tree = newick, scale_tree = scale_tree) my_evolver = pyvolve.Evolver(tree = tree, partitions = partition) fasta_seqfile = "temp" + str(i) + ".fasta" phylip_seqfile = "temp" + str(i) + ".phyl" phy_files.append(phylip_seqfile) my_evolver(seqfile=fasta_seqfile, seqfmt = "fasta", ratefile = None, infofile = None) fasta_to_phyl(fasta_seqfile, phylip_seqfile) os.remove(fasta_seqfile) phyl_output = "temp_seq.phyl" with open(phyl_output, 'w') as outfile: for fname in phy_files: with open(fname) as infile: outfile.write(infile.read()) outfile.write("\n") os.remove(fname) return phyl_output
def get_random_tree(filename, tree_string, L, kappa): # strains = read_in_strains(filename) # # L = genome_length(strains) # min_m = get_min_m(strains, L) # scaled_tree_string = scale_newick_format_tree(strains, L, min_m, tree_string) phylogeny = pyvolve.read_tree(tree = tree_string) # pyvolve.print_tree(phylogeny) freqs = [0.25,0.25,0.25,0.25] nuc_model = pyvolve.Model('nucleotide', {'kappa':kappa, 'state_freqs':freqs}) ancestor = generate_ancestor(L) print(ancestor) my_partition = pyvolve.Partition(models = nuc_model, root_sequence = ancestor) my_evolver = pyvolve.Evolver(partitions = my_partition, tree = phylogeny) my_evolver() # my_evolver(write_anc = True) simulated_strains = my_evolver.get_sequences() # strains = my_evolver.get_sequences(anc = True) # strain_names = list(strains.keys()) pi = pi_value(simulated_strains) theta = theta_value(simulated_strains) # print('pi: ' + str(pi)) # print('theta: ' + str(theta)) return {'pi': pi, 'theta': theta}
def make_partition_set(cat_sizes, root_freq_set, model_set, model_assignment): if root_freq_set is None: return [ pyvolve.Partition(models=ms, size=nk, root_model_name="bp0") for (ms, nk) in it.izip(model_set, cat_sizes) ] else: root_seqs = [ ''.join(np.random.choice(MOLECULES.codons, size=nk, p=freqs).flat) for (nk, freqs) in zip(cat_sizes, root_freq_set) ] return [ pyvolve.Partition(models=ms, root_sequence=root, root_model_name="bp0") for (ms, root) in it.izip(model_set, root_seqs) ]
def simulate_genomes(model, tree, asize, outdir, number): path = mkdir(os.path.join(outdir, str(number))) partition = pyvolve.Partition(models=model, size=asize) evolver = pyvolve.Evolver(tree=tree, partitions=partition) evolver( seqfile=None, # , ratefile=os.path.join(path, "rate_{}.fasta".format(number)), infofile=None) return evolver.get_sequences()
def get_random_tree(L, species, scaled_tree_string, kappa, iteration): # strains = read_in_strains(filename) # L = genome_length(strains) # min_m = get_min_m(strains, L) # max_m = get_max_m(strains, L, tree_string) # pis = [] # thetas = [] # scaled_trees = [] # for x in range(min_m,max_m+1): # scaled_tree_string = scale_newick_format_tree(strains, L, x, tree_string, increment) # scaled_trees.append(scaled_tree_string) # for tree in scaled_trees: phylogeny = pyvolve.read_tree(tree=scaled_tree_string) print('read in the tree') pyvolve.print_tree(phylogeny) freqs = [0.25, 0.25, 0.25, 0.25] nuc_model = pyvolve.Model('nucleotide', { 'kappa': kappa, 'state_freqs': freqs }) ancestor = generate_ancestor(L) print('generated an ancestor') # # print(ancestor) my_partition = pyvolve.Partition(models=nuc_model, root_sequence=ancestor) my_evolver = pyvolve.Evolver(partitions=my_partition, tree=phylogeny) my_evolver(ratefile=None, infofile=None, seqfile="simulated_alignment_" + str(species[:-1]) + "_universal_" + str(iteration + 1) + ".fasta") # # my_evolver() print('evolved the sequences') # # my_evolver(write_anc = True) simulated_strains = my_evolver.get_sequences() # # strains = my_evolver.get_sequences(anc = True) # # strain_names = list(strains.keys()) pi = pi_value(simulated_strains) theta = theta_value(simulated_strains) # pis.append(pi) # thetas.append(theta) # # print('pi: ' + str(pi)) # # print('theta: ' + str(theta)) # return {'pi': pis, 'theta': thetas} return pi, theta
def simulate(f, seqfile, tree, mu_dict, length): ''' Simulate single partition according homogeneous mutation-selection model. ''' try: my_tree = pyvolve.read_tree(file=tree) except: my_tree = pyvolve.read_tree(tree=tree) model = pyvolve.Model("MutSel", {'state_freqs': f, 'mu': mu_dict}) part = pyvolve.Partition(size=length, models=model) e = pyvolve.Evolver(partitions=part, tree=my_tree) e(seqfile=seqfile, ratefile=None, infofile=None)
def get_accurate_c(L, kappa): ancestor = generate_ancestor(L) print(ancestor) # phylogeny = pyvolve.read_tree(tree = '( (t1:0.5,t2:0.5)i1:0.5, (t3:0.5,t4:0.5)i2:0.5 , (t5:0.5,t6:0.5)i3:0.5, (t7:0.5,t8:0.5)i4:0.5 ) root;') phylogeny = pyvolve.read_tree( tree= '( ((t7:0.5,t8:0.5)i4:0.5,(t5:0.5,t6:0.5)i3:0.5)i1:0.5, (t3:0.5,t4:0.5)i2:0.5 ) root;' ) pyvolve.print_tree(phylogeny) freqs = [0.25, 0.25, 0.25, 0.25] nuc_model = pyvolve.Model('nucleotide', { 'kappa': 1.86836732388, 'state_freqs': freqs }) my_partition = pyvolve.Partition(models=nuc_model, root_sequence=ancestor) my_evolver = pyvolve.Evolver(partitions=my_partition, tree=phylogeny) # my_evolver() my_evolver(write_anc=True) # strains = my_evolver.get_sequences() strains = my_evolver.get_sequences(anc=True) strain_names = list(strains.keys()) # pre-order traversal of the tree n = len(strain_names) print(strain_names) c_sites = {} for key in strain_names: c_sites[key] = [] site_counts = L * [ None ] # list of dictionaries to keep track of which nucleotides are at each convergent site; index = site; key = nucleotide, value = number of strains with that nucleotide strains_with_site = L * [ None ] # list of the strains that have a convergent mutation at each site; index = site for x in range(L): site_counts[x] = {'A': 0, 'T': 0, 'G': 0, 'C': 0} strains_with_site[x] = [] # c_list_matrix = [[{} for x in range(n)] for y in range(n)] # matrix of the convergent mutation sites; the (i,j) entry is a dictionary of the convergent mutation sites between strain i and strain j; key = site, value = nucleotide c = 0 strain_names
def execute(tree, model, length, out, numSim): # read in model, tree, and define partition pyvolveModel = pyvolve.Model(model) pyvolveTree = pyvolve.read_tree(file=tree) pyvolvePartition = pyvolve.Partition(models=pyvolveModel, size=int(length)) # create evolver my_evolver = pyvolve.Evolver(tree=pyvolveTree, partitions=pyvolvePartition) my_evolver() print("Simulating sequences...") # create simluated sequences for i in range(int(numSim)): print(str(out) + "." + str(i) + ".fa") my_evolver(seqfile=str(out) + "." + str(model) + "-" + str(i) + ".fa")
def run(self, tree_file, sequences_folder): with open(tree_file) as f: line = f.readline().strip() if "(" not in line or line == ";": return None else: my_tree = ete3.Tree(line, format=1) tree = pyvolve.read_tree(tree=my_tree.write(format=5), scale_tree = self.parameters["SCALING"]) name_mapping = self.get_mapping_internal_names(tree, my_tree) partition = pyvolve.Partition(models=self.model, size=self.size) evolver = pyvolve.Evolver(tree=tree, partitions=partition) fasta_file = tree_file.split("/")[-1].replace("_completetree.nwk", "_complete") + ".fasta" evolver(seqfile=os.path.join(sequences_folder, fasta_file), ratefile=None, infofile=None, write_anc=True) # Correct the names self.correct_names(os.path.join(sequences_folder, fasta_file), name_mapping)
def simulate_single_sequence(self, name, gene_length, tree_file, sequences_folder): my_tree = "(A:1,B:1);".replace("A",name) tree = pyvolve.read_tree(tree=my_tree) partition = pyvolve.Partition(models=self.model, size=gene_length) evolver = pyvolve.Evolver(tree=tree, partitions=partition) fasta_file = tree_file.split("/")[-1].replace("_completetree.nwk", "_complete") + ".fasta" evolver(seqfile=os.path.join(sequences_folder, fasta_file), ratefile=None, infofile=None, write_anc=True) # Select single sequence entries = list() for n, v in af.fasta_reader(os.path.join(sequences_folder, fasta_file)): if n[1:] != name: continue else: entries.append((n,v)) af.fasta_writer(os.path.join(sequences_folder, fasta_file), entries)
def exampleFastaGenerator(nwkFile, fastaOutputLocation, seqLength, rate=1): # Tree. treeName = nwkFile[nwkFile.rindex('/'):] treeName = treeName.split('.')[0] phylogony = pyvolve.read_tree(file=nwkFile) # Rates. mutationRates = { "AC": rate, "AG": rate, "AT": rate, "CG": rate, "CT": rate, "GT": rate } # Model. model = pyvolve.Model("nucleotide", {"mu": mutationRates}) partition = pyvolve.Partition(models=model, size=seqLength) # Evolver. evolver = pyvolve.Evolver(partitions=[partition], tree=phylogony) evolver(seqfile=fastaOutputLocation, ratefile=None, infofile=None)
def evolve_nonconvergent_partition(g): if (g['num_convergent_site'] == 0): site_start = 1 else: site_start = g['num_simulated_site'] - g['num_convergent_site'] + 1 site_end = g['num_simulated_site'] print('Codon site {}-{}; Non-convergent codons'.format( site_start, site_end)) num_nonconvergent_site = g['num_simulated_site'] - g['num_convergent_site'] q_matrix = copy.copy(g['background_Q']) with suppress_stdout_stderr(): model = pyvolve.Model(model_type='custom', name='root', parameters={'matrix': q_matrix}) partition = pyvolve.Partition(models=model, size=num_nonconvergent_site) evolver = pyvolve.Evolver(partitions=partition, tree=g['background_tree']) evolver(ratefile='tmp.csubst.simulate_nonconvergent_ratefile.txt', infofile='tmp.csubst.simulate_nonconvergent_infofile.txt', seqfile='tmp.csubst.simulate_nonconvergent.fa', write_anc=False)
def generateTree(tns, ntaxa, seqlen): #Construct the tree and save as newick file t = dendropy.simulate.treesim.birth_death_tree(birth_rate=1.0, death_rate=0, taxon_namespace=tns, num_extant_tips=ntaxa) t.write(path='/tmp/pyvt', schema='newick', suppress_rooting=True, suppress_internal_node_labels=True) #Set pyvolve data type m1 = pyvolve.Model("nucleotide") p1 = pyvolve.Partition(models=m1, size=seqlen) #Read tree from dendropy pot = pyvolve.read_tree(file='/tmp/pyvt') #Simulate evolution with no save file e1 = pyvolve.Evolver(tree=pot, partitions=p1) e1(seqfile=None) seqs = e1.get_sequences() ds=dendropy.DnaCharacterMatrix.from_dict(seqs, taxon_namespace=tns) ds.write(path="evolvedsequences.fasta", schema="fasta") #print ds return t
def simulate(tree_index,length): """ Inputs: tree (integer 0-2) Outputs: array of 4 sequences, using the tree from above """ tree_map = ["alpha","beta","charlie"] tree = tree_map[tree_index] my_tree = pyvolve.read_tree(file = "trees/"+tree+".tre") #Idk weird pyvolve paramets parameters_omega = {"omega": 0.65} parameters_alpha_beta = {"beta": 0.65, "alpha": 0.98} # Corresponds to dN/dS = 0.65 / 0.98 my_model = pyvolve.Model("MG", parameters_alpha_beta) # Assign the model to a pyvolve.Partition. The size argument indicates to evolve 250 positions (for a codon alignment, this means 250 codons, i.e. 750 nucleotide sites) my_partition = pyvolve.Partition(models = my_model, size = length) # Evolve! my_evolver = pyvolve.Evolver(partitions = my_partition, tree = my_tree, ratefile = None, infofile = None) my_evolver(ratefile = None, infofile = None) #Extract the sequences simulated_sequences = list(my_evolver.get_sequences().values()) return simulated_sequences
import sys, os import pyvolve import glob from mungo.fasta import FastaReader from collections import defaultdict input_fasta = sys.argv[1] input_tree_txt = sys.argv[2] output_seqs = sys.argv[3] #f = pyvolve.ReadFrequencies("amino_acid", file = "/Users/fengqian/Downloads/UniMelb_shared-master/project/mosaic_data/Protein_translateable_pilot_upper_centroids.fasta") #f = pyvolve.ReadFrequencies("amino_acid", file = "/data/cephfs/punim0609/qian_feng/snake_pipeline/data/Protein_translateable_pilot_upper_centroids.fasta") f = pyvolve.ReadFrequencies("amino_acid", file=input_fasta) frequencies = f.compute_frequencies() my_tree_1 = pyvolve.read_tree(file=input_tree_txt, scale_tree=0.5) my_model_1 = pyvolve.Model("MTMAM", {"state_freqs": frequencies}) my_partition_1 = pyvolve.Partition(models=my_model_1, size=200) my_evolver_1 = pyvolve.Evolver(partitions=my_partition_1, tree=my_tree_1) my_evolver_1(ratefile=None, infofile=None, seqfile=output_seqs) seqs = {} seq_list = [] count = 0 for h, s in FastaReader(output_seqs): seqs["seq" + str(count)] = s seq_list.append("seq" + str(count)) count += 1 ##organize the seq ID name with open(output_seqs, 'w') as outfile: for s in seq_list: outfile.write(">" + s + "\n" + seqs[s] + "\n")
# Define a phylogeny, from a file containing a newick tree my_tree = pyvolve.read_tree(file="file_with_tree.tre") # Define a nucleotide model, as a pyvolve.Model object. For this example, we'll use default parameters, but see the example script custom_aminoacid.py for other options # To implement rate heterogeneity, do either of these: ## 1) Custom rates: Provide a list of rate_factors when defining a Model object. These rate factors will be assigned to sites with equal probability by default. To change this, provide probabilities with the argument `rate_probs`. ## 2) Gamma rates: Provide the keyword arguments num_categories and alpha when defining a Model object. <num_categories> rates will be drawn from a gamma distribution with shape and scale parameter each equal to <alpha>. These rates will be equiprobable, unless overridden by `rate_probs`. # Several model definitions are shown below (first argument can be a different model, as desired). # custom rates my_model1 = pyvolve.Model( "WAG", rate_factors=[0.3, 0.8, 1.5, 2.45]) # 25% of sites will have each factor. my_model2 = pyvolve.Model( "WAG", rate_factors=[0.3, 0.8, 1.5, 2.45], rate_probs=[0.7, 0.2, 0.05, 0.05] ) # 70% of sites evolve with rate of 0.3, 20% with a rate of 0.8, 5% with a rate of 1.5, and 5% with a rate of 2.45 # gamma rates my_model3 = pyvolve.Model("WAG", alpha=0.6, num_categories=5) # Assign the model to a pyvolve.Partition. The size argument indicates to evolve 250 positions my_partition = pyvolve.Partition(models=my_model2, size=250) # Evolve! my_evolver = pyvolve.Evolver(partitions=my_partition, tree=my_tree) my_evolver()
@author: david """ import pyvolve "User defined params" mut_rate = 0.005 freqs = [0.25, 0.25, 0.25, 0.25] seq_length = 1000 kappa = 2.75 "Read in phylogeny along which Pyvolve should simulate" "Scale_tree sets absolute mutation rate" my_tree = pyvolve.read_tree(file = "AMR-sim.tre", scale_tree = mut_rate) #pyvolve.print_tree(my_tree) # Print the parsed phylogeny "Specify nucleotide substitution model with custom rates" #custom_mu = {"AC":0.5, "AG":0.25, "AT":1.23, "CG":0.55, "CT":1.22, "GT":0.47} #nuc_model = pyvolve.Model( "nucleotide", {"mu":custom_mu, "state_freqs":freqs} ) "Or just use an HKY model with kappa" nuc_model = pyvolve.Model( "nucleotide", {"kappa":kappa, "state_freqs":freqs}) "Define a Partition object which evolves set # of positions according to my_model" my_partition = pyvolve.Partition(models = nuc_model, size = seq_length) #my_partition = pyvolve.Partition(models = nuc_model, root_sequence = "GATAGAAC") # Or with a root seq "Define an Evolver instance to evolve a single partition" my_evolver = pyvolve.Evolver(partitions = my_partition, tree = my_tree) "Evolve sequences with custom file names" my_evolver(ratefile = "AMR_ratefile.txt", infofile = "AMR_infofile.txt", seqfile = "AMR-seqsim.fasta" )
#!/bin/python3 import pyvolve ; import sys tree_variable=sys.argv[1] anc_seq_variable=sys.argv[2] model_type=sys.argv[3] omega_value=float(sys.argv[4]) # Simulation: my_tree = pyvolve.read_tree(file = tree_variable) my_model = pyvolve.Model(model_type, {"omega": omega_value }) my_partition = pyvolve.Partition(models = my_model, root_sequence = anc_seq_variable) my_evolver = pyvolve.Evolver(tree = my_tree, partitions= my_partition) my_evolver() #pyvolve.print_tree(tree_variable)
def main(): """Main body of script.""" codons = pyvolve.genetics.Genetics().codons codon_dict = pyvolve.genetics.Genetics().codon_dict pyrims = pyvolve.genetics.Genetics().pyrims purines = pyvolve.genetics.Genetics().purines args = vars(ParseArguments().parse_args()) print("Read the following command line arguments:") print("\n\t{0}".format("\n\t".join( ["{0} = {1}".format(key, value) for (key, value) in args.items()]))) print("\nPerforming simulation with pyvolve version {0}".format( pyvolve.__version__)) print("\nReading model params from {0}".format(args['modelparams'])) params = ReadParams(args['modelparams']) for (param, paramvalue) in params.items(): print("The value of {0} is {1}".format(param, paramvalue)) print("\nReading preferences from {0}".format(args['prefs'])) tup = dms_tools.file_io.ReadPreferences(args['prefs']) (sites, pis) = (tup[0], tup[2]) print("\nRead amino-acid preferences for {0} sites".format(len(pis))) tree = pyvolve.read_tree(file=args['tree']) # create models for simulation partitions = [] for r in sites: if params['diversifyingsitesA'] and (int(r) in params['diversifyingsitesA']): omega = params['diversifyingomegaA'] print r, omega elif params['diversifyingsitesB'] and ( int(r) in params['diversifyingsitesB']): omega = params['diversifyingomegaB'] print r, omega else: omega = 1.0 matrix = [] # matrix[x][y] is rate of substitution from x to y for (xi, x) in enumerate(codons): row = [] for (yi, y) in enumerate(codons): ntdiffs = [(x[j], y[j]) for j in range(3) if x[j] != y[j]] if len(ntdiffs) == 0: assert x == y row.append( 0) # will later be adjusted to make row sum to zero elif len(ntdiffs) > 1: # multi-nucleotide codon change row.append(0) else: # single nucleotide change (xnt, ynt) = ntdiffs[0] if (xnt in purines) == (ynt in purines): # transition qxy = params['kappa'] * params['phi{0}'.format(ynt)] else: # transversion qxy = params['phi{0}'.format(ynt)] (xaa, yaa) = (codon_dict[x], codon_dict[y]) if xaa == yaa: fxy = 1.0 else: pix = pis[r][xaa]**params['stringencyparameter'] piy = pis[r][yaa]**params['stringencyparameter'] if abs(pix - piy) < 1e-6: fxy = omega else: fxy = omega * math.log( piy / pix) / (1.0 - pix / piy) row.append(qxy * fxy * params['scalerate']) assert len(row) == len(codons) row[xi] = -sum(row) matrix.append(row) model = pyvolve.Model("custom", {"matrix": matrix}) partitions.append(pyvolve.Partition(models=model, size=1)) print("\nSimulating evolution, writing to {0}...".format( args['simulatedalignment'])) basename = os.path.splitext(args['simulatedalignment'])[0] evolver = pyvolve.Evolver(partitions=partitions, tree=tree) evolver( seqfile=args['simulatedalignment'], infofile='{0}_infofile.txt'.format(basename), ratefile='{0}_ratefile.txt'.format(basename), ) print("Finished simulation") uniqueseqs = set([]) uniquealignment = [] ninitial = 0 for seq in Bio.SeqIO.parse(args['simulatedalignment'], 'fasta'): ninitial += 1 seqstr = str(seq.seq) if seqstr not in uniqueseqs: uniqueseqs.add(seqstr) uniquealignment.append(seq) print( "\nAfter removing redundant sequences, we have shrunk {0} from {1} to {2} sequences" .format(args['simulatedalignment'], ninitial, len(uniquealignment))) Bio.SeqIO.write(uniquealignment, args['simulatedalignment'], 'fasta')
# This example script demonstrates how to evolve according to custom model with custom code import pyvolve import numpy as np # Define a phylogeny, from a file containing a newick tree my_tree = pyvolve.read_tree(file="file_with_tree.tre") # Define a custom model with custom matrix and custom code (states). The matrix must be square and have the same dimension (in 1D) as the provided code. Note that code is a list because, in theory, you can specify multi-character (as in letters) states. matrix = np.array([[-0.5, 0.25, 0.25], [0.25, -0.5, 0.25], [0.25, 0.25, -0.5]]) code = ["0", "1", "2"] my_model = pyvolve.Model("custom", {"matrix": matrix, "code": code}) my_partition = pyvolve.Partition(models=my_model, size=1) my_evolver = pyvolve.Evolver(partitions=my_partition, tree=my_tree) my_evolver()
sum = 0.0 for i in categoryProbs: sum += i for i in range(nCat): categoryProbs[i] = categoryProbs[i] / sum if sum > 1.000001 or sum < 0.999999: print( "\n Normalizing probabilities of site categories. New probabilities:") print(categoryProbs) #run pyvolve print("Starting pyvolve timer") import pyvolve start = time.time() pyvolveTree = pyvolve.read_tree(file=pathSimu + treeFile, scale_tree=args.scale) #pyvolveTree = pyvolve.read_tree(tree = tString2, scale_tree = args.scale) nucModel = pyvolve.Model("custom", {"matrix": mutMatrix}, alpha=0.5, num_categories=len(categoryRates)) partitions = pyvolve.Partition(models=nucModel, root_sequence=ref) my_evolver = pyvolve.Evolver(tree=pyvolveTree, partitions=partitions) my_evolver(seqfile=pathSimu + outputFile, algorithm=1) # Algorithm = 1 uses the Gillespie algorithm. time2 = time.time() - start print("Pyvolve timer ended") print(time2) exit()
# This example script demonstrates how to evolve according to a nucleotide model with several partitions. # In this example, the first partition has gamma-distributedsitewise rate heterogeneity, the second partition is homogenous, and the third partition has custom sitewise rate heterogeneity. # All models use default mutation-rate parameters import pyvolve # Define a phylogeny, from a file containing a newick tree my_tree = pyvolve.read_tree(file="file_with_tree.tre") # Define first model and partition. This partition has a length of 50 positions model1 = pyvolve.Model("nucleotide", alpha=0.7, num_categories=4) part1 = pyvolve.Partition(models=model1, size=50) # Define second model and partition. This partition has a length of 20 positions model2 = pyvolve.Model("nucleotide") part2 = pyvolve.Partition(models=model2, size=20) # Define second model and partition. This partition has a length of 100 positions model3 = pyvolve.Model("nucleotide", rate_factors=[0.5, 1.6, 4.1], rate_probs=[0.75, 0.2, 0.05]) part3 = pyvolve.Partition(models=model3, size=100) # Provide all partitions *in the order in which they should be evolved* to Evolver and evolve my_evolver = pyvolve.Evolver(partitions=[part1, part2, part3], tree=my_tree) my_evolver()
def evolve_convergent_partitions(g): num_fl = foreground.get_num_foreground_lineages(tree=g['tree']) model_names = [ 'root', ] + ['m' + str(i + 1) for i in range(num_fl)] num_convergent_partition = g['num_convergent_site'] convergent_partitions = list() biased_substitution_fractions = list() current_site = 0 for partition_index in numpy.arange(num_convergent_partition): current_site += 1 biased_aas = get_biased_amino_acids(g['convergent_amino_acids'], g['codon_table']) print('Codon site {}; Biased amino acids = {}; '.format( current_site, ''.join(biased_aas)), end='') biased_nsy_sub_index = get_biased_nonsynonymous_substitution_index( biased_aas, g['codon_table'], g['pyvolve_codon_orders']) biased_Q = apply_percent_biased_sub( mat=g['background_Q'], percent_biased_sub=g['percent_biased_sub'], target_index=biased_nsy_sub_index, biased_aas=biased_aas, codon_table=g['codon_table'], codon_orders=g['pyvolve_codon_orders'], all_nsy_cdn_index=g['all_nsy_cdn_index'], all_syn_cdn_index=g['all_syn_cdn_index'], foreground_omega=g['foreground_omega'], ) total_nsy_Q = get_total_Q(biased_Q, g['all_nsy_cdn_index']) total_biased_Q = get_total_biased_Q(biased_Q, biased_aas, g['codon_table'], g['pyvolve_codon_orders']) fraction_biased_Q = total_biased_Q / total_nsy_Q bg_total_nsy_Q = get_total_Q(g['background_Q'], g['all_nsy_cdn_index']) bg_total_biased_Q = get_total_biased_Q(g['background_Q'], biased_aas, g['codon_table'], g['pyvolve_codon_orders']) bg_fraction_biased_Q = bg_total_biased_Q / bg_total_nsy_Q txt = 'Total in Q toward the codons before and after the bias introduction = ' \ '{:,.1f}% ({:,.1f}/{:,.1f}) and {:,.1f}% ({:,.1f}/{:,.1f})' print( txt.format(bg_fraction_biased_Q * 100, bg_total_biased_Q, bg_total_nsy_Q, fraction_biased_Q * 100, total_biased_Q, total_nsy_Q)) biased_substitution_fractions.append(fraction_biased_Q) models = list() for model_name in model_names: is_nonroot_model = (model_name != 'root') if (is_nonroot_model): q_matrix = copy.copy(biased_Q) else: q_matrix = copy.copy(g['background_Q']) with suppress_stdout_stderr(): model = pyvolve.Model(model_type='custom', name=model_name, parameters={'matrix': q_matrix}) models.append(model) partition = pyvolve.Partition(models=models, size=1, root_model_name='root') convergent_partitions.append(partition) if len(biased_substitution_fractions): mean_biased_substitution_fraction = numpy.array( biased_substitution_fractions).mean() else: mean_biased_substitution_fraction = 0 txt = '{:,.2f}% of substitutions in {} sites in the foreground branches are ' \ 'expected to result from the introduced bias in Q matrix.' fraction_convergent_site = g['num_convergent_site'] / g[ 'num_simulated_site'] print( txt.format( mean_biased_substitution_fraction * fraction_convergent_site * 100, g['num_simulated_site'])) txt = '{:,.2f}% of substitutions in {} convergent sites in the foreground branches are ' \ 'expected to result from the introduced bias in Q matrix.' print( txt.format(mean_biased_substitution_fraction * 100, g['num_convergent_site'])) evolver = pyvolve.Evolver(partitions=convergent_partitions, tree=g['foreground_tree']) evolver(ratefile='tmp.csubst.simulate_convergent_ratefile.txt', infofile='tmp.csubst.simulate_convergent_infofile.txt', seqfile='tmp.csubst.simulate_convergent.fa', write_anc=False)
def pyvolvePartitions(model, divselection=None): """Get list of `pyvolve` partitions for `model`. Args: `model` (`phydmslib.models.Models` object) The model used for the simulations. Currently only certain `Models` are supported (e.g., `YNGKP`, `ExpCM`) `divselection` (`None` or 2-tuple `(divomega, divsites)`) Set this option if you want to simulate a subset of sites as under diversifying selection (e.g., an `omega` different than that used by `model`. In this case, `divomega` is the omega for this subset of sites, and `divsites` is a list of the sites in 1, 2, ... numbering. Returns: `partitions` (`list` of `pyvolve.Partition` objects) Can be fed into `pyvolve.Evolver` to simulate evolution. """ codons = pyvolve.genetics.Genetics().codons codon_dict = pyvolve.genetics.Genetics().codon_dict pyrims = pyvolve.genetics.Genetics().pyrims purines = pyvolve.genetics.Genetics().purines if divselection: (divomega, divsites) = divselection else: divsites = [] assert all([1 <= r <= model.nsites for r in divsites]) partitions = [] for r in range(model.nsites): matrix = scipy.zeros((len(codons), len(codons)), dtype='float') for (xi, x) in enumerate(codons): for (yi, y) in enumerate(codons): ntdiffs = [(x[j], y[j]) for j in range(3) if x[j] != y[j]] if len(ntdiffs) == 1: (xnt, ynt) = ntdiffs[0] qxy = 1.0 if (xnt in purines) == (ynt in purines): qxy *= model.kappa (xaa, yaa) = (codon_dict[x], codon_dict[y]) fxy = 1.0 if xaa != yaa: if type( model ) == phydmslib.models.ExpCM_empirical_phi_divpressure: fxy *= model.omega * ( 1 + model.omega2 * model.deltar[r]) elif r + 1 in divsites: fxy *= divomega else: fxy *= model.omega if type(model) in [ phydmslib.models.ExpCM, phydmslib.models.ExpCM_empirical_phi, phydmslib.models.ExpCM_empirical_phi_divpressure ]: qxy *= model.phi[NT_TO_INDEX[ynt]] pix = model.pi[r][AA_TO_INDEX[xaa]]**model.beta piy = model.pi[r][AA_TO_INDEX[yaa]]**model.beta if abs(pix - piy) > ALMOST_ZERO: fxy *= math.log(piy / pix) / (1.0 - pix / piy) elif type(model) == phydmslib.models.YNGKP_M0: for p in range(3): qxy *= model.phi[p][NT_TO_INDEX[y[p]]] else: raise ValueError("Can't handle model type {0}".format( type(model))) matrix[xi][yi] = model.mu * qxy * fxy matrix[xi][xi] = -matrix[xi].sum() # create model in way that captures annoying print statements in pyvolve old_stdout = sys.stdout sys.stdout = open(os.devnull, 'w') try: m = pyvolve.Model("custom", {"matrix": matrix}) finally: sys.stdout.close() sys.stdout = old_stdout partitions.append(pyvolve.Partition(models=m, size=1)) return partitions
def get_c(L, kappa): ancestor = generate_ancestor(L) print(ancestor) phylogeny = pyvolve.read_tree( tree='((t1:0.5,t2:0.5)i1:0.5,(t3:0.5,t4:0.5)i2:0.5)root;') # '(t4:0.785,(t3:0.380,(t2:0.806,(t5:0.612,t1:0.660)i1:0.762)i2:0.921)i3:0.207)root;') # ((s1,s2)n1,(s3,s4)n2)n3 pyvolve.print_tree(phylogeny) freqs = [0.25, 0.25, 0.25, 0.25] nuc_model = pyvolve.Model('nucleotide', { 'kappa': 1.86836732388, 'state_freqs': freqs }) my_partition = pyvolve.Partition(models=nuc_model, root_sequence=ancestor) my_evolver = pyvolve.Evolver(partitions=my_partition, tree=phylogeny) my_evolver() # my_evolver(write_anc = True) strains = my_evolver.get_sequences() # strains = my_evolver.get_sequences(anc = True) strain_names = list(strains.keys()) n = len(strain_names) site_counts = L * [ None ] # list of dictionaries to keep track of which nucleotides are at each convergent site; index = site; key = nucleotide, value = number of strains with that nucleotide strains_with_site = L * [ None ] # list of the strains that have a convergent mutation at each site; index = site for x in range(L): site_counts[x] = {'A': 0, 'T': 0, 'G': 0, 'C': 0} strains_with_site[x] = [] # c_list_matrix = [[{} for x in range(n)] for y in range(n)] # matrix of the convergent mutation sites; the (i,j) entry is a dictionary of the convergent mutation sites between strain i and strain j; key = site, value = nucleotide for s1 in range(n): strain1 = strains[strain_names[s1]] for s2 in range(s1, n): strain2 = strains[strain_names[s2]] for site in range(L): if strain1[site] == strain2[ site] and strain1[site] != ancestor[site]: if strain1 not in strains_with_site[ site]: # avoids double counting strain1 as convergent at that site strains_with_site[site].append(strain1) site_counts[site][strain1[site]] += 1 if strain2 not in strains_with_site[ site]: # avoids double counting strain2 as convergent at that site strains_with_site[site].append(strain2) site_counts[site][strain2[site]] += 1 c_q = (n - 1) * [ None ] # list of the number of convergent mutations between q strains; index = q - 2 nucleotides = ['A', 'T', 'G', 'C'] for x in range(n - 1): c_q[x] = 0 for site in site_counts: for base in nucleotides: for q in range(2, n + 1): if site[base] == (q): c_q[q - 2] += 1 c = sum(c_q) print(c) return c
def main(strain, seedFilepath, gffFilepath): for record in SeqIO.parse(seedFilepath, "fasta"): seedRec = record break gff_df = read_gff(gffFilepath) #get all the shuffle region prv = 0 pos_lst = [] for _, row in gff_df.iterrows(): pos_lst.append(("nc", prv, row["start"] - 1, "+")) pos_lst.append(("c", row["start"] - 1, row["end"], row["strand"])) prv = row["end"] pos_lst.append(("nc", prv, len(seedRec), "+")) # configuration for evolution treeFilepath = "tmp.tree" mytree = pyvolve.read_tree(file=treeFilepath) ncm = pyvolve.Model("nucleotide") # non-coding model cm = pyvolve.Model("ECMrest") # coding model outputSeq_lst = [Seq("") for _ in range(4)] # assuming tree has 4 nodes for pos in pos_lst: category, start, end, strand = pos # get rootSeq according to start, end, strand info rootSeq = seedRec.seq[start:end] if strand == "-": rootSeq = rootSeq.reverse_complement() rootSeq = str(rootSeq) # get simulated sequences if category == "nc": # partition = pyvolve.Partition(models = ncm, root_sequence = rootSeq) # evolver = pyvolve.Evolver(partition = partition, tree = mytree) # rec_lst = get_evolved(evolver) rec_lst = [SeqRecord(Seq(rootSeq)) for _ in range(4)] elif category == "c": partition = pyvolve.Partition( models=cm, root_sequence=rootSeq[3:-3]) #remove start & stop codon evolver = pyvolve.Evolver(partition=partition, tree=mytree) rec_lst = get_evolved(evolver) for rec in rec_lst: rec.seq = rootSeq[:3] + rec.seq + rootSeq[ -3:] #add last stop codon back assert len(rec_lst) == len(outputSeq_lst) # concat to outputSeq_lst for i, rec in enumerate(rec_lst): simSeq = rec.seq if strand == "-": simSeq = simSeq.reverse_complement() outputSeq_lst[i] += simSeq for i, outputSeq in enumerate(outputSeq_lst): genomeId = "{}_sim{}".format(strain, i + 1) outFilepath = "../data/dnaseq/{}.dnaseq".format(genomeId) with open(outFilepath, "w") as f: seqname = "{}:seq".format(genomeId) rec = SeqRecord(outputSeq, id=seqname, description="") SeqIO.write(rec, f, "fasta") print("DONE: output {}".format(outFilepath))
# This example script demonstrates how to evolve according to a nucleotide model with *branch* rate heterogeneity. The approach is the same for non-nucleotide models. import pyvolve # Define a phylogeny. For clarity, we define this tree with a string. The tree contains model flags for branches which should evolve according to new models. Flags are represented as _name_, where underscores surround the name. my_tree = pyvolve.read_tree( tree="((t1:0.5, t2:0.5):0.5_m1_,(t3:0.5, t4:0.5):0.5_m2_));") # Define a model for each flag. Models should be given names with the keyword argument `name`. These names *MUST* have correspondingly named flags in the tree! model1 = pyvolve.Model("nucleotide", {"kappa": 3.5}, name="m1") model2 = pyvolve.Model("nucleotide", {"kappa": 4.75}, name="m2") rootmodel = pyvolve.Model( "nucleotide", name="root" ) # We can also define, if we want, a model for the ROOT of the tree that is separate from either of these models. # Define partition will all models as a list. Include the argument `root_model_name` to indicate the NAME ATTRIBUTE of the model that should be used at the root of the tree. This name's corresponding object must be in the `models` list. Note that a separate root model is not needed - you could easily just start with _m1_ at the root, but you'd still need to give "m1" to `root_model_name`. my_partition = pyvolve.Partition(models=[model1, model2, rootmodel], size=250, root_model_name="root") # Evolve! my_evolver = pyvolve.Evolver(partitions=my_partition, tree=my_tree) my_evolver()
# have a relevant (and low) changce of back mutations (i.e. two changes at the same site) # as these are very rare for the organism studied. # read tree and determine root to tip distance max_rtt = max([x['dist_to_root'] for x in t2n]) scaling_factor = prop_bases_mutated / max_rtt for node in bdtree.traverse(): node.dist = node.dist * scaling_factor bdtree.write(outfile=tree_filename, format=3) # now we use the pyvolve module, see http://sjspielman.org/pyvolve/ # Spielman, SJ and Wilke, CO. 2015. # Pyvolve: A flexible Python module for simulating sequences along phylogenies. PLOS ONE. 10(9): e0139047. t = pyvolve.read_tree(tree=bdtree.write(format=3)) m = pyvolve.Model("nucleotide") p = pyvolve.Partition(models=m, root_sequence=miniseq) e = pyvolve.Evolver(partitions=p, tree=t) # Run evolution e() # Recover sequences from the evolution; # write output to file. simulated_sequences = e.get_sequences() with open(sequence_filename, 'wt') as f: for key in sorted(simulated_sequences.keys()): f.write("{0}\t{1}\n".format(key, simulated_sequences[key])) with open(fasta_filename, 'wt') as f: for key in sorted(simulated_sequences.keys()): f.write(">{0}\n{1}\n".format(key, simulated_sequences[key]))