def invFels(y, x, L=1000, alphabet='nuc_nogap'): from io import StringIO T = Phylo.read(StringIO("(A:%f,B:%f,(C:%f,D:%f):%f);" % (y, y, x, x, y)), "newick") gtr = GTR(alphabet=alphabet) gtr.seq_len = L mySeq = SeqGen(gtr=gtr, tree=T) mySeq.evolve() return T, mySeq.get_aln()
def infer_gene_gain_loss(rates = [1.0, 1.0], path_to_pangenome_dir = '/ebio/abt6_projects9/Pseudomonas_diversity/data/post_assembly_analysis/pan_genome/Ta1524/'): '''code nabbed and edited from panX''' # initialize GTR model with default parameters mu = np.sum(rates) gene_pi = np.array(rates)/mu gain_loss_model = GTR.custom(pi = gene_pi, mu=mu, W=np.ones((2,2)), alphabet = np.array(['0','1'])) # add "unknown" state to profile gain_loss_model.profile_map['-'] = np.ones(2) #root_dir = os.path.dirname(os.path.realpath(__file__)) # define file names for pseudo alignment of presence/absence patterns as in 001001010110 #path_to_pangenome_dir='/ebio/ag-neher/share/users/wding/panX-refseq/data/Pseudomonadales'#sys.argv[1] nwk=path_to_pangenome_dir+"/vis/strain_tree.nwk" fasta=path_to_pangenome_dir+"/geneCluster/genePresence.aln" # instantiate treetime with custom GTR t = ta.TreeAnc(nwk, gtr =gain_loss_model, verbose=2) # fix leaves names since Bio.Phylo interprets numeric leaf names as confidence for leaf in t.tree.get_terminals(): if leaf.name is None: leaf.name = str(leaf.confidence) t.aln = fasta t.tree.root.branch_length=0.0001 t.reconstruct_anc(method='ml') for n in t.tree.find_clades(): n.genepresence = n.sequence return t
def infer_gene_gain_loss(path, rates=[1.0, 1.0]): # initialize GTR model with default parameters mu = np.sum(rates) gene_pi = np.array(rates) / mu gain_loss_model = GTR.custom(pi=gene_pi, mu=mu, W=np.ones((2, 2)), alphabet=np.array(['0', '1'])) # add "unknown" state to profile gain_loss_model.profile_map['-'] = np.ones(2) root_dir = os.path.dirname(os.path.realpath(__file__)) # define file names for pseudo alignment of presence/absence patterns as in 001001010110 sep = '/' fasta = sep.join([path.rstrip(sep), 'geneCluster', 'genePresence.aln']) # strain tree based on core gene SNPs nwk = sep.join([path.rstrip(sep), 'geneCluster', 'strain_tree.nwk']) # instantiate treetime with custom GTR t = ta.TreeAnc(nwk, gtr=gain_loss_model, verbose=2) # fix leaves names since Bio.Phylo interprets numeric leaf names as confidence for leaf in t.tree.get_terminals(): if leaf.name is None: leaf.name = str(leaf.confidence) t.aln = fasta t.tree.root.branch_length = 0.0001 t.reconstruct_anc(method='ml') for n in t.tree.find_clades(): n.genepresence = n.sequence return t
def geo_inference(self, attr): from treetime.gtr import GTR places = set() for node in self.tree.find_clades(): if hasattr(node, attr): places.add(node.__getattribute__(attr)) if hasattr(node, 'sequence'): node.nuc_sequence = node.sequence places = sorted(places) alphabet = {chr(65+i):place for i,place in enumerate(places)} alphabet_rev = {v:k for k,v in alphabet.iteritems()} self.tt.sequence_gtr = self.tt.gtr nc = len(places) myGeoGTR = GTR.custom(pi = np.ones(nc, dtype=float)/nc, W=np.ones((nc,nc)), alphabet = np.array(sorted(alphabet.keys()))) myGeoGTR.profile_map['-'] = np.ones(nc) for node in self.tree.get_terminals(): if hasattr(node, attr): node.sequence=np.array([alphabet_rev[node.__getattribute__(attr)]]) else: node.sequence=np.array(['-']) for node in self.tree.get_nonterminals(): node.__delattr__('sequence') self.tt._gtr = myGeoGTR self.tt.reconstruct_anc(method='ml') self.tt.infer_gtr(print_raw=True) self.tt.reconstruct_anc(method='ml') self.tt.geogtr = self.tt.gtr self.tt._gtr = self.tt.sequence_gtr for node in self.tree.find_clades(): node.__setattr__(attr, alphabet[node.sequence[0]]) node.sequence = node.nuc_sequence
def tt_from_file(self, infile, root='none'): from treetime.gtr import GTR from treetime import io, utils gtr = GTR.standard() self.tt = io.treetime_from_newick(gtr, infile) io.set_seqs_to_leaves(self.tt, self.aln) io.set_node_dates_from_dic(self.tt, {seq.id:utils.numeric_date(seq.attributes['date']) for seq in self.aln if 'date' in seq.attributes}) self.tree = self.tt.tree if root=='midpoint': self.tt.tree.root_at_midpoint() self.tt.set_additional_tree_params() elif root=='oldest': tmp = self.tt.reroot_to_oldest() for node in self.tree.get_terminals(): if node.name in self.sequence_lookup: seq = self.sequence_lookup[node.name] for attr in seq.attributes: if attr == 'date': node.date = seq.attributes['date'].strftime('%Y-%m-%d') else: node.__setattr__(attr, seq.attributes[attr])
from __future__ import print_function, division import numpy as np from Bio import Phylo if __name__ == '__main__': from treetime.seq_utils import normalize_profile, prof2seq, seq2prof from treetime.gtr import GTR gtr = GTR.standard('JC69') dummy_prof = np.random.random(size=(10000, 5)) # used a lot (300us) norm_prof = normalize_profile(dummy_prof)[0] # used less but still a lot (50us) gtr.evolve(norm_prof, 0.1) # used less but still a lot (50us) gtr.propagate_profile(norm_prof, 0.1) # used only in final, sample_from_prof=False speeds it up (600us or 300us) seq, p, seq_ii = prof2seq(norm_prof, gtr, sample_from_prof=True, normalize=False) # used only initially (slow, 5ms) tmp_prof = seq2prof(seq, gtr.profile_map)
def date_from_seq_name(name): date = str2date_time(name.split('|')[2].strip()) return date.year + date.timetuple().tm_yday / 365.25 if __name__=='__main__': root_dir = os.path.dirname(os.path.realpath(__file__)) file_base = '../data/H3N2_NA_allyears_NA.200' fasta = os.path.join(root_dir, file_base+'.fasta') nwk = os.path.join(root_dir, file_base+'.nwk') mdf = os.path.join(root_dir, file_base+'.metadata.csv') # read tree from file gtr = GTR.standard() t = io.treetime_from_newick(gtr, nwk) # set alignment to the tree io.set_seqs_to_leaves(t, AlignIO.read(fasta, 'fasta')) io.read_metadata(t, mdf) t.reroot_to_best_root(infer_gtr=True) t.init_date_constraints() t.ml_t() # plotting the results t._score_branches() t.tree.ladderize() if polytomies: #Phylo.draw(t.tree, label_func = lambda x:'', show_confidence=False, branch_labels='') t1 = copy.deepcopy(t) t1.resolve_polytomies()
def date_from_seq_name(name): date = str2date_time(name.split('|')[2].strip()) return date.year + date.timetuple().tm_yday / 365.25 if __name__ == '__main__': root_dir = os.path.dirname(os.path.realpath(__file__)) fasta = os.path.join(root_dir, '../data/H3N2_NA_allyears_NA.20.fasta') nwk = os.path.join(root_dir, '../data/H3N2_NA_allyears_NA.20.nwk') mdf = os.path.join(root_dir, '../data/H3N2_NA_allyears_NA.20.metadata.csv') # read tree from file gtr = GTR.standard() t = io.treetime_from_newick(gtr, nwk) # set alignment to the tree io.set_seqs_to_leaves(t, AlignIO.read(fasta, 'fasta')) io.read_metadata(t, mdf) t.reroot_to_best_root(infer_gtr=True) t.init_date_constraints() t.ml_t() # plotting the results t._score_branches() t.tree.ladderize() #Phylo.draw(t.tree, label_func = lambda x:'', show_confidence=False, branch_labels='') t1 = copy.deepcopy(t) t1.resolve_polytomies() t1.tree.ladderize()
def geo_inference(self, attr): ''' infer a "mugration" model by pretending each region corresponds to a sequence state and repurposing the GTR inference and ancestral reconstruction ''' from treetime.gtr import GTR # Determine alphabet and store reconstructed ancestral sequences places = set() nuc_seqs = {} nuc_muts = {} nuc_seq_LH = None if hasattr(self.tt.tree, 'sequence_LH'): nuc_seq_LH = self.tt.tree.sequence_LH for node in self.tree.find_clades(): if hasattr(node, 'attr'): if attr in node.attr: places.add(node.attr[attr]) if hasattr(node, 'sequence'): nuc_seqs[node] = node.sequence if hasattr(node, 'mutations'): nuc_muts[node] = node.mutations node.__delattr__('mutations') # construct GTR (flat for now). The missing DATA symbol is a '-' (ord('-')==45) places = sorted(places) nc = len(places) if nc < 2 or nc > 180: print( "geo_inference: can't have less than 2 or more than 180 places!" ) return alphabet = {chr(65 + i): place for i, place in enumerate(places)} alphabet_rev = {v: k for k, v in alphabet.iteritems()} sequence_gtr = self.tt.gtr myGeoGTR = GTR.custom(pi=np.ones(nc, dtype=float) / nc, W=np.ones((nc, nc)), alphabet=np.array(sorted(alphabet.keys()))) myGeoGTR.profile_map['-'] = np.ones(nc) # set geo info to nodes as one letter sequence. for node in self.tree.get_terminals(): if hasattr(node, 'attr'): if attr in node.attr: node.sequence = np.array([alphabet_rev[node.attr[attr]]]) else: node.sequence = np.array(['-']) for node in self.tree.get_nonterminals(): node.__delattr__('sequence') # set custom GTR model, run inference self.tt._gtr = myGeoGTR tmp_use_mutation_length = self.tt.use_mutation_length self.tt.use_mutation_length = False self.tt.infer_ancestral_sequences(method='ml', infer_gtr=True, store_compressed=False, pc=5.0, marginal=True) # restore the nucleotide sequence and mutations to maintain expected behavior self.tt.geogtr = self.tt.gtr self.tt.geogtr.alphabet_to_location = alphabet self.tt._gtr = sequence_gtr self.dump_attr.append(attr) if hasattr(self.tt.tree, 'sequence_LH'): self.tt.tree.geo_LH = self.tt.tree.sequence_LH self.tt.tree.sequence_LH = nuc_seq_LH for node in self.tree.find_clades(): node.attr[attr] = alphabet[node.sequence[0]] if node in nuc_seqs: node.sequence = nuc_seqs[node] if node.up is not None: node.__setattr__(attr + '_transitions', node.mutations) if node in nuc_muts: node.mutations = nuc_muts[node] self.tt.use_mutation_length = tmp_use_mutation_length