def infer_branch_length_twoside(tree, aln, marginal=False, alphabet='nuc_nogap'): before = {} after = {} tt = TreeAnc(tree=tree, aln=aln, alphabet=alphabet, verbose=0) for n in tt.tree.find_clades(): before[n.name] = n.branch_length total_before = tt.tree.total_branch_length() def restore_original_bls(): for n in tt.tree.find_clades(): n.branch_length = before[n.name] def optimize(): tt.prepare_tree() if marginal: tt.optimize_sequences_and_branch_length( branch_length_mode='marginal', prune_short=False, max_iter=20) else: tt.optimize_sequences_and_branch_length(branch_length_mode='joint', prune_short=False, max_iter=20) for n in tt.tree.find_clades(): after[n.name] = n.branch_length LH = tt.sequence_LH() if marginal else tt.tree.sequence_joint_LH return np.array([[before[k], after[k]] for k in before]), LH # First run things with a small internal branch. set_internal(tt.tree, 1e-10) small_start = optimize() # Next run things with a large internal branch. restore_original_bls() set_internal(tt.tree, 1) large_start = optimize() if small_start[1] > large_start[1]: return small_start else: return large_start
def infer_branch_length(tree, aln, distance_scale=1.0, marginal=False, IC=None, alphabet='nuc_nogap'): before = {} after = {} tt = TreeAnc(tree=tree, aln=aln, alphabet=alphabet, verbose=0) for n in tt.tree.find_clades(): n.branch_length *= distance_scale n.mutation_length *= distance_scale before[n.name] = n.branch_length total_before = tt.tree.total_branch_length() # mess up branch length prior to optimizition if IC: IC(tt.tree) tt.prepare_tree() if marginal: tt.optimize_sequences_and_branch_length(branch_length_mode='marginal', prune_short=False, max_iter=20) else: tt.optimize_sequences_and_branch_length(branch_length_mode='joint', prune_short=False, max_iter=20) for n in tt.tree.find_clades(): after[n.name] = n.branch_length total_after = tt.tree.total_branch_length() LH = tt.sequence_LH() if marginal else tt.tree.sequence_joint_LH return np.array([[before[k], after[k]] for k in before]), LH
'_aa' if args.aa else '') aln_freq_name = hiv_out + os.path.basename( aln[:-5]) + 'alignment_frequencies%s' % ('_aa' if args.aa else '') gtr = None aln_freq = None if not args.redo: try: aln_freq = np.loadtxt(aln_freq_name) gtr = load_model(model_name) except: pass if gtr is None: tt = TreeAnc(tree=tree, aln=aln, compress=False, alphabet=alphabet, verbose=3) # tt.optimize_tree(branch_length_mode='marginal', max_iter=10, # infer_gtr=True, site_specific_gtr=True, pc=pc) tt.infer_gtr_iterative(max_iter=10, site_specific=True, pc=pc) gtr = tt.gtr save_model(gtr, model_name) if aln_freq is None: aln_freq = p_from_aln(AlignIO.read(aln, 'fasta'), alphabet=alphabet) np.savetxt(aln_freq_name, aln_freq) fabio_fitness = load_fitness_landscape(args.gene,
def analyze(ana, tree, aln, alphabet, prefix, params, true_model): """ infer a model as specified by analysis type "ana" and compare the true model """ T = Phylo.read(tree, format="newick") # if the true tree is used, rescale its branches with the mutation rate # to convert them to evolutionary distance if tree == tree_name(prefix, params): for n in T.find_clades(): n.branch_length *= params['m'] tt = TreeAnc(tree=T, aln=aln, compress=False, alphabet=alphabet) if ana == "iterative": # this performs iterative estimation of the model and ancestral sequences # using marginal ancestral reconstruction tt.infer_gtr_iterative(normalized_rate=False, site_specific=True, pc=pc, max_iter=10) elif ana == "unspecific": tt.infer_gtr(marginal=False, normalized_rate=False, site_specific=False, pc=pc) elif ana == "ml_reconstruction": tt.infer_gtr(marginal=False, normalized_rate=False, site_specific=True, pc=pc) elif ana == "marginalize": tt.infer_gtr(marginal=True, normalized_rate=False, site_specific=True, pc=pc) elif ana == 'optimize_tree': tt.optimize_tree(branch_length_mode='marginal', max_iter=10, infer_gtr=True, site_specific_gtr=True, pc=pc) # calculate likelihood tt.infer_ancestral_sequences(marginal=True) model = tt.gtr np.fill_diagonal(model.W, 0) acc = { 'LH': tt.sequence_LH(), 'tree_length': tt.tree.total_branch_length() } acc.update(assess_reconstruction(model, true_model)) return acc
def get_LH(tname, aln, gtr): """reconstruct ancestral sequences given a GTR model and report the likelihood """ tt = TreeAnc(tree=tname, aln=aln, gtr=gtr, compress=False) tt.infer_ancestral_sequences(marginal=True) return tt.sequence_LH()
def simplex(params, out_prefix=None, yule=True, n_model=5, n_seqgen=5, JC=False, alphabet='nuc_nogap', alpha=1.0, rate_alpha=1.5, W_dirichlet_alpha=2.0): """Generate a tree and random GTR model with frequency parameters sampled from a Dirichlet distribution on the simplex Parameters ---------- params : dict dictionary with parameters of the evolutionary process, sample size etc out_prefix : None, optional save the generated data using this prefix and otherwise standardized file names yule : bool, optional generate a Yule tree instead of a Kingman Coalesccent tree n_model : int, optional number of distinct models to draw for each tree n_seqgen : int, optional number of times sequences are evolved for each tree/model combination JC : bool, optional Use a Jukes Cantor model for the preference but include rate variation alphabet : str, optional alphabet of the GTR model alpha : float, optional parameter of the Dirichlet distribution for frequencies rate_alpha : float, optional parameter of the rate distribution (Gamma) W_dirichlet_alpha : float, optional parameter of the Dirichlet distribution of W matrix elements """ from Bio import AlignIO # generate a model T = betatree(params['n'], alpha=2.0) T.yule = yule T.coalesce() # ladderize the tree and name internal nodes via loading into TreeAnc T.BioTree.ladderize() tt = TreeAnc(tree=T.BioTree) if out_prefix: Phylo.write(tt.tree, tree_name(out_prefix, params), 'newick') for mi in range(n_model): params['model'] = mi if JC: myGTR = GTR_site_specific.random(L=params['L'], alphabet=alphabet, pi_dirichlet_alpha=False, W_dirichlet_alpha=False, mu_gamma_alpha=rate_alpha) else: myGTR = GTR_site_specific.random( L=params['L'], alphabet=alphabet, pi_dirichlet_alpha=alpha, mu_gamma_alpha=rate_alpha, W_dirichlet_alpha=W_dirichlet_alpha) myGTR.mu *= params['m'] if out_prefix: save_model(myGTR, model_name(out_prefix, params)) for si in range(n_seqgen): params['seqgen'] = si # generate sequences mySeq = SeqGen(params['L'], gtr=myGTR, tree=T.BioTree) mySeq.evolve() if out_prefix: save_mutation_count(mySeq, mutation_count_name(out_prefix, params)) with open(alignment_name_raw(out_prefix, params), 'wt') as fh: AlignIO.write(mySeq.get_aln(), fh, 'fasta') reconstruct_tree(out_prefix, params, aa='aa' in alphabet) os.system('gzip ' + alignment_name_raw(out_prefix, params))