コード例 #1
0
def infer_branch_length_twoside(tree,
                                aln,
                                marginal=False,
                                alphabet='nuc_nogap'):
    before = {}
    after = {}
    tt = TreeAnc(tree=tree, aln=aln, alphabet=alphabet, verbose=0)
    for n in tt.tree.find_clades():
        before[n.name] = n.branch_length
    total_before = tt.tree.total_branch_length()

    def restore_original_bls():
        for n in tt.tree.find_clades():
            n.branch_length = before[n.name]

    def optimize():
        tt.prepare_tree()

        if marginal:
            tt.optimize_sequences_and_branch_length(
                branch_length_mode='marginal', prune_short=False, max_iter=20)
        else:
            tt.optimize_sequences_and_branch_length(branch_length_mode='joint',
                                                    prune_short=False,
                                                    max_iter=20)

        for n in tt.tree.find_clades():
            after[n.name] = n.branch_length

        LH = tt.sequence_LH() if marginal else tt.tree.sequence_joint_LH

        return np.array([[before[k], after[k]] for k in before]), LH

    # First run things with a small internal branch.
    set_internal(tt.tree, 1e-10)
    small_start = optimize()
    # Next run things with a large internal branch.
    restore_original_bls()
    set_internal(tt.tree, 1)
    large_start = optimize()

    if small_start[1] > large_start[1]:
        return small_start
    else:
        return large_start
コード例 #2
0
def infer_branch_length(tree,
                        aln,
                        distance_scale=1.0,
                        marginal=False,
                        IC=None,
                        alphabet='nuc_nogap'):
    before = {}
    after = {}
    tt = TreeAnc(tree=tree, aln=aln, alphabet=alphabet, verbose=0)
    for n in tt.tree.find_clades():
        n.branch_length *= distance_scale
        n.mutation_length *= distance_scale
        before[n.name] = n.branch_length
    total_before = tt.tree.total_branch_length()

    # mess up branch length prior to optimizition
    if IC:
        IC(tt.tree)
    tt.prepare_tree()

    if marginal:
        tt.optimize_sequences_and_branch_length(branch_length_mode='marginal',
                                                prune_short=False,
                                                max_iter=20)
    else:
        tt.optimize_sequences_and_branch_length(branch_length_mode='joint',
                                                prune_short=False,
                                                max_iter=20)

    for n in tt.tree.find_clades():
        after[n.name] = n.branch_length
    total_after = tt.tree.total_branch_length()

    LH = tt.sequence_LH() if marginal else tt.tree.sequence_joint_LH

    return np.array([[before[k], after[k]] for k in before]), LH
コード例 #3
0
                                                    '_aa' if args.aa else '')
    aln_freq_name = hiv_out + os.path.basename(
        aln[:-5]) + 'alignment_frequencies%s' % ('_aa' if args.aa else '')

    gtr = None
    aln_freq = None
    if not args.redo:
        try:
            aln_freq = np.loadtxt(aln_freq_name)
            gtr = load_model(model_name)
        except:
            pass
    if gtr is None:
        tt = TreeAnc(tree=tree,
                     aln=aln,
                     compress=False,
                     alphabet=alphabet,
                     verbose=3)
        # tt.optimize_tree(branch_length_mode='marginal', max_iter=10,
        #                  infer_gtr=True, site_specific_gtr=True, pc=pc)

        tt.infer_gtr_iterative(max_iter=10, site_specific=True, pc=pc)

        gtr = tt.gtr
        save_model(gtr, model_name)

    if aln_freq is None:
        aln_freq = p_from_aln(AlignIO.read(aln, 'fasta'), alphabet=alphabet)
        np.savetxt(aln_freq_name, aln_freq)

    fabio_fitness = load_fitness_landscape(args.gene,
コード例 #4
0
def analyze(ana, tree, aln, alphabet, prefix, params, true_model):
    """
    infer a model as specified by analysis type "ana" and compare the true model
    """
    T = Phylo.read(tree, format="newick")

    # if the true tree is used, rescale its branches with the mutation rate
    # to convert them to evolutionary distance
    if tree == tree_name(prefix, params):
        for n in T.find_clades():
            n.branch_length *= params['m']

    tt = TreeAnc(tree=T, aln=aln, compress=False, alphabet=alphabet)

    if ana == "iterative":
        # this performs iterative estimation of the model and ancestral sequences
        # using marginal ancestral reconstruction
        tt.infer_gtr_iterative(normalized_rate=False,
                               site_specific=True,
                               pc=pc,
                               max_iter=10)
    elif ana == "unspecific":
        tt.infer_gtr(marginal=False,
                     normalized_rate=False,
                     site_specific=False,
                     pc=pc)
    elif ana == "ml_reconstruction":
        tt.infer_gtr(marginal=False,
                     normalized_rate=False,
                     site_specific=True,
                     pc=pc)
    elif ana == "marginalize":
        tt.infer_gtr(marginal=True,
                     normalized_rate=False,
                     site_specific=True,
                     pc=pc)
    elif ana == 'optimize_tree':
        tt.optimize_tree(branch_length_mode='marginal',
                         max_iter=10,
                         infer_gtr=True,
                         site_specific_gtr=True,
                         pc=pc)

    # calculate likelihood
    tt.infer_ancestral_sequences(marginal=True)
    model = tt.gtr
    np.fill_diagonal(model.W, 0)
    acc = {
        'LH': tt.sequence_LH(),
        'tree_length': tt.tree.total_branch_length()
    }
    acc.update(assess_reconstruction(model, true_model))

    return acc
コード例 #5
0
def get_LH(tname, aln, gtr):
    """reconstruct ancestral sequences given a GTR model and report the likelihood
    """
    tt = TreeAnc(tree=tname, aln=aln, gtr=gtr, compress=False)
    tt.infer_ancestral_sequences(marginal=True)
    return tt.sequence_LH()
コード例 #6
0
def simplex(params,
            out_prefix=None,
            yule=True,
            n_model=5,
            n_seqgen=5,
            JC=False,
            alphabet='nuc_nogap',
            alpha=1.0,
            rate_alpha=1.5,
            W_dirichlet_alpha=2.0):
    """Generate a tree and random GTR model with frequency parameters sampled
    from a Dirichlet distribution on the simplex

    Parameters
    ----------
    params : dict
        dictionary with parameters of the evolutionary process, sample size etc
    out_prefix : None, optional
        save the generated data using this prefix and otherwise standardized file names
    yule : bool, optional
        generate a Yule tree instead of a Kingman Coalesccent tree
    n_model : int, optional
        number of distinct models to draw for each tree
    n_seqgen : int, optional
        number of times sequences are evolved for each tree/model combination
    JC : bool, optional
        Use a Jukes Cantor model for the preference but include rate variation
    alphabet : str, optional
        alphabet of the GTR model
    alpha : float, optional
        parameter of the Dirichlet distribution for frequencies
    rate_alpha : float, optional
        parameter of the rate distribution (Gamma)
    W_dirichlet_alpha : float, optional
        parameter of the Dirichlet distribution of W matrix elements
    """
    from Bio import AlignIO
    # generate a model
    T = betatree(params['n'], alpha=2.0)
    T.yule = yule
    T.coalesce()
    # ladderize the tree and name internal nodes via loading into TreeAnc
    T.BioTree.ladderize()
    tt = TreeAnc(tree=T.BioTree)
    if out_prefix:
        Phylo.write(tt.tree, tree_name(out_prefix, params), 'newick')

    for mi in range(n_model):
        params['model'] = mi
        if JC:
            myGTR = GTR_site_specific.random(L=params['L'],
                                             alphabet=alphabet,
                                             pi_dirichlet_alpha=False,
                                             W_dirichlet_alpha=False,
                                             mu_gamma_alpha=rate_alpha)
        else:
            myGTR = GTR_site_specific.random(
                L=params['L'],
                alphabet=alphabet,
                pi_dirichlet_alpha=alpha,
                mu_gamma_alpha=rate_alpha,
                W_dirichlet_alpha=W_dirichlet_alpha)

        myGTR.mu *= params['m']

        if out_prefix:
            save_model(myGTR, model_name(out_prefix, params))

        for si in range(n_seqgen):
            params['seqgen'] = si
            # generate sequences
            mySeq = SeqGen(params['L'], gtr=myGTR, tree=T.BioTree)
            mySeq.evolve()

            if out_prefix:
                save_mutation_count(mySeq,
                                    mutation_count_name(out_prefix, params))
                with open(alignment_name_raw(out_prefix, params), 'wt') as fh:
                    AlignIO.write(mySeq.get_aln(), fh, 'fasta')
                reconstruct_tree(out_prefix, params, aa='aa' in alphabet)
                os.system('gzip ' + alignment_name_raw(out_prefix, params))