Esempio n. 1
0
def invFels(y, x, L=1000, alphabet='nuc_nogap'):
    from io import StringIO
    T = Phylo.read(StringIO("(A:%f,B:%f,(C:%f,D:%f):%f);" % (y, y, x, x, y)),
                   "newick")

    gtr = GTR(alphabet=alphabet)
    gtr.seq_len = L
    mySeq = SeqGen(gtr=gtr, tree=T)
    mySeq.evolve()

    return T, mySeq.get_aln()
def infer_gene_gain_loss(rates = [1.0, 1.0], path_to_pangenome_dir = '/ebio/abt6_projects9/Pseudomonas_diversity/data/post_assembly_analysis/pan_genome/Ta1524/'):
    '''code nabbed and edited from panX'''
    
    # initialize GTR model with default parameters
    mu = np.sum(rates)
    gene_pi = np.array(rates)/mu
    gain_loss_model = GTR.custom(pi = gene_pi, mu=mu, W=np.ones((2,2)), alphabet = np.array(['0','1']))
    # add "unknown" state to profile
    gain_loss_model.profile_map['-'] = np.ones(2)
    #root_dir = os.path.dirname(os.path.realpath(__file__))

    # define file names for pseudo alignment of presence/absence patterns as in 001001010110
    #path_to_pangenome_dir='/ebio/ag-neher/share/users/wding/panX-refseq/data/Pseudomonadales'#sys.argv[1]
    nwk=path_to_pangenome_dir+"/vis/strain_tree.nwk"
    fasta=path_to_pangenome_dir+"/geneCluster/genePresence.aln"

    # instantiate treetime with custom GTR
    t = ta.TreeAnc(nwk, gtr =gain_loss_model, verbose=2)
    # fix leaves names since Bio.Phylo interprets numeric leaf names as confidence
    for leaf in t.tree.get_terminals():
        if leaf.name is None:
            leaf.name = str(leaf.confidence)
    t.aln = fasta
    t.tree.root.branch_length=0.0001
    t.reconstruct_anc(method='ml')
    for n in t.tree.find_clades():
        n.genepresence = n.sequence

    return t
Esempio n. 3
0
def infer_gene_gain_loss(path, rates=[1.0, 1.0]):
    # initialize GTR model with default parameters
    mu = np.sum(rates)
    gene_pi = np.array(rates) / mu
    gain_loss_model = GTR.custom(pi=gene_pi,
                                 mu=mu,
                                 W=np.ones((2, 2)),
                                 alphabet=np.array(['0', '1']))
    # add "unknown" state to profile
    gain_loss_model.profile_map['-'] = np.ones(2)
    root_dir = os.path.dirname(os.path.realpath(__file__))

    # define file names for pseudo alignment of presence/absence patterns as in 001001010110
    sep = '/'
    fasta = sep.join([path.rstrip(sep), 'geneCluster', 'genePresence.aln'])
    # strain tree based on core gene SNPs
    nwk = sep.join([path.rstrip(sep), 'geneCluster', 'strain_tree.nwk'])

    # instantiate treetime with custom GTR
    t = ta.TreeAnc(nwk, gtr=gain_loss_model, verbose=2)
    # fix leaves names since Bio.Phylo interprets numeric leaf names as confidence
    for leaf in t.tree.get_terminals():
        if leaf.name is None:
            leaf.name = str(leaf.confidence)
    t.aln = fasta
    t.tree.root.branch_length = 0.0001
    t.reconstruct_anc(method='ml')

    for n in t.tree.find_clades():
        n.genepresence = n.sequence

    return t
Esempio n. 4
0
    def geo_inference(self, attr):
        from treetime.gtr import GTR
        places = set()
        for node in self.tree.find_clades():
            if hasattr(node, attr):
                places.add(node.__getattribute__(attr))
            if hasattr(node, 'sequence'):
                node.nuc_sequence = node.sequence
        places = sorted(places)
        alphabet = {chr(65+i):place for i,place in enumerate(places)}
        alphabet_rev = {v:k for k,v in alphabet.iteritems()}
        self.tt.sequence_gtr = self.tt.gtr
        nc = len(places)
        myGeoGTR = GTR.custom(pi = np.ones(nc, dtype=float)/nc, W=np.ones((nc,nc)),
                              alphabet = np.array(sorted(alphabet.keys())))
        myGeoGTR.profile_map['-'] = np.ones(nc)
        for node in self.tree.get_terminals():
            if hasattr(node, attr):
                node.sequence=np.array([alphabet_rev[node.__getattribute__(attr)]])
            else:
                node.sequence=np.array(['-'])
        for node in self.tree.get_nonterminals():
            node.__delattr__('sequence')
        self.tt._gtr = myGeoGTR
        self.tt.reconstruct_anc(method='ml')
        self.tt.infer_gtr(print_raw=True)
        self.tt.reconstruct_anc(method='ml')

        self.tt.geogtr = self.tt.gtr
        self.tt._gtr = self.tt.sequence_gtr
        for node in self.tree.find_clades():
            node.__setattr__(attr, alphabet[node.sequence[0]])
            node.sequence = node.nuc_sequence
Esempio n. 5
0
    def tt_from_file(self, infile, root='none'):
        from treetime.gtr import GTR
        from treetime import io, utils
        gtr = GTR.standard()
        self.tt = io.treetime_from_newick(gtr, infile)
        io.set_seqs_to_leaves(self.tt, self.aln)
        io.set_node_dates_from_dic(self.tt, {seq.id:utils.numeric_date(seq.attributes['date'])
                                for seq in self.aln if 'date' in seq.attributes})
        self.tree = self.tt.tree
        if root=='midpoint':
            self.tt.tree.root_at_midpoint()
            self.tt.set_additional_tree_params()
        elif root=='oldest':
            tmp = self.tt.reroot_to_oldest()

        for node in self.tree.get_terminals():
            if node.name in self.sequence_lookup:
                seq = self.sequence_lookup[node.name]
                for attr in seq.attributes:
                    if attr == 'date':
                        node.date = seq.attributes['date'].strftime('%Y-%m-%d')
                    else:
                        node.__setattr__(attr, seq.attributes[attr])
from __future__ import print_function, division
import numpy as np
from Bio import Phylo

if __name__ == '__main__':
    from treetime.seq_utils import normalize_profile, prof2seq, seq2prof
    from treetime.gtr import GTR

    gtr = GTR.standard('JC69')
    dummy_prof = np.random.random(size=(10000, 5))

    # used a lot (300us)
    norm_prof = normalize_profile(dummy_prof)[0]

    # used less but still a lot (50us)
    gtr.evolve(norm_prof, 0.1)

    # used less but still a lot (50us)
    gtr.propagate_profile(norm_prof, 0.1)

    # used only in final, sample_from_prof=False speeds it up (600us or 300us)
    seq, p, seq_ii = prof2seq(norm_prof,
                              gtr,
                              sample_from_prof=True,
                              normalize=False)

    # used only initially (slow, 5ms)
    tmp_prof = seq2prof(seq, gtr.profile_map)
Esempio n. 7
0
def date_from_seq_name(name):

    date = str2date_time(name.split('|')[2].strip())
    return date.year + date.timetuple().tm_yday / 365.25


if __name__=='__main__':
    root_dir = os.path.dirname(os.path.realpath(__file__))
    file_base = '../data/H3N2_NA_allyears_NA.200'
    fasta = os.path.join(root_dir, file_base+'.fasta')
    nwk = os.path.join(root_dir, file_base+'.nwk')
    mdf = os.path.join(root_dir, file_base+'.metadata.csv')

    # read tree from file
    gtr = GTR.standard()
    t = io.treetime_from_newick(gtr, nwk)
    # set alignment to the tree
    io.set_seqs_to_leaves(t, AlignIO.read(fasta, 'fasta'))
    io.read_metadata(t, mdf)
    t.reroot_to_best_root(infer_gtr=True)
    t.init_date_constraints()
    t.ml_t()
    # plotting the results
    t._score_branches()
    t.tree.ladderize()

    if polytomies:
        #Phylo.draw(t.tree, label_func = lambda x:'', show_confidence=False, branch_labels='')
        t1 = copy.deepcopy(t)
        t1.resolve_polytomies()
Esempio n. 8
0

def date_from_seq_name(name):

    date = str2date_time(name.split('|')[2].strip())
    return date.year + date.timetuple().tm_yday / 365.25


if __name__ == '__main__':
    root_dir = os.path.dirname(os.path.realpath(__file__))
    fasta = os.path.join(root_dir, '../data/H3N2_NA_allyears_NA.20.fasta')
    nwk = os.path.join(root_dir, '../data/H3N2_NA_allyears_NA.20.nwk')
    mdf = os.path.join(root_dir, '../data/H3N2_NA_allyears_NA.20.metadata.csv')

    # read tree from file
    gtr = GTR.standard()
    t = io.treetime_from_newick(gtr, nwk)
    # set alignment to the tree
    io.set_seqs_to_leaves(t, AlignIO.read(fasta, 'fasta'))
    io.read_metadata(t, mdf)
    t.reroot_to_best_root(infer_gtr=True)
    t.init_date_constraints()
    t.ml_t()
    # plotting the results
    t._score_branches()
    t.tree.ladderize()

    #Phylo.draw(t.tree, label_func = lambda x:'', show_confidence=False, branch_labels='')
    t1 = copy.deepcopy(t)
    t1.resolve_polytomies()
    t1.tree.ladderize()
Esempio n. 9
0
    def geo_inference(self, attr):
        '''
        infer a "mugration" model by pretending each region corresponds to a sequence
        state and repurposing the GTR inference and ancestral reconstruction
        '''
        from treetime.gtr import GTR
        # Determine alphabet and store reconstructed ancestral sequences
        places = set()
        nuc_seqs = {}
        nuc_muts = {}
        nuc_seq_LH = None
        if hasattr(self.tt.tree, 'sequence_LH'):
            nuc_seq_LH = self.tt.tree.sequence_LH
        for node in self.tree.find_clades():
            if hasattr(node, 'attr'):
                if attr in node.attr:
                    places.add(node.attr[attr])
            if hasattr(node, 'sequence'):
                nuc_seqs[node] = node.sequence
            if hasattr(node, 'mutations'):
                nuc_muts[node] = node.mutations
                node.__delattr__('mutations')

        # construct GTR (flat for now). The missing DATA symbol is a '-' (ord('-')==45)
        places = sorted(places)
        nc = len(places)
        if nc < 2 or nc > 180:
            print(
                "geo_inference: can't have less than 2 or more than 180 places!"
            )
            return

        alphabet = {chr(65 + i): place for i, place in enumerate(places)}
        alphabet_rev = {v: k for k, v in alphabet.iteritems()}
        sequence_gtr = self.tt.gtr
        myGeoGTR = GTR.custom(pi=np.ones(nc, dtype=float) / nc,
                              W=np.ones((nc, nc)),
                              alphabet=np.array(sorted(alphabet.keys())))
        myGeoGTR.profile_map['-'] = np.ones(nc)

        # set geo info to nodes as one letter sequence.
        for node in self.tree.get_terminals():
            if hasattr(node, 'attr'):
                if attr in node.attr:
                    node.sequence = np.array([alphabet_rev[node.attr[attr]]])
            else:
                node.sequence = np.array(['-'])
        for node in self.tree.get_nonterminals():
            node.__delattr__('sequence')
        # set custom GTR model, run inference
        self.tt._gtr = myGeoGTR
        tmp_use_mutation_length = self.tt.use_mutation_length
        self.tt.use_mutation_length = False
        self.tt.infer_ancestral_sequences(method='ml',
                                          infer_gtr=True,
                                          store_compressed=False,
                                          pc=5.0,
                                          marginal=True)

        # restore the nucleotide sequence and mutations to maintain expected behavior
        self.tt.geogtr = self.tt.gtr
        self.tt.geogtr.alphabet_to_location = alphabet
        self.tt._gtr = sequence_gtr
        self.dump_attr.append(attr)
        if hasattr(self.tt.tree, 'sequence_LH'):
            self.tt.tree.geo_LH = self.tt.tree.sequence_LH
            self.tt.tree.sequence_LH = nuc_seq_LH
        for node in self.tree.find_clades():
            node.attr[attr] = alphabet[node.sequence[0]]
            if node in nuc_seqs:
                node.sequence = nuc_seqs[node]
            if node.up is not None:
                node.__setattr__(attr + '_transitions', node.mutations)
                if node in nuc_muts:
                    node.mutations = nuc_muts[node]

        self.tt.use_mutation_length = tmp_use_mutation_length