コード例 #1
0
    def setUp(self):
        """setUp method for all tests"""
        # named sequences
        self.rna1 = RnaSequence('UCAGGG', Name='rna1')
        self.rna2 = RnaSequence('YCU-RG', Name='rna2')
        self.rna3 = RnaSequence('CAA-NR', Name='rna3')
        self.model1 = ModelSequence('UCAGGG', Name='rna1',\
            Alphabet=RNA.Alphabets.DegenGapped)
        self.model2 = ModelSequence('YCU-RG', Name='rna2',\
            Alphabet=RNA.Alphabets.DegenGapped)
        self.model3 = ModelSequence('CAA-NR', Name='rna3',\
            Alphabet=RNA.Alphabets.DegenGapped)

        self.aln = Alignment([self.rna1, self.rna2, self.rna3], MolType=RNA)
        self.da = DenseAlignment([self.model1, self.model2, self.model3],\
            MolType=RNA, Alphabet=RNA.Alphabets.DegenGapped)

        # seqs no name
        self.nn_rna1 = RnaSequence('UCAGGG')
        self.nn_rna2 = RnaSequence('YCU-RG')
        self.nn_rna3 = RnaSequence('CAA-NR')

        self.nn_model1 = ModelSequence('UCAGGG',\
            Alphabet=RNA.Alphabets.DegenGapped)
        self.nn_model2 = ModelSequence('YCU-RG',\
            Alphabet=RNA.Alphabets.DegenGapped)
        self.nn_model3 = ModelSequence('CAA-NR',\
            Alphabet=RNA.Alphabets.DegenGapped)

        self.nn_aln = Alignment([self.nn_rna1, self.nn_rna2, self.nn_rna3],\
            MolType=RNA)
        self.nn_da = DenseAlignment([self.nn_model1, self.nn_model2,\
            self.nn_model3], MolType=RNA, Alphabet=RNA.Alphabets.DegenGapped)
コード例 #2
0
def align_two_alignments(aln1, aln2, moltype, params=None):
    """Returns an Alignment object from two existing Alignments.

    aln1, aln2: cogent.core.alignment.Alignment objects, or data that can be
    used to build them.

    params: dict of parameters to pass in to the Clustal app controller.
    """
    #create SequenceCollection object from seqs
    aln1 = Alignment(aln1, MolType=moltype)
    #Create mapping between abbreviated IDs and full IDs
    aln1_int_map, aln1_int_keys = aln1.getIntMap()
    #Create SequenceCollection from int_map.
    aln1_int_map = Alignment(aln1_int_map, MolType=moltype)

    #create Alignment object from aln
    aln2 = Alignment(aln2, MolType=moltype)
    #Create mapping between abbreviated IDs and full IDs
    aln2_int_map, aln2_int_keys = aln2.getIntMap(prefix='seqn_')
    #Create SequenceCollection from int_map.
    aln2_int_map = Alignment(aln2_int_map, MolType=moltype)

    #Update aln1_int_keys with aln2_int_keys
    aln1_int_keys.update(aln2_int_keys)

    #Create Mafft app.
    app = Clustalw(InputHandler='_input_as_multiline_string',\
        params=params,
        SuppressStderr=True)
    app.Parameters['-align'].off()
    app.Parameters['-infile'].off()
    app.Parameters['-profile'].on()

    #Add aln_int_map as profile1
    app.Parameters['-profile1'].on(\
        app._tempfile_as_multiline_string(aln1_int_map.toFasta()))

    #Add seq_int_map as profile2
    app.Parameters['-profile2'].on(\
        app._tempfile_as_multiline_string(aln2_int_map.toFasta()))
    #Get results using int_map as input to app
    res = app()

    #Get alignment as dict out of results
    alignment = dict(ClustalParser(res['Align'].readlines()))

    #Make new dict mapping original IDs
    new_alignment = {}
    for k, v in alignment.items():
        new_alignment[aln1_int_keys[k]] = v
    #Create an Alignment object from alignment dict
    new_alignment = Alignment(new_alignment, MolType=moltype)
    #Clean up
    res.cleanUp()
    remove(app.Parameters['-profile1'].Value)
    remove(app.Parameters['-profile2'].Value)
    del(aln1,aln1_int_map,aln1_int_keys,\
        aln2,aln2_int_map,aln2_int_keys,app,res,alignment)

    return new_alignment
コード例 #3
0
def align_two_alignments(aln1, aln2, moltype, params=None):
    """Returns an Alignment object from two existing Alignments.

    aln1, aln2: cogent.core.alignment.Alignment objects, or data that can be
    used to build them.
        - Mafft profile alignment only works with aligned sequences. Alignment
        object used to handle unaligned sequences.

    params: dict of parameters to pass in to the Mafft app controller.
    """
    #create SequenceCollection object from seqs
    aln1 = Alignment(aln1,MolType=moltype)
    #Create mapping between abbreviated IDs and full IDs
    aln1_int_map, aln1_int_keys = aln1.getIntMap()
    #Create SequenceCollection from int_map.
    aln1_int_map = Alignment(aln1_int_map,MolType=moltype)
    
    #create Alignment object from aln
    aln2 = Alignment(aln2,MolType=moltype)
    #Create mapping between abbreviated IDs and full IDs
    aln2_int_map, aln2_int_keys = aln2.getIntMap(prefix='seqn_')
    #Create SequenceCollection from int_map.
    aln2_int_map = Alignment(aln2_int_map,MolType=moltype)
    
    #Update aln1_int_keys with aln2_int_keys
    aln1_int_keys.update(aln2_int_keys)
    
    #Create Mafft app.
    app = Mafft(InputHandler='_input_as_paths',\
        params=params,
        SuppressStderr=False)
    app._command = 'mafft-profile'
    
    aln1_path = app._tempfile_as_multiline_string(aln1_int_map.toFasta())
    aln2_path = app._tempfile_as_multiline_string(aln2_int_map.toFasta())
    filepaths = [aln1_path,aln2_path]
    
    #Get results using int_map as input to app
    res = app(filepaths)

    #Get alignment as dict out of results
    alignment = dict(MinimalFastaParser(res['StdOut'].readlines()))
    
    #Make new dict mapping original IDs
    new_alignment = {}
    for k,v in list(alignment.items()):
        key = k.replace('_seed_','')
        new_alignment[aln1_int_keys[key]]=v
    #Create an Alignment object from alignment dict
    new_alignment = Alignment(new_alignment,MolType=moltype)
    #Clean up
    res.cleanUp()
    remove(aln1_path)
    remove(aln2_path)
    remove('pre')
    remove('trace')
    del(aln1,aln1_int_map,aln1_int_keys,\
        aln2,aln2_int_map,aln2_int_keys,app,res,alignment)

    return new_alignment
コード例 #4
0
ファイル: dotur.py プロジェクト: yatisht/pycogent
def dotur_from_alignment(aln, moltype, distance_function, params=None):
    """Returns dotur results given an alignment and distance function.
    
        - aln: An Alignment object or something that behaves like one.
            Sequences must be aligned.
        - moltype: cogent.core.moltype object.
        - distance_function: function that can be passed to distanceMatrix()
            method of SequenceCollection.  Must be able to find distance
            between two sequences.
        
        - NOTE:  This function will only return the parsed *.list file, as
            it contains the OTU identities.
            Dotur generates 23 output files, so if this is not the one you
            are looking for, check out the documentation and add the others
            to the result path.
    """
    #construct Alignment object.  This will handle unaligned sequences.
    aln = Alignment(aln, MolType=moltype)

    #need to make int map.
    int_map, int_keys = aln.getIntMap()
    #construct Alignment object from int map to use object functionality
    int_map = Alignment(int_map, MolType=moltype)
    order = sorted(int_map.Names)

    #Build distance matrix.
    d_matrix_dict = int_map.distanceMatrix(f=distance_function)
    d_matrix_dict.RowOrder = order
    d_matrix_dict.ColOrder = order

    #Get distance matrix in list form.
    d_matrix_list = d_matrix_dict.toLists()

    #must be strings to use phylipMatrix
    for i, line in enumerate(d_matrix_list):
        d_matrix_list[i] = map(str, line)

    #Get phylip formatted string.
    phylip_matrix_string = phylipMatrix(rows=d_matrix_list, names=order)

    working_dir = get_tmp_filename(suffix='')
    app = Dotur(InputHandler='_input_as_multiline_string',\
        WorkingDir=working_dir,params=params)

    res = app(phylip_matrix_string)

    otu_list = OtuListParser(res['List'].readlines())

    #remap sequence names
    for i, otu in enumerate(otu_list):
        otu_list[i][2] = remap_seq_names(otu[2], int_keys)

    shutil.rmtree(app.WorkingDir)

    return otu_list
コード例 #5
0
    def setUp(self):
        """Set up for Voronoi tests"""
        self.aln1 = Alignment(['ABC', 'BCC', 'BAC'])

        self.aln2 = Alignment({'seq1':'GYVGS','seq2':'GFDGF','seq3':'GYDGF',\
            'seq4':'GYQGG'},Names=['seq1','seq2','seq3','seq4'])

        self.aln3 = Alignment({'seq1':'AA', 'seq2':'AA', 'seq3':'BB'},\
            Names=['seq1','seq2','seq3'])

        self.aln4 = Alignment({'seq1':'AA', 'seq2':'AA', 'seq3':'BB',\
        'seq4':'BB','seq5':'CC'},Names=['seq1','seq2','seq3','seq4','seq5'])

        self.aln5 = Alignment(['ABBA', 'ABCA', 'CBCB'])
コード例 #6
0
 def setUp(self):
     """Setup for Fasta tests."""
     self.strings = ['AAAA', 'CCCC', 'gggg', 'uuuu']
     self.labels = ['1st', '2nd', '3rd', '4th']
     self.infos = ["Dog", "Cat", "Mouse", "Rat"]
     self.sequences_with_labels = map(Sequence, self.strings)
     self.sequences_with_names = map(Sequence, self.strings)
     for l,sl,sn in zip(self.labels,self.sequences_with_labels,\
         self.sequences_with_names):
         sl.Label = l
         sn.Name = l
     self.fasta_no_label = '>0\nAAAA\n>1\nCCCC\n>2\ngggg\n>3\nuuuu'
     self.fasta_with_label=\
             '>1st\nAAAA\n>2nd\nCCCC\n>3rd\nGGGG\n>4th\nUUUU'
     self.fasta_with_label_lw2=\
         '>1st\nAA\nAA\n>2nd\nCC\nCC\n>3rd\nGG\nGG\n>4th\nUU\nUU'
     self.alignment_dict = {
         '1st': 'AAAA',
         '2nd': 'CCCC',
         '3rd': 'GGGG',
         '4th': 'UUUU'
     }
     self.alignment_object = Alignment(self.alignment_dict)
     for label, info in zip(self.labels, self.infos):
         self.alignment_object.NamedSeqs[label].Info = Info(species=info)
     self.fasta_with_label_species=\
           '>1st:Dog\nAAAA\n>2nd:Cat\nCCCC\n>3rd:Mouse\nGGGG\n>4th:Rat\nUUUU'
     self.alignment_object.RowOrder = ['1st', '2nd', '3rd', '4th']
コード例 #7
0
    def test_subset_seqs_Alignment(self):
        rna1 = RnaSequence('UCG', Name='rna1')
        rna2 = RnaSequence('YCG', Name='rna2')
        rna3 = RnaSequence('CAR', Name='rna3')

        sub_aln = Alignment([rna2, rna3], MolType=RNA)
        aln = Alignment([rna1, rna2, rna3], MolType=RNA)
        obs_sub_aln = aln.takeSeqs(['rna2', 'rna3'])

        self.assertEqual(obs_sub_aln, sub_aln)
        self.assertEqual(str(obs_sub_aln), str(sub_aln))

        # Selected sequences should be in specified order?
        obs_sub_aln_1 = self.aln.takeSeqs(['rna3', 'rna2'])
        obs_sub_aln_2 = self.aln.takeSeqs(['rna2', 'rna3'])
        self.assertNotEqual(str(obs_sub_aln_1), str(obs_sub_aln_2))
コード例 #8
0
ファイル: test_profile.py プロジェクト: miklou/pycogent
    def test_randomSequence(self):
        """randomSequence: 99% of new frequencies should be within 3*SD"""
        r_num, c_num = 100,20
        num_elements = r_num*c_num
        alpha = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
        r = random([r_num,c_num])
        p = Profile(r,alpha[:c_num])
        p.normalizePositions()
        d = p.Data
        n = 1000
        
        #Test only works on normalized profile, b/c of 1-d below
        means = n*d
        three_stds = sqrt(d*(1-d)*n)*3

        a = Alignment([p.randomSequence() for x in range(n)])

        def absoluteProfile(alignment,char_order):
            f = a.columnFreqs()
            res = zeros([len(f),len(char_order)])
            for row, freq in enumerate(f):
                for i in freq:
                    col = char_order.index(i)
                    res[row, col] = freq[i]
            return res

        ap = absoluteProfile(a,p.CharOrder)
        failure = abs(ap-means) > three_stds
        assert sum(sum(failure))/num_elements <= 0.01
コード例 #9
0
ファイル: test_profile.py プロジェクト: miklou/pycogent
    def test_randomIndices(self):
        """randomIndices: 99% of new frequencies should be within 3*SD
        """
        r_num, c_num = 100,20
        num_elements = r_num*c_num
        r = random([r_num,c_num])
        p = Profile(r,"A"*c_num)
        p.normalizePositions()
        d = p.Data
        n = 1000
        
        #Test only works on normalized profile, b/c of 1-d below
        means = n*d
        three_stds = sqrt(d*(1-d)*n)*3
        result = [p.randomIndices() for x in range(n)]
        a = Alignment(transpose(result))

        def absoluteProfile(alignment,char_order):
            f = a.columnFreqs()
            res = zeros([len(f),len(char_order)])
            for row, freq in enumerate(f):
                for i in freq:
                    res[row, ord(i)] = freq[i]
            return res

        ap = absoluteProfile(a,p.CharOrder)
        failure = abs(ap-means) > three_stds
        assert sum(sum(failure))/num_elements <= 0.01
コード例 #10
0
    def setUp(self):

        # create a list of files to cleanup
        self._paths_to_clean_up = []
        self._dirs_to_clean_up = []

        # load query seqs
        self.seqs = Alignment(MinimalFastaParser(QUERY_SEQS.split()))

        # generate temp filename
        tmp_dir = '/tmp'
        self.outfile = get_tmp_filename(tmp_dir)

        # create and write out reference sequence file
        self.outfasta = splitext(self.outfile)[0] + '.fasta'
        fastaout = open(self.outfasta, 'w')
        fastaout.write(REF_SEQS)
        fastaout.close()
        self._paths_to_clean_up.append(self.outfasta)

        # create and write out starting tree file
        self.outtree = splitext(self.outfile)[0] + '.tree'
        treeout = open(self.outtree, 'w')
        treeout.write(REF_TREE)
        treeout.close()
        self._paths_to_clean_up.append(self.outtree)
コード例 #11
0
    def test_AlignmentToProfile_weighted(self):
        """AlignmentToProfile: should work when sequences are weighted
        """
        #Alignment: sequences are just strings and don't have an alphabet
        #Weights: a normal dictionary (could be a real Weights object as well)
        a = Alignment({'seq1':'TCAG','seq2':'TAR-','seq3':'YAG-'},\
        Names=['seq1','seq2','seq3'])
        w = {'seq1': 0.5, 'seq2': .25, 'seq3': .25}

        #Basic situation in which all letters in the sequences occur in the
        #CharOrder, None have to be ignored. In that case it doesn't matter
        #whether we set split_degenerates to True or False, because if it's
        #True it's overwritten by the fact that the char is in the CharOrder.
        exp = array([[0.75, 0, 0, 0, 0, .25, 0], [0, 0.5, 0.5, 0, 0, 0, 0],
                     [0, 0.5, 0, 0.25, 0.25, 0, 0], [0, 0, 0, 0.5, 0, 0, 0.5]])
        #split_degenerates = False
        self.assertEqual(AlnToProfile(a,DNA, char_order="TACGRY-",\
            weights=w, split_degenerates=False).Data.tolist(),exp.tolist())
        #split_degenerates = True
        self.assertEqual(AlnToProfile(a,DNA, char_order="TACGRY-",\
            weights=w, split_degenerates=True).Data.tolist(),exp.tolist())

        #Only non-degenerate symbols in the CharOrder. Degenerates are split.
        #Gaps are ignored
        exp = array([[0.875, 0, 0.125, 0], [0, 0.5, 0.5, 0],
                     [0, 0.625, 0, 0.375], [0, 0, 0, 1]])
        self.assertEqual(AlnToProfile(a,DNA, char_order="TACG",\
            weights=w, split_degenerates=True).Data.tolist(),exp.tolist())

        #An Error is raised if all chars in an alignment column are ignored
        #CharOrder=AT, degenerates are not split.
        self.assertRaises(ValueError,AlnToProfile,a,DNA,\
            char_order="AT",weights=w, split_degenerates=True)
コード例 #12
0
def align_unaligned_seqs(seqs, moltype, params=None):
    """Returns an Alignment object from seqs.

    seqs: cogent.core.alignment.SequenceCollection object, or data that can be
    used to build one.
    
    moltype: a MolType object.  DNA, RNA, or PROTEIN.

    params: dict of parameters to pass in to the Clustal app controller.
    
    Result will be a cogent.core.alignment.Alignment object.
    """
    #create SequenceCollection object from seqs
    seq_collection = SequenceCollection(seqs, MolType=moltype)
    #Create mapping between abbreviated IDs and full IDs
    int_map, int_keys = seq_collection.getIntMap()
    #Create SequenceCollection from int_map.
    int_map = SequenceCollection(int_map, MolType=moltype)
    #Create Clustalw app.
    app = Clustalw(InputHandler='_input_as_multiline_string', params=params)
    #Get results using int_map as input to app
    res = app(int_map.toFasta())
    #Get alignment as dict out of results
    alignment = dict(ClustalParser(res['Align'].readlines()))
    #Make new dict mapping original IDs
    new_alignment = {}
    for k, v in alignment.items():
        new_alignment[int_keys[k]] = v
    #Create an Alignment object from alignment dict
    new_alignment = Alignment(new_alignment, MolType=moltype)
    #Clean up
    res.cleanUp()
    del (seq_collection, int_map, int_keys, app, res, alignment)

    return new_alignment
コード例 #13
0
ファイル: rfam.py プロジェクト: wangdi2014/for_qiime_scripts
def load_from_clustal(data, seq_constructor=Sequence, strict=True):
    recs = [(name, seq_constructor(seq, )) for name, seq in\
        ClustalParser(data, strict)]
    lengths = [len(i[1]) for i in recs]
    if lengths and max(lengths) == min(lengths):
        return Alignment(recs, MolType=BYTES)
    else:
        return SequenceCollection(recs, MolType=BYTES)
コード例 #14
0
def build_tree_from_alignment(aln, moltype, best_tree=False, params={}):
    """Returns a tree from Alignment object aln.
    
    aln: an xxx.Alignment object, or data that can be used to build one.
    
    moltype: cogent.core.moltype.MolType object

    best_tree: best_tree suppport is currently not implemented
    
    params: dict of parameters to pass in to the RAxML app controller.
    
    The result will be an xxx.Alignment object, or None if tree fails.
    """
    if best_tree:
        raise NotImplementedError

    if '-m' not in params:
        if moltype == DNA or moltype == RNA:
            #params["-m"] = 'GTRMIX'
            # in version 7.2.3, GTRMIX is no longer supported but says GTRCAT
            # behaves like GTRMIX (http://www.phylo.org/tools/raxmlhpc2.html)
            params["-m"] = 'GTRGAMMA'
        elif moltype == PROTEIN:
            params["-m"] = 'PROTGAMMAmatrixName'
        else:
            raise ValueError("Moltype must be either DNA, RNA, or PROTEIN")

    if not hasattr(aln, 'toPhylip'):
        aln = Alignment(aln)
    seqs, align_map = aln.toPhylip()

    # generate temp filename for output
    params["-w"] = "/tmp/"
    params["-n"] = get_tmp_filename().split("/")[-1]
    params["-k"] = True
    params["-p"] = randint(1, 100000)
    params["-x"] = randint(1, 100000)

    ih = '_input_as_multiline_string'

    raxml_app = Raxml(params=params,
                      InputHandler=ih,
                      WorkingDir=None,
                      SuppressStderr=True,
                      SuppressStdout=True)

    raxml_result = raxml_app(seqs)

    tree = DndParser(raxml_result['Bootstrap'], constructor=PhyloNode)

    for node in tree.tips():
        node.Name = align_map[node.Name]

    raxml_result.cleanUp()

    return tree
コード例 #15
0
    def test_distance_matrix(self):
        """distance_matrix should obey Names of alignment"""
        #Names=None
        aln1_exp = array([[0, 2, 2], [2, 0, 1], [2, 1, 0]])
        self.assertEqual(distance_matrix(self.aln1), aln1_exp)

        a = Alignment(self.aln1.NamedSeqs)
        a.Names = ['seq_1', 'seq_2', 'seq_0']
        a_exp = array([[0, 1, 2], [1, 0, 2], [2, 2, 0]])
        self.assertEqual(distance_matrix(a), a_exp)
コード例 #16
0
def alignment_traceback(seqs, aligned_positions, word_length):
    """Alignment object from state matrix and ending point.
    """
    (starts, ends, maps) = map_traceback(aligned_positions)
    aligneds = []
    for (start, end, amap, (name, seq)) in zip(starts, ends, maps, seqs):
        gs = Aligned(amap * word_length,
                     seq[start * word_length:end * word_length])
        aligneds.append((name, gs))
    return Alignment(MolType=None, data=aligneds)
コード例 #17
0
    def setUp(self):
        """Setup for Clustal tests."""
        self.unaligned_dict = {
            '1st': 'AAA',
            '2nd': 'CCCC',
            '3rd': 'GGGG',
            '4th': 'UUUU'
        }
        self.alignment_dict = {
            '1st': 'AAAA',
            '2nd': 'CCCC',
            '3rd': 'GGGG',
            '4th': 'UUUU'
        }
        #create alignment change order.
        self.alignment_object = Alignment(self.alignment_dict)
        self.alignment_order = ['2nd', '4th', '3rd', '1st']
        self.alignment_object.RowOrder = self.alignment_order

        self.clustal_with_label=\
"""CLUSTAL

1st    AAAA
2nd    CCCC
3rd    GGGG
4th    UUUU
"""
        self.clustal_with_label_lw2=\
"""CLUSTAL

1st    AA
2nd    CC
3rd    GG
4th    UU

1st    AA
2nd    CC
3rd    GG
4th    UU
"""

        self.clustal_with_label_reordered=\
"""CLUSTAL

2nd    CCCC
4th    UUUU
3rd    GGGG
1st    AAAA
"""

        self.clustal_with_label_lw2_reordered=\
"""CLUSTAL
コード例 #18
0
    def test_subset_positions_Alignment(self):
        rna1 = RnaSequence('UCG', Name='rna1')
        rna2 = RnaSequence('YCG', Name='rna2')
        rna3 = RnaSequence('CAR', Name='rna3')

        sub_aln = Alignment([rna1, rna2, rna3], MolType=RNA)

        obs_sub_aln = self.aln.takePositions([0, 1, 5])
        self.assertEqual(obs_sub_aln, sub_aln)
        self.assertNotEqual(obs_sub_aln, self.aln)
        # string representations should be the same. This fails right
        # now, because sequence order is not maintained. See separate test.
        self.assertEqual(str(obs_sub_aln), str(sub_aln))
コード例 #19
0
ファイル: pplacer.py プロジェクト: cxhernandez/pycogent
def insert_sequences_into_tree(aln, moltype, params={},
                                           write_log=True):
    """Returns a tree from Alignment object aln.

    aln: an xxx.Alignment object, or data that can be used to build one.

    moltype: cogent.core.moltype.MolType object

    params: dict of parameters to pass in to the RAxML app controller.

    The result will be an xxx.Alignment object, or None if tree fails.
    """

    # convert aln to phy since seq_names need fixed to run through pplacer
    
    new_aln=get_align_for_phylip(StringIO(aln))

    # convert aln to fasta in case it is not already a fasta file
    aln2 = Alignment(new_aln)
    seqs = aln2.toFasta()

    ih = '_input_as_multiline_string'    

    pplacer_app = Pplacer(params=params,
                      InputHandler=ih,
                      WorkingDir=None,
                      SuppressStderr=False,
                      SuppressStdout=False)
    
    pplacer_result = pplacer_app(seqs)

    # write a log file
    if write_log:
        log_fp = join(params["--out-dir"],'log_pplacer_' + \
                      split(get_tmp_filename())[-1])
        log_file=open(log_fp,'w')
        log_file.write(pplacer_result['StdOut'].read())
        log_file.close()
        
    # use guppy to convert json file into a placement tree
    guppy_params={'tog':None}
    
    new_tree=build_tree_from_json_using_params(pplacer_result['json'].name, \
                                               output_dir=params['--out-dir'], \
                                               params=guppy_params)

    pplacer_result.cleanUp()
    
    return new_tree
コード例 #20
0
 def likelyAncestralSeqs(self, locus=None):
     """Returns the most likely reconstructed ancestral sequences as an
     alignment.
     
     Arguments:
         - locus: a named locus"""
     prob_array = self.reconstructAncestralSeqs(locus=locus)
     seqs = []
     for edge, probs in prob_array.items():
         seq = []
         for row in probs:
             by_p = [(p, state) for state, p in row.items()]
             seq.append(max(by_p)[1])
         seqs += [(edge, self.model.MolType.makeSequence("".join(seq)))]
     return Alignment(data=seqs, MolType=self.model.MolType)
コード例 #21
0
def LoadSeqs(filename=None, format=None, data=None, moltype=None,
            name=None, aligned=True, label_to_name=None, parser_kw={},
            constructor_kw={}, **kw):
    """Initialize an alignment or collection of sequences.
    
    Arguments:
    - filename: name of the sequence file
    - format: format of the sequence file
    - data: optional explicit provision of sequences
    - moltype: the MolType, eg DNA, PROTEIN
    - aligned: set True if sequences are already aligned and have the same
      length, results in an Alignment object. If False, a SequenceCollection
      instance is returned instead. If callable, will use as a constructor
      (e.g. can pass in DenseAlignment or CodonAlignment).
    - label_to_name: function for converting original name into another
      name. Default behavior is to preserve the original FASTA label and
      comment. 
      To remove all FASTA label comments, and pass in only the label, pass in: 
            label_to_name=lambda x: x.split()[0]
      To look up names in a dict, pass in:
            label_to_name = lambda x: d.get(x, default_name)
      ...where d is a dict that's in scope, and default_name is what you want
      to assign any sequence that isn't in the dict.
    
    If format is None, will attempt to infer format from the filename
    suffix. If label_to_name is None, will attempt to infer correct
    conversion from the format.
    """
    
    if filename is None:
        assert data is not None
        assert format is None
        assert not kw, kw
    else:
        assert data is None, (filename, data)
        data = list(FromFilenameParser(filename, format, **parser_kw))

    # the following is a temp hack until we have the load API sorted out.
    if aligned: #if callable, call it -- expect either f(data) or bool
        if hasattr(aligned, '__call__'):
            return aligned(data=data, MolType=moltype, Name=name,
                label_to_name=label_to_name, **constructor_kw)
        else:   #was not callable, but wasn't False
            return Alignment(data=data, MolType=moltype, Name=name,
                label_to_name=label_to_name, **constructor_kw)
    else:   #generic case: return SequenceCollection
        return SequenceCollection(data, MolType=moltype, Name=name,
            label_to_name=label_to_name, **constructor_kw)
コード例 #22
0
ファイル: rnaalifold.py プロジェクト: mikerobeson/pycogent
def rnaalifold_from_alignment(aln, moltype=RNA, params=None):
    """Returns seq, pairs, folding energy for alignment.
    """
    #Create Alignment object.  Object will handle if seqs are unaligned.
    aln = Alignment(aln, MolType=RNA)
    int_map, int_keys = aln.getIntMap()

    app = RNAalifold(WorkingDir='/tmp',\
        InputHandler='_input_as_multiline_string',params=params)
    res = app(clustal_from_alignment(int_map))

    #seq,pairs,energy = rnaalifold_parser(res['StdOut'].readlines())
    pairs_list = MinimalRnaalifoldParser(res['StdOut'].readlines())

    res.cleanUp()
    return pairs_list
コード例 #23
0
ファイル: test_util.py プロジェクト: cxhernandez/pycogent
 def test_AlignmentToProfile_ignore(self):
     """AlignmentToProfile: should raise an error if too many chars ignored
     """
     #Same conditions as previous function, but in the last column 
     #there are only gaps, so normalization will fail at that position
     a = Alignment({'a':RnaSequence('UCAGRYN-'),'b':RnaSequence('ACUGAAA-')})
     exp =\
     array([[.5,0,.5,0],
      [0,1,0,0],
      [.5,0,.5,0],
      [0,0,0,1],
      [0,0,.75,.25],
      [.25,.25,.5,0],
      [.125,.125,.625,.125],
      [0,0,1,0]])
     self.assertRaises(ValueError,AlnToProfile,a,alphabet=RNA,\
         split_degenerates=True)
コード例 #24
0
def get_align_for_phylip(data, id_map=None):
    """
    Convenience function to return aligment object from phylip data

    data: sequence of lines in phylip format (an open file, list, etc)
    id_map: optional id mapping from external ids to phylip labels - not sure
        if we're going to implement this

    returns Alignment object
    """

    mpp = MinimalPhylipParser(data, id_map)

    tuples = []
    for tup in mpp:
        tuples.append(tup)
    return Alignment(tuples)
コード例 #25
0
    def test_insert_sequences_into_tree(self):
        """Inserts sequences into Tree using params - test handles tree-insertion"""

        # generate temp filename for output
        outfname = splitext(get_tmp_filename('/tmp/'))[0]

        # create starting tree
        outtreefname = outfname + '.tre'
        outtree = open(outtreefname, 'w')
        outtree.write(REF_TREE)
        outtree.close()

        # set params for tree-insertion
        params = {}
        params["-w"] = "/tmp/"
        params["-n"] = get_tmp_filename().split("/")[-1]
        params["-f"] = 'v'
        #params["-G"] = '0.25'
        params["-t"] = outtreefname
        params["-m"] = 'GTRGAMMA'

        aln_ref_query = get_align_for_phylip(
            StringIO(PHYLIP_FILE_DNA_REF_QUERY))
        aln = Alignment(aln_ref_query)
        seqs, align_map = aln.toPhylip()

        tree = insert_sequences_into_tree(seqs,
                                          DNA,
                                          params=params,
                                          write_log=False)

        for node in tree.tips():
            removed_query_str = re.sub('QUERY___', '', str(node.Name))
            new_node_name = re.sub('___\d+', '', str(removed_query_str))
            if new_node_name in align_map:
                node.Name = align_map[new_node_name]

        self.assertTrue(isinstance(tree, PhyloNode))
        self.assertEqual(tree.getNewick(with_distances=True), RESULT_TREE)
        self.assertEqual(len(tree.tips()), 7)
        self.assertRaises(NotImplementedError, build_tree_from_alignment, \
                         self.align1, RNA, True)

        remove(outtreefname)
コード例 #26
0
def align_unaligned_seqs(seqs,moltype,params=None,accurate=False):
    """Aligns unaligned sequences

    seqs: either list of sequence objects or list of strings
    add_seq_names: boolean. if True, sequence names are inserted in the list
        of sequences. if False, it assumes seqs is a list of lines of some
        proper format that the program can handle
    """
    #create SequenceCollection object from seqs
    seq_collection = SequenceCollection(seqs,MolType=moltype)
    #Create mapping between abbreviated IDs and full IDs
    int_map, int_keys = seq_collection.getIntMap()
    #Create SequenceCollection from int_map.
    int_map = SequenceCollection(int_map,MolType=moltype)
    #Create Mafft app.
    app = Mafft(InputHandler='_input_as_multiline_string',params=params)
    
    #Turn on correct moltype
    moltype_string = moltype.label.upper()
    app.Parameters[MOLTYPE_MAP[moltype_string]].on()
    
    #Do not report progress
    app.Parameters['--quiet'].on()
    
    #More accurate alignment, sacrificing performance.
    if accurate:
        app.Parameters['--globalpair'].on()
        app.Parameters['--maxiterate'].Value=1000
    
    #Get results using int_map as input to app
    res = app(int_map.toFasta())
    #Get alignment as dict out of results
    alignment = dict(MinimalFastaParser(res['StdOut'].readlines()))
    #Make new dict mapping original IDs
    new_alignment = {}
    for k,v in list(alignment.items()):
        new_alignment[int_keys[k]]=v
    #Create an Alignment object from alignment dict
    new_alignment = Alignment(new_alignment,MolType=moltype)
    #Clean up
    res.cleanUp()
    del(seq_collection,int_map,int_keys,app,res,alignment)

    return new_alignment
コード例 #27
0
def insert_sequences_into_tree(aln, moltype, params={}):
    """Returns a tree from placement of sequences
    """
    # convert aln to phy since seq_names need fixed to run through parsinsert
    new_aln = get_align_for_phylip(StringIO(aln))

    # convert aln to fasta in case it is not already a fasta file
    aln2 = Alignment(new_aln)
    seqs = aln2.toFasta()

    parsinsert_app = ParsInsert(params=params)
    result = parsinsert_app(seqs)

    # parse tree
    tree = DndParser(result['Tree'].read(), constructor=PhyloNode)

    # cleanup files
    result.cleanUp()

    return tree
コード例 #28
0
    def getAlignment(self,
                     feature_types=None,
                     where_feature=None,
                     omit_redundant=True):
        """Arguments:
            - feature_types: annotations to be applied to the returned
              sequences
            - omit_redundant: exclude redundant gap positions"""
        seqs = []
        annotations = {}

        for member in self.Members:
            if feature_types:
                seq = member.getAnnotatedAligned(feature_types, where_feature)
            else:
                seq = member.AlignedSeq
            if seq is None:
                continue
            name = seq.Name

            if self._rc:  # names should reflect change to strand
                loc = member.Location.copy()
                loc.Strand *= -1
                name = str(loc)

            annotations[name] = seq.data.annotations
            seq.Name = seq.data.Name = name
            seqs += [(name, seq)]

        if seqs is None:
            return None

        aln = Alignment(data=seqs, MolType=DNA)

        if self._rc:
            aln = aln.rc()

        if omit_redundant:
            aln = aln.filtered(lambda x: set(x) != set('-'))

        return aln
コード例 #29
0
ファイル: test_util.py プロジェクト: cxhernandez/pycogent
 def test_AlignmentToProfile_basic(self):
     """AlignmentToProfile: should work under basic conditions
     """
     #sequences in the alignment are unweighted
     #Alphabet is the alphabet of the sequences (RNA)
     #CharOrder is set explicitly
     #Degenerate bases are split up
     #Gaps are ignored
     #In all of the columns at least one character is in the CharOrder
     a = Alignment({'a':RnaSequence('UCAGRYN-'),'b':RnaSequence('ACUGAAAA')})
     exp =\
     array([[.5,0,.5,0],
      [0,1,0,0],
      [.5,0,.5,0],
      [0,0,0,1],
      [0,0,.75,.25],
      [.25,.25,.5,0],
      [.125,.125,.625,.125],
      [0,0,1,0]])
     self.assertEqual(AlnToProfile(a,alphabet=RNA,\
         split_degenerates=True).Data.tolist(),exp.tolist())
コード例 #30
0
def align_unaligned_seqs(seqs, moltype, params=None):
    """Returns an Alignment object from seqs.

    seqs: SequenceCollection object, or data that can be used to build one.
    
    moltype: a MolType object.  DNA, RNA, or PROTEIN.

    params: dict of parameters to pass in to the Muscle app controller.
    
    Result will be an Alignment object.
    """
    if not params:
        params = {}
    #create SequenceCollection object from seqs
    seq_collection = SequenceCollection(seqs, MolType=moltype)
    #Create mapping between abbreviated IDs and full IDs
    int_map, int_keys = seq_collection.getIntMap()
    #Create SequenceCollection from int_map.
    int_map = SequenceCollection(int_map, MolType=moltype)
    #get temporary filename
    params.update({'-out': get_tmp_filename()})
    #Create Muscle app.
    app = Muscle(InputHandler='_input_as_multiline_string',\
                 params=params)
    #Get results using int_map as input to app
    res = app(int_map.toFasta())
    #Get alignment as dict out of results
    alignment = dict(MinimalFastaParser(res['MuscleOut'].readlines()))
    #Make new dict mapping original IDs
    new_alignment = {}
    for k, v in alignment.items():
        new_alignment[int_keys[k]] = v
    #Create an Alignment object from alignment dict
    new_alignment = Alignment(new_alignment, MolType=moltype)
    #Clean up
    res.cleanUp()
    del (seq_collection, int_map, int_keys, app, res, alignment, params)

    return new_alignment