コード例 #1
0
    def setUp(self):
        """setUp method for all tests"""
        # named sequences
        self.rna1 = RnaSequence('UCAGGG', Name='rna1')
        self.rna2 = RnaSequence('YCU-RG', Name='rna2')
        self.rna3 = RnaSequence('CAA-NR', Name='rna3')
        self.model1 = ModelSequence('UCAGGG', Name='rna1',\
            Alphabet=RNA.Alphabets.DegenGapped)
        self.model2 = ModelSequence('YCU-RG', Name='rna2',\
            Alphabet=RNA.Alphabets.DegenGapped)
        self.model3 = ModelSequence('CAA-NR', Name='rna3',\
            Alphabet=RNA.Alphabets.DegenGapped)

        self.aln = Alignment([self.rna1, self.rna2, self.rna3], MolType=RNA)
        self.da = DenseAlignment([self.model1, self.model2, self.model3],\
            MolType=RNA, Alphabet=RNA.Alphabets.DegenGapped)

        # seqs no name
        self.nn_rna1 = RnaSequence('UCAGGG')
        self.nn_rna2 = RnaSequence('YCU-RG')
        self.nn_rna3 = RnaSequence('CAA-NR')

        self.nn_model1 = ModelSequence('UCAGGG',\
            Alphabet=RNA.Alphabets.DegenGapped)
        self.nn_model2 = ModelSequence('YCU-RG',\
            Alphabet=RNA.Alphabets.DegenGapped)
        self.nn_model3 = ModelSequence('CAA-NR',\
            Alphabet=RNA.Alphabets.DegenGapped)

        self.nn_aln = Alignment([self.nn_rna1, self.nn_rna2, self.nn_rna3],\
            MolType=RNA)
        self.nn_da = DenseAlignment([self.nn_model1, self.nn_model2,\
            self.nn_model3], MolType=RNA, Alphabet=RNA.Alphabets.DegenGapped)
コード例 #2
0
    def setUp(self):

        # create a list of files to cleanup
        self._paths_to_clean_up = []
        self._dirs_to_clean_up = []

        # load query seqs
        self.seqs = Alignment(MinimalFastaParser(QUERY_SEQS.split()))

        # generate temp filename
        tmp_dir = '/tmp'
        self.outfile = get_tmp_filename(tmp_dir)

        # create and write out reference sequence file
        self.outfasta = splitext(self.outfile)[0] + '.fasta'
        fastaout = open(self.outfasta, 'w')
        fastaout.write(REF_SEQS)
        fastaout.close()
        self._paths_to_clean_up.append(self.outfasta)

        # create and write out starting tree file
        self.outtree = splitext(self.outfile)[0] + '.tree'
        treeout = open(self.outtree, 'w')
        treeout.write(REF_TREE)
        treeout.close()
        self._paths_to_clean_up.append(self.outtree)
コード例 #3
0
ファイル: clustalw.py プロジェクト: pombredanne/pycogent-1
def add_seqs_to_alignment(seqs, aln, moltype, params=None):
    """Returns an Alignment object from seqs and existing Alignment.

    seqs: a cogent.core.alignment.SequenceCollection object, or data that can
    be used to build one.

    aln: a cogent.core.alignment.Alignment object, or data that can be used to
    build one

    params: dict of parameters to pass in to the Clustal app controller.
    """
    # create SequenceCollection object from seqs
    seq_collection = SequenceCollection(seqs, MolType=moltype)
    # Create mapping between abbreviated IDs and full IDs
    seq_int_map, seq_int_keys = seq_collection.getIntMap()
    # Create SequenceCollection from int_map.
    seq_int_map = SequenceCollection(seq_int_map, MolType=moltype)

    # create Alignment object from aln
    aln = Alignment(aln, MolType=moltype)
    # Create mapping between abbreviated IDs and full IDs
    aln_int_map, aln_int_keys = aln.getIntMap(prefix="seqn_")
    # Create SequenceCollection from int_map.
    aln_int_map = Alignment(aln_int_map, MolType=moltype)

    # Update seq_int_keys with aln_int_keys
    seq_int_keys.update(aln_int_keys)

    # Create Mafft app.
    app = Clustalw(InputHandler="_input_as_multiline_string", params=params, SuppressStderr=True)
    app.Parameters["-align"].off()
    app.Parameters["-infile"].off()
    app.Parameters["-sequences"].on()

    # Add aln_int_map as profile1
    app.Parameters["-profile1"].on(app._tempfile_as_multiline_string(aln_int_map.toFasta()))

    # Add seq_int_map as profile2
    app.Parameters["-profile2"].on(app._tempfile_as_multiline_string(seq_int_map.toFasta()))
    # Get results using int_map as input to app
    res = app()

    # Get alignment as dict out of results
    alignment = dict(ClustalParser(res["Align"].readlines()))

    # Make new dict mapping original IDs
    new_alignment = {}
    for k, v in alignment.items():
        new_alignment[seq_int_keys[k]] = v
    # Create an Alignment object from alignment dict
    new_alignment = Alignment(new_alignment, MolType=moltype)
    # Clean up
    res.cleanUp()
    remove(app.Parameters["-profile1"].Value)
    remove(app.Parameters["-profile2"].Value)
    del (seq_collection, seq_int_map, seq_int_keys, aln, aln_int_map, aln_int_keys, app, res, alignment)

    return new_alignment
コード例 #4
0
def build_tree_from_alignment(aln, moltype, best_tree=False, params={}):
    """Returns a tree from Alignment object aln.
    
    aln: an xxx.Alignment object, or data that can be used to build one.
    
    moltype: cogent.core.moltype.MolType object

    best_tree: best_tree suppport is currently not implemented
    
    params: dict of parameters to pass in to the RAxML app controller.
    
    The result will be an xxx.Alignment object, or None if tree fails.
    """
    if best_tree:
        raise NotImplementedError

    if '-m' not in params:
        if moltype == DNA or moltype == RNA:
            #params["-m"] = 'GTRMIX'
            # in version 7.2.3, GTRMIX is no longer supported but says GTRCAT
            # behaves like GTRMIX (http://www.phylo.org/tools/raxmlhpc2.html)
            params["-m"] = 'GTRGAMMA'
        elif moltype == PROTEIN:
            params["-m"] = 'PROTGAMMAmatrixName'
        else:
            raise ValueError("Moltype must be either DNA, RNA, or PROTEIN")

    if not hasattr(aln, 'toPhylip'):
        aln = Alignment(aln)
    seqs, align_map = aln.toPhylip()

    # generate temp filename for output
    params["-w"] = "/tmp/"
    params["-n"] = get_tmp_filename().split("/")[-1]
    params["-k"] = True
    params["-p"] = randint(1, 100000)
    params["-x"] = randint(1, 100000)

    ih = '_input_as_multiline_string'

    raxml_app = Raxml(params=params,
                      InputHandler=ih,
                      WorkingDir=None,
                      SuppressStderr=True,
                      SuppressStdout=True)

    raxml_result = raxml_app(seqs)

    tree = DndParser(raxml_result['Bootstrap'], constructor=PhyloNode)

    for node in tree.tips():
        node.Name = align_map[node.Name]

    raxml_result.cleanUp()

    return tree
コード例 #5
0
ファイル: raxml_v730.py プロジェクト: cxhernandez/pycogent
def build_tree_from_alignment(aln, moltype, best_tree=False, params={}):
    """Returns a tree from Alignment object aln.
    
    aln: an xxx.Alignment object, or data that can be used to build one.
    
    moltype: cogent.core.moltype.MolType object

    best_tree: best_tree suppport is currently not implemented
    
    params: dict of parameters to pass in to the RAxML app controller.
    
    The result will be an xxx.Alignment object, or None if tree fails.
    """
    if best_tree:
        raise NotImplementedError

    if '-m' not in params:
        if moltype == DNA or moltype == RNA:
            #params["-m"] = 'GTRMIX'
            # in version 7.2.3, GTRMIX is no longer supported but says GTRCAT
            # behaves like GTRMIX (http://www.phylo.org/tools/raxmlhpc2.html)
            params["-m"] = 'GTRGAMMA'
        elif moltype == PROTEIN:
            params["-m"] = 'PROTGAMMAmatrixName'
        else:
            raise ValueError("Moltype must be either DNA, RNA, or PROTEIN")

    if not hasattr(aln, 'toPhylip'):
        aln = Alignment(aln)
    seqs, align_map = aln.toPhylip()

    # generate temp filename for output    
    params["-w"] = "/tmp/"    
    params["-n"] = get_tmp_filename().split("/")[-1]
    params["-k"] = True
    params["-p"] = randint(1,100000)
    params["-x"] = randint(1,100000)
    
    ih = '_input_as_multiline_string'    

    raxml_app = Raxml(params=params,
                      InputHandler=ih,
                      WorkingDir=None,
                      SuppressStderr=True,
                      SuppressStdout=True)
                      
    raxml_result = raxml_app(seqs)
    
    tree = DndParser(raxml_result['Bootstrap'], constructor=PhyloNode)
    
    for node in tree.tips():
        node.Name = align_map[node.Name]

    raxml_result.cleanUp()

    return tree
コード例 #6
0
    def test_distance_matrix(self):
        """distance_matrix should obey Names of alignment"""
        #Names=None
        aln1_exp = array([[0, 2, 2], [2, 0, 1], [2, 1, 0]])
        self.assertEqual(distance_matrix(self.aln1), aln1_exp)

        a = Alignment(self.aln1.NamedSeqs)
        a.Names = ['seq_1', 'seq_2', 'seq_0']
        a_exp = array([[0, 1, 2], [1, 0, 2], [2, 2, 0]])
        self.assertEqual(distance_matrix(a), a_exp)
コード例 #7
0
ファイル: test_util.py プロジェクト: miklou/pycogent
 def test_distance_matrix(self):
     """distance_matrix should obey Names of alignment"""
     #Names=None
     aln1_exp = array([[0,2,2],[2,0,1],[2,1,0]])
     self.assertEqual(distance_matrix(self.aln1),aln1_exp)
     
     a = Alignment(self.aln1.NamedSeqs)
     a.Names=['seq_1','seq_2','seq_0']
     a_exp = array([[0,1,2],[1,0,2],[2,2,0]])
     self.assertEqual(distance_matrix(a),a_exp)
コード例 #8
0
ファイル: dotur.py プロジェクト: GavinHuttley/pycogent
def dotur_from_alignment(aln,moltype,distance_function,params=None):
    """Returns dotur results given an alignment and distance function.
    
        - aln: An Alignment object or something that behaves like one.
            Sequences must be aligned.
        - moltype: cogent.core.moltype object.
        - distance_function: function that can be passed to distanceMatrix()
            method of SequenceCollection.  Must be able to find distance
            between two sequences.
        
        - NOTE:  This function will only return the parsed *.list file, as
            it contains the OTU identities.
            Dotur generates 23 output files, so if this is not the one you
            are looking for, check out the documentation and add the others
            to the result path.
    """
    #construct Alignment object.  This will handle unaligned sequences.
    aln = Alignment(aln, MolType=moltype)
    
    #need to make int map.
    int_map, int_keys = aln.getIntMap()
    #construct Alignment object from int map to use object functionality
    int_map = Alignment(int_map, MolType=moltype)
    order = sorted(int_map.Names)
    
    #Build distance matrix.
    d_matrix_dict = int_map.distanceMatrix(f=distance_function)
    d_matrix_dict.RowOrder=order
    d_matrix_dict.ColOrder=order
    
    #Get distance matrix in list form.
    d_matrix_list = d_matrix_dict.toLists()
    
    #must be strings to use phylipMatrix
    for i,line in enumerate(d_matrix_list):
        d_matrix_list[i]=map(str,line)
    
    #Get phylip formatted string.
    phylip_matrix_string = phylipMatrix(rows=d_matrix_list,names=order)
        
    working_dir = get_tmp_filename(suffix='')
    app = Dotur(InputHandler='_input_as_multiline_string',\
        WorkingDir=working_dir,params=params)
    
    res = app(phylip_matrix_string)
    
    otu_list = OtuListParser(res['List'].readlines())
    
    #remap sequence names
    for i,otu in enumerate(otu_list):
        otu_list[i][2]=remap_seq_names(otu[2], int_keys)
    
    shutil.rmtree(app.WorkingDir)
    
    return otu_list
コード例 #9
0
    def setUp(self):
        """Set up for Voronoi tests"""
        self.aln1 = Alignment(['ABC', 'BCC', 'BAC'])

        self.aln2 = Alignment({'seq1':'GYVGS','seq2':'GFDGF','seq3':'GYDGF',\
            'seq4':'GYQGG'},Names=['seq1','seq2','seq3','seq4'])

        self.aln3 = Alignment({'seq1':'AA', 'seq2':'AA', 'seq3':'BB'},\
            Names=['seq1','seq2','seq3'])

        self.aln4 = Alignment({'seq1':'AA', 'seq2':'AA', 'seq3':'BB',\
        'seq4':'BB','seq5':'CC'},Names=['seq1','seq2','seq3','seq4','seq5'])

        self.aln5 = Alignment(['ABBA', 'ABCA', 'CBCB'])
コード例 #10
0
ファイル: mafft.py プロジェクト: ElDeveloper/brokit
def align_two_alignments(aln1, aln2, moltype, params=None):
    """Returns an Alignment object from two existing Alignments.

    aln1, aln2: cogent.core.alignment.Alignment objects, or data that can be
    used to build them.
        - Mafft profile alignment only works with aligned sequences. Alignment
        object used to handle unaligned sequences.

    params: dict of parameters to pass in to the Mafft app controller.
    """
    #create SequenceCollection object from seqs
    aln1 = Alignment(aln1,MolType=moltype)
    #Create mapping between abbreviated IDs and full IDs
    aln1_int_map, aln1_int_keys = aln1.getIntMap()
    #Create SequenceCollection from int_map.
    aln1_int_map = Alignment(aln1_int_map,MolType=moltype)
    
    #create Alignment object from aln
    aln2 = Alignment(aln2,MolType=moltype)
    #Create mapping between abbreviated IDs and full IDs
    aln2_int_map, aln2_int_keys = aln2.getIntMap(prefix='seqn_')
    #Create SequenceCollection from int_map.
    aln2_int_map = Alignment(aln2_int_map,MolType=moltype)
    
    #Update aln1_int_keys with aln2_int_keys
    aln1_int_keys.update(aln2_int_keys)
    
    #Create Mafft app.
    app = Mafft(InputHandler='_input_as_paths',\
        params=params,
        SuppressStderr=False)
    app._command = 'mafft-profile'
    
    aln1_path = app._tempfile_as_multiline_string(aln1_int_map.toFasta())
    aln2_path = app._tempfile_as_multiline_string(aln2_int_map.toFasta())
    filepaths = [aln1_path,aln2_path]
    
    #Get results using int_map as input to app
    res = app(filepaths)

    #Get alignment as dict out of results
    alignment = dict(parse_fasta(res['StdOut']))
    
    #Make new dict mapping original IDs
    new_alignment = {}
    for k,v in alignment.items():
        key = k.replace('_seed_','')
        new_alignment[aln1_int_keys[key]]=v
    #Create an Alignment object from alignment dict
    new_alignment = Alignment(new_alignment,MolType=moltype)
    #Clean up
    res.cleanUp()
    remove(aln1_path)
    remove(aln2_path)
    remove('pre')
    remove('trace')
    del(aln1,aln1_int_map,aln1_int_keys,\
        aln2,aln2_int_map,aln2_int_keys,app,res,alignment)

    return new_alignment
コード例 #11
0
ファイル: clustalw.py プロジェクト: jairideout/brokit
def align_two_alignments(aln1, aln2, moltype, params=None):
    """Returns an Alignment object from two existing Alignments.

    aln1, aln2: cogent.core.alignment.Alignment objects, or data that can be
    used to build them.

    params: dict of parameters to pass in to the Clustal app controller.
    """
    #create SequenceCollection object from seqs
    aln1 = Alignment(aln1,MolType=moltype)
    #Create mapping between abbreviated IDs and full IDs
    aln1_int_map, aln1_int_keys = aln1.getIntMap()
    #Create SequenceCollection from int_map.
    aln1_int_map = Alignment(aln1_int_map,MolType=moltype)

    #create Alignment object from aln
    aln2 = Alignment(aln2,MolType=moltype)
    #Create mapping between abbreviated IDs and full IDs
    aln2_int_map, aln2_int_keys = aln2.getIntMap(prefix='seqn_')
    #Create SequenceCollection from int_map.
    aln2_int_map = Alignment(aln2_int_map,MolType=moltype)

    #Update aln1_int_keys with aln2_int_keys
    aln1_int_keys.update(aln2_int_keys)

    #Create Mafft app.
    app = Clustalw(InputHandler='_input_as_multiline_string',\
        params=params,
        SuppressStderr=True)
    app.Parameters['-align'].off()
    app.Parameters['-infile'].off()
    app.Parameters['-profile'].on()

    #Add aln_int_map as profile1
    app.Parameters['-profile1'].on(\
        app._tempfile_as_multiline_string(aln1_int_map.toFasta()))

    #Add seq_int_map as profile2
    app.Parameters['-profile2'].on(\
        app._tempfile_as_multiline_string(aln2_int_map.toFasta()))
    #Get results using int_map as input to app
    res = app()

    #Get alignment as dict out of results
    alignment = dict(ClustalParser(res['Align'].readlines()))

    #Make new dict mapping original IDs
    new_alignment = {}
    for k,v in alignment.items():
        new_alignment[aln1_int_keys[k]]=v
    #Create an Alignment object from alignment dict
    new_alignment = Alignment(new_alignment,MolType=moltype)
    #Clean up
    res.cleanUp()
    remove(app.Parameters['-profile1'].Value)
    remove(app.Parameters['-profile2'].Value)
    del(aln1,aln1_int_map,aln1_int_keys,\
        aln2,aln2_int_map,aln2_int_keys,app,res,alignment)

    return new_alignment
コード例 #12
0
ファイル: pplacer.py プロジェクト: biocore/burrito-fillings
def insert_sequences_into_tree(aln, moltype, params={},
                                           write_log=True):
    """Returns a tree from Alignment object aln.

    aln: an xxx.Alignment object, or data that can be used to build one.

    moltype: cogent.core.moltype.MolType object

    params: dict of parameters to pass in to the RAxML app controller.

    The result will be an xxx.Alignment object, or None if tree fails.
    """

    # convert aln to phy since seq_names need fixed to run through pplacer

    new_aln=get_align_for_phylip(StringIO(aln))

    # convert aln to fasta in case it is not already a fasta file
    aln2 = Alignment(new_aln)
    seqs = aln2.toFasta()

    ih = '_input_as_multiline_string'

    pplacer_app = Pplacer(params=params,
                      InputHandler=ih,
                      WorkingDir=None,
                      SuppressStderr=False,
                      SuppressStdout=False)

    pplacer_result = pplacer_app(seqs)

    # write a log file
    if write_log:
        log_fp = join(params["--out-dir"],'log_pplacer_' + \
                      split(get_tmp_filename())[-1])
        log_file=open(log_fp,'w')
        log_file.write(pplacer_result['StdOut'].read())
        log_file.close()

    # use guppy to convert json file into a placement tree
    guppy_params={'tog':None}

    new_tree=build_tree_from_json_using_params(pplacer_result['json'].name, \
                                               output_dir=params['--out-dir'], \
                                               params=guppy_params)

    pplacer_result.cleanUp()

    return new_tree
コード例 #13
0
ファイル: pplacer.py プロジェクト: cxhernandez/pycogent
def insert_sequences_into_tree(aln, moltype, params={},
                                           write_log=True):
    """Returns a tree from Alignment object aln.

    aln: an xxx.Alignment object, or data that can be used to build one.

    moltype: cogent.core.moltype.MolType object

    params: dict of parameters to pass in to the RAxML app controller.

    The result will be an xxx.Alignment object, or None if tree fails.
    """

    # convert aln to phy since seq_names need fixed to run through pplacer
    
    new_aln=get_align_for_phylip(StringIO(aln))

    # convert aln to fasta in case it is not already a fasta file
    aln2 = Alignment(new_aln)
    seqs = aln2.toFasta()

    ih = '_input_as_multiline_string'    

    pplacer_app = Pplacer(params=params,
                      InputHandler=ih,
                      WorkingDir=None,
                      SuppressStderr=False,
                      SuppressStdout=False)
    
    pplacer_result = pplacer_app(seqs)

    # write a log file
    if write_log:
        log_fp = join(params["--out-dir"],'log_pplacer_' + \
                      split(get_tmp_filename())[-1])
        log_file=open(log_fp,'w')
        log_file.write(pplacer_result['StdOut'].read())
        log_file.close()
        
    # use guppy to convert json file into a placement tree
    guppy_params={'tog':None}
    
    new_tree=build_tree_from_json_using_params(pplacer_result['json'].name, \
                                               output_dir=params['--out-dir'], \
                                               params=guppy_params)

    pplacer_result.cleanUp()
    
    return new_tree
コード例 #14
0
ファイル: rnaalifold.py プロジェクト: mikerobeson/pycogent
def rnaalifold_from_alignment(aln, moltype=RNA, params=None):
    """Returns seq, pairs, folding energy for alignment.
    """
    #Create Alignment object.  Object will handle if seqs are unaligned.
    aln = Alignment(aln, MolType=RNA)
    int_map, int_keys = aln.getIntMap()

    app = RNAalifold(WorkingDir='/tmp',\
        InputHandler='_input_as_multiline_string',params=params)
    res = app(clustal_from_alignment(int_map))

    #seq,pairs,energy = rnaalifold_parser(res['StdOut'].readlines())
    pairs_list = MinimalRnaalifoldParser(res['StdOut'].readlines())

    res.cleanUp()
    return pairs_list
コード例 #15
0
 def setUp(self):
     """Setup for Fasta tests."""
     self.strings = ['AAAA', 'CCCC', 'gggg', 'uuuu']
     self.labels = ['1st', '2nd', '3rd', '4th']
     self.infos = ["Dog", "Cat", "Mouse", "Rat"]
     self.sequences_with_labels = map(Sequence, self.strings)
     self.sequences_with_names = map(Sequence, self.strings)
     for l,sl,sn in zip(self.labels,self.sequences_with_labels,\
         self.sequences_with_names):
         sl.Label = l
         sn.Name = l
     self.fasta_no_label = '>0\nAAAA\n>1\nCCCC\n>2\ngggg\n>3\nuuuu'
     self.fasta_with_label=\
             '>1st\nAAAA\n>2nd\nCCCC\n>3rd\nGGGG\n>4th\nUUUU'
     self.fasta_with_label_lw2=\
         '>1st\nAA\nAA\n>2nd\nCC\nCC\n>3rd\nGG\nGG\n>4th\nUU\nUU'
     self.alignment_dict = {
         '1st': 'AAAA',
         '2nd': 'CCCC',
         '3rd': 'GGGG',
         '4th': 'UUUU'
     }
     self.alignment_object = Alignment(self.alignment_dict)
     for label, info in zip(self.labels, self.infos):
         self.alignment_object.NamedSeqs[label].Info = Info(species=info)
     self.fasta_with_label_species=\
           '>1st:Dog\nAAAA\n>2nd:Cat\nCCCC\n>3rd:Mouse\nGGGG\n>4th:Rat\nUUUU'
     self.alignment_object.RowOrder = ['1st', '2nd', '3rd', '4th']
コード例 #16
0
    def test_AlignmentToProfile_weighted(self):
        """AlignmentToProfile: should work when sequences are weighted
        """
        #Alignment: sequences are just strings and don't have an alphabet
        #Weights: a normal dictionary (could be a real Weights object as well)
        a = Alignment({'seq1':'TCAG','seq2':'TAR-','seq3':'YAG-'},\
        Names=['seq1','seq2','seq3'])
        w = {'seq1': 0.5, 'seq2': .25, 'seq3': .25}

        #Basic situation in which all letters in the sequences occur in the
        #CharOrder, None have to be ignored. In that case it doesn't matter
        #whether we set split_degenerates to True or False, because if it's
        #True it's overwritten by the fact that the char is in the CharOrder.
        exp = array([[0.75, 0, 0, 0, 0, .25, 0], [0, 0.5, 0.5, 0, 0, 0, 0],
                     [0, 0.5, 0, 0.25, 0.25, 0, 0], [0, 0, 0, 0.5, 0, 0, 0.5]])
        #split_degenerates = False
        self.assertEqual(AlnToProfile(a,DNA, char_order="TACGRY-",\
            weights=w, split_degenerates=False).Data.tolist(),exp.tolist())
        #split_degenerates = True
        self.assertEqual(AlnToProfile(a,DNA, char_order="TACGRY-",\
            weights=w, split_degenerates=True).Data.tolist(),exp.tolist())

        #Only non-degenerate symbols in the CharOrder. Degenerates are split.
        #Gaps are ignored
        exp = array([[0.875, 0, 0.125, 0], [0, 0.5, 0.5, 0],
                     [0, 0.625, 0, 0.375], [0, 0, 0, 1]])
        self.assertEqual(AlnToProfile(a,DNA, char_order="TACG",\
            weights=w, split_degenerates=True).Data.tolist(),exp.tolist())

        #An Error is raised if all chars in an alignment column are ignored
        #CharOrder=AT, degenerates are not split.
        self.assertRaises(ValueError,AlnToProfile,a,DNA,\
            char_order="AT",weights=w, split_degenerates=True)
コード例 #17
0
ファイル: test_profile.py プロジェクト: miklou/pycogent
    def test_randomIndices(self):
        """randomIndices: 99% of new frequencies should be within 3*SD
        """
        r_num, c_num = 100,20
        num_elements = r_num*c_num
        r = random([r_num,c_num])
        p = Profile(r,"A"*c_num)
        p.normalizePositions()
        d = p.Data
        n = 1000
        
        #Test only works on normalized profile, b/c of 1-d below
        means = n*d
        three_stds = sqrt(d*(1-d)*n)*3
        result = [p.randomIndices() for x in range(n)]
        a = Alignment(transpose(result))

        def absoluteProfile(alignment,char_order):
            f = a.columnFreqs()
            res = zeros([len(f),len(char_order)])
            for row, freq in enumerate(f):
                for i in freq:
                    res[row, ord(i)] = freq[i]
            return res

        ap = absoluteProfile(a,p.CharOrder)
        failure = abs(ap-means) > three_stds
        assert sum(sum(failure))/num_elements <= 0.01
コード例 #18
0
ファイル: test_profile.py プロジェクト: miklou/pycogent
    def test_randomSequence(self):
        """randomSequence: 99% of new frequencies should be within 3*SD"""
        r_num, c_num = 100,20
        num_elements = r_num*c_num
        alpha = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
        r = random([r_num,c_num])
        p = Profile(r,alpha[:c_num])
        p.normalizePositions()
        d = p.Data
        n = 1000
        
        #Test only works on normalized profile, b/c of 1-d below
        means = n*d
        three_stds = sqrt(d*(1-d)*n)*3

        a = Alignment([p.randomSequence() for x in range(n)])

        def absoluteProfile(alignment,char_order):
            f = a.columnFreqs()
            res = zeros([len(f),len(char_order)])
            for row, freq in enumerate(f):
                for i in freq:
                    col = char_order.index(i)
                    res[row, col] = freq[i]
            return res

        ap = absoluteProfile(a,p.CharOrder)
        failure = abs(ap-means) > three_stds
        assert sum(sum(failure))/num_elements <= 0.01
コード例 #19
0
 def setUp(self):
     
     # create a list of files to cleanup
     self._paths_to_clean_up = []
     self._dirs_to_clean_up = []
     
     # load query seqs
     self.seqs = Alignment(MinimalFastaParser(QUERY_SEQS.split()))
     
     # generate temp filename
     tmp_dir='/tmp'
     self.outfile = get_tmp_filename(tmp_dir)
     
     # create and write out reference sequence file
     self.outfasta=splitext(self.outfile)[0]+'.fasta'
     fastaout=open(self.outfasta,'w')
     fastaout.write(REF_SEQS)
     fastaout.close()
     self._paths_to_clean_up.append(self.outfasta)
     
     # create and write out starting tree file
     self.outtree=splitext(self.outfile)[0]+'.tree'
     treeout=open(self.outtree,'w')
     treeout.write(REF_TREE)
     treeout.close()
     self._paths_to_clean_up.append(self.outtree)
コード例 #20
0
    def setUp(self):
        """setUp method for all tests"""
        # named sequences
        self.rna1 = RnaSequence('UCAGGG', Name='rna1')
        self.rna2 = RnaSequence('YCU-RG', Name='rna2')
        self.rna3 = RnaSequence('CAA-NR', Name='rna3')
        self.model1 = ModelSequence('UCAGGG', Name='rna1',\
            Alphabet=RNA.Alphabets.DegenGapped)
        self.model2 = ModelSequence('YCU-RG', Name='rna2',\
            Alphabet=RNA.Alphabets.DegenGapped)
        self.model3 = ModelSequence('CAA-NR', Name='rna3',\
            Alphabet=RNA.Alphabets.DegenGapped)

        self.aln = Alignment([self.rna1, self.rna2, self.rna3], MolType=RNA)
        self.da = DenseAlignment([self.model1, self.model2, self.model3],\
            MolType=RNA, Alphabet=RNA.Alphabets.DegenGapped)

        # seqs no name
        self.nn_rna1 = RnaSequence('UCAGGG')
        self.nn_rna2 = RnaSequence('YCU-RG')
        self.nn_rna3 = RnaSequence('CAA-NR')

        self.nn_model1 = ModelSequence('UCAGGG',\
            Alphabet=RNA.Alphabets.DegenGapped)
        self.nn_model2 = ModelSequence('YCU-RG',\
            Alphabet=RNA.Alphabets.DegenGapped)
        self.nn_model3 = ModelSequence('CAA-NR',\
            Alphabet=RNA.Alphabets.DegenGapped)

        self.nn_aln = Alignment([self.nn_rna1, self.nn_rna2, self.nn_rna3],\
            MolType=RNA)
        self.nn_da = DenseAlignment([self.nn_model1, self.nn_model2,\
            self.nn_model3], MolType=RNA, Alphabet=RNA.Alphabets.DegenGapped)
コード例 #21
0
    def test_subset_seqs_Alignment(self):
        rna1 = RnaSequence('UCG', Name='rna1')
        rna2 = RnaSequence('YCG', Name='rna2')
        rna3 = RnaSequence('CAR', Name='rna3')

        sub_aln = Alignment([rna2, rna3], MolType=RNA)
        aln = Alignment([rna1, rna2, rna3], MolType=RNA)
        obs_sub_aln = aln.takeSeqs(['rna2','rna3'])
        
        self.assertEqual(obs_sub_aln, sub_aln)
        self.assertEqual(str(obs_sub_aln), str(sub_aln))

        # Selected sequences should be in specified order?
        obs_sub_aln_1 = self.aln.takeSeqs(['rna3','rna2'])
        obs_sub_aln_2 = self.aln.takeSeqs(['rna2','rna3'])
        self.failIfEqual(str(obs_sub_aln_1), str(obs_sub_aln_2))
コード例 #22
0
def align_unaligned_seqs(seqs, moltype, params=None):
    """Returns an Alignment object from seqs.

    seqs: cogent.core.alignment.SequenceCollection object, or data that can be
    used to build one.
    
    moltype: a MolType object.  DNA, RNA, or PROTEIN.

    params: dict of parameters to pass in to the Clustal app controller.
    
    Result will be a cogent.core.alignment.Alignment object.
    """
    #create SequenceCollection object from seqs
    seq_collection = SequenceCollection(seqs, MolType=moltype)
    #Create mapping between abbreviated IDs and full IDs
    int_map, int_keys = seq_collection.getIntMap()
    #Create SequenceCollection from int_map.
    int_map = SequenceCollection(int_map, MolType=moltype)
    #Create Clustalw app.
    app = Clustalw(InputHandler='_input_as_multiline_string', params=params)
    #Get results using int_map as input to app
    res = app(int_map.toFasta())
    #Get alignment as dict out of results
    alignment = dict(ClustalParser(res['Align'].readlines()))
    #Make new dict mapping original IDs
    new_alignment = {}
    for k, v in alignment.items():
        new_alignment[int_keys[k]] = v
    #Create an Alignment object from alignment dict
    new_alignment = Alignment(new_alignment, MolType=moltype)
    #Clean up
    res.cleanUp()
    del (seq_collection, int_map, int_keys, app, res, alignment)

    return new_alignment
コード例 #23
0
ファイル: rnaalifold.py プロジェクト: cxhernandez/pycogent
def rnaalifold_from_alignment(aln,moltype=RNA,params=None):
    """Returns seq, pairs, folding energy for alignment.
    """
    #Create Alignment object.  Object will handle if seqs are unaligned.
    aln = Alignment(aln,MolType=RNA)
    int_map, int_keys = aln.getIntMap()

    app = RNAalifold(WorkingDir='/tmp',\
        InputHandler='_input_as_multiline_string',params=params)
    res = app(clustal_from_alignment(int_map))
    
    #seq,pairs,energy = rnaalifold_parser(res['StdOut'].readlines())
    pairs_list = MinimalRnaalifoldParser(res['StdOut'].readlines())

    res.cleanUp()
    return pairs_list
コード例 #24
0
    def test_subset_seqs_Alignment(self):
        rna1 = RnaSequence('UCG', Name='rna1')
        rna2 = RnaSequence('YCG', Name='rna2')
        rna3 = RnaSequence('CAR', Name='rna3')

        sub_aln = Alignment([rna2, rna3], MolType=RNA)
        aln = Alignment([rna1, rna2, rna3], MolType=RNA)
        obs_sub_aln = aln.takeSeqs(['rna2', 'rna3'])

        self.assertEqual(obs_sub_aln, sub_aln)
        self.assertEqual(str(obs_sub_aln), str(sub_aln))

        # Selected sequences should be in specified order?
        obs_sub_aln_1 = self.aln.takeSeqs(['rna3', 'rna2'])
        obs_sub_aln_2 = self.aln.takeSeqs(['rna2', 'rna3'])
        self.assertNotEqual(str(obs_sub_aln_1), str(obs_sub_aln_2))
コード例 #25
0
ファイル: rfam.py プロジェクト: wangdi2014/for_qiime_scripts
def load_from_clustal(data, seq_constructor=Sequence, strict=True):
    recs = [(name, seq_constructor(seq, )) for name, seq in\
        ClustalParser(data, strict)]
    lengths = [len(i[1]) for i in recs]
    if lengths and max(lengths) == min(lengths):
        return Alignment(recs, MolType=BYTES)
    else:
        return SequenceCollection(recs, MolType=BYTES)
コード例 #26
0
    def test_insert_sequences_into_tree(self):
        """Inserts sequences into Tree using params - test handles tree-insertion"""

        # generate temp filename for output
        outfname = splitext(get_tmp_filename('/tmp/'))[0]

        # create starting tree
        outtreefname = outfname + '.tre'
        outtree = open(outtreefname, 'w')
        outtree.write(REF_TREE)
        outtree.close()

        # set params for tree-insertion
        params = {}
        params["-w"] = "/tmp/"
        params["-n"] = get_tmp_filename().split("/")[-1]
        params["-f"] = 'v'
        #params["-G"] = '0.25'
        params["-t"] = outtreefname
        params["-m"] = 'GTRGAMMA'

        aln_ref_query = get_align_for_phylip(
            StringIO(PHYLIP_FILE_DNA_REF_QUERY))
        aln = Alignment(aln_ref_query)
        seqs, align_map = aln.toPhylip()

        tree = insert_sequences_into_tree(seqs,
                                          DNA,
                                          params=params,
                                          write_log=False)

        for node in tree.tips():
            removed_query_str = re.sub('QUERY___', '', str(node.Name))
            new_node_name = re.sub('___\d+', '', str(removed_query_str))
            if new_node_name in align_map:
                node.Name = align_map[new_node_name]

        self.assertTrue(isinstance(tree, PhyloNode))
        self.assertEqual(tree.getNewick(with_distances=True), RESULT_TREE)
        self.assertEqual(len(tree.tips()), 7)
        self.assertRaises(NotImplementedError, build_tree_from_alignment, \
                         self.align1, RNA, True)

        remove(outtreefname)
コード例 #27
0
def alignment_traceback(seqs, aligned_positions, word_length):
    """Alignment object from state matrix and ending point.
    """
    (starts, ends, maps) = map_traceback(aligned_positions)
    aligneds = []
    for (start, end, amap, (name, seq)) in zip(starts, ends, maps, seqs):
        gs = Aligned(amap * word_length,
                     seq[start * word_length:end * word_length])
        aligneds.append((name, gs))
    return Alignment(MolType=None, data=aligneds)
コード例 #28
0
ファイル: parsinsert.py プロジェクト: chungtseng/pycogent
def insert_sequences_into_tree(aln, moltype, params={}):
    """Returns a tree from placement of sequences
    """
    # convert aln to phy since seq_names need fixed to run through parsinsert
    new_aln=get_align_for_phylip(StringIO(aln))

    # convert aln to fasta in case it is not already a fasta file
    aln2 = Alignment(new_aln)
    seqs = aln2.toFasta()

    parsinsert_app = ParsInsert(params=params)
    result = parsinsert_app(seqs)
    
    # parse tree
    tree = DndParser(result['Tree'].read(), constructor=PhyloNode)
    
    # cleanup files
    result.cleanUp()
    
    return tree
コード例 #29
0
def insert_sequences_into_tree(aln, moltype, params={}):
    """Returns a tree from placement of sequences
    """
    # convert aln to phy since seq_names need fixed to run through parsinsert
    new_aln = get_align_for_phylip(StringIO(aln))

    # convert aln to fasta in case it is not already a fasta file
    aln2 = Alignment(new_aln)
    seqs = aln2.toFasta()

    parsinsert_app = ParsInsert(params=params)
    result = parsinsert_app(seqs)

    # parse tree
    tree = DndParser(result['Tree'].read(), constructor=PhyloNode)

    # cleanup files
    result.cleanUp()

    return tree
コード例 #30
0
    def getAlignment(self,
                     feature_types=None,
                     where_feature=None,
                     omit_redundant=True):
        """Arguments:
            - feature_types: annotations to be applied to the returned
              sequences
            - omit_redundant: exclude redundant gap positions"""
        seqs = []
        annotations = {}

        for member in self.Members:
            if feature_types:
                seq = member.getAnnotatedAligned(feature_types, where_feature)
            else:
                seq = member.AlignedSeq
            if seq is None:
                continue
            name = seq.Name

            if self._rc:  # names should reflect change to strand
                loc = member.Location.copy()
                loc.Strand *= -1
                name = str(loc)

            annotations[name] = seq.data.annotations
            seq.Name = seq.data.Name = name
            seqs += [(name, seq)]

        if seqs is None:
            return None

        aln = Alignment(data=seqs, MolType=DNA)

        if self._rc:
            aln = aln.rc()

        if omit_redundant:
            aln = aln.filtered(lambda x: set(x) != set('-'))

        return aln
コード例 #31
0
ファイル: test_raxml_v730.py プロジェクト: miklou/pycogent
    def test_insert_sequences_into_tree(self):
        """Inserts sequences into Tree using params - test handles tree-insertion"""
        
        # generate temp filename for output
        outfname=splitext(get_tmp_filename('/tmp/'))[0]
        
        # create starting tree
        outtreefname=outfname+'.tre'
        outtree=open(outtreefname,'w')
        outtree.write(REF_TREE)
        outtree.close()
        
        # set params for tree-insertion
        params={}
        params["-w"]="/tmp/"
        params["-n"] = get_tmp_filename().split("/")[-1]
        params["-f"] = 'v'
        #params["-G"] = '0.25'
        params["-t"] = outtreefname
        params["-m"] = 'GTRGAMMA'
        
        aln_ref_query=get_align_for_phylip(StringIO(PHYLIP_FILE_DNA_REF_QUERY))
        aln = Alignment(aln_ref_query)
        seqs, align_map = aln.toPhylip()
        
        tree = insert_sequences_into_tree(seqs, DNA, params=params,
                                          write_log=False)
        
        for node in tree.tips():
            removed_query_str=re.sub('QUERY___','',str(node.Name))
            new_node_name=re.sub('___\d+','',str(removed_query_str))
            if new_node_name in align_map:
                node.Name = align_map[new_node_name]

        self.assertTrue(isinstance(tree, PhyloNode))
        self.assertEqual(tree.getNewick(with_distances=True),RESULT_TREE)
        self.assertEqual(len(tree.tips()), 7)
        self.assertRaises(NotImplementedError, build_tree_from_alignment, \
                         self.align1, RNA, True)
                         
        remove(outtreefname)
コード例 #32
0
    def setUp(self):
        """Setup for Clustal tests."""
        self.unaligned_dict = {
            '1st': 'AAA',
            '2nd': 'CCCC',
            '3rd': 'GGGG',
            '4th': 'UUUU'
        }
        self.alignment_dict = {
            '1st': 'AAAA',
            '2nd': 'CCCC',
            '3rd': 'GGGG',
            '4th': 'UUUU'
        }
        #create alignment change order.
        self.alignment_object = Alignment(self.alignment_dict)
        self.alignment_order = ['2nd', '4th', '3rd', '1st']
        self.alignment_object.RowOrder = self.alignment_order

        self.clustal_with_label=\
"""CLUSTAL

1st    AAAA
2nd    CCCC
3rd    GGGG
4th    UUUU
"""
        self.clustal_with_label_lw2=\
"""CLUSTAL

1st    AA
2nd    CC
3rd    GG
4th    UU

1st    AA
2nd    CC
3rd    GG
4th    UU
"""

        self.clustal_with_label_reordered=\
"""CLUSTAL

2nd    CCCC
4th    UUUU
3rd    GGGG
1st    AAAA
"""

        self.clustal_with_label_lw2_reordered=\
"""CLUSTAL
コード例 #33
0
 def getAlignment(self, feature_types=None, where_feature=None,
                     omit_redundant=True):
     """Arguments:
         - feature_types: annotations to be applied to the returned
           sequences
         - omit_redundant: exclude redundant gap positions"""
     seqs = []
     annotations = {}
     
     for member in self.Members:
         if feature_types:
             seq = member.getAnnotatedAligned(feature_types, where_feature)
         else:
             seq = member.AlignedSeq
         if seq is None:
             continue
         name = seq.Name
         
         if self._rc: # names should reflect change to strand
             loc = member.Location.copy()
             loc.Strand *= -1
             name = str(loc)
         
         annotations[name] = seq.data.annotations
         seq.Name = seq.data.Name = name
         seqs += [(name, seq)]
     
     if seqs is None:
         return None
     
     aln = Alignment(data=seqs, MolType=DNA)
     
     if self._rc:
         aln = aln.rc()
     
     if omit_redundant:
         aln = aln.filtered(lambda x: set(x) != set('-'))
     
     return aln
コード例 #34
0
ファイル: test_pplacer.py プロジェクト: mikerobeson/pycogent
 def test_insert_sequences_into_tree(self):
     """Inserts sequences into Tree"""
     
     params={}
     # generate temp filename for output
     params["-r"] = self.refseq_fname
     params["-t"] = self.tree_fname
     params["-s"] = self.stats_fname
     params["--out-dir"] = "/tmp"
     
     aln_ref_query=MinimalFastaParser(StringIO(QUERY_SEQS))
     aln = Alignment(aln_ref_query)
     seqs, align_map = aln.toPhylip()
     tree = insert_sequences_into_tree(seqs, DNA, params=params,
                                       write_log=False)
     
     # rename tips back to query names
     for node in tree.tips():
         if node.Name in align_map:
             node.Name = align_map[node.Name]
     
     self.assertEqual(tree.getNewick(with_distances=True), RESULT_TREE)
コード例 #35
0
    def test_subset_positions_Alignment(self):
        rna1 = RnaSequence('UCG', Name='rna1')
        rna2 = RnaSequence('YCG', Name='rna2')
        rna3 = RnaSequence('CAR', Name='rna3')

        sub_aln = Alignment([rna1, rna2, rna3], MolType=RNA)

        obs_sub_aln = self.aln.takePositions([0, 1, 5])
        self.assertEqual(obs_sub_aln, sub_aln)
        self.assertNotEqual(obs_sub_aln, self.aln)
        # string representations should be the same. This fails right
        # now, because sequence order is not maintained. See separate test.
        self.assertEqual(str(obs_sub_aln), str(sub_aln))
コード例 #36
0
 def likelyAncestralSeqs(self, locus=None):
     """Returns the most likely reconstructed ancestral sequences as an
     alignment.
     
     Arguments:
         - locus: a named locus"""
     prob_array = self.reconstructAncestralSeqs(locus=locus)
     seqs = []
     for edge, probs in prob_array.items():
         seq = []
         for row in probs:
             by_p = [(p, state) for state, p in row.items()]
             seq.append(max(by_p)[1])
         seqs += [(edge, self.model.MolType.makeSequence("".join(seq)))]
     return Alignment(data=seqs, MolType=self.model.MolType)
コード例 #37
0
def LoadSeqs(filename=None, format=None, data=None, moltype=None,
            name=None, aligned=True, label_to_name=None, parser_kw={},
            constructor_kw={}, **kw):
    """Initialize an alignment or collection of sequences.
    
    Arguments:
    - filename: name of the sequence file
    - format: format of the sequence file
    - data: optional explicit provision of sequences
    - moltype: the MolType, eg DNA, PROTEIN
    - aligned: set True if sequences are already aligned and have the same
      length, results in an Alignment object. If False, a SequenceCollection
      instance is returned instead. If callable, will use as a constructor
      (e.g. can pass in DenseAlignment or CodonAlignment).
    - label_to_name: function for converting original name into another
      name. Default behavior is to preserve the original FASTA label and
      comment. 
      To remove all FASTA label comments, and pass in only the label, pass in: 
            label_to_name=lambda x: x.split()[0]
      To look up names in a dict, pass in:
            label_to_name = lambda x: d.get(x, default_name)
      ...where d is a dict that's in scope, and default_name is what you want
      to assign any sequence that isn't in the dict.
    
    If format is None, will attempt to infer format from the filename
    suffix. If label_to_name is None, will attempt to infer correct
    conversion from the format.
    """
    
    if filename is None:
        assert data is not None
        assert format is None
        assert not kw, kw
    else:
        assert data is None, (filename, data)
        data = list(FromFilenameParser(filename, format, **parser_kw))

    # the following is a temp hack until we have the load API sorted out.
    if aligned: #if callable, call it -- expect either f(data) or bool
        if hasattr(aligned, '__call__'):
            return aligned(data=data, MolType=moltype, Name=name,
                label_to_name=label_to_name, **constructor_kw)
        else:   #was not callable, but wasn't False
            return Alignment(data=data, MolType=moltype, Name=name,
                label_to_name=label_to_name, **constructor_kw)
    else:   #generic case: return SequenceCollection
        return SequenceCollection(data, MolType=moltype, Name=name,
            label_to_name=label_to_name, **constructor_kw)
コード例 #38
0
ファイル: dotur.py プロジェクト: yatisht/pycogent
def dotur_from_alignment(aln, moltype, distance_function, params=None):
    """Returns dotur results given an alignment and distance function.
    
        - aln: An Alignment object or something that behaves like one.
            Sequences must be aligned.
        - moltype: cogent.core.moltype object.
        - distance_function: function that can be passed to distanceMatrix()
            method of SequenceCollection.  Must be able to find distance
            between two sequences.
        
        - NOTE:  This function will only return the parsed *.list file, as
            it contains the OTU identities.
            Dotur generates 23 output files, so if this is not the one you
            are looking for, check out the documentation and add the others
            to the result path.
    """
    #construct Alignment object.  This will handle unaligned sequences.
    aln = Alignment(aln, MolType=moltype)

    #need to make int map.
    int_map, int_keys = aln.getIntMap()
    #construct Alignment object from int map to use object functionality
    int_map = Alignment(int_map, MolType=moltype)
    order = sorted(int_map.Names)

    #Build distance matrix.
    d_matrix_dict = int_map.distanceMatrix(f=distance_function)
    d_matrix_dict.RowOrder = order
    d_matrix_dict.ColOrder = order

    #Get distance matrix in list form.
    d_matrix_list = d_matrix_dict.toLists()

    #must be strings to use phylipMatrix
    for i, line in enumerate(d_matrix_list):
        d_matrix_list[i] = map(str, line)

    #Get phylip formatted string.
    phylip_matrix_string = phylipMatrix(rows=d_matrix_list, names=order)

    working_dir = get_tmp_filename(suffix='')
    app = Dotur(InputHandler='_input_as_multiline_string',\
        WorkingDir=working_dir,params=params)

    res = app(phylip_matrix_string)

    otu_list = OtuListParser(res['List'].readlines())

    #remap sequence names
    for i, otu in enumerate(otu_list):
        otu_list[i][2] = remap_seq_names(otu[2], int_keys)

    shutil.rmtree(app.WorkingDir)

    return otu_list
コード例 #39
0
ファイル: test_util.py プロジェクト: cxhernandez/pycogent
 def test_AlignmentToProfile_ignore(self):
     """AlignmentToProfile: should raise an error if too many chars ignored
     """
     #Same conditions as previous function, but in the last column 
     #there are only gaps, so normalization will fail at that position
     a = Alignment({'a':RnaSequence('UCAGRYN-'),'b':RnaSequence('ACUGAAA-')})
     exp =\
     array([[.5,0,.5,0],
      [0,1,0,0],
      [.5,0,.5,0],
      [0,0,0,1],
      [0,0,.75,.25],
      [.25,.25,.5,0],
      [.125,.125,.625,.125],
      [0,0,1,0]])
     self.assertRaises(ValueError,AlnToProfile,a,alphabet=RNA,\
         split_degenerates=True)
コード例 #40
0
def get_align_for_phylip(data, id_map=None):
    """
    Convenience function to return aligment object from phylip data

    data: sequence of lines in phylip format (an open file, list, etc)
    id_map: optional id mapping from external ids to phylip labels - not sure
        if we're going to implement this

    returns Alignment object
    """

    mpp = MinimalPhylipParser(data, id_map)

    tuples = []
    for tup in mpp:
        tuples.append(tup)
    return Alignment(tuples)
コード例 #41
0
def align_unaligned_seqs(seqs,moltype,params=None,accurate=False):
    """Aligns unaligned sequences

    seqs: either list of sequence objects or list of strings
    add_seq_names: boolean. if True, sequence names are inserted in the list
        of sequences. if False, it assumes seqs is a list of lines of some
        proper format that the program can handle
    """
    #create SequenceCollection object from seqs
    seq_collection = SequenceCollection(seqs,MolType=moltype)
    #Create mapping between abbreviated IDs and full IDs
    int_map, int_keys = seq_collection.getIntMap()
    #Create SequenceCollection from int_map.
    int_map = SequenceCollection(int_map,MolType=moltype)
    #Create Mafft app.
    app = Mafft(InputHandler='_input_as_multiline_string',params=params)
    
    #Turn on correct moltype
    moltype_string = moltype.label.upper()
    app.Parameters[MOLTYPE_MAP[moltype_string]].on()
    
    #Do not report progress
    app.Parameters['--quiet'].on()
    
    #More accurate alignment, sacrificing performance.
    if accurate:
        app.Parameters['--globalpair'].on()
        app.Parameters['--maxiterate'].Value=1000
    
    #Get results using int_map as input to app
    res = app(int_map.toFasta())
    #Get alignment as dict out of results
    alignment = dict(MinimalFastaParser(res['StdOut'].readlines()))
    #Make new dict mapping original IDs
    new_alignment = {}
    for k,v in list(alignment.items()):
        new_alignment[int_keys[k]]=v
    #Create an Alignment object from alignment dict
    new_alignment = Alignment(new_alignment,MolType=moltype)
    #Clean up
    res.cleanUp()
    del(seq_collection,int_map,int_keys,app,res,alignment)

    return new_alignment
コード例 #42
0
ファイル: test_util.py プロジェクト: cxhernandez/pycogent
 def test_AlignmentToProfile_basic(self):
     """AlignmentToProfile: should work under basic conditions
     """
     #sequences in the alignment are unweighted
     #Alphabet is the alphabet of the sequences (RNA)
     #CharOrder is set explicitly
     #Degenerate bases are split up
     #Gaps are ignored
     #In all of the columns at least one character is in the CharOrder
     a = Alignment({'a':RnaSequence('UCAGRYN-'),'b':RnaSequence('ACUGAAAA')})
     exp =\
     array([[.5,0,.5,0],
      [0,1,0,0],
      [.5,0,.5,0],
      [0,0,0,1],
      [0,0,.75,.25],
      [.25,.25,.5,0],
      [.125,.125,.625,.125],
      [0,0,1,0]])
     self.assertEqual(AlnToProfile(a,alphabet=RNA,\
         split_degenerates=True).Data.tolist(),exp.tolist())
コード例 #43
0
def align_unaligned_seqs(seqs, moltype, params=None):
    """Returns an Alignment object from seqs.

    seqs: SequenceCollection object, or data that can be used to build one.
    
    moltype: a MolType object.  DNA, RNA, or PROTEIN.

    params: dict of parameters to pass in to the Muscle app controller.
    
    Result will be an Alignment object.
    """
    if not params:
        params = {}
    #create SequenceCollection object from seqs
    seq_collection = SequenceCollection(seqs, MolType=moltype)
    #Create mapping between abbreviated IDs and full IDs
    int_map, int_keys = seq_collection.getIntMap()
    #Create SequenceCollection from int_map.
    int_map = SequenceCollection(int_map, MolType=moltype)
    #get temporary filename
    params.update({'-out': get_tmp_filename()})
    #Create Muscle app.
    app = Muscle(InputHandler='_input_as_multiline_string',\
                 params=params)
    #Get results using int_map as input to app
    res = app(int_map.toFasta())
    #Get alignment as dict out of results
    alignment = dict(MinimalFastaParser(res['MuscleOut'].readlines()))
    #Make new dict mapping original IDs
    new_alignment = {}
    for k, v in alignment.items():
        new_alignment[int_keys[k]] = v
    #Create an Alignment object from alignment dict
    new_alignment = Alignment(new_alignment, MolType=moltype)
    #Clean up
    res.cleanUp()
    del (seq_collection, int_map, int_keys, app, res, alignment, params)

    return new_alignment
コード例 #44
0
ファイル: test_methods.py プロジェクト: mikerobeson/pycogent
    def setUp(self):
        """General setUp method for all tests in this file"""

        #ALIGNMENTS
        self.aln1 = Alignment(['ABC', 'BCC', 'BAC'])

        #alignment from Henikoff 1994
        self.aln2 = Alignment({'seq1':'GYVGS','seq2':'GFDGF','seq3':'GYDGF',\
            'seq4':'GYQGG'},Names=['seq1','seq2','seq3','seq4'])

        #alignment from Vingron & Sibbald 1993
        self.aln3 = Alignment({'seq1':'AA', 'seq2':'AA', 'seq3':'BB'},\
            Names=['seq1','seq2','seq3'])

        #alignment from Vingron & Sibbald 1993
        self.aln4 = Alignment({'seq1':'AA', 'seq2':'AA', 'seq3':'BB',\
        'seq4':'BB','seq5':'CC'},Names=['seq1','seq2','seq3','seq4','seq5'])

        self.aln5 = Alignment(['ABBA', 'ABCA', 'CBCB'])

        #alignment 5S rRNA seqs from Hein 1990
        self.aln6 = ClustalParser(FIVE_S_ALN.split('\n'))

        #alignment from Vingron & Sibbald 1993
        self.aln7 = Alignment(
            {
                'seq1': 'AGCTA',
                'seq2': 'AGGTA',
                'seq3': 'ACCTG',
                'seq4': 'TGCAA'
            },
            Names=['seq1', 'seq2', 'seq3', 'seq4'])

        #TREES (SEE BOTTOM OF FILE FOR DESCRIPTION)
        self.tree1 = DndParser(TREE_1)
        self.tree2 = DndParser(TREE_2)
        self.tree3 = DndParser(TREE_3)
        self.tree4 = DndParser(TREE_4)
        self.tree5 = DndParser(TREE_5)
        self.tree6 = DndParser(TREE_6)
        self.tree7 = DndParser(TREE_7)
        self.tree8 = DndParser(TREE_8)
        self.tree9 = DndParser(TREE_9)
コード例 #45
0
list_of_genes=[]
for fasta in fasta_files[:]:
	print "_" * 50
	print "processing file:", fasta
	aln = LoadSeqs(fasta, moltype=DNA)
	list_sequences = aln.Names
	# Check if all taxa that are specified in the control file exist in the alignment
	list_check = set(taxa_names).issubset(set(list_sequences))
	# If there are some taxa not present in alignment the following code will simulate a sequence full of "N" of the correct length for those missing taxa and add it to the alignment
	if list_check == False:
		missing_elements = []
		for element in taxa_names:
			if element not in list_sequences:
				missing_elements.append(element)
		print "These taxa are missing in alignment:", missing_elements, "\nSequences for missing taxa will be generated only containing \"N\"."
		seq = Alignment(aln)
		string_list = seq.todict().values()
		length_alignment = ""
		for element in string_list:
			length_alignment = len(element)
		simulated_seq = []
		for element in missing_elements:
			fake_aln = "N" * length_alignment
			simulated_seq.append((element,fake_aln))
		fake_seqs = LoadSeqs(data = simulated_seq)
		aln = aln.addSeqs(fake_seqs)
	# Apply filter of user-set taxa names to be used for snp-extraction
	edited_alignment = aln.takeSeqs(sorted(taxa_names))
	# Get the variable positions for each fasta file
	var_pos_list = variable_positions(edited_alignment)
	print var_pos_list
コード例 #46
0
ファイル: mafft.py プロジェクト: ElDeveloper/brokit
def add_seqs_to_alignment(seqs, aln, moltype, params=None, accurate=False):
    """Returns an Alignment object from seqs and existing Alignment.

    seqs: a cogent.core.sequence.Sequence object, or data that can be used
    to build one.

    aln: an cogent.core.alignment.Alignment object, or data that can be used
    to build one

    params: dict of parameters to pass in to the Mafft app controller.
    """
    #create SequenceCollection object from seqs
    seq_collection = SequenceCollection(seqs,MolType=moltype)
    #Create mapping between abbreviated IDs and full IDs
    seq_int_map, seq_int_keys = seq_collection.getIntMap()
    #Create SequenceCollection from int_map.
    seq_int_map = SequenceCollection(seq_int_map,MolType=moltype)
    
    #create Alignment object from aln
    aln = Alignment(aln,MolType=moltype)
    #Create mapping between abbreviated IDs and full IDs
    aln_int_map, aln_int_keys = aln.getIntMap(prefix='seqn_')
    #Create SequenceCollection from int_map.
    aln_int_map = Alignment(aln_int_map,MolType=moltype)
    
    #Update seq_int_keys with aln_int_keys
    seq_int_keys.update(aln_int_keys)
    
    #Create Mafft app.
    app = Mafft(InputHandler='_input_as_multiline_string',\
        params=params,
        SuppressStderr=True)
    
    #Turn on correct moltype
    moltype_string = moltype.label.upper()
    app.Parameters[MOLTYPE_MAP[moltype_string]].on()
    
    #Do not report progress
    app.Parameters['--quiet'].on()
    
    #Add aln_int_map as seed alignment
    app.Parameters['--seed'].on(\
        app._tempfile_as_multiline_string(aln_int_map.toFasta()))
        
    #More accurate alignment, sacrificing performance.
    if accurate:
        app.Parameters['--globalpair'].on()
        app.Parameters['--maxiterate'].Value=1000
    
    #Get results using int_map as input to app
    res = app(seq_int_map.toFasta())
    #Get alignment as dict out of results
    alignment = dict(parse_fasta(res['StdOut']))
    
    #Make new dict mapping original IDs
    new_alignment = {}
    for k,v in alignment.items():
        key = k.replace('_seed_','')
        new_alignment[seq_int_keys[key]]=v
    #Create an Alignment object from alignment dict
    new_alignment = Alignment(new_alignment,MolType=moltype)
    #Clean up
    res.cleanUp()
    remove(app.Parameters['--seed'].Value)
    del(seq_collection,seq_int_map,seq_int_keys,\
        aln,aln_int_map,aln_int_keys,app,res,alignment)

    return new_alignment
コード例 #47
0
ファイル: clearcut.py プロジェクト: cxhernandez/pycogent
def build_tree_from_alignment(aln, moltype, best_tree=False, params={},\
    working_dir='/tmp'):
    """Returns a tree from Alignment object aln.

    aln: an cogent.core.alignment.Alignment object, or data that can be used
    to build one.
        -  Clearcut only accepts aligned sequences.  Alignment object used to
        handle unaligned sequences.
    
    moltype: a cogent.core.moltype object.
        - NOTE: If moltype = RNA, we must convert to DNA since Clearcut v1.0.8
        gives incorrect results if RNA is passed in.  'U' is treated as an 
        incorrect character and is excluded from distance calculations.

    best_tree: if True (default:False), uses a slower but more accurate
    algorithm to build the tree.

    params: dict of parameters to pass in to the Clearcut app controller.

    The result will be an cogent.core.tree.PhyloNode object, or None if tree
    fails.
    """
    params['--out'] = get_tmp_filename(working_dir)
    
    # Create instance of app controller, enable tree, disable alignment
    app = Clearcut(InputHandler='_input_as_multiline_string', params=params, \
                   WorkingDir=working_dir, SuppressStdout=True,\
                   SuppressStderr=True)
    #Input is an alignment
    app.Parameters['-a'].on()
    #Turn off input as distance matrix
    app.Parameters['-d'].off()
    
    #If moltype = RNA, we must convert to DNA.
    if moltype == RNA:
        moltype = DNA
    
    if best_tree:
        app.Parameters['-N'].on()
    
    #Turn on correct moltype
    moltype_string = moltype.label.upper()
    app.Parameters[MOLTYPE_MAP[moltype_string]].on()    

    # Setup mapping. Clearcut clips identifiers. We will need to remap them.
    # Clearcut only accepts aligned sequences.  Let Alignment object handle
    # unaligned sequences.
    seq_aln = Alignment(aln,MolType=moltype)
    #get int mapping
    int_map, int_keys = seq_aln.getIntMap()
    #create new Alignment object with int_map
    int_map = Alignment(int_map)

    # Collect result
    result = app(int_map.toFasta())
    
    # Build tree
    tree = DndParser(result['Tree'].read(), constructor=PhyloNode)
    for node in tree.tips():
        node.Name = int_keys[node.Name]

    # Clean up
    result.cleanUp()
    del(seq_aln, app, result, int_map, int_keys, params)

    return tree
コード例 #48
0
ファイル: align_seqs.py プロジェクト: EESI/FizzyQIIME
 def __call__(self, seq_path, result_path=None, log_path=None, \
     failure_path=None, cmbuild_params=None, cmalign_params=None):
     
     log_params = []
     # load candidate sequences
     candidate_sequences = dict(MinimalFastaParser(open(seq_path,'U')))
     
     # load template sequences
     try:
         info, template_alignment, struct = list(MinimalRfamParser(open(\
             self.Params['template_filepath'],'U'),\
             seq_constructor=ChangedSequence))[0]
     except RecordError:
         raise ValueError, "Template alignment must be in Stockholm format with corresponding secondary structure annotation when using InfernalAligner."
     
     moltype = self.Params['moltype']
     
     #Need to make separate mapping for unaligned sequences
     unaligned = SequenceCollection(candidate_sequences,MolType=moltype)
     int_map, int_keys = unaligned.getIntMap(prefix='unaligned_')
     int_map = SequenceCollection(int_map,MolType=moltype)
     
     #Turn on --gapthresh option in cmbuild to force alignment to full model
     if cmbuild_params is None:
         cmbuild_params = {}
     cmbuild_params.update({'--gapthresh':1.0})
     
     #record cmbuild parameters
     log_params.append('cmbuild parameters:')
     log_params.append(str(cmbuild_params))
     
     #Turn on --sub option in Infernal, since we know the unaligned sequences
     # are fragments.
     #Also turn on --gapthresh to use same gapthresh as was used to build
     # model
     
     if cmalign_params is None:
         cmalign_params = {}
     cmalign_params.update({'--sub':True,'--gapthresh':1.0})
     
     #record cmalign parameters
     log_params.append('cmalign parameters:')
     log_params.append(str(cmalign_params))
     
     #Align sequences to alignment including alignment gaps.
     aligned, struct_string = cmalign_from_alignment(aln=template_alignment,\
         structure_string=struct,\
         seqs=int_map,\
         moltype=moltype,\
         include_aln=True,\
         params=cmalign_params,\
         cmbuild_params=cmbuild_params)
     
     #Pull out original sequences from full alignment.
     infernal_aligned={}
     aligned_dict = aligned.NamedSeqs
     for key in int_map.Names:
         infernal_aligned[int_keys.get(key,key)]=aligned_dict[key]
     
     #Create an Alignment object from alignment dict
     infernal_aligned = Alignment(infernal_aligned,MolType=moltype)
     
     if log_path is not None:
         log_file = open(log_path,'w')
         log_file.write('\n'.join(log_params))
         log_file.close()
     
     if result_path is not None:
         result_file = open(result_path,'w')
         result_file.write(infernal_aligned.toFasta())
         result_file.close()
         return None
     else:
         try:
             return infernal_aligned
         except ValueError:
             return {}
コード例 #49
0
class AllTests(TestCase):

    def setUp(self):
        """setUp method for all tests"""
        # named sequences
        self.rna1 = RnaSequence('UCAGGG', Name='rna1')
        self.rna2 = RnaSequence('YCU-RG', Name='rna2')
        self.rna3 = RnaSequence('CAA-NR', Name='rna3')
        self.model1 = ModelSequence('UCAGGG', Name='rna1',\
            Alphabet=RNA.Alphabets.DegenGapped)
        self.model2 = ModelSequence('YCU-RG', Name='rna2',\
            Alphabet=RNA.Alphabets.DegenGapped)
        self.model3 = ModelSequence('CAA-NR', Name='rna3',\
            Alphabet=RNA.Alphabets.DegenGapped)

        self.aln = Alignment([self.rna1, self.rna2, self.rna3], MolType=RNA)
        self.da = DenseAlignment([self.model1, self.model2, self.model3],\
            MolType=RNA, Alphabet=RNA.Alphabets.DegenGapped)

        # seqs no name
        self.nn_rna1 = RnaSequence('UCAGGG')
        self.nn_rna2 = RnaSequence('YCU-RG')
        self.nn_rna3 = RnaSequence('CAA-NR')

        self.nn_model1 = ModelSequence('UCAGGG',\
            Alphabet=RNA.Alphabets.DegenGapped)
        self.nn_model2 = ModelSequence('YCU-RG',\
            Alphabet=RNA.Alphabets.DegenGapped)
        self.nn_model3 = ModelSequence('CAA-NR',\
            Alphabet=RNA.Alphabets.DegenGapped)

        self.nn_aln = Alignment([self.nn_rna1, self.nn_rna2, self.nn_rna3],\
            MolType=RNA)
        self.nn_da = DenseAlignment([self.nn_model1, self.nn_model2,\
            self.nn_model3], MolType=RNA, Alphabet=RNA.Alphabets.DegenGapped)

    def test_printing_named_seqs(self):
        """Printing named seqs should work the same on Aln and DenseAln"""
        #Note: the newline trailing each sequence is intentional, because
        #we want each FASTA-format record to be separated.
        exp_lines_general = ['>rna1','UCAGGG','>rna2','YCU-RG','>rna3','CAA-NR']
        self.assertEqual(str(self.aln), '\n'.join(exp_lines_general) + '\n')
        self.assertEqual(str(self.da), '\n'.join(exp_lines_general) + '\n')

    def test_printing_unnamed_seqs(self):
        """Printing unnamed sequences should work the same on Aln and DenseAln
        """
        exp_lines_gen = ['>seq_0','UCAGGG','>seq_1','YCU-RG','>seq_2','CAA-NR\n']
        self.assertEqual(str(self.nn_aln),'\n'.join(exp_lines_gen))
        self.assertEqual(str(self.nn_da),'\n'.join(exp_lines_gen))

    def test_DenseAlignment_without_moltype(self):
        """Expect MolType to be picked up from the sequences."""

        m1 = ModelSequence('UCAG',Alphabet=RNA.Alphabets.DegenGapped,\
            Name='rna1')
        m2 = ModelSequence('CCCR',Alphabet=RNA.Alphabets.DegenGapped,\
            Name='rna2')
        da = DenseAlignment([m1, m2])
        exp_lines = ['>rna1','UCAG','>rna2','CCCR']
        self.assertEqual(str(da), '\n'.join(exp_lines) + '\n')

    def test_names(self):
        # Should both alignments handle names the same way?
        self.assertEqual(self.aln.Names, ['rna1','rna2','rna3'])
        self.assertEqual(self.da.Names, ['rna1','rna2','rna3'])
        # On unnamed sequences the behavior is now the same.
        self.assertEqual(self.nn_aln.Names, ['seq_0','seq_1','seq_2'])
        self.assertEqual(self.nn_da.Names, ['seq_0','seq_1','seq_2'])
    
    def test_seqFreqs(self):
        """seqFreqs should work the same on Alignment and DenseAlignment"""
        # Used alphabet: ('U', 'C', 'A', 'G', '-', 'B', 'D', 'H',\
        # 'K', 'M', 'N', 'S', 'R', 'W', 'V', 'Y')
        exp = [[1,1,1,3,0,0,0,0,0,0,0,0,0,0,0,0,0],\
            [1,1,0,1,1,0,0,0,0,0,0,0,1,0,0,1,0],\
            [0,1,2,0,1,0,0,0,0,0,1,0,1,0,0,0,0]]
        # This works
        self.assertEqual(self.da.getSeqFreqs().Data, exp)
        # This used to raise an error, but now works
        self.assertEqual(self.aln.getSeqFreqs().Data, exp)

    def test_subset_positions_DenseAlignment(self):
        model1 = ModelSequence('UCG', Name='rna1',\
            Alphabet=RNA.Alphabets.DegenGapped)
        model2 = ModelSequence('YCG', Name='rna2',\
            Alphabet=RNA.Alphabets.DegenGapped)
        model3 = ModelSequence('CAR', Name='rna3',\
            Alphabet=RNA.Alphabets.DegenGapped)
        sub_da = DenseAlignment([model1, model2, model3],\
            MolType=RNA, Alphabet=RNA.Alphabets.DegenGapped)

        full_data = array([[0,1,2,3,3,3],[15,1,0,4,12,3],[1,2,2,4,10,12]])
        sub_data = array([[0,1,3],[15,1,3],[1,2,12]])
        
        # First check some data
        self.assertEqual(self.da.ArraySeqs, full_data)
        self.assertEqual(self.da.ArrayPositions, transpose(full_data))
        self.assertEqual(sub_da.ArraySeqs, sub_data)
        self.assertEqual(sub_da.ArrayPositions, transpose(sub_data))
        
        obs_sub_da_TP = self.da.takePositions([0,1,5])
        obs_sub_da_SA = self.da.getSubAlignment(pos=[0,1,5])
        
        # When using the getSubAlignment method the data is right 
        self.assertEqual(obs_sub_da_SA, sub_da)
        self.failIfEqual(obs_sub_da_SA, self.da)
        self.assertEqual(obs_sub_da_SA.ArraySeqs, sub_data)
        self.assertEqual(obs_sub_da_SA.ArrayPositions, transpose(sub_data))

        # For the takePositions method: Why does this work
        self.assertEqual(obs_sub_da_TP, sub_da)
        self.failIfEqual(obs_sub_da_TP, self.da)
        # If the data doesn't match?
        self.assertEqual(obs_sub_da_TP.ArraySeqs, sub_data)
        self.assertEqual(obs_sub_da_TP.ArrayPositions, transpose(sub_data))
        # Shouldn't the __eq__ method check the data at least?
        
    def test_subset_positions_Alignment(self):
        rna1 = RnaSequence('UCG', Name='rna1')
        rna2 = RnaSequence('YCG', Name='rna2')
        rna3 = RnaSequence('CAR', Name='rna3')
        
        sub_aln = Alignment([rna1, rna2, rna3], MolType=RNA)

        obs_sub_aln = self.aln.takePositions([0,1,5])
        self.assertEqual(obs_sub_aln, sub_aln)
        self.failIfEqual(obs_sub_aln, self.aln)
        # string representations should be the same. This fails right
        # now, because sequence order is not maintained. See separate test.
        self.assertEqual(str(obs_sub_aln), str(sub_aln))
        
    def test_takePositions_sequence_order(self):
        """Alignment takePositions should maintain seq order"""
        #This works        
        self.assertEqual(self.da.Names,['rna1','rna2','rna3'])
        sub_da = self.da.getSubAlignment(pos=[0,1,5])
        self.assertEqual(sub_da.Names,['rna1','rna2','rna3'])
        # seq order not maintained in Alignment
        self.assertEqual(self.aln.Names,['rna1','rna2','rna3']) 
        sub_aln = self.aln.takePositions([0,1,5])
        self.assertEqual(sub_aln.Names,['rna1','rna2','rna3'])

    def test_subset_seqs_Alignment(self):
        rna1 = RnaSequence('UCG', Name='rna1')
        rna2 = RnaSequence('YCG', Name='rna2')
        rna3 = RnaSequence('CAR', Name='rna3')

        sub_aln = Alignment([rna2, rna3], MolType=RNA)
        aln = Alignment([rna1, rna2, rna3], MolType=RNA)
        obs_sub_aln = aln.takeSeqs(['rna2','rna3'])
        
        self.assertEqual(obs_sub_aln, sub_aln)
        self.assertEqual(str(obs_sub_aln), str(sub_aln))

        # Selected sequences should be in specified order?
        obs_sub_aln_1 = self.aln.takeSeqs(['rna3','rna2'])
        obs_sub_aln_2 = self.aln.takeSeqs(['rna2','rna3'])
        self.failIfEqual(str(obs_sub_aln_1), str(obs_sub_aln_2))
    
    def test_subset_seqs_DenseAlignment(self):
        model1 = ModelSequence('UCG', Name='rna1',\
            Alphabet=RNA.Alphabets.DegenGapped)
        model2 = ModelSequence('YCG', Name='rna2',\
            Alphabet=RNA.Alphabets.DegenGapped)
        model3 = ModelSequence('CAR', Name='rna3',\
            Alphabet=RNA.Alphabets.DegenGapped)
        sub_da = DenseAlignment([model1, model2, model3],\
            MolType=RNA, Alphabet=RNA.Alphabets.DegenGapped)
       
        # takeSeqs by name should have the same effect as
        # getSubAlignment by seq idx?
        obs_sub_da_TS = self.da.takeSeqs(['rna1'])
        obs_sub_da_SA = self.da.getSubAlignment(seqs=[0])

        # These two are now the same. Fixed mapping of key to char array.
        self.assertEqual(obs_sub_da_TS, obs_sub_da_SA)
        self.assertEqual(str(obs_sub_da_TS), str(obs_sub_da_SA))

    def test_aln_equality(self):
        # When does something compare equal?
        self.assertEqual(self.da == self.da, True)
        # one sequence less
        other_da1 = DenseAlignment([self.model1, self.model2],\
            MolType=RNA, Alphabet=RNA.Alphabets.DegenGapped)
        self.assertEqual(self.da == other_da1, False)
        # seqs in different order -- doesn't matter
        other_da2 = DenseAlignment([self.model1, self.model3, self.model2],\
            MolType=RNA, Alphabet=RNA.Alphabets.DegenGapped)
        self.assertEqual(self.da == other_da2, True)
        # seqs in different encoding -- doesn't matter, only looks at data
        other_da3 = DenseAlignment([self.model1, self.model2, self.model3])
        # Should this compare False even though the data is exactly the same?
        # The MolType is different...
        self.assertEqual(self.da == other_da3, True) 
        assert alltrue(map(alltrue,self.da.ArraySeqs == other_da3.ArraySeqs))

    def test_seq_equality(self):
        model1 = ModelSequence('UCG', Name='rna1',\
            Alphabet=RNA.Alphabets.DegenGapped)
        model2 = ModelSequence('UCG', Name='rna1',\
            Alphabet=RNA.Alphabets.DegenGapped)
        # Shouldn't the above two sequences be equal?
        self.assertEqual(model1, model2)
        # string comparison is True
        self.assertEqual(str(model1), str(model2))

    def test_seq_ungapping(self):
        rna1 = RnaSequence('U-C-A-G-', Name='rna1')
        model1 = ModelSequence('U-C-A-G-', Name='rna1',\
            Alphabet=RNA.Alphabets.DegenGapped)
        
        self.assertEqual(rna1, 'U-C-A-G-')
        self.assertEqual(rna1.degap(), 'UCAG')

        # check is produces the right string from the beginning
        self.assertEqual(str(model1), 'U-C-A-G-')
        self.assertEqual(model1._data, [0,4,1,4,2,4,3,4])
        # ModelSequence should maybe have the same degap method as normal Seq
        self.assertEqual(str(model1.degap()), 'UCAG')

    def test_the_rest_of_ModelSequence(self):
        """The class ModelSequence has 14 methods, but only 2 unittests.
        You might want to add some tests there..."""
        #note: mostly these are tested in derived classes, for convenience.
        pass
コード例 #50
0
class ParsInsertTests(TestCase):
    def setUp(self):
        
        # create a list of files to cleanup
        self._paths_to_clean_up = []
        self._dirs_to_clean_up = []
        
        # load query seqs
        self.seqs = Alignment(MinimalFastaParser(QUERY_SEQS.split()))
        
        # generate temp filename
        tmp_dir='/tmp'
        self.outfile = get_tmp_filename(tmp_dir)
        
        # create and write out reference sequence file
        self.outfasta=splitext(self.outfile)[0]+'.fasta'
        fastaout=open(self.outfasta,'w')
        fastaout.write(REF_SEQS)
        fastaout.close()
        self._paths_to_clean_up.append(self.outfasta)
        
        # create and write out starting tree file
        self.outtree=splitext(self.outfile)[0]+'.tree'
        treeout=open(self.outtree,'w')
        treeout.write(REF_TREE)
        treeout.close()
        self._paths_to_clean_up.append(self.outtree)
    
    def tearDown(self): 
        """cleans up all files initially created"""
        # remove the tempdir and contents
        map(remove,self._paths_to_clean_up)
        map(rmdir,self._dirs_to_clean_up)
    
    def test_base_command(self):
        """Base command-calls"""
        
        app = ParsInsert()
        self.assertEqual(app.BaseCommand, \
                         ''.join(['cd "',getcwd(),'/"; ','ParsInsert']))
        
    def test_change_working_dir(self):
        """Change working dir"""
        
        app = ParsInsert(WorkingDir='/tmp/ParsInsertTest')
        self.assertEqual(app.BaseCommand, \
                       ''.join(['cd "','/tmp/ParsInsertTest',\
                                '/"; ','ParsInsert']))
                                
        rmtree('/tmp/ParsInsertTest')

    def test_insert_sequences_into_tree(self):
        """Inserts sequences into Tree"""
        
        # define log fp
        log_fp='/tmp/parsinsert.log'
        self._paths_to_clean_up.append(log_fp)
        
        # define tax assignment values fp
        tax_assign_fp='/tmp/tax_assignments.log'
        self._paths_to_clean_up.append(tax_assign_fp)
        
        # set the reference alignment and starting tree
        param={
                '-t':self.outtree,
                '-s':self.outfasta,
                '-l':log_fp,
                '-o':tax_assign_fp
              }
        
        seqs, align_map = self.seqs.toPhylip()
        
        # insert sequences into tree
        tree = insert_sequences_into_tree(seqs, DNA, params=param)

        # rename tips back to query names
        for node in tree.tips():
            if node.Name in align_map:
                node.Name = align_map[node.Name]
                
        self.assertEqual(tree.getNewick(with_distances=True),exp_tree)