def setUp(self): """setUp method for all tests""" # named sequences self.rna1 = RnaSequence('UCAGGG', Name='rna1') self.rna2 = RnaSequence('YCU-RG', Name='rna2') self.rna3 = RnaSequence('CAA-NR', Name='rna3') self.model1 = ModelSequence('UCAGGG', Name='rna1',\ Alphabet=RNA.Alphabets.DegenGapped) self.model2 = ModelSequence('YCU-RG', Name='rna2',\ Alphabet=RNA.Alphabets.DegenGapped) self.model3 = ModelSequence('CAA-NR', Name='rna3',\ Alphabet=RNA.Alphabets.DegenGapped) self.aln = Alignment([self.rna1, self.rna2, self.rna3], MolType=RNA) self.da = DenseAlignment([self.model1, self.model2, self.model3],\ MolType=RNA, Alphabet=RNA.Alphabets.DegenGapped) # seqs no name self.nn_rna1 = RnaSequence('UCAGGG') self.nn_rna2 = RnaSequence('YCU-RG') self.nn_rna3 = RnaSequence('CAA-NR') self.nn_model1 = ModelSequence('UCAGGG',\ Alphabet=RNA.Alphabets.DegenGapped) self.nn_model2 = ModelSequence('YCU-RG',\ Alphabet=RNA.Alphabets.DegenGapped) self.nn_model3 = ModelSequence('CAA-NR',\ Alphabet=RNA.Alphabets.DegenGapped) self.nn_aln = Alignment([self.nn_rna1, self.nn_rna2, self.nn_rna3],\ MolType=RNA) self.nn_da = DenseAlignment([self.nn_model1, self.nn_model2,\ self.nn_model3], MolType=RNA, Alphabet=RNA.Alphabets.DegenGapped)
def setUp(self): # create a list of files to cleanup self._paths_to_clean_up = [] self._dirs_to_clean_up = [] # load query seqs self.seqs = Alignment(MinimalFastaParser(QUERY_SEQS.split())) # generate temp filename tmp_dir = '/tmp' self.outfile = get_tmp_filename(tmp_dir) # create and write out reference sequence file self.outfasta = splitext(self.outfile)[0] + '.fasta' fastaout = open(self.outfasta, 'w') fastaout.write(REF_SEQS) fastaout.close() self._paths_to_clean_up.append(self.outfasta) # create and write out starting tree file self.outtree = splitext(self.outfile)[0] + '.tree' treeout = open(self.outtree, 'w') treeout.write(REF_TREE) treeout.close() self._paths_to_clean_up.append(self.outtree)
def add_seqs_to_alignment(seqs, aln, moltype, params=None): """Returns an Alignment object from seqs and existing Alignment. seqs: a cogent.core.alignment.SequenceCollection object, or data that can be used to build one. aln: a cogent.core.alignment.Alignment object, or data that can be used to build one params: dict of parameters to pass in to the Clustal app controller. """ # create SequenceCollection object from seqs seq_collection = SequenceCollection(seqs, MolType=moltype) # Create mapping between abbreviated IDs and full IDs seq_int_map, seq_int_keys = seq_collection.getIntMap() # Create SequenceCollection from int_map. seq_int_map = SequenceCollection(seq_int_map, MolType=moltype) # create Alignment object from aln aln = Alignment(aln, MolType=moltype) # Create mapping between abbreviated IDs and full IDs aln_int_map, aln_int_keys = aln.getIntMap(prefix="seqn_") # Create SequenceCollection from int_map. aln_int_map = Alignment(aln_int_map, MolType=moltype) # Update seq_int_keys with aln_int_keys seq_int_keys.update(aln_int_keys) # Create Mafft app. app = Clustalw(InputHandler="_input_as_multiline_string", params=params, SuppressStderr=True) app.Parameters["-align"].off() app.Parameters["-infile"].off() app.Parameters["-sequences"].on() # Add aln_int_map as profile1 app.Parameters["-profile1"].on(app._tempfile_as_multiline_string(aln_int_map.toFasta())) # Add seq_int_map as profile2 app.Parameters["-profile2"].on(app._tempfile_as_multiline_string(seq_int_map.toFasta())) # Get results using int_map as input to app res = app() # Get alignment as dict out of results alignment = dict(ClustalParser(res["Align"].readlines())) # Make new dict mapping original IDs new_alignment = {} for k, v in alignment.items(): new_alignment[seq_int_keys[k]] = v # Create an Alignment object from alignment dict new_alignment = Alignment(new_alignment, MolType=moltype) # Clean up res.cleanUp() remove(app.Parameters["-profile1"].Value) remove(app.Parameters["-profile2"].Value) del (seq_collection, seq_int_map, seq_int_keys, aln, aln_int_map, aln_int_keys, app, res, alignment) return new_alignment
def build_tree_from_alignment(aln, moltype, best_tree=False, params={}): """Returns a tree from Alignment object aln. aln: an xxx.Alignment object, or data that can be used to build one. moltype: cogent.core.moltype.MolType object best_tree: best_tree suppport is currently not implemented params: dict of parameters to pass in to the RAxML app controller. The result will be an xxx.Alignment object, or None if tree fails. """ if best_tree: raise NotImplementedError if '-m' not in params: if moltype == DNA or moltype == RNA: #params["-m"] = 'GTRMIX' # in version 7.2.3, GTRMIX is no longer supported but says GTRCAT # behaves like GTRMIX (http://www.phylo.org/tools/raxmlhpc2.html) params["-m"] = 'GTRGAMMA' elif moltype == PROTEIN: params["-m"] = 'PROTGAMMAmatrixName' else: raise ValueError("Moltype must be either DNA, RNA, or PROTEIN") if not hasattr(aln, 'toPhylip'): aln = Alignment(aln) seqs, align_map = aln.toPhylip() # generate temp filename for output params["-w"] = "/tmp/" params["-n"] = get_tmp_filename().split("/")[-1] params["-k"] = True params["-p"] = randint(1, 100000) params["-x"] = randint(1, 100000) ih = '_input_as_multiline_string' raxml_app = Raxml(params=params, InputHandler=ih, WorkingDir=None, SuppressStderr=True, SuppressStdout=True) raxml_result = raxml_app(seqs) tree = DndParser(raxml_result['Bootstrap'], constructor=PhyloNode) for node in tree.tips(): node.Name = align_map[node.Name] raxml_result.cleanUp() return tree
def build_tree_from_alignment(aln, moltype, best_tree=False, params={}): """Returns a tree from Alignment object aln. aln: an xxx.Alignment object, or data that can be used to build one. moltype: cogent.core.moltype.MolType object best_tree: best_tree suppport is currently not implemented params: dict of parameters to pass in to the RAxML app controller. The result will be an xxx.Alignment object, or None if tree fails. """ if best_tree: raise NotImplementedError if '-m' not in params: if moltype == DNA or moltype == RNA: #params["-m"] = 'GTRMIX' # in version 7.2.3, GTRMIX is no longer supported but says GTRCAT # behaves like GTRMIX (http://www.phylo.org/tools/raxmlhpc2.html) params["-m"] = 'GTRGAMMA' elif moltype == PROTEIN: params["-m"] = 'PROTGAMMAmatrixName' else: raise ValueError("Moltype must be either DNA, RNA, or PROTEIN") if not hasattr(aln, 'toPhylip'): aln = Alignment(aln) seqs, align_map = aln.toPhylip() # generate temp filename for output params["-w"] = "/tmp/" params["-n"] = get_tmp_filename().split("/")[-1] params["-k"] = True params["-p"] = randint(1,100000) params["-x"] = randint(1,100000) ih = '_input_as_multiline_string' raxml_app = Raxml(params=params, InputHandler=ih, WorkingDir=None, SuppressStderr=True, SuppressStdout=True) raxml_result = raxml_app(seqs) tree = DndParser(raxml_result['Bootstrap'], constructor=PhyloNode) for node in tree.tips(): node.Name = align_map[node.Name] raxml_result.cleanUp() return tree
def test_distance_matrix(self): """distance_matrix should obey Names of alignment""" #Names=None aln1_exp = array([[0, 2, 2], [2, 0, 1], [2, 1, 0]]) self.assertEqual(distance_matrix(self.aln1), aln1_exp) a = Alignment(self.aln1.NamedSeqs) a.Names = ['seq_1', 'seq_2', 'seq_0'] a_exp = array([[0, 1, 2], [1, 0, 2], [2, 2, 0]]) self.assertEqual(distance_matrix(a), a_exp)
def test_distance_matrix(self): """distance_matrix should obey Names of alignment""" #Names=None aln1_exp = array([[0,2,2],[2,0,1],[2,1,0]]) self.assertEqual(distance_matrix(self.aln1),aln1_exp) a = Alignment(self.aln1.NamedSeqs) a.Names=['seq_1','seq_2','seq_0'] a_exp = array([[0,1,2],[1,0,2],[2,2,0]]) self.assertEqual(distance_matrix(a),a_exp)
def dotur_from_alignment(aln,moltype,distance_function,params=None): """Returns dotur results given an alignment and distance function. - aln: An Alignment object or something that behaves like one. Sequences must be aligned. - moltype: cogent.core.moltype object. - distance_function: function that can be passed to distanceMatrix() method of SequenceCollection. Must be able to find distance between two sequences. - NOTE: This function will only return the parsed *.list file, as it contains the OTU identities. Dotur generates 23 output files, so if this is not the one you are looking for, check out the documentation and add the others to the result path. """ #construct Alignment object. This will handle unaligned sequences. aln = Alignment(aln, MolType=moltype) #need to make int map. int_map, int_keys = aln.getIntMap() #construct Alignment object from int map to use object functionality int_map = Alignment(int_map, MolType=moltype) order = sorted(int_map.Names) #Build distance matrix. d_matrix_dict = int_map.distanceMatrix(f=distance_function) d_matrix_dict.RowOrder=order d_matrix_dict.ColOrder=order #Get distance matrix in list form. d_matrix_list = d_matrix_dict.toLists() #must be strings to use phylipMatrix for i,line in enumerate(d_matrix_list): d_matrix_list[i]=map(str,line) #Get phylip formatted string. phylip_matrix_string = phylipMatrix(rows=d_matrix_list,names=order) working_dir = get_tmp_filename(suffix='') app = Dotur(InputHandler='_input_as_multiline_string',\ WorkingDir=working_dir,params=params) res = app(phylip_matrix_string) otu_list = OtuListParser(res['List'].readlines()) #remap sequence names for i,otu in enumerate(otu_list): otu_list[i][2]=remap_seq_names(otu[2], int_keys) shutil.rmtree(app.WorkingDir) return otu_list
def setUp(self): """Set up for Voronoi tests""" self.aln1 = Alignment(['ABC', 'BCC', 'BAC']) self.aln2 = Alignment({'seq1':'GYVGS','seq2':'GFDGF','seq3':'GYDGF',\ 'seq4':'GYQGG'},Names=['seq1','seq2','seq3','seq4']) self.aln3 = Alignment({'seq1':'AA', 'seq2':'AA', 'seq3':'BB'},\ Names=['seq1','seq2','seq3']) self.aln4 = Alignment({'seq1':'AA', 'seq2':'AA', 'seq3':'BB',\ 'seq4':'BB','seq5':'CC'},Names=['seq1','seq2','seq3','seq4','seq5']) self.aln5 = Alignment(['ABBA', 'ABCA', 'CBCB'])
def align_two_alignments(aln1, aln2, moltype, params=None): """Returns an Alignment object from two existing Alignments. aln1, aln2: cogent.core.alignment.Alignment objects, or data that can be used to build them. - Mafft profile alignment only works with aligned sequences. Alignment object used to handle unaligned sequences. params: dict of parameters to pass in to the Mafft app controller. """ #create SequenceCollection object from seqs aln1 = Alignment(aln1,MolType=moltype) #Create mapping between abbreviated IDs and full IDs aln1_int_map, aln1_int_keys = aln1.getIntMap() #Create SequenceCollection from int_map. aln1_int_map = Alignment(aln1_int_map,MolType=moltype) #create Alignment object from aln aln2 = Alignment(aln2,MolType=moltype) #Create mapping between abbreviated IDs and full IDs aln2_int_map, aln2_int_keys = aln2.getIntMap(prefix='seqn_') #Create SequenceCollection from int_map. aln2_int_map = Alignment(aln2_int_map,MolType=moltype) #Update aln1_int_keys with aln2_int_keys aln1_int_keys.update(aln2_int_keys) #Create Mafft app. app = Mafft(InputHandler='_input_as_paths',\ params=params, SuppressStderr=False) app._command = 'mafft-profile' aln1_path = app._tempfile_as_multiline_string(aln1_int_map.toFasta()) aln2_path = app._tempfile_as_multiline_string(aln2_int_map.toFasta()) filepaths = [aln1_path,aln2_path] #Get results using int_map as input to app res = app(filepaths) #Get alignment as dict out of results alignment = dict(parse_fasta(res['StdOut'])) #Make new dict mapping original IDs new_alignment = {} for k,v in alignment.items(): key = k.replace('_seed_','') new_alignment[aln1_int_keys[key]]=v #Create an Alignment object from alignment dict new_alignment = Alignment(new_alignment,MolType=moltype) #Clean up res.cleanUp() remove(aln1_path) remove(aln2_path) remove('pre') remove('trace') del(aln1,aln1_int_map,aln1_int_keys,\ aln2,aln2_int_map,aln2_int_keys,app,res,alignment) return new_alignment
def align_two_alignments(aln1, aln2, moltype, params=None): """Returns an Alignment object from two existing Alignments. aln1, aln2: cogent.core.alignment.Alignment objects, or data that can be used to build them. params: dict of parameters to pass in to the Clustal app controller. """ #create SequenceCollection object from seqs aln1 = Alignment(aln1,MolType=moltype) #Create mapping between abbreviated IDs and full IDs aln1_int_map, aln1_int_keys = aln1.getIntMap() #Create SequenceCollection from int_map. aln1_int_map = Alignment(aln1_int_map,MolType=moltype) #create Alignment object from aln aln2 = Alignment(aln2,MolType=moltype) #Create mapping between abbreviated IDs and full IDs aln2_int_map, aln2_int_keys = aln2.getIntMap(prefix='seqn_') #Create SequenceCollection from int_map. aln2_int_map = Alignment(aln2_int_map,MolType=moltype) #Update aln1_int_keys with aln2_int_keys aln1_int_keys.update(aln2_int_keys) #Create Mafft app. app = Clustalw(InputHandler='_input_as_multiline_string',\ params=params, SuppressStderr=True) app.Parameters['-align'].off() app.Parameters['-infile'].off() app.Parameters['-profile'].on() #Add aln_int_map as profile1 app.Parameters['-profile1'].on(\ app._tempfile_as_multiline_string(aln1_int_map.toFasta())) #Add seq_int_map as profile2 app.Parameters['-profile2'].on(\ app._tempfile_as_multiline_string(aln2_int_map.toFasta())) #Get results using int_map as input to app res = app() #Get alignment as dict out of results alignment = dict(ClustalParser(res['Align'].readlines())) #Make new dict mapping original IDs new_alignment = {} for k,v in alignment.items(): new_alignment[aln1_int_keys[k]]=v #Create an Alignment object from alignment dict new_alignment = Alignment(new_alignment,MolType=moltype) #Clean up res.cleanUp() remove(app.Parameters['-profile1'].Value) remove(app.Parameters['-profile2'].Value) del(aln1,aln1_int_map,aln1_int_keys,\ aln2,aln2_int_map,aln2_int_keys,app,res,alignment) return new_alignment
def insert_sequences_into_tree(aln, moltype, params={}, write_log=True): """Returns a tree from Alignment object aln. aln: an xxx.Alignment object, or data that can be used to build one. moltype: cogent.core.moltype.MolType object params: dict of parameters to pass in to the RAxML app controller. The result will be an xxx.Alignment object, or None if tree fails. """ # convert aln to phy since seq_names need fixed to run through pplacer new_aln=get_align_for_phylip(StringIO(aln)) # convert aln to fasta in case it is not already a fasta file aln2 = Alignment(new_aln) seqs = aln2.toFasta() ih = '_input_as_multiline_string' pplacer_app = Pplacer(params=params, InputHandler=ih, WorkingDir=None, SuppressStderr=False, SuppressStdout=False) pplacer_result = pplacer_app(seqs) # write a log file if write_log: log_fp = join(params["--out-dir"],'log_pplacer_' + \ split(get_tmp_filename())[-1]) log_file=open(log_fp,'w') log_file.write(pplacer_result['StdOut'].read()) log_file.close() # use guppy to convert json file into a placement tree guppy_params={'tog':None} new_tree=build_tree_from_json_using_params(pplacer_result['json'].name, \ output_dir=params['--out-dir'], \ params=guppy_params) pplacer_result.cleanUp() return new_tree
def insert_sequences_into_tree(aln, moltype, params={}, write_log=True): """Returns a tree from Alignment object aln. aln: an xxx.Alignment object, or data that can be used to build one. moltype: cogent.core.moltype.MolType object params: dict of parameters to pass in to the RAxML app controller. The result will be an xxx.Alignment object, or None if tree fails. """ # convert aln to phy since seq_names need fixed to run through pplacer new_aln=get_align_for_phylip(StringIO(aln)) # convert aln to fasta in case it is not already a fasta file aln2 = Alignment(new_aln) seqs = aln2.toFasta() ih = '_input_as_multiline_string' pplacer_app = Pplacer(params=params, InputHandler=ih, WorkingDir=None, SuppressStderr=False, SuppressStdout=False) pplacer_result = pplacer_app(seqs) # write a log file if write_log: log_fp = join(params["--out-dir"],'log_pplacer_' + \ split(get_tmp_filename())[-1]) log_file=open(log_fp,'w') log_file.write(pplacer_result['StdOut'].read()) log_file.close() # use guppy to convert json file into a placement tree guppy_params={'tog':None} new_tree=build_tree_from_json_using_params(pplacer_result['json'].name, \ output_dir=params['--out-dir'], \ params=guppy_params) pplacer_result.cleanUp() return new_tree
def rnaalifold_from_alignment(aln, moltype=RNA, params=None): """Returns seq, pairs, folding energy for alignment. """ #Create Alignment object. Object will handle if seqs are unaligned. aln = Alignment(aln, MolType=RNA) int_map, int_keys = aln.getIntMap() app = RNAalifold(WorkingDir='/tmp',\ InputHandler='_input_as_multiline_string',params=params) res = app(clustal_from_alignment(int_map)) #seq,pairs,energy = rnaalifold_parser(res['StdOut'].readlines()) pairs_list = MinimalRnaalifoldParser(res['StdOut'].readlines()) res.cleanUp() return pairs_list
def setUp(self): """Setup for Fasta tests.""" self.strings = ['AAAA', 'CCCC', 'gggg', 'uuuu'] self.labels = ['1st', '2nd', '3rd', '4th'] self.infos = ["Dog", "Cat", "Mouse", "Rat"] self.sequences_with_labels = map(Sequence, self.strings) self.sequences_with_names = map(Sequence, self.strings) for l,sl,sn in zip(self.labels,self.sequences_with_labels,\ self.sequences_with_names): sl.Label = l sn.Name = l self.fasta_no_label = '>0\nAAAA\n>1\nCCCC\n>2\ngggg\n>3\nuuuu' self.fasta_with_label=\ '>1st\nAAAA\n>2nd\nCCCC\n>3rd\nGGGG\n>4th\nUUUU' self.fasta_with_label_lw2=\ '>1st\nAA\nAA\n>2nd\nCC\nCC\n>3rd\nGG\nGG\n>4th\nUU\nUU' self.alignment_dict = { '1st': 'AAAA', '2nd': 'CCCC', '3rd': 'GGGG', '4th': 'UUUU' } self.alignment_object = Alignment(self.alignment_dict) for label, info in zip(self.labels, self.infos): self.alignment_object.NamedSeqs[label].Info = Info(species=info) self.fasta_with_label_species=\ '>1st:Dog\nAAAA\n>2nd:Cat\nCCCC\n>3rd:Mouse\nGGGG\n>4th:Rat\nUUUU' self.alignment_object.RowOrder = ['1st', '2nd', '3rd', '4th']
def test_AlignmentToProfile_weighted(self): """AlignmentToProfile: should work when sequences are weighted """ #Alignment: sequences are just strings and don't have an alphabet #Weights: a normal dictionary (could be a real Weights object as well) a = Alignment({'seq1':'TCAG','seq2':'TAR-','seq3':'YAG-'},\ Names=['seq1','seq2','seq3']) w = {'seq1': 0.5, 'seq2': .25, 'seq3': .25} #Basic situation in which all letters in the sequences occur in the #CharOrder, None have to be ignored. In that case it doesn't matter #whether we set split_degenerates to True or False, because if it's #True it's overwritten by the fact that the char is in the CharOrder. exp = array([[0.75, 0, 0, 0, 0, .25, 0], [0, 0.5, 0.5, 0, 0, 0, 0], [0, 0.5, 0, 0.25, 0.25, 0, 0], [0, 0, 0, 0.5, 0, 0, 0.5]]) #split_degenerates = False self.assertEqual(AlnToProfile(a,DNA, char_order="TACGRY-",\ weights=w, split_degenerates=False).Data.tolist(),exp.tolist()) #split_degenerates = True self.assertEqual(AlnToProfile(a,DNA, char_order="TACGRY-",\ weights=w, split_degenerates=True).Data.tolist(),exp.tolist()) #Only non-degenerate symbols in the CharOrder. Degenerates are split. #Gaps are ignored exp = array([[0.875, 0, 0.125, 0], [0, 0.5, 0.5, 0], [0, 0.625, 0, 0.375], [0, 0, 0, 1]]) self.assertEqual(AlnToProfile(a,DNA, char_order="TACG",\ weights=w, split_degenerates=True).Data.tolist(),exp.tolist()) #An Error is raised if all chars in an alignment column are ignored #CharOrder=AT, degenerates are not split. self.assertRaises(ValueError,AlnToProfile,a,DNA,\ char_order="AT",weights=w, split_degenerates=True)
def test_randomIndices(self): """randomIndices: 99% of new frequencies should be within 3*SD """ r_num, c_num = 100,20 num_elements = r_num*c_num r = random([r_num,c_num]) p = Profile(r,"A"*c_num) p.normalizePositions() d = p.Data n = 1000 #Test only works on normalized profile, b/c of 1-d below means = n*d three_stds = sqrt(d*(1-d)*n)*3 result = [p.randomIndices() for x in range(n)] a = Alignment(transpose(result)) def absoluteProfile(alignment,char_order): f = a.columnFreqs() res = zeros([len(f),len(char_order)]) for row, freq in enumerate(f): for i in freq: res[row, ord(i)] = freq[i] return res ap = absoluteProfile(a,p.CharOrder) failure = abs(ap-means) > three_stds assert sum(sum(failure))/num_elements <= 0.01
def test_randomSequence(self): """randomSequence: 99% of new frequencies should be within 3*SD""" r_num, c_num = 100,20 num_elements = r_num*c_num alpha = "ABCDEFGHIJKLMNOPQRSTUVWXYZ" r = random([r_num,c_num]) p = Profile(r,alpha[:c_num]) p.normalizePositions() d = p.Data n = 1000 #Test only works on normalized profile, b/c of 1-d below means = n*d three_stds = sqrt(d*(1-d)*n)*3 a = Alignment([p.randomSequence() for x in range(n)]) def absoluteProfile(alignment,char_order): f = a.columnFreqs() res = zeros([len(f),len(char_order)]) for row, freq in enumerate(f): for i in freq: col = char_order.index(i) res[row, col] = freq[i] return res ap = absoluteProfile(a,p.CharOrder) failure = abs(ap-means) > three_stds assert sum(sum(failure))/num_elements <= 0.01
def setUp(self): # create a list of files to cleanup self._paths_to_clean_up = [] self._dirs_to_clean_up = [] # load query seqs self.seqs = Alignment(MinimalFastaParser(QUERY_SEQS.split())) # generate temp filename tmp_dir='/tmp' self.outfile = get_tmp_filename(tmp_dir) # create and write out reference sequence file self.outfasta=splitext(self.outfile)[0]+'.fasta' fastaout=open(self.outfasta,'w') fastaout.write(REF_SEQS) fastaout.close() self._paths_to_clean_up.append(self.outfasta) # create and write out starting tree file self.outtree=splitext(self.outfile)[0]+'.tree' treeout=open(self.outtree,'w') treeout.write(REF_TREE) treeout.close() self._paths_to_clean_up.append(self.outtree)
def setUp(self): """setUp method for all tests""" # named sequences self.rna1 = RnaSequence('UCAGGG', Name='rna1') self.rna2 = RnaSequence('YCU-RG', Name='rna2') self.rna3 = RnaSequence('CAA-NR', Name='rna3') self.model1 = ModelSequence('UCAGGG', Name='rna1',\ Alphabet=RNA.Alphabets.DegenGapped) self.model2 = ModelSequence('YCU-RG', Name='rna2',\ Alphabet=RNA.Alphabets.DegenGapped) self.model3 = ModelSequence('CAA-NR', Name='rna3',\ Alphabet=RNA.Alphabets.DegenGapped) self.aln = Alignment([self.rna1, self.rna2, self.rna3], MolType=RNA) self.da = DenseAlignment([self.model1, self.model2, self.model3],\ MolType=RNA, Alphabet=RNA.Alphabets.DegenGapped) # seqs no name self.nn_rna1 = RnaSequence('UCAGGG') self.nn_rna2 = RnaSequence('YCU-RG') self.nn_rna3 = RnaSequence('CAA-NR') self.nn_model1 = ModelSequence('UCAGGG',\ Alphabet=RNA.Alphabets.DegenGapped) self.nn_model2 = ModelSequence('YCU-RG',\ Alphabet=RNA.Alphabets.DegenGapped) self.nn_model3 = ModelSequence('CAA-NR',\ Alphabet=RNA.Alphabets.DegenGapped) self.nn_aln = Alignment([self.nn_rna1, self.nn_rna2, self.nn_rna3],\ MolType=RNA) self.nn_da = DenseAlignment([self.nn_model1, self.nn_model2,\ self.nn_model3], MolType=RNA, Alphabet=RNA.Alphabets.DegenGapped)
def test_subset_seqs_Alignment(self): rna1 = RnaSequence('UCG', Name='rna1') rna2 = RnaSequence('YCG', Name='rna2') rna3 = RnaSequence('CAR', Name='rna3') sub_aln = Alignment([rna2, rna3], MolType=RNA) aln = Alignment([rna1, rna2, rna3], MolType=RNA) obs_sub_aln = aln.takeSeqs(['rna2','rna3']) self.assertEqual(obs_sub_aln, sub_aln) self.assertEqual(str(obs_sub_aln), str(sub_aln)) # Selected sequences should be in specified order? obs_sub_aln_1 = self.aln.takeSeqs(['rna3','rna2']) obs_sub_aln_2 = self.aln.takeSeqs(['rna2','rna3']) self.failIfEqual(str(obs_sub_aln_1), str(obs_sub_aln_2))
def align_unaligned_seqs(seqs, moltype, params=None): """Returns an Alignment object from seqs. seqs: cogent.core.alignment.SequenceCollection object, or data that can be used to build one. moltype: a MolType object. DNA, RNA, or PROTEIN. params: dict of parameters to pass in to the Clustal app controller. Result will be a cogent.core.alignment.Alignment object. """ #create SequenceCollection object from seqs seq_collection = SequenceCollection(seqs, MolType=moltype) #Create mapping between abbreviated IDs and full IDs int_map, int_keys = seq_collection.getIntMap() #Create SequenceCollection from int_map. int_map = SequenceCollection(int_map, MolType=moltype) #Create Clustalw app. app = Clustalw(InputHandler='_input_as_multiline_string', params=params) #Get results using int_map as input to app res = app(int_map.toFasta()) #Get alignment as dict out of results alignment = dict(ClustalParser(res['Align'].readlines())) #Make new dict mapping original IDs new_alignment = {} for k, v in alignment.items(): new_alignment[int_keys[k]] = v #Create an Alignment object from alignment dict new_alignment = Alignment(new_alignment, MolType=moltype) #Clean up res.cleanUp() del (seq_collection, int_map, int_keys, app, res, alignment) return new_alignment
def rnaalifold_from_alignment(aln,moltype=RNA,params=None): """Returns seq, pairs, folding energy for alignment. """ #Create Alignment object. Object will handle if seqs are unaligned. aln = Alignment(aln,MolType=RNA) int_map, int_keys = aln.getIntMap() app = RNAalifold(WorkingDir='/tmp',\ InputHandler='_input_as_multiline_string',params=params) res = app(clustal_from_alignment(int_map)) #seq,pairs,energy = rnaalifold_parser(res['StdOut'].readlines()) pairs_list = MinimalRnaalifoldParser(res['StdOut'].readlines()) res.cleanUp() return pairs_list
def test_subset_seqs_Alignment(self): rna1 = RnaSequence('UCG', Name='rna1') rna2 = RnaSequence('YCG', Name='rna2') rna3 = RnaSequence('CAR', Name='rna3') sub_aln = Alignment([rna2, rna3], MolType=RNA) aln = Alignment([rna1, rna2, rna3], MolType=RNA) obs_sub_aln = aln.takeSeqs(['rna2', 'rna3']) self.assertEqual(obs_sub_aln, sub_aln) self.assertEqual(str(obs_sub_aln), str(sub_aln)) # Selected sequences should be in specified order? obs_sub_aln_1 = self.aln.takeSeqs(['rna3', 'rna2']) obs_sub_aln_2 = self.aln.takeSeqs(['rna2', 'rna3']) self.assertNotEqual(str(obs_sub_aln_1), str(obs_sub_aln_2))
def load_from_clustal(data, seq_constructor=Sequence, strict=True): recs = [(name, seq_constructor(seq, )) for name, seq in\ ClustalParser(data, strict)] lengths = [len(i[1]) for i in recs] if lengths and max(lengths) == min(lengths): return Alignment(recs, MolType=BYTES) else: return SequenceCollection(recs, MolType=BYTES)
def test_insert_sequences_into_tree(self): """Inserts sequences into Tree using params - test handles tree-insertion""" # generate temp filename for output outfname = splitext(get_tmp_filename('/tmp/'))[0] # create starting tree outtreefname = outfname + '.tre' outtree = open(outtreefname, 'w') outtree.write(REF_TREE) outtree.close() # set params for tree-insertion params = {} params["-w"] = "/tmp/" params["-n"] = get_tmp_filename().split("/")[-1] params["-f"] = 'v' #params["-G"] = '0.25' params["-t"] = outtreefname params["-m"] = 'GTRGAMMA' aln_ref_query = get_align_for_phylip( StringIO(PHYLIP_FILE_DNA_REF_QUERY)) aln = Alignment(aln_ref_query) seqs, align_map = aln.toPhylip() tree = insert_sequences_into_tree(seqs, DNA, params=params, write_log=False) for node in tree.tips(): removed_query_str = re.sub('QUERY___', '', str(node.Name)) new_node_name = re.sub('___\d+', '', str(removed_query_str)) if new_node_name in align_map: node.Name = align_map[new_node_name] self.assertTrue(isinstance(tree, PhyloNode)) self.assertEqual(tree.getNewick(with_distances=True), RESULT_TREE) self.assertEqual(len(tree.tips()), 7) self.assertRaises(NotImplementedError, build_tree_from_alignment, \ self.align1, RNA, True) remove(outtreefname)
def alignment_traceback(seqs, aligned_positions, word_length): """Alignment object from state matrix and ending point. """ (starts, ends, maps) = map_traceback(aligned_positions) aligneds = [] for (start, end, amap, (name, seq)) in zip(starts, ends, maps, seqs): gs = Aligned(amap * word_length, seq[start * word_length:end * word_length]) aligneds.append((name, gs)) return Alignment(MolType=None, data=aligneds)
def insert_sequences_into_tree(aln, moltype, params={}): """Returns a tree from placement of sequences """ # convert aln to phy since seq_names need fixed to run through parsinsert new_aln=get_align_for_phylip(StringIO(aln)) # convert aln to fasta in case it is not already a fasta file aln2 = Alignment(new_aln) seqs = aln2.toFasta() parsinsert_app = ParsInsert(params=params) result = parsinsert_app(seqs) # parse tree tree = DndParser(result['Tree'].read(), constructor=PhyloNode) # cleanup files result.cleanUp() return tree
def insert_sequences_into_tree(aln, moltype, params={}): """Returns a tree from placement of sequences """ # convert aln to phy since seq_names need fixed to run through parsinsert new_aln = get_align_for_phylip(StringIO(aln)) # convert aln to fasta in case it is not already a fasta file aln2 = Alignment(new_aln) seqs = aln2.toFasta() parsinsert_app = ParsInsert(params=params) result = parsinsert_app(seqs) # parse tree tree = DndParser(result['Tree'].read(), constructor=PhyloNode) # cleanup files result.cleanUp() return tree
def getAlignment(self, feature_types=None, where_feature=None, omit_redundant=True): """Arguments: - feature_types: annotations to be applied to the returned sequences - omit_redundant: exclude redundant gap positions""" seqs = [] annotations = {} for member in self.Members: if feature_types: seq = member.getAnnotatedAligned(feature_types, where_feature) else: seq = member.AlignedSeq if seq is None: continue name = seq.Name if self._rc: # names should reflect change to strand loc = member.Location.copy() loc.Strand *= -1 name = str(loc) annotations[name] = seq.data.annotations seq.Name = seq.data.Name = name seqs += [(name, seq)] if seqs is None: return None aln = Alignment(data=seqs, MolType=DNA) if self._rc: aln = aln.rc() if omit_redundant: aln = aln.filtered(lambda x: set(x) != set('-')) return aln
def test_insert_sequences_into_tree(self): """Inserts sequences into Tree using params - test handles tree-insertion""" # generate temp filename for output outfname=splitext(get_tmp_filename('/tmp/'))[0] # create starting tree outtreefname=outfname+'.tre' outtree=open(outtreefname,'w') outtree.write(REF_TREE) outtree.close() # set params for tree-insertion params={} params["-w"]="/tmp/" params["-n"] = get_tmp_filename().split("/")[-1] params["-f"] = 'v' #params["-G"] = '0.25' params["-t"] = outtreefname params["-m"] = 'GTRGAMMA' aln_ref_query=get_align_for_phylip(StringIO(PHYLIP_FILE_DNA_REF_QUERY)) aln = Alignment(aln_ref_query) seqs, align_map = aln.toPhylip() tree = insert_sequences_into_tree(seqs, DNA, params=params, write_log=False) for node in tree.tips(): removed_query_str=re.sub('QUERY___','',str(node.Name)) new_node_name=re.sub('___\d+','',str(removed_query_str)) if new_node_name in align_map: node.Name = align_map[new_node_name] self.assertTrue(isinstance(tree, PhyloNode)) self.assertEqual(tree.getNewick(with_distances=True),RESULT_TREE) self.assertEqual(len(tree.tips()), 7) self.assertRaises(NotImplementedError, build_tree_from_alignment, \ self.align1, RNA, True) remove(outtreefname)
def setUp(self): """Setup for Clustal tests.""" self.unaligned_dict = { '1st': 'AAA', '2nd': 'CCCC', '3rd': 'GGGG', '4th': 'UUUU' } self.alignment_dict = { '1st': 'AAAA', '2nd': 'CCCC', '3rd': 'GGGG', '4th': 'UUUU' } #create alignment change order. self.alignment_object = Alignment(self.alignment_dict) self.alignment_order = ['2nd', '4th', '3rd', '1st'] self.alignment_object.RowOrder = self.alignment_order self.clustal_with_label=\ """CLUSTAL 1st AAAA 2nd CCCC 3rd GGGG 4th UUUU """ self.clustal_with_label_lw2=\ """CLUSTAL 1st AA 2nd CC 3rd GG 4th UU 1st AA 2nd CC 3rd GG 4th UU """ self.clustal_with_label_reordered=\ """CLUSTAL 2nd CCCC 4th UUUU 3rd GGGG 1st AAAA """ self.clustal_with_label_lw2_reordered=\ """CLUSTAL
def getAlignment(self, feature_types=None, where_feature=None, omit_redundant=True): """Arguments: - feature_types: annotations to be applied to the returned sequences - omit_redundant: exclude redundant gap positions""" seqs = [] annotations = {} for member in self.Members: if feature_types: seq = member.getAnnotatedAligned(feature_types, where_feature) else: seq = member.AlignedSeq if seq is None: continue name = seq.Name if self._rc: # names should reflect change to strand loc = member.Location.copy() loc.Strand *= -1 name = str(loc) annotations[name] = seq.data.annotations seq.Name = seq.data.Name = name seqs += [(name, seq)] if seqs is None: return None aln = Alignment(data=seqs, MolType=DNA) if self._rc: aln = aln.rc() if omit_redundant: aln = aln.filtered(lambda x: set(x) != set('-')) return aln
def test_insert_sequences_into_tree(self): """Inserts sequences into Tree""" params={} # generate temp filename for output params["-r"] = self.refseq_fname params["-t"] = self.tree_fname params["-s"] = self.stats_fname params["--out-dir"] = "/tmp" aln_ref_query=MinimalFastaParser(StringIO(QUERY_SEQS)) aln = Alignment(aln_ref_query) seqs, align_map = aln.toPhylip() tree = insert_sequences_into_tree(seqs, DNA, params=params, write_log=False) # rename tips back to query names for node in tree.tips(): if node.Name in align_map: node.Name = align_map[node.Name] self.assertEqual(tree.getNewick(with_distances=True), RESULT_TREE)
def test_subset_positions_Alignment(self): rna1 = RnaSequence('UCG', Name='rna1') rna2 = RnaSequence('YCG', Name='rna2') rna3 = RnaSequence('CAR', Name='rna3') sub_aln = Alignment([rna1, rna2, rna3], MolType=RNA) obs_sub_aln = self.aln.takePositions([0, 1, 5]) self.assertEqual(obs_sub_aln, sub_aln) self.assertNotEqual(obs_sub_aln, self.aln) # string representations should be the same. This fails right # now, because sequence order is not maintained. See separate test. self.assertEqual(str(obs_sub_aln), str(sub_aln))
def likelyAncestralSeqs(self, locus=None): """Returns the most likely reconstructed ancestral sequences as an alignment. Arguments: - locus: a named locus""" prob_array = self.reconstructAncestralSeqs(locus=locus) seqs = [] for edge, probs in prob_array.items(): seq = [] for row in probs: by_p = [(p, state) for state, p in row.items()] seq.append(max(by_p)[1]) seqs += [(edge, self.model.MolType.makeSequence("".join(seq)))] return Alignment(data=seqs, MolType=self.model.MolType)
def LoadSeqs(filename=None, format=None, data=None, moltype=None, name=None, aligned=True, label_to_name=None, parser_kw={}, constructor_kw={}, **kw): """Initialize an alignment or collection of sequences. Arguments: - filename: name of the sequence file - format: format of the sequence file - data: optional explicit provision of sequences - moltype: the MolType, eg DNA, PROTEIN - aligned: set True if sequences are already aligned and have the same length, results in an Alignment object. If False, a SequenceCollection instance is returned instead. If callable, will use as a constructor (e.g. can pass in DenseAlignment or CodonAlignment). - label_to_name: function for converting original name into another name. Default behavior is to preserve the original FASTA label and comment. To remove all FASTA label comments, and pass in only the label, pass in: label_to_name=lambda x: x.split()[0] To look up names in a dict, pass in: label_to_name = lambda x: d.get(x, default_name) ...where d is a dict that's in scope, and default_name is what you want to assign any sequence that isn't in the dict. If format is None, will attempt to infer format from the filename suffix. If label_to_name is None, will attempt to infer correct conversion from the format. """ if filename is None: assert data is not None assert format is None assert not kw, kw else: assert data is None, (filename, data) data = list(FromFilenameParser(filename, format, **parser_kw)) # the following is a temp hack until we have the load API sorted out. if aligned: #if callable, call it -- expect either f(data) or bool if hasattr(aligned, '__call__'): return aligned(data=data, MolType=moltype, Name=name, label_to_name=label_to_name, **constructor_kw) else: #was not callable, but wasn't False return Alignment(data=data, MolType=moltype, Name=name, label_to_name=label_to_name, **constructor_kw) else: #generic case: return SequenceCollection return SequenceCollection(data, MolType=moltype, Name=name, label_to_name=label_to_name, **constructor_kw)
def dotur_from_alignment(aln, moltype, distance_function, params=None): """Returns dotur results given an alignment and distance function. - aln: An Alignment object or something that behaves like one. Sequences must be aligned. - moltype: cogent.core.moltype object. - distance_function: function that can be passed to distanceMatrix() method of SequenceCollection. Must be able to find distance between two sequences. - NOTE: This function will only return the parsed *.list file, as it contains the OTU identities. Dotur generates 23 output files, so if this is not the one you are looking for, check out the documentation and add the others to the result path. """ #construct Alignment object. This will handle unaligned sequences. aln = Alignment(aln, MolType=moltype) #need to make int map. int_map, int_keys = aln.getIntMap() #construct Alignment object from int map to use object functionality int_map = Alignment(int_map, MolType=moltype) order = sorted(int_map.Names) #Build distance matrix. d_matrix_dict = int_map.distanceMatrix(f=distance_function) d_matrix_dict.RowOrder = order d_matrix_dict.ColOrder = order #Get distance matrix in list form. d_matrix_list = d_matrix_dict.toLists() #must be strings to use phylipMatrix for i, line in enumerate(d_matrix_list): d_matrix_list[i] = map(str, line) #Get phylip formatted string. phylip_matrix_string = phylipMatrix(rows=d_matrix_list, names=order) working_dir = get_tmp_filename(suffix='') app = Dotur(InputHandler='_input_as_multiline_string',\ WorkingDir=working_dir,params=params) res = app(phylip_matrix_string) otu_list = OtuListParser(res['List'].readlines()) #remap sequence names for i, otu in enumerate(otu_list): otu_list[i][2] = remap_seq_names(otu[2], int_keys) shutil.rmtree(app.WorkingDir) return otu_list
def test_AlignmentToProfile_ignore(self): """AlignmentToProfile: should raise an error if too many chars ignored """ #Same conditions as previous function, but in the last column #there are only gaps, so normalization will fail at that position a = Alignment({'a':RnaSequence('UCAGRYN-'),'b':RnaSequence('ACUGAAA-')}) exp =\ array([[.5,0,.5,0], [0,1,0,0], [.5,0,.5,0], [0,0,0,1], [0,0,.75,.25], [.25,.25,.5,0], [.125,.125,.625,.125], [0,0,1,0]]) self.assertRaises(ValueError,AlnToProfile,a,alphabet=RNA,\ split_degenerates=True)
def get_align_for_phylip(data, id_map=None): """ Convenience function to return aligment object from phylip data data: sequence of lines in phylip format (an open file, list, etc) id_map: optional id mapping from external ids to phylip labels - not sure if we're going to implement this returns Alignment object """ mpp = MinimalPhylipParser(data, id_map) tuples = [] for tup in mpp: tuples.append(tup) return Alignment(tuples)
def align_unaligned_seqs(seqs,moltype,params=None,accurate=False): """Aligns unaligned sequences seqs: either list of sequence objects or list of strings add_seq_names: boolean. if True, sequence names are inserted in the list of sequences. if False, it assumes seqs is a list of lines of some proper format that the program can handle """ #create SequenceCollection object from seqs seq_collection = SequenceCollection(seqs,MolType=moltype) #Create mapping between abbreviated IDs and full IDs int_map, int_keys = seq_collection.getIntMap() #Create SequenceCollection from int_map. int_map = SequenceCollection(int_map,MolType=moltype) #Create Mafft app. app = Mafft(InputHandler='_input_as_multiline_string',params=params) #Turn on correct moltype moltype_string = moltype.label.upper() app.Parameters[MOLTYPE_MAP[moltype_string]].on() #Do not report progress app.Parameters['--quiet'].on() #More accurate alignment, sacrificing performance. if accurate: app.Parameters['--globalpair'].on() app.Parameters['--maxiterate'].Value=1000 #Get results using int_map as input to app res = app(int_map.toFasta()) #Get alignment as dict out of results alignment = dict(MinimalFastaParser(res['StdOut'].readlines())) #Make new dict mapping original IDs new_alignment = {} for k,v in list(alignment.items()): new_alignment[int_keys[k]]=v #Create an Alignment object from alignment dict new_alignment = Alignment(new_alignment,MolType=moltype) #Clean up res.cleanUp() del(seq_collection,int_map,int_keys,app,res,alignment) return new_alignment
def test_AlignmentToProfile_basic(self): """AlignmentToProfile: should work under basic conditions """ #sequences in the alignment are unweighted #Alphabet is the alphabet of the sequences (RNA) #CharOrder is set explicitly #Degenerate bases are split up #Gaps are ignored #In all of the columns at least one character is in the CharOrder a = Alignment({'a':RnaSequence('UCAGRYN-'),'b':RnaSequence('ACUGAAAA')}) exp =\ array([[.5,0,.5,0], [0,1,0,0], [.5,0,.5,0], [0,0,0,1], [0,0,.75,.25], [.25,.25,.5,0], [.125,.125,.625,.125], [0,0,1,0]]) self.assertEqual(AlnToProfile(a,alphabet=RNA,\ split_degenerates=True).Data.tolist(),exp.tolist())
def align_unaligned_seqs(seqs, moltype, params=None): """Returns an Alignment object from seqs. seqs: SequenceCollection object, or data that can be used to build one. moltype: a MolType object. DNA, RNA, or PROTEIN. params: dict of parameters to pass in to the Muscle app controller. Result will be an Alignment object. """ if not params: params = {} #create SequenceCollection object from seqs seq_collection = SequenceCollection(seqs, MolType=moltype) #Create mapping between abbreviated IDs and full IDs int_map, int_keys = seq_collection.getIntMap() #Create SequenceCollection from int_map. int_map = SequenceCollection(int_map, MolType=moltype) #get temporary filename params.update({'-out': get_tmp_filename()}) #Create Muscle app. app = Muscle(InputHandler='_input_as_multiline_string',\ params=params) #Get results using int_map as input to app res = app(int_map.toFasta()) #Get alignment as dict out of results alignment = dict(MinimalFastaParser(res['MuscleOut'].readlines())) #Make new dict mapping original IDs new_alignment = {} for k, v in alignment.items(): new_alignment[int_keys[k]] = v #Create an Alignment object from alignment dict new_alignment = Alignment(new_alignment, MolType=moltype) #Clean up res.cleanUp() del (seq_collection, int_map, int_keys, app, res, alignment, params) return new_alignment
def setUp(self): """General setUp method for all tests in this file""" #ALIGNMENTS self.aln1 = Alignment(['ABC', 'BCC', 'BAC']) #alignment from Henikoff 1994 self.aln2 = Alignment({'seq1':'GYVGS','seq2':'GFDGF','seq3':'GYDGF',\ 'seq4':'GYQGG'},Names=['seq1','seq2','seq3','seq4']) #alignment from Vingron & Sibbald 1993 self.aln3 = Alignment({'seq1':'AA', 'seq2':'AA', 'seq3':'BB'},\ Names=['seq1','seq2','seq3']) #alignment from Vingron & Sibbald 1993 self.aln4 = Alignment({'seq1':'AA', 'seq2':'AA', 'seq3':'BB',\ 'seq4':'BB','seq5':'CC'},Names=['seq1','seq2','seq3','seq4','seq5']) self.aln5 = Alignment(['ABBA', 'ABCA', 'CBCB']) #alignment 5S rRNA seqs from Hein 1990 self.aln6 = ClustalParser(FIVE_S_ALN.split('\n')) #alignment from Vingron & Sibbald 1993 self.aln7 = Alignment( { 'seq1': 'AGCTA', 'seq2': 'AGGTA', 'seq3': 'ACCTG', 'seq4': 'TGCAA' }, Names=['seq1', 'seq2', 'seq3', 'seq4']) #TREES (SEE BOTTOM OF FILE FOR DESCRIPTION) self.tree1 = DndParser(TREE_1) self.tree2 = DndParser(TREE_2) self.tree3 = DndParser(TREE_3) self.tree4 = DndParser(TREE_4) self.tree5 = DndParser(TREE_5) self.tree6 = DndParser(TREE_6) self.tree7 = DndParser(TREE_7) self.tree8 = DndParser(TREE_8) self.tree9 = DndParser(TREE_9)
list_of_genes=[] for fasta in fasta_files[:]: print "_" * 50 print "processing file:", fasta aln = LoadSeqs(fasta, moltype=DNA) list_sequences = aln.Names # Check if all taxa that are specified in the control file exist in the alignment list_check = set(taxa_names).issubset(set(list_sequences)) # If there are some taxa not present in alignment the following code will simulate a sequence full of "N" of the correct length for those missing taxa and add it to the alignment if list_check == False: missing_elements = [] for element in taxa_names: if element not in list_sequences: missing_elements.append(element) print "These taxa are missing in alignment:", missing_elements, "\nSequences for missing taxa will be generated only containing \"N\"." seq = Alignment(aln) string_list = seq.todict().values() length_alignment = "" for element in string_list: length_alignment = len(element) simulated_seq = [] for element in missing_elements: fake_aln = "N" * length_alignment simulated_seq.append((element,fake_aln)) fake_seqs = LoadSeqs(data = simulated_seq) aln = aln.addSeqs(fake_seqs) # Apply filter of user-set taxa names to be used for snp-extraction edited_alignment = aln.takeSeqs(sorted(taxa_names)) # Get the variable positions for each fasta file var_pos_list = variable_positions(edited_alignment) print var_pos_list
def add_seqs_to_alignment(seqs, aln, moltype, params=None, accurate=False): """Returns an Alignment object from seqs and existing Alignment. seqs: a cogent.core.sequence.Sequence object, or data that can be used to build one. aln: an cogent.core.alignment.Alignment object, or data that can be used to build one params: dict of parameters to pass in to the Mafft app controller. """ #create SequenceCollection object from seqs seq_collection = SequenceCollection(seqs,MolType=moltype) #Create mapping between abbreviated IDs and full IDs seq_int_map, seq_int_keys = seq_collection.getIntMap() #Create SequenceCollection from int_map. seq_int_map = SequenceCollection(seq_int_map,MolType=moltype) #create Alignment object from aln aln = Alignment(aln,MolType=moltype) #Create mapping between abbreviated IDs and full IDs aln_int_map, aln_int_keys = aln.getIntMap(prefix='seqn_') #Create SequenceCollection from int_map. aln_int_map = Alignment(aln_int_map,MolType=moltype) #Update seq_int_keys with aln_int_keys seq_int_keys.update(aln_int_keys) #Create Mafft app. app = Mafft(InputHandler='_input_as_multiline_string',\ params=params, SuppressStderr=True) #Turn on correct moltype moltype_string = moltype.label.upper() app.Parameters[MOLTYPE_MAP[moltype_string]].on() #Do not report progress app.Parameters['--quiet'].on() #Add aln_int_map as seed alignment app.Parameters['--seed'].on(\ app._tempfile_as_multiline_string(aln_int_map.toFasta())) #More accurate alignment, sacrificing performance. if accurate: app.Parameters['--globalpair'].on() app.Parameters['--maxiterate'].Value=1000 #Get results using int_map as input to app res = app(seq_int_map.toFasta()) #Get alignment as dict out of results alignment = dict(parse_fasta(res['StdOut'])) #Make new dict mapping original IDs new_alignment = {} for k,v in alignment.items(): key = k.replace('_seed_','') new_alignment[seq_int_keys[key]]=v #Create an Alignment object from alignment dict new_alignment = Alignment(new_alignment,MolType=moltype) #Clean up res.cleanUp() remove(app.Parameters['--seed'].Value) del(seq_collection,seq_int_map,seq_int_keys,\ aln,aln_int_map,aln_int_keys,app,res,alignment) return new_alignment
def build_tree_from_alignment(aln, moltype, best_tree=False, params={},\ working_dir='/tmp'): """Returns a tree from Alignment object aln. aln: an cogent.core.alignment.Alignment object, or data that can be used to build one. - Clearcut only accepts aligned sequences. Alignment object used to handle unaligned sequences. moltype: a cogent.core.moltype object. - NOTE: If moltype = RNA, we must convert to DNA since Clearcut v1.0.8 gives incorrect results if RNA is passed in. 'U' is treated as an incorrect character and is excluded from distance calculations. best_tree: if True (default:False), uses a slower but more accurate algorithm to build the tree. params: dict of parameters to pass in to the Clearcut app controller. The result will be an cogent.core.tree.PhyloNode object, or None if tree fails. """ params['--out'] = get_tmp_filename(working_dir) # Create instance of app controller, enable tree, disable alignment app = Clearcut(InputHandler='_input_as_multiline_string', params=params, \ WorkingDir=working_dir, SuppressStdout=True,\ SuppressStderr=True) #Input is an alignment app.Parameters['-a'].on() #Turn off input as distance matrix app.Parameters['-d'].off() #If moltype = RNA, we must convert to DNA. if moltype == RNA: moltype = DNA if best_tree: app.Parameters['-N'].on() #Turn on correct moltype moltype_string = moltype.label.upper() app.Parameters[MOLTYPE_MAP[moltype_string]].on() # Setup mapping. Clearcut clips identifiers. We will need to remap them. # Clearcut only accepts aligned sequences. Let Alignment object handle # unaligned sequences. seq_aln = Alignment(aln,MolType=moltype) #get int mapping int_map, int_keys = seq_aln.getIntMap() #create new Alignment object with int_map int_map = Alignment(int_map) # Collect result result = app(int_map.toFasta()) # Build tree tree = DndParser(result['Tree'].read(), constructor=PhyloNode) for node in tree.tips(): node.Name = int_keys[node.Name] # Clean up result.cleanUp() del(seq_aln, app, result, int_map, int_keys, params) return tree
def __call__(self, seq_path, result_path=None, log_path=None, \ failure_path=None, cmbuild_params=None, cmalign_params=None): log_params = [] # load candidate sequences candidate_sequences = dict(MinimalFastaParser(open(seq_path,'U'))) # load template sequences try: info, template_alignment, struct = list(MinimalRfamParser(open(\ self.Params['template_filepath'],'U'),\ seq_constructor=ChangedSequence))[0] except RecordError: raise ValueError, "Template alignment must be in Stockholm format with corresponding secondary structure annotation when using InfernalAligner." moltype = self.Params['moltype'] #Need to make separate mapping for unaligned sequences unaligned = SequenceCollection(candidate_sequences,MolType=moltype) int_map, int_keys = unaligned.getIntMap(prefix='unaligned_') int_map = SequenceCollection(int_map,MolType=moltype) #Turn on --gapthresh option in cmbuild to force alignment to full model if cmbuild_params is None: cmbuild_params = {} cmbuild_params.update({'--gapthresh':1.0}) #record cmbuild parameters log_params.append('cmbuild parameters:') log_params.append(str(cmbuild_params)) #Turn on --sub option in Infernal, since we know the unaligned sequences # are fragments. #Also turn on --gapthresh to use same gapthresh as was used to build # model if cmalign_params is None: cmalign_params = {} cmalign_params.update({'--sub':True,'--gapthresh':1.0}) #record cmalign parameters log_params.append('cmalign parameters:') log_params.append(str(cmalign_params)) #Align sequences to alignment including alignment gaps. aligned, struct_string = cmalign_from_alignment(aln=template_alignment,\ structure_string=struct,\ seqs=int_map,\ moltype=moltype,\ include_aln=True,\ params=cmalign_params,\ cmbuild_params=cmbuild_params) #Pull out original sequences from full alignment. infernal_aligned={} aligned_dict = aligned.NamedSeqs for key in int_map.Names: infernal_aligned[int_keys.get(key,key)]=aligned_dict[key] #Create an Alignment object from alignment dict infernal_aligned = Alignment(infernal_aligned,MolType=moltype) if log_path is not None: log_file = open(log_path,'w') log_file.write('\n'.join(log_params)) log_file.close() if result_path is not None: result_file = open(result_path,'w') result_file.write(infernal_aligned.toFasta()) result_file.close() return None else: try: return infernal_aligned except ValueError: return {}
class AllTests(TestCase): def setUp(self): """setUp method for all tests""" # named sequences self.rna1 = RnaSequence('UCAGGG', Name='rna1') self.rna2 = RnaSequence('YCU-RG', Name='rna2') self.rna3 = RnaSequence('CAA-NR', Name='rna3') self.model1 = ModelSequence('UCAGGG', Name='rna1',\ Alphabet=RNA.Alphabets.DegenGapped) self.model2 = ModelSequence('YCU-RG', Name='rna2',\ Alphabet=RNA.Alphabets.DegenGapped) self.model3 = ModelSequence('CAA-NR', Name='rna3',\ Alphabet=RNA.Alphabets.DegenGapped) self.aln = Alignment([self.rna1, self.rna2, self.rna3], MolType=RNA) self.da = DenseAlignment([self.model1, self.model2, self.model3],\ MolType=RNA, Alphabet=RNA.Alphabets.DegenGapped) # seqs no name self.nn_rna1 = RnaSequence('UCAGGG') self.nn_rna2 = RnaSequence('YCU-RG') self.nn_rna3 = RnaSequence('CAA-NR') self.nn_model1 = ModelSequence('UCAGGG',\ Alphabet=RNA.Alphabets.DegenGapped) self.nn_model2 = ModelSequence('YCU-RG',\ Alphabet=RNA.Alphabets.DegenGapped) self.nn_model3 = ModelSequence('CAA-NR',\ Alphabet=RNA.Alphabets.DegenGapped) self.nn_aln = Alignment([self.nn_rna1, self.nn_rna2, self.nn_rna3],\ MolType=RNA) self.nn_da = DenseAlignment([self.nn_model1, self.nn_model2,\ self.nn_model3], MolType=RNA, Alphabet=RNA.Alphabets.DegenGapped) def test_printing_named_seqs(self): """Printing named seqs should work the same on Aln and DenseAln""" #Note: the newline trailing each sequence is intentional, because #we want each FASTA-format record to be separated. exp_lines_general = ['>rna1','UCAGGG','>rna2','YCU-RG','>rna3','CAA-NR'] self.assertEqual(str(self.aln), '\n'.join(exp_lines_general) + '\n') self.assertEqual(str(self.da), '\n'.join(exp_lines_general) + '\n') def test_printing_unnamed_seqs(self): """Printing unnamed sequences should work the same on Aln and DenseAln """ exp_lines_gen = ['>seq_0','UCAGGG','>seq_1','YCU-RG','>seq_2','CAA-NR\n'] self.assertEqual(str(self.nn_aln),'\n'.join(exp_lines_gen)) self.assertEqual(str(self.nn_da),'\n'.join(exp_lines_gen)) def test_DenseAlignment_without_moltype(self): """Expect MolType to be picked up from the sequences.""" m1 = ModelSequence('UCAG',Alphabet=RNA.Alphabets.DegenGapped,\ Name='rna1') m2 = ModelSequence('CCCR',Alphabet=RNA.Alphabets.DegenGapped,\ Name='rna2') da = DenseAlignment([m1, m2]) exp_lines = ['>rna1','UCAG','>rna2','CCCR'] self.assertEqual(str(da), '\n'.join(exp_lines) + '\n') def test_names(self): # Should both alignments handle names the same way? self.assertEqual(self.aln.Names, ['rna1','rna2','rna3']) self.assertEqual(self.da.Names, ['rna1','rna2','rna3']) # On unnamed sequences the behavior is now the same. self.assertEqual(self.nn_aln.Names, ['seq_0','seq_1','seq_2']) self.assertEqual(self.nn_da.Names, ['seq_0','seq_1','seq_2']) def test_seqFreqs(self): """seqFreqs should work the same on Alignment and DenseAlignment""" # Used alphabet: ('U', 'C', 'A', 'G', '-', 'B', 'D', 'H',\ # 'K', 'M', 'N', 'S', 'R', 'W', 'V', 'Y') exp = [[1,1,1,3,0,0,0,0,0,0,0,0,0,0,0,0,0],\ [1,1,0,1,1,0,0,0,0,0,0,0,1,0,0,1,0],\ [0,1,2,0,1,0,0,0,0,0,1,0,1,0,0,0,0]] # This works self.assertEqual(self.da.getSeqFreqs().Data, exp) # This used to raise an error, but now works self.assertEqual(self.aln.getSeqFreqs().Data, exp) def test_subset_positions_DenseAlignment(self): model1 = ModelSequence('UCG', Name='rna1',\ Alphabet=RNA.Alphabets.DegenGapped) model2 = ModelSequence('YCG', Name='rna2',\ Alphabet=RNA.Alphabets.DegenGapped) model3 = ModelSequence('CAR', Name='rna3',\ Alphabet=RNA.Alphabets.DegenGapped) sub_da = DenseAlignment([model1, model2, model3],\ MolType=RNA, Alphabet=RNA.Alphabets.DegenGapped) full_data = array([[0,1,2,3,3,3],[15,1,0,4,12,3],[1,2,2,4,10,12]]) sub_data = array([[0,1,3],[15,1,3],[1,2,12]]) # First check some data self.assertEqual(self.da.ArraySeqs, full_data) self.assertEqual(self.da.ArrayPositions, transpose(full_data)) self.assertEqual(sub_da.ArraySeqs, sub_data) self.assertEqual(sub_da.ArrayPositions, transpose(sub_data)) obs_sub_da_TP = self.da.takePositions([0,1,5]) obs_sub_da_SA = self.da.getSubAlignment(pos=[0,1,5]) # When using the getSubAlignment method the data is right self.assertEqual(obs_sub_da_SA, sub_da) self.failIfEqual(obs_sub_da_SA, self.da) self.assertEqual(obs_sub_da_SA.ArraySeqs, sub_data) self.assertEqual(obs_sub_da_SA.ArrayPositions, transpose(sub_data)) # For the takePositions method: Why does this work self.assertEqual(obs_sub_da_TP, sub_da) self.failIfEqual(obs_sub_da_TP, self.da) # If the data doesn't match? self.assertEqual(obs_sub_da_TP.ArraySeqs, sub_data) self.assertEqual(obs_sub_da_TP.ArrayPositions, transpose(sub_data)) # Shouldn't the __eq__ method check the data at least? def test_subset_positions_Alignment(self): rna1 = RnaSequence('UCG', Name='rna1') rna2 = RnaSequence('YCG', Name='rna2') rna3 = RnaSequence('CAR', Name='rna3') sub_aln = Alignment([rna1, rna2, rna3], MolType=RNA) obs_sub_aln = self.aln.takePositions([0,1,5]) self.assertEqual(obs_sub_aln, sub_aln) self.failIfEqual(obs_sub_aln, self.aln) # string representations should be the same. This fails right # now, because sequence order is not maintained. See separate test. self.assertEqual(str(obs_sub_aln), str(sub_aln)) def test_takePositions_sequence_order(self): """Alignment takePositions should maintain seq order""" #This works self.assertEqual(self.da.Names,['rna1','rna2','rna3']) sub_da = self.da.getSubAlignment(pos=[0,1,5]) self.assertEqual(sub_da.Names,['rna1','rna2','rna3']) # seq order not maintained in Alignment self.assertEqual(self.aln.Names,['rna1','rna2','rna3']) sub_aln = self.aln.takePositions([0,1,5]) self.assertEqual(sub_aln.Names,['rna1','rna2','rna3']) def test_subset_seqs_Alignment(self): rna1 = RnaSequence('UCG', Name='rna1') rna2 = RnaSequence('YCG', Name='rna2') rna3 = RnaSequence('CAR', Name='rna3') sub_aln = Alignment([rna2, rna3], MolType=RNA) aln = Alignment([rna1, rna2, rna3], MolType=RNA) obs_sub_aln = aln.takeSeqs(['rna2','rna3']) self.assertEqual(obs_sub_aln, sub_aln) self.assertEqual(str(obs_sub_aln), str(sub_aln)) # Selected sequences should be in specified order? obs_sub_aln_1 = self.aln.takeSeqs(['rna3','rna2']) obs_sub_aln_2 = self.aln.takeSeqs(['rna2','rna3']) self.failIfEqual(str(obs_sub_aln_1), str(obs_sub_aln_2)) def test_subset_seqs_DenseAlignment(self): model1 = ModelSequence('UCG', Name='rna1',\ Alphabet=RNA.Alphabets.DegenGapped) model2 = ModelSequence('YCG', Name='rna2',\ Alphabet=RNA.Alphabets.DegenGapped) model3 = ModelSequence('CAR', Name='rna3',\ Alphabet=RNA.Alphabets.DegenGapped) sub_da = DenseAlignment([model1, model2, model3],\ MolType=RNA, Alphabet=RNA.Alphabets.DegenGapped) # takeSeqs by name should have the same effect as # getSubAlignment by seq idx? obs_sub_da_TS = self.da.takeSeqs(['rna1']) obs_sub_da_SA = self.da.getSubAlignment(seqs=[0]) # These two are now the same. Fixed mapping of key to char array. self.assertEqual(obs_sub_da_TS, obs_sub_da_SA) self.assertEqual(str(obs_sub_da_TS), str(obs_sub_da_SA)) def test_aln_equality(self): # When does something compare equal? self.assertEqual(self.da == self.da, True) # one sequence less other_da1 = DenseAlignment([self.model1, self.model2],\ MolType=RNA, Alphabet=RNA.Alphabets.DegenGapped) self.assertEqual(self.da == other_da1, False) # seqs in different order -- doesn't matter other_da2 = DenseAlignment([self.model1, self.model3, self.model2],\ MolType=RNA, Alphabet=RNA.Alphabets.DegenGapped) self.assertEqual(self.da == other_da2, True) # seqs in different encoding -- doesn't matter, only looks at data other_da3 = DenseAlignment([self.model1, self.model2, self.model3]) # Should this compare False even though the data is exactly the same? # The MolType is different... self.assertEqual(self.da == other_da3, True) assert alltrue(map(alltrue,self.da.ArraySeqs == other_da3.ArraySeqs)) def test_seq_equality(self): model1 = ModelSequence('UCG', Name='rna1',\ Alphabet=RNA.Alphabets.DegenGapped) model2 = ModelSequence('UCG', Name='rna1',\ Alphabet=RNA.Alphabets.DegenGapped) # Shouldn't the above two sequences be equal? self.assertEqual(model1, model2) # string comparison is True self.assertEqual(str(model1), str(model2)) def test_seq_ungapping(self): rna1 = RnaSequence('U-C-A-G-', Name='rna1') model1 = ModelSequence('U-C-A-G-', Name='rna1',\ Alphabet=RNA.Alphabets.DegenGapped) self.assertEqual(rna1, 'U-C-A-G-') self.assertEqual(rna1.degap(), 'UCAG') # check is produces the right string from the beginning self.assertEqual(str(model1), 'U-C-A-G-') self.assertEqual(model1._data, [0,4,1,4,2,4,3,4]) # ModelSequence should maybe have the same degap method as normal Seq self.assertEqual(str(model1.degap()), 'UCAG') def test_the_rest_of_ModelSequence(self): """The class ModelSequence has 14 methods, but only 2 unittests. You might want to add some tests there...""" #note: mostly these are tested in derived classes, for convenience. pass
class ParsInsertTests(TestCase): def setUp(self): # create a list of files to cleanup self._paths_to_clean_up = [] self._dirs_to_clean_up = [] # load query seqs self.seqs = Alignment(MinimalFastaParser(QUERY_SEQS.split())) # generate temp filename tmp_dir='/tmp' self.outfile = get_tmp_filename(tmp_dir) # create and write out reference sequence file self.outfasta=splitext(self.outfile)[0]+'.fasta' fastaout=open(self.outfasta,'w') fastaout.write(REF_SEQS) fastaout.close() self._paths_to_clean_up.append(self.outfasta) # create and write out starting tree file self.outtree=splitext(self.outfile)[0]+'.tree' treeout=open(self.outtree,'w') treeout.write(REF_TREE) treeout.close() self._paths_to_clean_up.append(self.outtree) def tearDown(self): """cleans up all files initially created""" # remove the tempdir and contents map(remove,self._paths_to_clean_up) map(rmdir,self._dirs_to_clean_up) def test_base_command(self): """Base command-calls""" app = ParsInsert() self.assertEqual(app.BaseCommand, \ ''.join(['cd "',getcwd(),'/"; ','ParsInsert'])) def test_change_working_dir(self): """Change working dir""" app = ParsInsert(WorkingDir='/tmp/ParsInsertTest') self.assertEqual(app.BaseCommand, \ ''.join(['cd "','/tmp/ParsInsertTest',\ '/"; ','ParsInsert'])) rmtree('/tmp/ParsInsertTest') def test_insert_sequences_into_tree(self): """Inserts sequences into Tree""" # define log fp log_fp='/tmp/parsinsert.log' self._paths_to_clean_up.append(log_fp) # define tax assignment values fp tax_assign_fp='/tmp/tax_assignments.log' self._paths_to_clean_up.append(tax_assign_fp) # set the reference alignment and starting tree param={ '-t':self.outtree, '-s':self.outfasta, '-l':log_fp, '-o':tax_assign_fp } seqs, align_map = self.seqs.toPhylip() # insert sequences into tree tree = insert_sequences_into_tree(seqs, DNA, params=param) # rename tips back to query names for node in tree.tips(): if node.Name in align_map: node.Name = align_map[node.Name] self.assertEqual(tree.getNewick(with_distances=True),exp_tree)