def clustal_align_protein(rec_1, rec_2, work_dir): """Align the two given proteins with clustalw. """ fasta_file = op.join(work_dir, "prot-start.fasta") align_file = op.join(work_dir, "prot.aln") SeqIO.write((rec_1, rec_2), file(fasta_file, "w"), "fasta") clustal_cl = Clustalw.MultipleAlignCL(fasta_file, command=CLUSTALW_BIN) clustal_cl.set_output(align_file, output_order="INPUT") clustal_cl.set_type("PROTEIN") Clustalw.do_alignment(clustal_cl) aln_file = file(clustal_cl.output_file) alignment = AlignIO.read(aln_file, "clustal") print >>sys.stderr, "\tDoing clustalw alignment: %s" % clustal_cl return alignment.format("fasta")
def loadAlignment( self, path ): """ path is a path to an alignment file in .aln format""" alignment = Clustalw.parse_file( path ) self.allseq = alignment.get_all_seqs() self.summary = AlignInfo.SummaryInfo(alignment) self.l = alignment.get_alignment_length() self.insertLoadedBioAlignment()
def Align_Results(OutputFileName): import os FileIN_Name = """/users/rwbarrettemac/bioinformatics/pythonfolders/FMDanalysisScript/FMDserotypingARRAY/Consensus_Results/%s.FASTA""" % (OutputFileName) FileOUT_ALN = """/users/rwbarrettemac/bioinformatics/pythonfolders/FMDanalysisScript/FMDserotypingARRAY/Consensus_Results/%s.ALN""" % (OutputFileName) print FileIN_Name print FileOUT_ALN from Bio.Clustalw import MultipleAlignCL from Bio import Clustalw cline = MultipleAlignCL(os.path.join(os.curdir, FileIN_Name)) cline.set_output(FileOUT_ALN) alignment = Clustalw.do_alignment(cline) cline.close()
def loadAlignment( self, alignmentFile ): "Populates this object with the given alignment data from a CLUSTAL .aln file." # ***NOTE*** the CLUSTAL parser does not handle windows line breaks well... alignment = Clustalw.parse_file(alignmentFile) alignments = alignment.get_all_seqs() self.alignmentLength = alignment.get_alignment_length() for seq in alignments: sequence = fasta.Record() align = fasta.Record() sequence.title = seq.description align.title = seq.description align.sequence = seq.seq.tostring() sequence.sequence = seq.seq.tostring().replace("-","") self.alignments.append( align ) self.sequences.append( sequence )
def align(self): "Aligns the sequences using CLUSTAL, storing the results" if len(self.sequences) == 0: return self.sequencesToFile( self.tmpFileName ) commandLine = MultipleAlignCL(os.path.join(os.curdir, self.tmpFileName), self.clustalPath) alignment = Clustalw.do_alignment(commandLine) allRecords = alignment.get_all_seqs() length = alignment.get_alignment_length() alignmentStrings = [] for record in allRecords: f = fasta.Record() f.title = record.description.strip() f.sequence = record.seq.tostring() alignmentStrings.append( f ) self.alignments = alignmentStrings self.alignmentLength = length os.remove(self.tmpFileName)
#!/usr/bin/env python """Example of generating a substitution matrix from an alignment. """ # standard library import sys # Biopython from Bio import SubsMat from Bio import Clustalw from Bio.Alphabet import IUPAC from Bio.Align import AlignInfo # get an alignment object from a Clustalw alignment output c_align = Clustalw.parse_file('protein.aln', IUPAC.protein) summary_align = AlignInfo.SummaryInfo(c_align) # get a replacement dictionary and accepted replacement matrix # exclude all amino acids that aren't charged polar replace_info = summary_align.replacement_dictionary(["G", "A", "V", "L", "I", "M", "P", "F", "W", "S", "T", "N", "Q", "Y", "C"]) my_arm = SubsMat.SeqMat(replace_info) print replace_info my_lom = SubsMat.make_log_odds_matrix(my_arm) print 'log_odds_mat:', my_lom my_lom.print_mat()
assert alignment[::-1][2].id == "mixed" del alignment del letters print "testing reading and writing clustal format..." test_dir = os.path.join(os.getcwd(), 'Clustalw') test_names = ['opuntia.aln', 'cw02.aln'] test_files = [] for name in test_names: test_files.append(os.path.join(test_dir, name)) for test_file in test_files: # parse the alignment file and get an aligment object alignment = Clustalw.parse_file(test_file) # print the alignment back out print alignment alignment = Clustalw.parse_file(os.path.join(test_dir, test_names[0])) # test the base alignment stuff print 'all_seqs...' for seq_record in alignment: print 'description:', seq_record.description print 'seq:', repr(seq_record.seq) print 'length:', alignment.get_alignment_length() print 'Calculating summary information...' align_info = AlignInfo.SummaryInfo(alignment)
# biopython from Bio.Alphabet import IUPAC from Bio import Clustalw from Bio.Clustalw import MultipleAlignCL from Bio.Align import AlignInfo from Bio.SubsMat import FreqTable # create the command line to run clustalw # this assumes you've got clustalw somewhere on your path, otherwise # you need to pass a second argument to MultipleAlignCL with the complete # path to clustalw cline = MultipleAlignCL(os.path.join(os.curdir, 'opuntia.fasta')) cline.set_output('test.aln') # actually perform the alignment and get back an alignment object alignment = Clustalw.do_alignment(cline) # get the records in the alignment all_records = alignment.get_all_seqs() print 'description:', all_records[0].description print 'sequence:', all_records[0].seq # get the length of the alignment print 'length', alignment.get_alignment_length() print alignment # print out interesting information about the alignment summary_align = AlignInfo.SummaryInfo(alignment)
# Check Bio.AlignIO.read(...) alignment = AlignIO.read(handle=open(t_filename), format="clustal") assert isinstance(alignment, Alignment) assert compare(alignment, alignments[0]) print "Using Bio.AlignIO.read(...)" #print "~" * 75 #handle = StringIO() #AlignIO.write([alignment], handle, "clustal") #handle.seek(0) #print handle.read() #print "~" * 75 print "Using Bio.Clustalw.parse_file(...)" c_alignment = Clustalw.parse_file(t_filename) assert isinstance(c_alignment, Alignment) assert isinstance(c_alignment, Clustalw.ClustalAlignment) #print " Using Bio.Clustalw.parse_file(...)" #print "~" * 75 #print c_alignment #print "~" * 75 #print # Compare the two... assert compare(alignment, c_alignment) # Check Bio.AlignIO can read the Bio.Clustalw's string output n_alignment = AlignIO.read(StringIO(str(c_alignment)), "clustal") assert isinstance(alignment, Alignment)
assert alignment[::-1][2].id == "mixed" del alignment del letters print "testing reading and writing clustal format..." test_dir = os.path.join(os.getcwd(), 'Clustalw') test_names = ['opuntia.aln', 'cw02.aln'] test_files = [] for name in test_names: test_files.append(os.path.join(test_dir, name)) for test_file in test_files: # parse the alignment file and get an aligment object alignment = Clustalw.parse_file(test_file) # print the alignment back out print alignment alignment = Clustalw.parse_file(os.path.join(test_dir, test_names[0])) # test the base alignment stuff print 'all_seqs...' all_seqs = alignment.get_all_seqs() for seq_record in all_seqs: print 'description:', seq_record.description print 'seq:', repr(seq_record.seq) print 'length:', alignment.get_alignment_length() print 'Calculating summary information...'
if not clustalw_exe: raise MissingExternalDependencyError(\ "Install clustalw or clustalw2 if you want to use Bio.Clustalw.") ################################################################# print "Checking error conditions" print "=========================" print "Empty file" input_file = "does_not_exist.fasta" assert not os.path.isfile(input_file) cline = MultipleAlignCL(input_file, command=clustalw_exe) try: align = Clustalw.do_alignment(cline) assert False, "Should have failed, returned %s" % repr(align) except IOError, err: print "Failed (good)" #Python 2.3 on Windows gave (0, 'Error') #Python 2.5 on Windows gives [Errno 0] Error assert "Cannot open sequence file" in str(err) \ or "not produced" in str(err) \ or str(err) == "[Errno 0] Error" \ or str(err) == "(0, 'Error')", str(err) print print "Single sequence" input_file = "Fasta/f001" assert os.path.isfile(input_file) assert len(list(SeqIO.parse(input_file, "fasta"))) == 1
#!/usr/bin/env python """Example of generating a substitution matrix from an alignment. """ # standard library import sys # Biopython from Bio import SubsMat from Bio import Clustalw from Bio.Alphabet import IUPAC from Bio.Align import AlignInfo # get an alignment object from a Clustalw alignment output c_align = Clustalw.parse_file("protein.aln", IUPAC.protein) summary_align = AlignInfo.SummaryInfo(c_align) # get a replacement dictionary and accepted replacement matrix # exclude all amino acids that aren't charged polar replace_info = summary_align.replacement_dictionary( ["G", "A", "V", "L", "I", "M", "P", "F", "W", "S", "T", "N", "Q", "Y", "C"] ) my_arm = SubsMat.SeqMat(replace_info) print (replace_info) my_lom = SubsMat.make_log_odds_matrix(my_arm) print "log_odds_mat:", my_lom my_lom.print_mat()