def getFastaEntry(self, entryNum, resultFolder): fastaFile = self.pathToFasta from Bio import AlignIO, SeqIO from Bio.Seq import Seq from Bio.SeqRecord import SeqRecord from Bio.Alphabet import IUPAC, Gapped alignmentIterator = AlignIO.parse(fastaFile, "fasta", alphabet=Gapped( IUPAC.ExtendedIUPACProtein(), "-")) queryFasta = resultFolder + "/" + "Query_%d.faa" % (entryNum, ) alignment = alignmentIterator.next() if entryNum > len(alignment): return None desiredSeqString = str(alignment[entryNum - 1].seq) desiredSeqString = desiredSeqString.replace("-", "") #print desiredSeqString seqNoGaps = Seq(desiredSeqString, alphabet=IUPAC.ExtendedIUPACProtein()) seqRecNoGaps = SeqRecord(seq=seqNoGaps, id=alignment[entryNum - 1].id) #print seqRecNoGaps.seq #print seqRecNoGaps.id SeqIO.write(seqRecNoGaps, queryFasta, "fasta") return queryFasta
def writeNoGaps(self, outputfile): fastaFile = self.pathToFasta from Bio import AlignIO, SeqIO from Bio.Seq import Seq from Bio.SeqRecord import SeqRecord from Bio.Alphabet import IUPAC, Gapped seqIterator = SeqIO.parse(fastaFile, "fasta", alphabet=Gapped(IUPAC.ExtendedIUPACProtein(), "-")) outFasta = outputfile records = list() for alignment in seqIterator: desiredSeqString = str(alignment.seq) desiredSeqString = desiredSeqString.replace("-", "") #print desiredSeqString seqNoGaps = Seq(desiredSeqString, alphabet=IUPAC.ExtendedIUPACProtein()) seqRecNoGaps = SeqRecord(seq=seqNoGaps, id=alignment.id) #print seqRecNoGaps.seq #print seqRecNoGaps.id records.append(seqRecNoGaps) SeqIO.write(records, outFasta, "fasta")
def setUp(self): self.aln_file = [TEST_ALIGN_FILE1, TEST_ALIGN_FILE2, TEST_ALIGN_FILE3, TEST_ALIGN_FILE4, TEST_ALIGN_FILE5, TEST_ALIGN_FILE6] alns = [] for i in self.aln_file: if i[1] == 'parse': nucl = SeqIO.parse(i[0][0], 'fasta', alphabet=IUPAC.IUPACUnambiguousDNA()) prot = AlignIO.read(i[0][1], 'clustal', alphabet=IUPAC.protein) with warnings.catch_warnings(): warnings.simplefilter('ignore') caln = codonalign.build(prot, nucl, alphabet=codonalign.default_codon_alphabet) elif i[1] == 'index': # Deliberately using a fancy protein alphabet for testing: nucl = SeqIO.index(i[0][0], 'fasta', alphabet=IUPAC.IUPACUnambiguousDNA()) prot = AlignIO.read(i[0][1], 'clustal', alphabet=Gapped(IUPAC.ExtendedIUPACProtein())) with warnings.catch_warnings(): warnings.simplefilter('ignore') caln = codonalign.build(prot, nucl, alphabet=codonalign.default_codon_alphabet, max_score=20) elif i[1] == 'id': nucl = SeqIO.parse(i[0][0], 'fasta', alphabet=IUPAC.IUPACUnambiguousDNA()) prot = AlignIO.read(i[0][1], 'clustal', alphabet=IUPAC.protein) with open(i[0][2]) as handle: id = dict((i.split()[0], i.split()[1]) for i in handle) with warnings.catch_warnings(): warnings.simplefilter('ignore') caln = codonalign.build(prot, nucl, corr_dict=id, alphabet=codonalign.default_codon_alphabet) alns.append(caln) nucl.close() # Close the indexed FASTA file self.alns = alns
def testLimit(self, list_seqs, start): """ Extract the aa sequences in the window. list_seqs is the list of sequence id in the alignment (not the id associated with the Bio.Seq object). start is the index of the start of the window. """ frame = start % 3 aa_window_length = int(self.window_length / 3) begin = int((start - frame) / 3) end = int(begin + aa_window_length) if frame == 0: t_align = self.t_align0 elif frame == 1: t_align = self.t_align1 else: t_align = self.t_align2 sub_align = MulAlign([], Gapped(IUPAC.ExtendedIUPACProtein(), "N")) for idx in list_seqs: sub_align.append(t_align[idx][begin:end]) result = [] for c in range(aa_window_length): c = Counter(sub_align[:, c]) # count the most common aa nbr_most_common = c.most_common(1)[0][1] if nbr_most_common / len(list_seqs) >= self.min_aa_ratio: result.append(True) else: result.append(False) return result
def generateFastaWithOutEntry(self, entryNumToRemove, resultFolder): fastaFile = self.pathToFasta # fastaFileClade = sys.argv[2] # entryToTest = int(sys.argv[3]) entryToTest = int(entryNumToRemove) from Bio import AlignIO from Bio.Alphabet import IUPAC, Gapped from Bio.Align import MultipleSeqAlignment alignmentIterator = AlignIO.parse(fastaFile, "fasta", alphabet=Gapped( IUPAC.ExtendedIUPACProtein(), "-")) alignment = alignmentIterator.next() pathToNewFile = resultFolder + "/" + "WithOutEntry_%d.faa" % ( entryToTest, ) #print testAlignment[entryToTest].id #print testAlignment[entryToTest].seq #print "Number of entries: ",len(testAlignment) # Here we remove the desired element newTestAlignment = [] for i in range(len(alignment)): if i != entryToTest - 1: newTestAlignment.append(alignment[i]) newAlignment = MultipleSeqAlignment(newTestAlignment) AlignIO.write(newAlignment, pathToNewFile, "fasta") #print "Number of entries after: ",len(newTestAlignment) return pathToNewFile
def writeFastas(self, pathToCompleteFastaAlignment, outputPath): """ Writes individual clades as fasta alignments, where the files are stored in the output path and the name of each file is composed of <clade_name>.fasta . Data is obtained from the complete fasta alignment provided (pathToCompleteFastaAlignment) and the mappings given to the constructor. The mappings are a dictionary of fasta ids to clade names. Elements in the alignment not present in the mappings won't end in any clade fasta file. :param pathToCompleteFastaAlignment: :param outputPath: """ from Bio import SeqIO, AlignIO from Bio.Alphabet import IUPAC, Gapped # create file handles for each clade for cladeName in list(self._mappings.values()): self._clade2fileHandle[cladeName] = open(outputPath + "/" + cladeName + ".fas") # read complete alignment alignment = AlignIO.read(pathToCompleteFastaAlignment, "fasta", alphabet=Gapped(IUPAC.ExtendedIUPACProtein(), "-")) for record in alignment: handle = self._clade2fileHandle[self._mappings[record.id]] SeqIO.write(record, handle, "fasta") for cladeName in self._clade2fileHandle.keys(): self._clade2fileHandle[cladeName].close()
def _guess_consensus_alphabet(self, ambiguous): """Pick an (ungapped) alphabet for an alignment consesus sequence (PRIVATE). This just looks at the sequences we have, checks their type, and returns as appropriate type which seems to make sense with the sequences we've got. """ # Start with the (un-gapped version of) the alignment alphabet a = Alphabet._get_base_alphabet(self.alignment._alphabet) # Now check its compatible with all the rest of the sequences for record in self.alignment: # Get the (un-gapped version of) the sequence's alphabet alt = Alphabet._get_base_alphabet(record.seq.alphabet) if not isinstance(alt, a.__class__): raise ValueError( "Alignment contains a sequence with an incompatible alphabet." ) # Check the ambiguous character we are going to use in the consensus # is in the alphabet's list of valid letters (if defined). if ( hasattr(a, "letters") and a.letters is not None and ambiguous not in a.letters ): # We'll need to pick a more generic alphabet... if isinstance(a, IUPAC.IUPACUnambiguousDNA): if ambiguous in IUPAC.IUPACUnambiguousDNA().letters: a = IUPAC.IUPACUnambiguousDNA() else: a = Alphabet.generic_dna elif isinstance(a, IUPAC.IUPACUnambiguousRNA): if ambiguous in IUPAC.IUPACUnambiguousRNA().letters: a = IUPAC.IUPACUnambiguousRNA() else: a = Alphabet.generic_rna elif isinstance(a, IUPAC.IUPACProtein): if ambiguous in IUPAC.ExtendedIUPACProtein().letters: a = IUPAC.ExtendedIUPACProtein() else: a = Alphabet.generic_protein else: a = Alphabet.single_letter_alphabet return a
def test_translate(self): """Test that a dna open reading frame is translated correctly.""" orf = 'ATGTGGAGACGGAAACATCCGAGGACATCCGGAGGAACCCGGGGAGTTCTGAGTGGTAATTAG' expected_primers = Seq('MWRRKHPRTSGGTRGVLSGN*', HasStopCodon(IUPAC.ExtendedIUPACProtein(), '*')) result_primers = translate(orf) self.assertEqual(result_primers, expected_primers) self.assertEqual(len(result_primers), 21) self.assertEqual(isinstance(result_primers, Seq), True)
def translate(self, align, offset): """ Translate the alignment according to the selected frame which is set according to 'offset' value """ end = ((align.get_alignment_length() - offset) // 3) * 3 + offset t_align = MulAlign([], Gapped(IUPAC.ExtendedIUPACProtein(), "N")) for rec in align: seq = str(rec.seq).upper().replace("-", "N").replace("n", "N") new_seq = Seq(seq, IUPAC.IUPACAmbiguousDNA())[offset:end].translate() new_rec = SeqRecord(new_seq, name=rec.name, id=rec.id, description="") t_align.append(new_rec) return t_align
def process_upload(sequences, format, request): if format not in ["file", "text"]: raise InvalidFASTA( "Invalid format: {}. Must be either 'file' or 'text'.".format( format)) if format == "text": seq_file = io.BytesIO() seq_file.write(sequences) seq_file.seek(0) sequences = seq_file sequences = SeqIO.parse(sequences, "fasta", IUPAC.ExtendedIUPACProtein()) try: sequence = next(sequences) except StopIteration: raise InvalidFASTA("No sequences parsed.") if not Alphabet._verify_alphabet(sequence.seq): raise InvalidFASTA("Sequence {} is not a protein.".format(sequence.id)) result = [str(sequence.id)] classifications, ids, rows = upload_hmmer(sequence) result.append(classifications[0][1]) secondary_classification = classifications[0][2] result.append(secondary_classification if secondary_classification != "Unknown" else None) result.append(rows) result.append(upload_blastp(sequence)[0]) result.append(result[-1][0]["id"]) result.append(result[-2][0]["variant"]) request.session["uploaded_sequences"] = [{ "id": "QUERY", #sequence.id, "variant": classifications[0][1], "sequence": str(sequence.seq), "taxonomy": result[-3][0]["taxonomy"] }] return result
#!/usr/bin/env python import argparse from Bio import SeqIO from Bio.Alphabet import IUPAC import os import subprocess parser = argparse.ArgumentParser( description="Zhiping's indexing solution for counting unique peptides") parser.parse_args() aa_letters = IUPAC.ExtendedIUPACProtein().letters for aa1 in aa_letters: for aa2 in aa_letters: # Make indexed directories with first and second amino acids if not os.path.exists(aa1 + aa2): os.makedirs(aa1 + aa2) os.chdir(aa1 + aa2) # Write script for cluster SRC = open(aa1 + aa2 + "_hash_count.py", 'w') SRC.write( "from Bio import SeqIO\n" + "import sys\n" + "import os\n" + "# Flush STOUT continuously\n" + "sys.stdout = os.fdopen(sys.stdout.fileno(), 'w', 0)\n" + "OUT = open('" + aa1 + aa2 + "_peptides.txt', 'w')\n" + "counter = 0\n" + "hash_count = {}\n" "# Loop through records\n" + "for seq_record in SeqIO.parse('../../nr.fasta', 'fasta'):\n" + " if counter%1000000 == 0:\n" + " print 'On seq ' + str(counter) + '\\n'\n" + " for i in range(len(seq_record.seq) - 14 + 1):\n" +
from Bio.Align import MultipleSeqAlignment from Bio.Alphabet import IUPAC, Gapped from Bio.Seq import Seq from Bio.SeqRecord import SeqRecord os.chdir(sys.argv[1]) listing = os.listdir(".") consensus = {} genConsensus = '' pssmGen = '' consensusThres = 0.7 #generalAlignment = AlignIO.parse(sys.argv[2],"fasta",alphabet=IUPAC.ExtendedIUPACProtein()) generalAlignment = AlignIO.parse(sys.argv[2], "fasta", alphabet=Gapped( IUPAC.ExtendedIUPACProtein(), "-")) lengthGenAl = 0 for genAlignment in generalAlignment: sumGen = AlignInfo.SummaryInfo(genAlignment) genConsensus = sumGen.gap_consensus(consensusThres) #pssmGen = sumGen.pos_specific_score_matrix(genConsensus,chars_to_ignore = ['-']) pssmGen = sumGen.pos_specific_score_matrix(genConsensus) lengthGenAl = len(genAlignment) for item in listing: if item.endswith(".fas"): #alignments = AlignIO.parse(item,"fasta",alphabet=IUPAC.ExtendedIUPACProtein()) alignments = AlignIO.parse(item, "fasta", alphabet=Gapped( IUPAC.ExtendedIUPACProtein(), "-"))
__author__="pmoreno" __date__ ="$May 29, 2011 5:16:28 PM$" if __name__ == "__main__": #dirOfHMMModels = sys.argv[1] fastaFileCladeNoGeneralSignal = sys.argv[1] fastaFileClade = sys.argv[2] entryToTest = int(sys.argv[3]) resultFolder = sys.argv[4] from Bio import AlignIO, SeqIO from Bio.Alphabet import IUPAC, Gapped from Bio.Align import MultipleSeqAlignment alignmentNoGenSignalIterator = AlignIO.parse(fastaFileCladeNoGeneralSignal,"fasta",alphabet=Gapped(IUPAC.ExtendedIUPACProtein(),"-")); alignmentIterator = AlignIO.parse(fastaFileClade,"fasta",alphabet=Gapped(IUPAC.ExtendedIUPACProtein(),"-")); noGenSignalAlignment = alignmentNoGenSignalIterator.next() queryFasta = resultFolder+"/"+"Query_%d.faa" % (entryToTest,) ownCladeProfile = resultFolder+"/"+"ForOwnCladeProfile_%d.faa" % (entryToTest,) #print testAlignment[entryToTest].id #print testAlignment[entryToTest].seq alignmentWithSignal = alignmentIterator.next() desiredSeqString = str(alignmentWithSignal[entryToTest-1].seq) desiredSeqString = desiredSeqString.replace("-", "") #print desiredSeqString seqNoGaps = Seq(desiredSeqString, alphabet=IUPAC.ExtendedIUPACProtein()) #print seqNoGaps seqRecNoGaps = SeqRecord(seq=seqNoGaps, id=alignmentWithSignal[entryToTest-1].id)
'TTG': 'L', 'TAC': 'Y', 'TAT': 'Y', 'TAA': '_', 'TAG': 'O', 'TGC': 'C', 'TGT': 'C', 'TGA': 'U', 'TGG': 'W', } tt_flip = flip_trans_table(tt_11) matrix = matlist.blosum62 msa_alphabet = AlphabetEncoder(IUPAC.ExtendedIUPACProtein(), '-.') alignments = AlignIO.read(align_in, 'fasta', alphabet=msa_alphabet) # print(len(list(alignments))) # number of orthologs use in final msa uid_pattern = re.compile(r'uid=(\S+?);') tax_id_pattern = re.compile(r'tax_id=(\d+)') # name outfile with uid of gene of interest as has been convention; expects it to be the first gene in the msa uid = re.search(uid_pattern, alignments[0].id).group(1) bad_codons = ['...', '---', 'NNN'] # codons that will not receive scores # get disorder for each sequence in the msa # make this a function later disorder_strength = [ ] # list of numerical disorder strength score for each aa seq in msa (with gaps)
def run(self, consensusThreshold): from Bio import AlignIO, SeqIO from Bio.Align import AlignInfo # from Bio.Align import MultipleSeqAlignment from Bio.Alphabet import IUPAC, Gapped # from Bio.Seq import Seq # from Bio.SeqRecord import SeqRecord # Directory where files are # os.chdir(sys.argv[1]) # listing = os.listdir(".") listing = os.listdir(self.pathToCladesAlignments) consensus = {} genConsensus = '' pssmGen = '' # this value should be read from the arguments or else use a default consensusThres = consensusThreshold # sys.argv[2] holds the path to the general alignment generalAlignment = AlignIO.parse(self.generalAlignment, "fasta", alphabet=Gapped( IUPAC.ExtendedIUPACProtein(), "-")) lengthGenAl = 0 positionsToMask = [] for genAlignment in generalAlignment: sumGen = AlignInfo.SummaryInfo(genAlignment) genConsensus = sumGen.gap_consensus(consensusThres) for index, residue in enumerate(genConsensus): if genConsensus[index] == '-': continue if genConsensus[index] == 'X': continue positionsToMask.append(index) #pssmGen = sumGen.pos_specific_score_matrix(genConsensus,chars_to_ignore = ['-']) pssmGen = sumGen.pos_specific_score_matrix(genConsensus) lengthGenAl = len(genAlignment) print positionsToMask print listing resultAlignFiles = [] for item in listing: if item.endswith(".fas"): #alignments = AlignIO.parse(item,"fasta",alphabet=IUPAC.ExtendedIUPACProtein()) alignments = AlignIO.parse(self.pathToCladesAlignments + item, "fasta", alphabet=Gapped( IUPAC.ExtendedIUPACProtein(), "-")) for alignment in alignments: summ = AlignInfo.SummaryInfo(alignment) consensus[item] = summ.gap_consensus(consensusThres) for posToMask in positionsToMask: if consensus[item][posToMask] == '-': continue for alignElement in alignment: mutSeq = alignElement.seq.tomutable() mutSeq[posToMask] = 'X' alignElement.seq = mutSeq.toseq() SeqIO.write( alignment, self.outPutPath + item + "_noPKSsignal_Thres%d.faa" % (consensusThres * 100, ), "fasta") resultAlignFiles.append(self.outPutPath + item + "_noPKSsignal_Thres%d.faa" % (consensusThres * 100, )) summ = AlignInfo.SummaryInfo(alignment) consensus[item] = summ.gap_consensus(consensusThres) print item, consensus[item] return resultAlignFiles