Exemple #1
0
    def getFastaEntry(self, entryNum, resultFolder):
        fastaFile = self.pathToFasta

        from Bio import AlignIO, SeqIO
        from Bio.Seq import Seq
        from Bio.SeqRecord import SeqRecord
        from Bio.Alphabet import IUPAC, Gapped

        alignmentIterator = AlignIO.parse(fastaFile,
                                          "fasta",
                                          alphabet=Gapped(
                                              IUPAC.ExtendedIUPACProtein(),
                                              "-"))
        queryFasta = resultFolder + "/" + "Query_%d.faa" % (entryNum, )
        alignment = alignmentIterator.next()
        if entryNum > len(alignment):
            return None

        desiredSeqString = str(alignment[entryNum - 1].seq)
        desiredSeqString = desiredSeqString.replace("-", "")
        #print desiredSeqString
        seqNoGaps = Seq(desiredSeqString,
                        alphabet=IUPAC.ExtendedIUPACProtein())

        seqRecNoGaps = SeqRecord(seq=seqNoGaps, id=alignment[entryNum - 1].id)
        #print seqRecNoGaps.seq
        #print seqRecNoGaps.id
        SeqIO.write(seqRecNoGaps, queryFasta, "fasta")
        return queryFasta
Exemple #2
0
    def writeNoGaps(self, outputfile):
        fastaFile = self.pathToFasta

        from Bio import AlignIO, SeqIO
        from Bio.Seq import Seq
        from Bio.SeqRecord import SeqRecord
        from Bio.Alphabet import IUPAC, Gapped

        seqIterator = SeqIO.parse(fastaFile,
                                  "fasta",
                                  alphabet=Gapped(IUPAC.ExtendedIUPACProtein(),
                                                  "-"))
        outFasta = outputfile
        records = list()
        for alignment in seqIterator:
            desiredSeqString = str(alignment.seq)
            desiredSeqString = desiredSeqString.replace("-", "")
            #print desiredSeqString
            seqNoGaps = Seq(desiredSeqString,
                            alphabet=IUPAC.ExtendedIUPACProtein())

            seqRecNoGaps = SeqRecord(seq=seqNoGaps, id=alignment.id)
            #print seqRecNoGaps.seq
            #print seqRecNoGaps.id
            records.append(seqRecNoGaps)

        SeqIO.write(records, outFasta, "fasta")
Exemple #3
0
 def setUp(self):
     self.aln_file = [TEST_ALIGN_FILE1,
                      TEST_ALIGN_FILE2,
                      TEST_ALIGN_FILE3,
                      TEST_ALIGN_FILE4,
                      TEST_ALIGN_FILE5,
                      TEST_ALIGN_FILE6]
     alns = []
     for i in self.aln_file:
         if i[1] == 'parse':
             nucl = SeqIO.parse(i[0][0], 'fasta', alphabet=IUPAC.IUPACUnambiguousDNA())
             prot = AlignIO.read(i[0][1], 'clustal', alphabet=IUPAC.protein)
             with warnings.catch_warnings():
                 warnings.simplefilter('ignore')
                 caln = codonalign.build(prot, nucl, alphabet=codonalign.default_codon_alphabet)
         elif i[1] == 'index':
             # Deliberately using a fancy protein alphabet for testing:
             nucl = SeqIO.index(i[0][0], 'fasta', alphabet=IUPAC.IUPACUnambiguousDNA())
             prot = AlignIO.read(i[0][1], 'clustal', alphabet=Gapped(IUPAC.ExtendedIUPACProtein()))
             with warnings.catch_warnings():
                 warnings.simplefilter('ignore')
                 caln = codonalign.build(prot, nucl, alphabet=codonalign.default_codon_alphabet, max_score=20)
         elif i[1] == 'id':
             nucl = SeqIO.parse(i[0][0], 'fasta', alphabet=IUPAC.IUPACUnambiguousDNA())
             prot = AlignIO.read(i[0][1], 'clustal', alphabet=IUPAC.protein)
             with open(i[0][2]) as handle:
                 id = dict((i.split()[0], i.split()[1]) for i in handle)
             with warnings.catch_warnings():
                 warnings.simplefilter('ignore')
                 caln = codonalign.build(prot, nucl, corr_dict=id, alphabet=codonalign.default_codon_alphabet)
         alns.append(caln)
         nucl.close()  # Close the indexed FASTA file
     self.alns = alns
Exemple #4
0
    def testLimit(self, list_seqs, start):
        """
        Extract the aa sequences in the window.
        list_seqs is the list of sequence id in the alignment (not the id associated
            with the Bio.Seq object).
        start is the index of the start of the window.
        """
        frame = start % 3
        aa_window_length = int(self.window_length / 3)
        begin = int((start - frame) / 3)
        end = int(begin + aa_window_length)

        if frame == 0:
            t_align = self.t_align0
        elif frame == 1:
            t_align = self.t_align1
        else:
            t_align = self.t_align2

        sub_align = MulAlign([], Gapped(IUPAC.ExtendedIUPACProtein(), "N"))
        for idx in list_seqs:
            sub_align.append(t_align[idx][begin:end])

        result = []

        for c in range(aa_window_length):
            c = Counter(sub_align[:, c])
            #  count the most common aa
            nbr_most_common = c.most_common(1)[0][1]
            if nbr_most_common / len(list_seqs) >= self.min_aa_ratio:
                result.append(True)
            else:
                result.append(False)
        return result
Exemple #5
0
    def generateFastaWithOutEntry(self, entryNumToRemove, resultFolder):
        fastaFile = self.pathToFasta
        # fastaFileClade = sys.argv[2]
        # entryToTest = int(sys.argv[3])
        entryToTest = int(entryNumToRemove)

        from Bio import AlignIO
        from Bio.Alphabet import IUPAC, Gapped
        from Bio.Align import MultipleSeqAlignment

        alignmentIterator = AlignIO.parse(fastaFile,
                                          "fasta",
                                          alphabet=Gapped(
                                              IUPAC.ExtendedIUPACProtein(),
                                              "-"))

        alignment = alignmentIterator.next()
        pathToNewFile = resultFolder + "/" + "WithOutEntry_%d.faa" % (
            entryToTest, )
        #print testAlignment[entryToTest].id
        #print testAlignment[entryToTest].seq
        #print "Number of entries: ",len(testAlignment)
        # Here we remove the desired element
        newTestAlignment = []
        for i in range(len(alignment)):
            if i != entryToTest - 1:
                newTestAlignment.append(alignment[i])

        newAlignment = MultipleSeqAlignment(newTestAlignment)

        AlignIO.write(newAlignment, pathToNewFile, "fasta")
        #print "Number of entries after: ",len(newTestAlignment)
        return pathToNewFile
Exemple #6
0
    def writeFastas(self, pathToCompleteFastaAlignment, outputPath):
        """
        Writes individual clades as fasta alignments, where the files are stored in the output path
        and the name of each file is composed of <clade_name>.fasta . Data is obtained from the complete
        fasta alignment provided (pathToCompleteFastaAlignment) and the mappings given to the constructor.
        The mappings are a dictionary of fasta ids to clade names. Elements in the alignment not present
        in the mappings won't end in any clade fasta file.

        :param pathToCompleteFastaAlignment:
        :param outputPath:
        """
        from Bio import SeqIO, AlignIO
        from Bio.Alphabet import IUPAC, Gapped
        # create file handles for each clade
        for cladeName in list(self._mappings.values()):
            self._clade2fileHandle[cladeName] = open(outputPath + "/" +
                                                     cladeName + ".fas")

        # read complete alignment
        alignment = AlignIO.read(pathToCompleteFastaAlignment,
                                 "fasta",
                                 alphabet=Gapped(IUPAC.ExtendedIUPACProtein(),
                                                 "-"))
        for record in alignment:
            handle = self._clade2fileHandle[self._mappings[record.id]]
            SeqIO.write(record, handle, "fasta")

        for cladeName in self._clade2fileHandle.keys():
            self._clade2fileHandle[cladeName].close()
Exemple #7
0
    def _guess_consensus_alphabet(self, ambiguous):
        """Pick an (ungapped) alphabet for an alignment consesus sequence (PRIVATE).

        This just looks at the sequences we have, checks their type, and
        returns as appropriate type which seems to make sense with the
        sequences we've got.
        """
        # Start with the (un-gapped version of) the alignment alphabet
        a = Alphabet._get_base_alphabet(self.alignment._alphabet)

        # Now check its compatible with all the rest of the sequences
        for record in self.alignment:
            # Get the (un-gapped version of) the sequence's alphabet
            alt = Alphabet._get_base_alphabet(record.seq.alphabet)
            if not isinstance(alt, a.__class__):
                raise ValueError(
                    "Alignment contains a sequence with an incompatible alphabet."
                )

        # Check the ambiguous character we are going to use in the consensus
        # is in the alphabet's list of valid letters (if defined).
        if (
            hasattr(a, "letters")
            and a.letters is not None
            and ambiguous not in a.letters
        ):
            # We'll need to pick a more generic alphabet...
            if isinstance(a, IUPAC.IUPACUnambiguousDNA):
                if ambiguous in IUPAC.IUPACUnambiguousDNA().letters:
                    a = IUPAC.IUPACUnambiguousDNA()
                else:
                    a = Alphabet.generic_dna
            elif isinstance(a, IUPAC.IUPACUnambiguousRNA):
                if ambiguous in IUPAC.IUPACUnambiguousRNA().letters:
                    a = IUPAC.IUPACUnambiguousRNA()
                else:
                    a = Alphabet.generic_rna
            elif isinstance(a, IUPAC.IUPACProtein):
                if ambiguous in IUPAC.ExtendedIUPACProtein().letters:
                    a = IUPAC.ExtendedIUPACProtein()
                else:
                    a = Alphabet.generic_protein
            else:
                a = Alphabet.single_letter_alphabet
        return a
Exemple #8
0
    def test_translate(self):
        """Test that a dna open reading frame is translated correctly."""

        orf = 'ATGTGGAGACGGAAACATCCGAGGACATCCGGAGGAACCCGGGGAGTTCTGAGTGGTAATTAG'
        expected_primers = Seq('MWRRKHPRTSGGTRGVLSGN*',
                               HasStopCodon(IUPAC.ExtendedIUPACProtein(), '*'))
        result_primers = translate(orf)
        self.assertEqual(result_primers, expected_primers)
        self.assertEqual(len(result_primers), 21)
        self.assertEqual(isinstance(result_primers, Seq), True)
Exemple #9
0
 def translate(self, align, offset):
     """
     Translate the alignment according to the selected frame which is set 
         according to 'offset' value
     """
     end = ((align.get_alignment_length() - offset) // 3) * 3 + offset
     t_align = MulAlign([], Gapped(IUPAC.ExtendedIUPACProtein(), "N"))
     for rec in align:
         seq = str(rec.seq).upper().replace("-", "N").replace("n", "N")
         new_seq = Seq(seq,
                       IUPAC.IUPACAmbiguousDNA())[offset:end].translate()
         new_rec = SeqRecord(new_seq,
                             name=rec.name,
                             id=rec.id,
                             description="")
         t_align.append(new_rec)
     return t_align
Exemple #10
0
def process_upload(sequences, format, request):
    if format not in ["file", "text"]:
        raise InvalidFASTA(
            "Invalid format: {}. Must be either 'file' or 'text'.".format(
                format))

    if format == "text":
        seq_file = io.BytesIO()
        seq_file.write(sequences)
        seq_file.seek(0)
        sequences = seq_file

    sequences = SeqIO.parse(sequences, "fasta", IUPAC.ExtendedIUPACProtein())

    try:
        sequence = next(sequences)
    except StopIteration:
        raise InvalidFASTA("No sequences parsed.")

    if not Alphabet._verify_alphabet(sequence.seq):
        raise InvalidFASTA("Sequence {} is not a protein.".format(sequence.id))

    result = [str(sequence.id)]

    classifications, ids, rows = upload_hmmer(sequence)
    result.append(classifications[0][1])
    secondary_classification = classifications[0][2]
    result.append(secondary_classification
                  if secondary_classification != "Unknown" else None)
    result.append(rows)
    result.append(upload_blastp(sequence)[0])
    result.append(result[-1][0]["id"])
    result.append(result[-2][0]["variant"])

    request.session["uploaded_sequences"] = [{
        "id":
        "QUERY",  #sequence.id,
        "variant":
        classifications[0][1],
        "sequence":
        str(sequence.seq),
        "taxonomy":
        result[-3][0]["taxonomy"]
    }]

    return result
Exemple #11
0
#!/usr/bin/env python
import argparse
from Bio import SeqIO
from Bio.Alphabet import IUPAC
import os
import subprocess

parser = argparse.ArgumentParser(
    description="Zhiping's indexing solution for counting unique peptides")
parser.parse_args()

aa_letters = IUPAC.ExtendedIUPACProtein().letters
for aa1 in aa_letters:
    for aa2 in aa_letters:
        # Make indexed directories with first and second amino acids
        if not os.path.exists(aa1 + aa2):
            os.makedirs(aa1 + aa2)
        os.chdir(aa1 + aa2)
        # Write script for cluster
        SRC = open(aa1 + aa2 + "_hash_count.py", 'w')
        SRC.write(
            "from Bio import SeqIO\n" + "import sys\n" + "import os\n" +
            "# Flush STOUT continuously\n" +
            "sys.stdout = os.fdopen(sys.stdout.fileno(), 'w', 0)\n" +
            "OUT = open('" + aa1 + aa2 + "_peptides.txt', 'w')\n" +
            "counter = 0\n" + "hash_count = {}\n"
            "# Loop through records\n" +
            "for seq_record in SeqIO.parse('../../nr.fasta', 'fasta'):\n" +
            "	if counter%1000000 == 0:\n" +
            "		print 'On seq ' + str(counter) + '\\n'\n" +
            "	for i in range(len(seq_record.seq) - 14 + 1):\n" +
    from Bio.Align import MultipleSeqAlignment
    from Bio.Alphabet import IUPAC, Gapped
    from Bio.Seq import Seq
    from Bio.SeqRecord import SeqRecord
    os.chdir(sys.argv[1])
    listing = os.listdir(".")
    consensus = {}
    genConsensus = ''
    pssmGen = ''
    consensusThres = 0.7

    #generalAlignment = AlignIO.parse(sys.argv[2],"fasta",alphabet=IUPAC.ExtendedIUPACProtein())
    generalAlignment = AlignIO.parse(sys.argv[2],
                                     "fasta",
                                     alphabet=Gapped(
                                         IUPAC.ExtendedIUPACProtein(), "-"))
    lengthGenAl = 0
    for genAlignment in generalAlignment:
        sumGen = AlignInfo.SummaryInfo(genAlignment)
        genConsensus = sumGen.gap_consensus(consensusThres)
        #pssmGen = sumGen.pos_specific_score_matrix(genConsensus,chars_to_ignore = ['-'])
        pssmGen = sumGen.pos_specific_score_matrix(genConsensus)
        lengthGenAl = len(genAlignment)

    for item in listing:
        if item.endswith(".fas"):
            #alignments = AlignIO.parse(item,"fasta",alphabet=IUPAC.ExtendedIUPACProtein())
            alignments = AlignIO.parse(item,
                                       "fasta",
                                       alphabet=Gapped(
                                           IUPAC.ExtendedIUPACProtein(), "-"))
__author__="pmoreno"
__date__ ="$May 29, 2011 5:16:28 PM$"

if __name__ == "__main__":
    #dirOfHMMModels = sys.argv[1]
    fastaFileCladeNoGeneralSignal = sys.argv[1]
    fastaFileClade = sys.argv[2]
    entryToTest = int(sys.argv[3])
    resultFolder = sys.argv[4]

    from Bio import AlignIO, SeqIO
    from Bio.Alphabet import IUPAC, Gapped
    from Bio.Align import MultipleSeqAlignment

    alignmentNoGenSignalIterator = AlignIO.parse(fastaFileCladeNoGeneralSignal,"fasta",alphabet=Gapped(IUPAC.ExtendedIUPACProtein(),"-"));
    alignmentIterator = AlignIO.parse(fastaFileClade,"fasta",alphabet=Gapped(IUPAC.ExtendedIUPACProtein(),"-"));

    noGenSignalAlignment = alignmentNoGenSignalIterator.next()
    queryFasta = resultFolder+"/"+"Query_%d.faa" % (entryToTest,)
    ownCladeProfile = resultFolder+"/"+"ForOwnCladeProfile_%d.faa" % (entryToTest,)
    #print testAlignment[entryToTest].id
    #print testAlignment[entryToTest].seq

    alignmentWithSignal = alignmentIterator.next()
    desiredSeqString = str(alignmentWithSignal[entryToTest-1].seq)
    desiredSeqString = desiredSeqString.replace("-", "")
    #print desiredSeqString
    seqNoGaps = Seq(desiredSeqString, alphabet=IUPAC.ExtendedIUPACProtein())
    #print seqNoGaps
    seqRecNoGaps = SeqRecord(seq=seqNoGaps, id=alignmentWithSignal[entryToTest-1].id)
Exemple #14
0
        'TTG': 'L',
        'TAC': 'Y',
        'TAT': 'Y',
        'TAA': '_',
        'TAG': 'O',
        'TGC': 'C',
        'TGT': 'C',
        'TGA': 'U',
        'TGG': 'W',
    }

    tt_flip = flip_trans_table(tt_11)

    matrix = matlist.blosum62

    msa_alphabet = AlphabetEncoder(IUPAC.ExtendedIUPACProtein(), '-.')
    alignments = AlignIO.read(align_in, 'fasta', alphabet=msa_alphabet)
    # print(len(list(alignments)))   # number of orthologs use in final msa

    uid_pattern = re.compile(r'uid=(\S+?);')
    tax_id_pattern = re.compile(r'tax_id=(\d+)')

    # name outfile with uid of gene of interest as has been convention; expects it to be the first gene in the msa
    uid = re.search(uid_pattern, alignments[0].id).group(1)

    bad_codons = ['...', '---', 'NNN']  # codons that will not receive scores

    # get disorder for each sequence in the msa
    # make this a function later
    disorder_strength = [
    ]  # list of numerical disorder strength score for each aa seq in msa (with gaps)
Exemple #15
0
    def run(self, consensusThreshold):
        from Bio import AlignIO, SeqIO
        from Bio.Align import AlignInfo
        # from Bio.Align import MultipleSeqAlignment
        from Bio.Alphabet import IUPAC, Gapped
        # from Bio.Seq import Seq
        # from Bio.SeqRecord import SeqRecord
        # Directory where files are
        # os.chdir(sys.argv[1])
        # listing = os.listdir(".")
        listing = os.listdir(self.pathToCladesAlignments)
        consensus = {}
        genConsensus = ''
        pssmGen = ''
        # this value should be read from the arguments or else use a default
        consensusThres = consensusThreshold
        # sys.argv[2] holds the path to the general alignment
        generalAlignment = AlignIO.parse(self.generalAlignment,
                                         "fasta",
                                         alphabet=Gapped(
                                             IUPAC.ExtendedIUPACProtein(),
                                             "-"))
        lengthGenAl = 0
        positionsToMask = []
        for genAlignment in generalAlignment:
            sumGen = AlignInfo.SummaryInfo(genAlignment)
            genConsensus = sumGen.gap_consensus(consensusThres)
            for index, residue in enumerate(genConsensus):
                if genConsensus[index] == '-':
                    continue
                if genConsensus[index] == 'X':
                    continue
                positionsToMask.append(index)
            #pssmGen = sumGen.pos_specific_score_matrix(genConsensus,chars_to_ignore = ['-'])
            pssmGen = sumGen.pos_specific_score_matrix(genConsensus)
            lengthGenAl = len(genAlignment)

        print positionsToMask
        print listing

        resultAlignFiles = []
        for item in listing:
            if item.endswith(".fas"):
                #alignments = AlignIO.parse(item,"fasta",alphabet=IUPAC.ExtendedIUPACProtein())
                alignments = AlignIO.parse(self.pathToCladesAlignments + item,
                                           "fasta",
                                           alphabet=Gapped(
                                               IUPAC.ExtendedIUPACProtein(),
                                               "-"))
                for alignment in alignments:
                    summ = AlignInfo.SummaryInfo(alignment)
                    consensus[item] = summ.gap_consensus(consensusThres)
                    for posToMask in positionsToMask:
                        if consensus[item][posToMask] == '-':
                            continue
                        for alignElement in alignment:
                            mutSeq = alignElement.seq.tomutable()
                            mutSeq[posToMask] = 'X'
                            alignElement.seq = mutSeq.toseq()
                    SeqIO.write(
                        alignment, self.outPutPath + item +
                        "_noPKSsignal_Thres%d.faa" % (consensusThres * 100, ),
                        "fasta")
                    resultAlignFiles.append(self.outPutPath + item +
                                            "_noPKSsignal_Thres%d.faa" %
                                            (consensusThres * 100, ))
                    summ = AlignInfo.SummaryInfo(alignment)
                    consensus[item] = summ.gap_consensus(consensusThres)
                    print item, consensus[item]
        return resultAlignFiles