Esempio n. 1
0
    def writeNoGaps(self, outputfile):
        fastaFile = self.pathToFasta

        from Bio import AlignIO, SeqIO
        from Bio.Seq import Seq
        from Bio.SeqRecord import SeqRecord
        from Bio.Alphabet import IUPAC, Gapped

        seqIterator = SeqIO.parse(fastaFile,
                                  "fasta",
                                  alphabet=Gapped(IUPAC.ExtendedIUPACProtein(),
                                                  "-"))
        outFasta = outputfile
        records = list()
        for alignment in seqIterator:
            desiredSeqString = str(alignment.seq)
            desiredSeqString = desiredSeqString.replace("-", "")
            #print desiredSeqString
            seqNoGaps = Seq(desiredSeqString,
                            alphabet=IUPAC.ExtendedIUPACProtein())

            seqRecNoGaps = SeqRecord(seq=seqNoGaps, id=alignment.id)
            #print seqRecNoGaps.seq
            #print seqRecNoGaps.id
            records.append(seqRecNoGaps)

        SeqIO.write(records, outFasta, "fasta")
Esempio n. 2
0
    def getFastaEntry(self, entryNum, resultFolder):
        fastaFile = self.pathToFasta

        from Bio import AlignIO, SeqIO
        from Bio.Seq import Seq
        from Bio.SeqRecord import SeqRecord
        from Bio.Alphabet import IUPAC, Gapped

        alignmentIterator = AlignIO.parse(fastaFile,
                                          "fasta",
                                          alphabet=Gapped(
                                              IUPAC.ExtendedIUPACProtein(),
                                              "-"))
        queryFasta = resultFolder + "/" + "Query_%d.faa" % (entryNum, )
        alignment = alignmentIterator.next()
        if entryNum > len(alignment):
            return None

        desiredSeqString = str(alignment[entryNum - 1].seq)
        desiredSeqString = desiredSeqString.replace("-", "")
        #print desiredSeqString
        seqNoGaps = Seq(desiredSeqString,
                        alphabet=IUPAC.ExtendedIUPACProtein())

        seqRecNoGaps = SeqRecord(seq=seqNoGaps, id=alignment[entryNum - 1].id)
        #print seqRecNoGaps.seq
        #print seqRecNoGaps.id
        SeqIO.write(seqRecNoGaps, queryFasta, "fasta")
        return queryFasta
Esempio n. 3
0
def main():
    (opts, args) = getoptions()

    # Load PWMs
    pssms = load_motifs(opts.pwm_dir, opts.pseudocount)

    if opts.testseq is not None:
        if opts.seqtype == 'RNA':
            seq = Seq(opts.testseq,
                      IUPAC.IUPACUnambiguousRNA()).back_transcribe()
            seq.alphabet = IUPAC.IUPACUnambiguousDNA()
        else:
            seq = Seq(opts.testseq, IUPAC.IUPACUnambiguousDNA())
        final = scan_all(pssms, seq, opts)
        print final.to_csv(sep="\t", index=False)
    else:
        # Scan in sequence
        print >> sys.stderr, "Scanning sequences ",
        tic = time.time()
        for seqrecord in SeqIO.parse(open(args[0]), "fasta"):

            seq = seqrecord.seq
            if opts.seqtype == "RNA":
                seq = seq.back_transcribe()
            seq.alphabet = IUPAC.IUPACUnambiguousDNA()

            final = scan_all(pssms, seq, opts)
            print final.to_csv(sep="\t", index=False)

        toc = time.time()
        print >> sys.stderr, "done in %0.2f seconds!" % (float(toc - tic))
Esempio n. 4
0
 def setUp(self):
     self.aln_file = [TEST_ALIGN_FILE1,
                      TEST_ALIGN_FILE2,
                      TEST_ALIGN_FILE3,
                      TEST_ALIGN_FILE4,
                      TEST_ALIGN_FILE5,
                      TEST_ALIGN_FILE6]
     alns = []
     for i in self.aln_file:
         if i[1] == "parse":
             nucl = SeqIO.parse(i[0][0], "fasta", alphabet=IUPAC.IUPACUnambiguousDNA())
             prot = AlignIO.read(i[0][1], "clustal", alphabet=IUPAC.protein)
             with warnings.catch_warnings():
                 warnings.simplefilter("ignore")
                 caln = codonalign.build(prot, nucl, alphabet=codonalign.default_codon_alphabet)
         elif i[1] == "index":
             # Deliberately using a fancy protein alphabet for testing:
             nucl = SeqIO.index(i[0][0], "fasta", alphabet=IUPAC.IUPACUnambiguousDNA())
             prot = AlignIO.read(i[0][1], "clustal", alphabet=generic_protein)
             with warnings.catch_warnings():
                 warnings.simplefilter("ignore")
                 caln = codonalign.build(prot, nucl, alphabet=codonalign.default_codon_alphabet, max_score=20)
             nucl.close()  # Close the indexed FASTA file
         elif i[1] == "id":
             nucl = SeqIO.parse(i[0][0], "fasta", alphabet=IUPAC.IUPACUnambiguousDNA())
             prot = AlignIO.read(i[0][1], "clustal", alphabet=IUPAC.protein)
             with open(i[0][2]) as handle:
                 id = {i.split()[0]: i.split()[1] for i in handle}
             with warnings.catch_warnings():
                 warnings.simplefilter("ignore")
                 caln = codonalign.build(prot, nucl, corr_dict=id, alphabet=codonalign.default_codon_alphabet)
         alns.append(caln)
     self.alns = alns
Esempio n. 5
0
def get_seq_record(record, start, stop, description):
    """Return a SeqRecord for query between start and stop.

    Given a sam record, find query sequences that cover the [start, stop]
    interval completely and create a SeqRecord object with sequence in the
    reference orientation.
    """
    # get the query positions of the bases mapped to start, stop
    # currently only extracts from reads where both endpoints are mapped
    positions = record.get_aligned_pairs(matches_only=True)
    first_position = [item[0] for item in positions if item[1] == start]
    last_position = [item[0] for item in positions if item[1] == stop]
    # fetch and reorient sequence
    if first_position and last_position:
        name = record.query_name
        if not record.is_reverse:
            seq = Seq(
                record.get_forward_sequence()
                [first_position[0]:last_position[0]],
                IUPAC.IUPACUnambiguousDNA())
            direction = 'f'
        else:
            length = record.query_length
            seq = Seq(
                record.get_forward_sequence()[length -
                                              last_position[0]:length -
                                              first_position[0]],
                IUPAC.IUPACUnambiguousDNA()).reverse_complement()
            direction = 'rc'
        return SeqRecord(seq,
                         id=name,
                         description=('|').join(
                             [description, direction[record.is_reverse]]))
Esempio n. 6
0
    def chgAlpha(self, newAlpha):
        """Accepts 'DNA' 'RNA' or 'protein' or an 
	alphabet object"""

        from Bio.Seq import Seq
        from Bio.Alphabet import IUPAC

        alpha = None
        if newAlpha == "DNA":
            alpha = IUPAC.IUPACUnambiguousDNA()
            self.typ = alpha
        elif newAlpha == "RNA":
            alpha = IUPAC.IUPACUnambiguousDNA()
            self.typ = alpha
        elif newAlpha == "protein":
            alpha = IUPAC.IUPACProtein()
            self.typ = alpha
        else:
            raise NameError, "type not 'DNA', 'RNA', or 'protein'"

        if not alpha:
            alpha = newAlpha

        self.seq = Seq(self.seq.tostring(), alpha)

        self.checkAlpha()
Esempio n. 7
0
 def setUp(self):
     self.aln_file = [TEST_ALIGN_FILE1,
                      TEST_ALIGN_FILE2,
                      TEST_ALIGN_FILE3,
                      TEST_ALIGN_FILE4,
                      TEST_ALIGN_FILE5,
                      TEST_ALIGN_FILE6]
     alns = []
     for i in self.aln_file:
         if i[1] == 'parse':
             nucl = SeqIO.parse(i[0][0], 'fasta', alphabet=IUPAC.IUPACUnambiguousDNA())
             prot = AlignIO.read(i[0][1], 'clustal', alphabet=IUPAC.protein)
             with warnings.catch_warnings():
                 warnings.simplefilter('ignore')
                 caln = codonalign.build(prot, nucl, alphabet=codonalign.default_codon_alphabet)
         elif i[1] == 'index':
             # Deliberately using a fancy protein alphabet for testing:
             nucl = SeqIO.index(i[0][0], 'fasta', alphabet=IUPAC.IUPACUnambiguousDNA())
             prot = AlignIO.read(i[0][1], 'clustal', alphabet=Gapped(IUPAC.ExtendedIUPACProtein()))
             with warnings.catch_warnings():
                 warnings.simplefilter('ignore')
                 caln = codonalign.build(prot, nucl, alphabet=codonalign.default_codon_alphabet, max_score=20)
         elif i[1] == 'id':
             nucl = SeqIO.parse(i[0][0], 'fasta', alphabet=IUPAC.IUPACUnambiguousDNA())
             prot = AlignIO.read(i[0][1], 'clustal', alphabet=IUPAC.protein)
             with open(i[0][2]) as handle:
                 id = dict((i.split()[0], i.split()[1]) for i in handle)
             with warnings.catch_warnings():
                 warnings.simplefilter('ignore')
                 caln = codonalign.build(prot, nucl, corr_dict=id, alphabet=codonalign.default_codon_alphabet)
         alns.append(caln)
         nucl.close()  # Close the indexed FASTA file
     self.alns = alns
Esempio n. 8
0
def load_csv_file(file, delimiter=";"):
    """
    This function loads a "Primer" file.
    @returns: List of PrimerPair instances
    """
    pos = {
        "id": 0,
        "forwardPrimer": 0,
        "reversePrimer": 0,
        "fPDNA": 0,
        "rPDNA": 0,
        "ampliconMinLength": 0,
        "ampliconMaxLength": 0
    }
    header_len = len(pos)
    primer_dict = {}
    with open(file, newline='') as csvfile:
        csvreader = csv.reader(csvfile, delimiter=delimiter)
        headers = next(csvreader)
        if (len(headers) != header_len):
            raise ValueError("Wrong header")
        for i in range(len(headers)):
            if (headers[i] not in pos):
                raise ValueError("Unknown header " + headers[i])
            pos[headers[i]] = i

        i = 1
        for row in csvreader:
            i += 1
            if (len(row) == header_len):
                fprimer = Seq(row[pos["fPDNA"]], IUPAC.IUPACAmbiguousDNA())
                fprimer = SeqRecord(fprimer)
                fprimer.id = row[pos["forwardPrimer"]]

                rprimer = Seq(row[pos["rPDNA"]], IUPAC.IUPACAmbiguousDNA())
                rprimer = SeqRecord(rprimer)
                if (True):  #TODO
                    rprimer = rprimer.reverse_complement()
                rprimer.id = row[pos["reversePrimer"]]

                primer_pair = PrimerPair((row[pos["id"]]), fprimer, rprimer,
                                         int(row[pos["ampliconMinLength"]]),
                                         int(row[pos["ampliconMaxLength"]]))
                if (check_primer_pair_integrity(primer_pair)):
                    primer_dict[row[pos["id"]]] = primer_pair
                else:
                    logging.warning("Skipping primer pair " + primer_pair.id +
                                    ", bad sequence")
            else:
                logging.warning("Wrong primer pair in line " + str(i))

    return primer_dict
Esempio n. 9
0
 def setUp(self):
     # Test set 1
     seq1 = SeqRecord(Seq(
         'TCAGGGACTGCGAGAACCAAGCTACTGCTGCTGCTGGCTGCGCTCTGCGCCGCAGGTGGGGCGCTGGAG',
         alphabet=IUPAC.IUPACUnambiguousDNA()),
                      id='pro1')
     seq2 = SeqRecord(Seq(
         'TCAGGGACTTCGAGAACCAAGCGCTCCTGCTGCTGGCTGCGCTCGGCGCCGCAGGTGGAGCACTGGAG',
         alphabet=IUPAC.IUPACUnambiguousDNA()),
                      id='pro2')
     pro1 = SeqRecord(Seq('SGTARTKLLLLLAALCAAGGALE',
                          alphabet=IUPAC.protein),
                      id='pro1')
     pro2 = SeqRecord(Seq('SGTSRTKRLLLLAALGAAGGALE',
                          alphabet=IUPAC.protein),
                      id='pro2')
     aln1 = MultipleSeqAlignment([pro1, pro2])
     self.aln1 = aln1
     self.seqlist1 = [seq1, seq2]
     # Test set 2
     #                      M  K  K  H  E L(F)L  C  Q  G  T  S  N  K  L  T  Q(L)L  G  T  F  E  D  H  F  L  S  L  Q  R  M  F  N  N  C  E  V  V
     seq3 = SeqRecord(Seq(
         'ATGAAAAAGCACGAGTTACTTTGCCAAGGGACAAGTAACAAGCTCACCCAGTTGGGCACTTTTGAAGACCACTTTCTGAGCCTACAGAGGATGTTCAACAACTGTGAGGTGGTCCTTGGGAATTTGGAAATTACCTACATGCAGAGTAGTTACAACCTTTCTTTTCTCAAGACCATCCAGGAGGTTGCCGGCTATGTACTCATTGCCCTC',
         alphabet=IUPAC.IUPACUnambiguousDNA()),
                      id='pro1')
     # seq4 =SeqRecord(Seq('ATGAAAAAGCACGAGTT CTTTGCCAAGGGACAAGTAACAAGCTCACCCAGTTGGGCACTTTTGAAGACCACTTTCTGAGCCTACAGAGGATGTTCAACAA TGTGAGGTGGTCCTTGGGAATTTGGAAATTACCTACATGCAGAGTAGTTACAACCTTTCTTTTCTCAAGACCATCCAGGAGGTTGCCGGCTATGTACTCATTGCCCTC', alphabet=IUPAC.IUPACUnambiguousDNA()), id='pro2')
     seq4 = SeqRecord(Seq(
         'ATGAAAAAGCACGAGTTCTTTGCCAAGGGACAAGTAACAAGCTCACCCAGTTGGGCACTTTTGAAGACCACTTTCTGAGCCTACAGAGGATGTTCAACAATGTGAGGTGGTCCTTGGGAATTTGGAAATTACCTACATGCAGAGTAGTTACAACCTTTCTTTTCTCAAGACCATCCAGGAGGTTGCCGGCTATGTACTCATTGCCCTC',
         alphabet=IUPAC.IUPACUnambiguousDNA()),
                      id='pro2')
     # seq5 =SeqRecord(Seq('ATGAAAAAGCACGAGTT CTTTGCCAAGGGACAAGTAACAAGCTCACCC  TTGGGCACTTTTGAAGACCACTTTCTGAGCCTACAGAGGATGTTCAACAACTGTGAGGTGGTCCTTGGGAATTTGGAAATTACCTACATGCAGAGTAGTTACAACCTTTCTTTTCTCAAGACCATCCAGGAGGTTGCCGGCTATGTACTCATTGCCCTC', alphabet=IUPAC.IUPACUnambiguousDNA()), id='pro3')
     seq5 = SeqRecord(Seq(
         'ATGAAAAAGCACGAGTTACTTTGCCAAGGGACAAGTAACAAGCTCACCCTTGGGCACTTTTGAAGACCACTTTCTGAGCCTACAGAGGATGTTCAACAACTGTGAGGTGGTCCTTGGGAATTTGGAAATTACCTACATGCAGAGTAGTTACAACCTTTCTTTTCTCAAGACCATCCAGGAGGTTGCCGGCTATGTACTCATTGCCCTC',
         alphabet=IUPAC.IUPACUnambiguousDNA()),
                      id='pro3')
     pro3 = SeqRecord(Seq(
         'MKKHELLCQGTSNKLTQLGTFEDHFLSLQRMFNNCEVVLGNLEITYMQSSYNLSFLKTIQEVAGYVLIAL',
         alphabet=IUPAC.protein),
                      id='pro1')
     pro4 = SeqRecord(Seq(
         'MKKHEFLCQGTSNKLTQLGTFEDHFLSLQRMFNNCEVVLGNLEITYMQSSYNLSFLKTIQEVAGYVLIAL',
         alphabet=IUPAC.protein),
                      id='pro2')
     pro5 = SeqRecord(Seq(
         'MKKHELLCQGTSNKLTLLGTFEDHFLSLQRMFNNCEVVLGNLEITYMQSSYNLSFLKTIQEVAGYVLIAL',
         alphabet=IUPAC.protein),
                      id='pro3')
     aln2 = MultipleSeqAlignment([pro3, pro4, pro5])
     self.aln2 = aln2
     self.seqlist2 = [seq3, seq4, seq5]
Esempio n. 10
0
def sequence(ungapped, position, length):
    """
    Given an ungapped sequence and a positive or negative number (position),
    return the nucleotide at that position plus [length] nucleotides in the
    positive direction.
    """
    if position >= 0:
        return str(
            Seq(str(ungapped.seq),
                IUPAC.IUPACUnambiguousDNA())[position:position + length])
    else:
        return getNegative(
            str(
                Seq(str(ungapped.seq),
                    IUPAC.IUPACUnambiguousDNA())[position:position + length]))
Esempio n. 11
0
    def writeFastas(self, pathToCompleteFastaAlignment, outputPath):
        """
        Writes individual clades as fasta alignments, where the files are stored in the output path
        and the name of each file is composed of <clade_name>.fasta . Data is obtained from the complete
        fasta alignment provided (pathToCompleteFastaAlignment) and the mappings given to the constructor.
        The mappings are a dictionary of fasta ids to clade names. Elements in the alignment not present
        in the mappings won't end in any clade fasta file.

        :param pathToCompleteFastaAlignment:
        :param outputPath:
        """
        from Bio import SeqIO, AlignIO
        from Bio.Alphabet import IUPAC, Gapped
        # create file handles for each clade
        for cladeName in list(self._mappings.values()):
            self._clade2fileHandle[cladeName] = open(outputPath + "/" +
                                                     cladeName + ".fas")

        # read complete alignment
        alignment = AlignIO.read(pathToCompleteFastaAlignment,
                                 "fasta",
                                 alphabet=Gapped(IUPAC.ExtendedIUPACProtein(),
                                                 "-"))
        for record in alignment:
            handle = self._clade2fileHandle[self._mappings[record.id]]
            SeqIO.write(record, handle, "fasta")

        for cladeName in self._clade2fileHandle.keys():
            self._clade2fileHandle[cladeName].close()
Esempio n. 12
0
 def gb(self):
     l = self.length()
     g = SeqRecord(
         Seq(self.sequence(),IUPAC.IUPACUnambiguousDNA()),
         id=self.name[0:8],
         name=self.name[0:8],
         description=self.description
     )
     g.features = []
     for f in self.features():
         t = f.type
         if f.direction == 'f':
             strand = 1
         else:
             strand = -1
         if self.shape == 'c' and f.end > l:
             f1 = FeatureLocation(ExactPosition(f.start), ExactPosition(l), strand)
             f2 = FeatureLocation(ExactPosition(0), ExactPosition(f.end - l), strand)
             if strand == 1:
                 floc = CompoundLocation([f1, f2])
             else:
                 floc = CompoundLocation([f2, f1])
         else:
             floc = FeatureLocation(ExactPosition(f.start),ExactPosition(f.end), strand)
         sf = SeqFeature(floc, f.type, qualifiers=dict([[q.name,q.data] for q in f.qualifiers.all()]))
         g.features.append(sf)
     return g.format('genbank')
Esempio n. 13
0
def genome_to_seqrecord(phage_genome):
    """Creates a SeqRecord object from a pdm_utils Genome object.

    :param phage_genome: A pdm_utils Genome object.
    :type phage_genome: Genome
    :returns: A BioPython SeqRecord object
    :rtype: SeqRecord
    """

    assert phage_genome != None,\
    "Genome object passed is None and not initialized"
    try:
        record = SeqRecord(phage_genome.seq)
        record.seq.alphabet = IUPAC.IUPACAmbiguousDNA()
    except AttributeError:
        print("Genome object failed to be converted to SeqRecord.",
              "Genome valid attribute 'seq' is required to",
              "convert to SeqRecord object.")
        raise
    record.name = phage_genome.name
    if phage_genome.accession != "":
        record.id = phage_genome.accession
    record.features = get_seqrecord_features(phage_genome)
    record.description = get_seqrecord_description(phage_genome)
    record.annotations=\
            get_seqrecord_annotations(phage_genome)

    return record
Esempio n. 14
0
    def testLimit(self, list_seqs, start):
        """
        Extract the aa sequences in the window.
        list_seqs is the list of sequence id in the alignment (not the id associated
            with the Bio.Seq object).
        start is the index of the start of the window.
        """
        frame = start % 3
        aa_window_length = int(self.window_length / 3)
        begin = int((start - frame) / 3)
        end = int(begin + aa_window_length)

        if frame == 0:
            t_align = self.t_align0
        elif frame == 1:
            t_align = self.t_align1
        else:
            t_align = self.t_align2

        sub_align = MulAlign([], Gapped(IUPAC.ExtendedIUPACProtein(), "N"))
        for idx in list_seqs:
            sub_align.append(t_align[idx][begin:end])

        result = []

        for c in range(aa_window_length):
            c = Counter(sub_align[:, c])
            #  count the most common aa
            nbr_most_common = c.most_common(1)[0][1]
            if nbr_most_common / len(list_seqs) >= self.min_aa_ratio:
                result.append(True)
            else:
                result.append(False)
        return result
def stage_one_trimming(alignment, window_size, proportion, threshold, min_len):
    """
    ---------------------------------------------------------------------
    MODIFIED FUNCTION FROM PHYLUCE: generic_align.py
    ---------------------------------------------------------------------
    First stage alignment trimming to find and trim edges of a given
    alignment.  Calls running_average function above to determine reasonable
    alignment start and end trimming for the entire alignment block.
    """
    start, end = running_average(alignment, window_size, proportion, threshold)
    s1_trimmed = MultipleSeqAlignment([], Gapped(IUPAC.ambiguous_dna, "-?"))
    for sequence in alignment:
        sequence.seq.alphabet = IUPAC.IUPACAmbiguousDNA()
        if start >= 0 and end:
            trim = sequence[start:end]
            if set(trim) != set(
                ['-']) and set(trim) != (['?']) and len(trim) >= min_len:
                s1_trimmed.append(sequence[start:end])
            else:
                s1_trimmed = None
                break
        else:
            s1_trimmed = None
            break

    return s1_trimmed
Esempio n. 16
0
def find_gapped_columns(align, cfg):
    """Find all columns that contain more gaps than the cfg setting using 
    a sliding windows"""
    max_gap_proportion = cfg["max_gap_proportion"]
    nbr_sequences = len(align)
    columns_to_remove = []
    len_align = align.get_alignment_length()
    for index in range(len_align):
        column = str(align[:, index]).replace("n", "N")
        gap_freq = (column.count("N") + column.count("-")) / nbr_sequences
        if gap_freq > max_gap_proportion:
            columns_to_remove.append(index)
    if columns_to_remove:
        idxs = [x for x in range(len_align) if x not in columns_to_remove]
        trimmed_records = []
        for rec in align:
            L_seq = list(rec.seq)
            new_seq = "".join([L_seq[i] for i in idxs])
            new_rec = SeqRecord(
                Seq(new_seq, IUPAC.IUPACAmbiguousDNA()),
                name=rec.name,
                id=rec.id,
                description="",
            )
            trimmed_records.append(new_rec)
        return trimmed_records
    return align
Esempio n. 17
0
 def setUp(self):
     nucl = SeqIO.parse(TEST_ALIGN_FILE6[0][0], 'fasta', alphabet=IUPAC.IUPACUnambiguousDNA())
     prot = AlignIO.read(TEST_ALIGN_FILE6[0][1], 'clustal', alphabet=IUPAC.protein)
     with open(TEST_ALIGN_FILE6[0][2]) as handle:
         id_corr = dict((i.split()[0], i.split()[1]) for i in handle)
     aln = CodonAlign.build(prot, nucl, corr_dict=id_corr, alphabet=CodonAlign.default_codon_alphabet)
     self.aln = aln
Esempio n. 18
0
    def generateFastaWithOutEntry(self, entryNumToRemove, resultFolder):
        fastaFile = self.pathToFasta
        # fastaFileClade = sys.argv[2]
        # entryToTest = int(sys.argv[3])
        entryToTest = int(entryNumToRemove)

        from Bio import AlignIO
        from Bio.Alphabet import IUPAC, Gapped
        from Bio.Align import MultipleSeqAlignment

        alignmentIterator = AlignIO.parse(fastaFile,
                                          "fasta",
                                          alphabet=Gapped(
                                              IUPAC.ExtendedIUPACProtein(),
                                              "-"))

        alignment = alignmentIterator.next()
        pathToNewFile = resultFolder + "/" + "WithOutEntry_%d.faa" % (
            entryToTest, )
        #print testAlignment[entryToTest].id
        #print testAlignment[entryToTest].seq
        #print "Number of entries: ",len(testAlignment)
        # Here we remove the desired element
        newTestAlignment = []
        for i in range(len(alignment)):
            if i != entryToTest - 1:
                newTestAlignment.append(alignment[i])

        newAlignment = MultipleSeqAlignment(newTestAlignment)

        AlignIO.write(newAlignment, pathToNewFile, "fasta")
        #print "Number of entries after: ",len(newTestAlignment)
        return pathToNewFile
Esempio n. 19
0
def main():
    logging.basicConfig()
    parser = argparse.ArgumentParser()
    parser.add_argument('--fasta', dest='fasta_file', metavar='STRING', required=True, type=str)
    parser.add_argument('--num_fragments', dest='num_fragments', metavar='int', required=True, type=int)
    parser.add_argument('--mean_frag_size', dest='frag_size_mu', metavar='int', required=True, type=int)
    parser.add_argument('--frag_size_std', dest='frag_size_sigma', metavar='int', required=True, type=int)
    parser.add_argument('--mean_mutation_rate', dest='mutation_rate_mu', metavar='float', required=True, type=float)
    parser.add_argument('--mutation_rate_std', dest='mutation_rate_sigma', metavar='float', required=True, type=float)
    parser.add_argument('--output', dest='output_file', metavar='string', required=True, type=str)
    args = parser.parse_args()

    outhandle = open(args.output_file, 'w')

    generated_seqs = []
    for record in SeqIO.parse(args.fasta_file, 'fasta'):
        base_id = record.id
        base_seq = str(record.seq)
        while len(generated_seqs) < args.num_fragments:
            try:
                mutation_rate = rnd.gauss(args.mutation_rate_mu, args.mutation_rate_sigma)
                subsequence = subselect_sequence(base_seq, args.frag_size_mu, args.frag_size_sigma)
                mutated_subsequence = mutate(subsequence, mutation_rate)
                new_id = '%s__mut_%.2f__len_%i' % (base_id, mutation_rate, len(subsequence))
                generated_seqs.append(SeqRecord(Seq(mutated_subsequence, IUPAC.IUPACAmbiguousDNA()),
                                            id=new_id, name=new_id, description=''))
            except Exception as e:
                print(e)

    SeqIO.write(generated_seqs, outhandle, 'fasta')

    outhandle.close()
Esempio n. 20
0
def get_sines(sine_fname):
    """As given in file + reverse complements."""
    for (i, sine_record) in enumerate(SeqIO.parse(sine_fname, "fasta")):
        cur_seq = Seq(str(sine_record.seq), IUPAC.IUPACAmbiguousDNA())
        yield str(cur_seq)
        cur_seq_rc = cur_seq.reverse_complement()
        yield str(cur_seq_rc)
        print(cur_seq, cur_seq_rc, '''\n ======================''')
Esempio n. 21
0
 def translate(self, align, offset):
     """
     Translate the alignment according to the selected frame which is set 
         according to 'offset' value
     """
     end = ((align.get_alignment_length() - offset) // 3) * 3 + offset
     t_align = MulAlign([], Gapped(IUPAC.ExtendedIUPACProtein(), "N"))
     for rec in align:
         seq = str(rec.seq).upper().replace("-", "N").replace("n", "N")
         new_seq = Seq(seq,
                       IUPAC.IUPACAmbiguousDNA())[offset:end].translate()
         new_rec = SeqRecord(new_seq,
                             name=rec.name,
                             id=rec.id,
                             description="")
         t_align.append(new_rec)
     return t_align
Esempio n. 22
0
def get_sine_forward(sine_fname):
    """Only in direction given in file."""
    [sine_record] = SeqIO.parse(sine_fname,
                                "fasta",
                                alphabet=IUPAC.IUPACAmbiguousDNA())
    # TODO: If we return it as dumb string, why did we bother about the alphabet?
    # TODO: The reference SINEs do contain a couple ambiguous chars - N, Y.
    return str(sine_record.seq)
Esempio n. 23
0
 def setUp(self):
     nucl = SeqIO.parse(TEST_ALIGN_FILE6[0][0], 'fasta', alphabet=IUPAC.IUPACUnambiguousDNA())
     prot = AlignIO.read(TEST_ALIGN_FILE6[0][1], 'clustal', alphabet=IUPAC.protein)
     with open(TEST_ALIGN_FILE6[0][2]) as handle:
         id_corr = dict((i.split()[0], i.split()[1]) for i in handle)
     with warnings.catch_warnings():
         warnings.simplefilter('ignore', BiopythonWarning)
         aln = codonalign.build(prot, nucl, corr_dict=id_corr, alphabet=codonalign.default_codon_alphabet)
     self.aln = aln
Esempio n. 24
0
 def test_reverse_complements(self):
     """Test double reverse complement preserves the sequence."""
     sorted_amb_rna = sorted(ambiguous_rna_values)
     sorted_amb_dna = sorted(ambiguous_dna_values)
     for sequence in [
             Seq.Seq("".join(sorted_amb_rna)),
             Seq.Seq("".join(sorted_amb_dna)),
             Seq.Seq("".join(sorted_amb_rna), Alphabet.generic_rna),
             Seq.Seq("".join(sorted_amb_dna), Alphabet.generic_dna),
             Seq.Seq("".join(sorted_amb_rna).replace("X", ""),
                     IUPAC.IUPACAmbiguousRNA()),
             Seq.Seq("".join(sorted_amb_dna).replace("X", ""),
                     IUPAC.IUPACAmbiguousDNA()),
             Seq.Seq("AWGAARCKG"),
     ]:  # Note no U or T
         reversed_sequence = sequence.reverse_complement()
         self.assertEqual(str(sequence),
                          str(reversed_sequence.reverse_complement()))
Esempio n. 25
0
    def set_primer_seqs(self, fwd_sequence, rev_sequence):
        """Set the primer sequences.

        Set the primer sequences from the given forward and reverse sequences.

        Parameters
        ----------
        fwd_sequence : string
            forward primer sequence - ambiguities allowed.
        rev_sequence : string
            reverse primer sequence - ambiguities allowed.

        """
        fwd_primer = Seq(fwd_sequence, IUPAC.IUPACAmbiguousDNA())
        rev_primer = Seq(rev_sequence, IUPAC.IUPACAmbiguousDNA())
        self.logger.info("Setting foward primer to " + fwd_sequence)
        self.logger.info("Setting reverse primer to " + rev_sequence)
        self._primer_pair = (fwd_primer, rev_primer)
Esempio n. 26
0
 def setUp(self):
     self.aln_file = [
         TEST_ALIGN_FILE1, TEST_ALIGN_FILE2, TEST_ALIGN_FILE3,
         TEST_ALIGN_FILE4, TEST_ALIGN_FILE5, TEST_ALIGN_FILE6
     ]
     alns = []
     for i in self.aln_file:
         if i[1] == 'parse':
             nucl = SeqIO.parse(i[0][0],
                                'fasta',
                                alphabet=IUPAC.IUPACUnambiguousDNA())
             prot = AlignIO.read(i[0][1], 'clustal', alphabet=IUPAC.protein)
             with warnings.catch_warnings():
                 warnings.simplefilter('ignore')
                 caln = CodonAlign.build(
                     prot, nucl, alphabet=CodonAlign.default_codon_alphabet)
         elif i[1] == 'index':
             nucl = SeqIO.index(i[0][0],
                                'fasta',
                                alphabet=IUPAC.IUPACUnambiguousDNA())
             prot = AlignIO.read(i[0][1], 'clustal', alphabet=IUPAC.protein)
             with warnings.catch_warnings():
                 warnings.simplefilter('ignore')
                 caln = CodonAlign.build(
                     prot,
                     nucl,
                     alphabet=CodonAlign.default_codon_alphabet,
                     max_score=20)
         elif i[1] == 'id':
             nucl = SeqIO.parse(i[0][0],
                                'fasta',
                                alphabet=IUPAC.IUPACUnambiguousDNA())
             prot = AlignIO.read(i[0][1], 'clustal', alphabet=IUPAC.protein)
             id = dict((i.split()[0], i.split()[1])
                       for i in open(i[0][2]).readlines())
             with warnings.catch_warnings():
                 warnings.simplefilter('ignore')
                 caln = CodonAlign.build(
                     prot,
                     nucl,
                     corr_dict=id,
                     alphabet=CodonAlign.default_codon_alphabet)
         alns.append(caln)
     self.alns = alns
Esempio n. 27
0
def gapCdsToProteins(proteinAlignment, extraDnaSeqs=None):
    """ to replace proteinToCodonAlignment() """
    protSeqDict = {}
    for seqRecord in proteinAlignment:
        protSeqDict[seqRecord.id] = seqRecord
    dnaFasta = patric_api.getSequenceOfFeatures(protSeqDict.keys(), 'dna')
    #if Debug:
    #     LOG.write("dnaFasta sample: %s\n"%dnaFasta[:100])

    dnaSeqDict = SeqIO.to_dict(
        SeqIO.parse(StringIO(dnaFasta),
                    "fasta",
                    alphabet=IUPAC.IUPACAmbiguousDNA()))
    for seqId in protSeqDict:
        if extraDnaSeqs and seqId in extraDnaSeqs:
            dnaSeqDict[seqId] = extraDnaSeqs[seqId]
            if Debug:
                LOG.write("appending extra DNA seq %s\n" % seqId)
    if set(dnaSeqDict.keys()) != set(protSeqDict.keys()):
        raise Exception(
            "Protein and DNA sets differ:\nProteins: %s\nDNA: %s\n" %
            (", ".join(sorted(protSeqDict)), ", ".join(sorted(dnaSeqDict))))
    dnaAlignFasta = StringIO()
    prot_align_len = proteinAlignment.get_alignment_length()
    for seqId in dnaSeqDict:
        dnaSeq = dnaSeqDict[seqId].seq
        if len(dnaSeq) < 3 * prot_align_len:
            # this is to handle cases where protein exists but DNA does not
            dnaSeq += '---' * (prot_align_len - len(dnaSeq))
        protSeq = protSeqDict[seqId].seq
        dnaAlignFasta.write(">" + seqId + "\n")
        dnaSeqPos = 0
        for protPos in range(0, len(protSeq)):
            if protSeq[protPos] == '-':
                codon = '---'
            else:
                #  TODO: in future use a codon table to check correct matching
                codon = str(dnaSeq[dnaSeqPos:dnaSeqPos + 3])
                dnaSeqPos += 3
            dnaAlignFasta.write(codon)
        protPos += 1  # should now be equal to prot_align_len
        if Debug:
            LOG.write(
                seqId +
                " protPos={0}, dnaSeqPos={1}, orig_DNA_len={2}, orig_prot_len={3}\n"
                .format(protPos, dnaSeqPos, len(dnaSeq), len(protSeq)))
        if protPos < prot_align_len:
            dnaAlignFasta.write(''.join("---" * (prot_align_len - protPos)))
            LOG.write(
                "padding short seq {0}, of {1} pos out to {2}, orig_DNA_len={3}, orig_prot_len={4}\n"
                .format(seqId, protPos, prot_align_len, len(dnaSeq),
                        len(protSeq)))
        dnaAlignFasta.write("\n")
    dnaAlignFasta_text = dnaAlignFasta.getvalue()
    retval = AlignIO.read(StringIO(dnaAlignFasta_text), 'fasta')
    return retval
Esempio n. 28
0
    def test_translate(self):
        """Test that a dna open reading frame is translated correctly."""

        orf = 'ATGTGGAGACGGAAACATCCGAGGACATCCGGAGGAACCCGGGGAGTTCTGAGTGGTAATTAG'
        expected_primers = Seq('MWRRKHPRTSGGTRGVLSGN*',
                               HasStopCodon(IUPAC.ExtendedIUPACProtein(), '*'))
        result_primers = translate(orf)
        self.assertEqual(result_primers, expected_primers)
        self.assertEqual(len(result_primers), 21)
        self.assertEqual(isinstance(result_primers, Seq), True)
Esempio n. 29
0
def cast_sequence(ungapped_sequence):
    """
    ungapped_sequence: a list with the sequence and id for all the species in a file.
    
    Returns a list sequences with the type cast as c.
    """
    casted = []
    for record in ungapped_sequence:
        casted.append(Seq(str(record.seq), IUPAC.IUPACUnambiguousDNA()))
    return casted
Esempio n. 30
0
    def test_compute_background_1(self):
        target = ms.compute_background(self.fastas,
                                       IUPAC.IUPACUnambiguousRNA(),
                                       verbose=False)
        expected = {'A': 0.1944,
                    'C': 0.1388,
                    'U': 0.5277,
                    'G': 0.1388}

        for key,value in expected.items():
            self.assertAlmostEqual(target[key], value, 3)