Example #1
0
def read_single_with_titles(filename, alphabet):
    global title_to_ids
    iterator = FastaIterator(open(filename), alphabet, title_to_ids)
    record = iterator.next()
    try:
        second = iterator.next()
    except StopIteration:
        second = None
    assert record is not None and second is None
    return record
def read_single_with_titles(filename, alphabet):
    global title_to_ids
    iterator = FastaIterator(open(filename), alphabet, title_to_ids)
    record = iterator.next()
    try:
        second = iterator.next()
    except StopIteration:
        second = None
    assert record is not None and second is None
    return record
Example #3
0
def PairedFastaQualIterator(fasta_handle, qual_handle, alphabet = single_letter_alphabet, title2ids = None) :
    """Iterate over matched FASTA and QUAL files as SeqRecord objects.

    For example, consider this short QUAL file::

        >EAS54_6_R1_2_1_413_324
        26 26 18 26 26 26 26 26 26 26 26 26 26 26 26 22 26 26 26 26
        26 26 26 23 23
        >EAS54_6_R1_2_1_540_792
        26 26 26 26 26 26 26 26 26 26 26 22 26 26 26 26 26 12 26 26
        26 18 26 23 18
        >EAS54_6_R1_2_1_443_348
        26 26 26 26 26 26 26 26 26 26 26 24 26 22 26 26 13 22 26 18
        24 18 18 18 18
    
    And a matching FASTA file::

        >EAS54_6_R1_2_1_413_324
        CCCTTCTTGTCTTCAGCGTTTCTCC
        >EAS54_6_R1_2_1_540_792
        TTGGCAGGCCAAGGCCGATGGATCA
        >EAS54_6_R1_2_1_443_348
        GTTGCTTCTGGCGTGGGTGGGGGGG

    You can parse these separately using Bio.SeqIO with the "qual" and
    "fasta" formats, but then you'll get a group of SeqRecord objects with
    no sequence, and a matching group with the sequence but not the
    qualities.  Because it only deals with one input file handle, Bio.SeqIO
    can't be used to read the two files together - but this function can!
    For example,
    
    >>> rec_iter = PairedFastaQualIterator(open("Quality/example.fasta", "rU"),
    ...                                    open("Quality/example.qual", "rU"))
    >>> for record in rec_iter :
    ...     print record.id, record.seq
    EAS54_6_R1_2_1_413_324 CCCTTCTTGTCTTCAGCGTTTCTCC
    EAS54_6_R1_2_1_540_792 TTGGCAGGCCAAGGCCGATGGATCA
    EAS54_6_R1_2_1_443_348 GTTGCTTCTGGCGTGGGTGGGGGGG

    As with the FASTQ or QUAL parsers, if you want to look at the qualities,
    they are in each record's per-letter-annotation dictionary as a simple
    list of integers:

    >>> print record.letter_annotations["phred_quality"]
    [26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 24, 26, 22, 26, 26, 13, 22, 26, 18, 24, 18, 18, 18, 18]

    If you have access to data as a FASTQ format file, using that directly
    would be simpler and more straight forward.  Note that you can easily use
    this function to convert paired FASTA and QUAL files into FASTQ files:

    >>> from Bio import SeqIO
    >>> rec_iter = PairedFastaQualIterator(open("Quality/example.fasta", "rU"),
    ...                                    open("Quality/example.qual", "rU"))
    >>> out_handle = open("Quality/temp.fastq", "w")
    >>> SeqIO.write(rec_iter, out_handle, "fastq")
    3
    >>> out_handle.close()

    And don't forget to clean up the temp file if you don't need it anymore:

    >>> import os
    >>> os.remove("Quality/temp.fastq")    
    """
    from Bio.SeqIO.FastaIO import FastaIterator    
    fasta_iter = FastaIterator(fasta_handle, alphabet=alphabet, \
                               title2ids=title2ids)
    qual_iter = QualPhredIterator(qual_handle, alphabet=alphabet, \
                                  title2ids=title2ids)

    #Using zip(...) would create a list loading everything into memory!
    #It would also not catch any extra records found in only one file.
    while True :
        try :
            f_rec = fasta_iter.next()
        except StopIteration :
            f_rec = None
        try :
            q_rec = qual_iter.next()
        except StopIteration :
            q_rec = None
        if f_rec is None and q_rec is None :
            #End of both files
            break
        if f_rec is None :
            raise ValueError("FASTA file has more entries than the QUAL file.")
        if q_rec is None :
            raise ValueError("QUAL file has more entries than the FASTA file.")
        if f_rec.id != q_rec.id :
            raise ValueError("FASTA and QUAL entries do not match (%s vs %s)." \
                             % (f_rec.id, q_rec.id))
        if len(f_rec) != len(q_rec.letter_annotations["phred_quality"]) :
            raise ValueError("Sequence length and number of quality scores disagree for %s" \
                             % f_rec.id)
        #Merge the data....
        f_rec.letter_annotations["phred_quality"] = q_rec.letter_annotations["phred_quality"]
        yield f_rec
Example #4
0
def PairedFastaQualIterator(fasta_handle,
                            qual_handle,
                            alphabet=single_letter_alphabet,
                            title2ids=None):
    """Iterate over matched FASTA and QUAL files as SeqRecord objects.

    For example, consider this short QUAL file::

        >EAS54_6_R1_2_1_413_324
        26 26 18 26 26 26 26 26 26 26 26 26 26 26 26 22 26 26 26 26
        26 26 26 23 23
        >EAS54_6_R1_2_1_540_792
        26 26 26 26 26 26 26 26 26 26 26 22 26 26 26 26 26 12 26 26
        26 18 26 23 18
        >EAS54_6_R1_2_1_443_348
        26 26 26 26 26 26 26 26 26 26 26 24 26 22 26 26 13 22 26 18
        24 18 18 18 18
    
    And a matching FASTA file::

        >EAS54_6_R1_2_1_413_324
        CCCTTCTTGTCTTCAGCGTTTCTCC
        >EAS54_6_R1_2_1_540_792
        TTGGCAGGCCAAGGCCGATGGATCA
        >EAS54_6_R1_2_1_443_348
        GTTGCTTCTGGCGTGGGTGGGGGGG

    You can parse these separately using Bio.SeqIO with the "qual" and
    "fasta" formats, but then you'll get a group of SeqRecord objects with
    no sequence, and a matching group with the sequence but not the
    qualities.  Because it only deals with one input file handle, Bio.SeqIO
    can't be used to read the two files together - but this function can!
    For example,
    
    >>> rec_iter = PairedFastaQualIterator(open("Quality/example.fasta", "rU"),
    ...                                    open("Quality/example.qual", "rU"))
    >>> for record in rec_iter :
    ...     print record.id, record.seq
    EAS54_6_R1_2_1_413_324 CCCTTCTTGTCTTCAGCGTTTCTCC
    EAS54_6_R1_2_1_540_792 TTGGCAGGCCAAGGCCGATGGATCA
    EAS54_6_R1_2_1_443_348 GTTGCTTCTGGCGTGGGTGGGGGGG

    As with the FASTQ or QUAL parsers, if you want to look at the qualities,
    they are in each record's per-letter-annotation dictionary as a simple
    list of integers:

    >>> print record.letter_annotations["phred_quality"]
    [26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 24, 26, 22, 26, 26, 13, 22, 26, 18, 24, 18, 18, 18, 18]

    If you have access to data as a FASTQ format file, using that directly
    would be simpler and more straight forward.  Note that you can easily use
    this function to convert paired FASTA and QUAL files into FASTQ files:

    >>> from Bio import SeqIO
    >>> rec_iter = PairedFastaQualIterator(open("Quality/example.fasta", "rU"),
    ...                                    open("Quality/example.qual", "rU"))
    >>> out_handle = open("Quality/temp.fastq", "w")
    >>> SeqIO.write(rec_iter, out_handle, "fastq")
    3
    >>> out_handle.close()

    And don't forget to clean up the temp file if you don't need it anymore:

    >>> import os
    >>> os.remove("Quality/temp.fastq")    
    """
    from Bio.SeqIO.FastaIO import FastaIterator
    fasta_iter = FastaIterator(fasta_handle, alphabet=alphabet, \
                               title2ids=title2ids)
    qual_iter = QualPhredIterator(qual_handle, alphabet=alphabet, \
                                  title2ids=title2ids)

    #Using zip(...) would create a list loading everything into memory!
    #It would also not catch any extra records found in only one file.
    while True:
        try:
            f_rec = fasta_iter.next()
        except StopIteration:
            f_rec = None
        try:
            q_rec = qual_iter.next()
        except StopIteration:
            q_rec = None
        if f_rec is None and q_rec is None:
            #End of both files
            break
        if f_rec is None:
            raise ValueError("FASTA file has more entries than the QUAL file.")
        if q_rec is None:
            raise ValueError("QUAL file has more entries than the FASTA file.")
        if f_rec.id != q_rec.id:
            raise ValueError("FASTA and QUAL entries do not match (%s vs %s)." \
                             % (f_rec.id, q_rec.id))
        if len(f_rec) != len(q_rec.letter_annotations["phred_quality"]):
            raise ValueError("Sequence length and number of quality scores disagree for %s" \
                             % f_rec.id)
        #Merge the data....
        f_rec.letter_annotations["phred_quality"] = q_rec.letter_annotations[
            "phred_quality"]
        yield f_rec