Beispiel #1
0
def _parse_fasta_raw(fh, data_parser, format_label):
    """Raw parser for FASTA or QUAL files.

    Returns raw values (seq/qual, id, description). It is the responsibility of
    the caller to construct the correct in-memory object to hold the data.

    """
    line = next(fh)
    # header check inlined here and below for performance
    if line.startswith('>'):
        id_, desc = _parse_fasta_like_header(line)
    else:
        raise FASTAFormatError(
            "Found line without a header in %s-formatted file:\n%s" %
            (format_label, line))

    data_chunks = []
    for line in fh:
        if line.startswith('>'):
            # new header, so yield current record and reset state
            yield data_parser(data_chunks), id_, desc
            data_chunks = []
            id_, desc = _parse_fasta_like_header(line)
        else:
            line = line.strip()
            if line:
                data_chunks.append(line)
            else:
                raise FASTAFormatError(
                    "Found blank or whitespace-only line in %s-formatted "
                    "file." % format_label)
    # yield last record in file
    yield data_parser(data_chunks), id_, desc
Beispiel #2
0
def _fasta_to_generator(fh, qual=FileSentinel, constructor=BiologicalSequence):
    if qual is None:
        for seq, id_, desc in _parse_fasta_raw(fh, _parse_sequence_data,
                                               'FASTA'):
            yield constructor(seq, id=id_, description=desc)
    else:
        fasta_gen = _parse_fasta_raw(fh, _parse_sequence_data, 'FASTA')
        qual_gen = _parse_fasta_raw(qual, _parse_quality_scores, 'QUAL')

        for fasta_rec, qual_rec in zip_longest(fasta_gen, qual_gen,
                                               fillvalue=None):
            if fasta_rec is None:
                raise FASTAFormatError(
                    "QUAL file has more records than FASTA file.")
            if qual_rec is None:
                raise FASTAFormatError(
                    "FASTA file has more records than QUAL file.")

            fasta_seq, fasta_id, fasta_desc = fasta_rec
            qual_scores, qual_id, qual_desc = qual_rec

            if fasta_id != qual_id:
                raise FASTAFormatError(
                    "IDs do not match between FASTA and QUAL records: %r != %r"
                    % (fasta_id, qual_id))
            if fasta_desc != qual_desc:
                raise FASTAFormatError(
                    "Descriptions do not match between FASTA and QUAL "
                    "records: %r != %r" % (fasta_desc, qual_desc))

            # sequence and quality scores lengths are checked in constructor
            yield constructor(fasta_seq, id=fasta_id, description=fasta_desc,
                              quality=qual_scores)
Beispiel #3
0
def _parse_quality_scores(chunks):
    if not chunks:
        raise FASTAFormatError("Found QUAL header without quality scores.")

    qual_str = ' '.join(chunks)
    try:
        return np.asarray(qual_str.split(), dtype=int)
    except ValueError:
        raise FASTAFormatError(
            "Could not convert quality scores to integers:\n%s" % qual_str)
Beispiel #4
0
def _fasta_to_generator(fh, qual=FileSentinel, constructor=Sequence, **kwargs):
    if qual is None:
        for seq, id_, desc in _parse_fasta_raw(fh, _parse_sequence_data,
                                               FASTAFormatError):
            yield constructor(seq,
                              metadata={
                                  'id': id_,
                                  'description': desc
                              },
                              **kwargs)
    else:
        fasta_gen = _parse_fasta_raw(fh, _parse_sequence_data,
                                     FASTAFormatError)
        qual_gen = _parse_fasta_raw(qual, _parse_quality_scores,
                                    QUALFormatError)

        for fasta_rec, qual_rec in zip_longest(fasta_gen,
                                               qual_gen,
                                               fillvalue=None):
            if fasta_rec is None:
                raise FASTAFormatError(
                    "QUAL file has more records than FASTA file.")
            if qual_rec is None:
                raise FASTAFormatError(
                    "FASTA file has more records than QUAL file.")

            fasta_seq, fasta_id, fasta_desc = fasta_rec
            qual_scores, qual_id, qual_desc = qual_rec

            if fasta_id != qual_id:
                raise FASTAFormatError(
                    "IDs do not match between FASTA and QUAL records: %r != %r"
                    % (fasta_id, qual_id))
            if fasta_desc != qual_desc:
                raise FASTAFormatError(
                    "Descriptions do not match between FASTA and QUAL "
                    "records: %r != %r" % (fasta_desc, qual_desc))

            # sequence and quality scores lengths are checked in constructor
            yield constructor(fasta_seq,
                              metadata={
                                  'id': fasta_id,
                                  'description': fasta_desc
                              },
                              positional_metadata={'quality': qual_scores},
                              **kwargs)
Beispiel #5
0
def _sniffer_data_parser(chunks):
    data = _parse_sequence_data(chunks)
    try:
        _parse_quality_scores(chunks)
    except QUALFormatError:
        return data
    else:
        # used for flow control within sniffer, user should never see this
        # message
        raise FASTAFormatError('Data appear to be quality scores.')
Beispiel #6
0
def _parse_sequence_data(chunks):
    if not chunks:
        raise FASTAFormatError("Found header without sequence data.")
    return ''.join(chunks)