Beispiel #1
0
def _parse_quality_scores(fh, seq_len, variant, phred_offset, prev):
    phred_scores = []
    qual_len = 0
    for chunk in _line_generator(fh, skip_blanks=False):
        if chunk:
            if chunk.startswith('@') and qual_len == seq_len:
                return np.hstack(phred_scores), chunk
            else:
                if not prev:
                    _blank_error("after '+' or within quality scores")
                qual_len += len(chunk)

                if qual_len > seq_len:
                    raise FASTQFormatError(
                        "Found more quality score characters than sequence "
                        "characters. Extra quality score characters: %r" %
                        chunk[-(qual_len - seq_len):])

                phred_scores.append(
                    _decode_qual_to_phred(chunk,
                                          variant=variant,
                                          phred_offset=phred_offset))
        prev = chunk

    if qual_len != seq_len:
        raise FASTQFormatError(
            "Found incomplete/truncated FASTQ record at end of file.")
    return np.hstack(phred_scores), None
Beispiel #2
0
def _parse_sequence_data(fh, prev):
    seq_chunks = []
    for chunk in _line_generator(fh, skip_blanks=False):
        if chunk.startswith('+'):
            if not prev:
                _blank_error("before '+'")
            if not seq_chunks:
                raise FASTQFormatError(
                    "Found FASTQ record without sequence data.")
            return ''.join(seq_chunks), chunk
        elif chunk.startswith('@'):
            raise FASTQFormatError(
                "Found FASTQ record that is missing a quality (+) header line "
                "after sequence data.")
        else:
            if not prev:
                _blank_error("after header or within sequence")
            if _whitespace_regex.search(chunk):
                raise FASTQFormatError(
                    "Found whitespace in sequence data: %r" % str(chunk))
            seq_chunks.append(chunk)
        prev = chunk

    raise FASTQFormatError(
        "Found incomplete/truncated FASTQ record at end of file.")
Beispiel #3
0
def _fastq_to_generator(fh,
                        variant=None,
                        phred_offset=None,
                        constructor=Sequence,
                        **kwargs):
    # Skip any blank or whitespace-only lines at beginning of file
    seq_header = next(_line_generator(fh, skip_blanks=True))

    if not seq_header.startswith('@'):
        raise FASTQFormatError(
            "Expected sequence (@) header line at start of file: %r" %
            str(seq_header))

    while seq_header is not None:
        id_, desc = _parse_fasta_like_header(seq_header)
        seq, qual_header = _parse_sequence_data(fh, seq_header)

        if qual_header != '+' and qual_header[1:] != seq_header[1:]:
            raise FASTQFormatError(
                "Sequence (@) and quality (+) header lines do not match: "
                "%r != %r" % (str(seq_header[1:]), str(qual_header[1:])))

        phred_scores, seq_header = _parse_quality_scores(
            fh, len(seq), variant, phred_offset, qual_header)
        yield constructor(seq,
                          metadata={
                              'id': id_,
                              'description': desc
                          },
                          positional_metadata={'quality': phred_scores},
                          **kwargs)
Beispiel #4
0
def _parse_sequence_data(fh):
    seq_chunks = []
    for chunk in _line_generator(fh):
        if chunk.startswith('+'):
            if not seq_chunks:
                raise FASTQFormatError(
                    "Found FASTQ record without sequence data.")
            return ''.join(seq_chunks), chunk
        elif chunk.startswith('@'):
            raise FASTQFormatError(
                "Found FASTQ record that is missing a quality (+) header line "
                "after sequence data.")
        else:
            if _whitespace_regex.search(chunk):
                raise FASTQFormatError(
                    "Found whitespace in sequence data: %r" % chunk)
            seq_chunks.append(chunk)

    raise FASTQFormatError(
        "Found incomplete/truncated FASTQ record at end of file.")
Beispiel #5
0
def _fastq_to_generator(fh,
                        variant=None,
                        phred_offset=None,
                        constructor=BiologicalSequence):
    seq_header = next(_line_generator(fh))
    if not seq_header.startswith('@'):
        raise FASTQFormatError(
            "Expected sequence (@) header line at start of file: %r" %
            seq_header)

    while seq_header is not None:
        id_, desc = _parse_fasta_like_header(seq_header)
        seq, qual_header = _parse_sequence_data(fh)

        if qual_header != '+' and qual_header[1:] != seq_header[1:]:
            raise FASTQFormatError(
                "Sequence (@) and quality (+) header lines do not match: "
                "%r != %r" % (seq_header[1:], qual_header[1:]))

        phred_scores, seq_header = _parse_quality_scores(
            fh, len(seq), variant, phred_offset)
        yield constructor(seq, id=id_, description=desc, quality=phred_scores)
Beispiel #6
0
def _parse_quality_scores(fh, seq_len, variant, phred_offset):
    phred_scores = []
    qual_len = 0
    for chunk in _line_generator(fh):
        if chunk.startswith('@') and qual_len == seq_len:
            return phred_scores, chunk
        else:
            qual_len += len(chunk)

            if qual_len > seq_len:
                raise FASTQFormatError(
                    "Found more quality score characters than sequence "
                    "characters. Extra quality score characters: %r" %
                    chunk[-(qual_len - seq_len):])

            phred_scores.extend(
                _decode_qual_to_phred(chunk,
                                      variant=variant,
                                      phred_offset=phred_offset))

    if qual_len != seq_len:
        raise FASTQFormatError(
            "Found incomplete/truncated FASTQ record at end of file.")
    return phred_scores, None
Beispiel #7
0
def _line_generator(fh):
    for line in fh:
        line = line.rstrip('\n')
        if not line:
            raise FASTQFormatError("Found blank line in FASTQ-formatted file.")
        yield line
Beispiel #8
0
def _blank_error(unique_text):
    error_string = ("Found blank or whitespace-only line {} in "
                    "FASTQ file").format(unique_text)
    raise FASTQFormatError(error_string)