Beispiel #1
0
def _get_some_qual_and_lengths(fhand, force_file_as_non_seek):
    'It returns the quality characters and the lengths'
    seqs_to_peek = get_setting('SEQS_TO_GUESS_FASTQ_VERSION')
    chunk_size = get_setting('CHUNK_TO_GUESS_FASTQ_VERSION')

    lengths = array('I')
    seqs_analyzed = 0
    if fhand_is_seekable(fhand) and not force_file_as_non_seek:
        fmt_fhand = fhand
        chunk = fmt_fhand.read(chunk_size)
        fhand.seek(0)
    else:
        chunk = peek_chunk_from_file(fhand, chunk_size)
        fmt_fhand = cStringIO.StringIO(chunk)

    try:
        for seq in FastqGeneralIterator(fmt_fhand):
            qual = [ord(char) for char in seq[2]]
            sanger_chars = [q for q in qual if q < 64]
            if sanger_chars:
                fhand.seek(0)
                return None, True, chunk  # no quals, no lengths, is_sanger
            lengths.append(len(qual))
            seqs_analyzed += 1
            if seqs_analyzed > seqs_to_peek:
                break
    except ValueError:
        msg = 'The file is Fastq, but the version is difficult to guess'
        raise UndecidedFastqVersionError(msg)
    finally:
        fhand.seek(0)
    return lengths, None, chunk  # don't know if it's sanger
Beispiel #2
0
def _guess_format(fhand, force_file_as_non_seek):
    '''It guesses the format of the sequence file.

    This function is just for testing forcing the fhand as non-seekable.
    It does ignore the solexa fastq version.
    '''
    chunk_size = 2048
    chunk = peek_chunk_from_file(fhand, chunk_size)
    if not chunk:
        raise FileIsEmptyError('The file is empty')
    lines = chunk.splitlines()
    if chunk.startswith('>'):
        if lines[1].startswith('>'):
            raise UnknownFormatError('Malformed fasta')
        else:
            first_item = lines[1].strip().split()[0]
            if first_item.isdigit():
                return 'qual'
            else:
                return 'fasta'
    elif chunk.startswith('@'):
        return _guess_fastq_version(fhand, force_file_as_non_seek)
    elif chunk.startswith('LOCUS'):
        return 'genbank'
    elif chunk.startswith('ID'):
        return 'embl'
    raise UnknownFormatError('Sequence file of unknown format.')
Beispiel #3
0
def _guess_format(fhand, force_file_as_non_seek):
    """It guesses the format of the sequence file.

    This function is just for testing forcing the fhand as non-seekable.
    It does ignore the solexa fastq version.
    """
    chunk_size = 1024
    chunk = peek_chunk_from_file(fhand, chunk_size)
    if not chunk:
        raise UnknownFormatError("The file is empty")
    lines = chunk.splitlines()
    if chunk.startswith(">"):
        if lines[1].startswith(">"):
            raise UnknownFormatError("Malformed fasta")
        else:
            first_item = lines[1].strip().split()[0]
            if first_item.isdigit():
                return "qual"
            else:
                return "fasta"
    elif chunk.startswith("@"):
        return _guess_fastq_version(fhand, force_file_as_non_seek)
    elif chunk.startswith("LOCUS"):
        return "genbank"
    elif chunk.startswith("ID"):
        return "embl"
    raise UnknownFormatError("Sequence file of unknown format.")
Beispiel #4
0
def _get_some_qual_and_lengths(fhand, force_file_as_non_seek):
    "It returns the quality characters and the lengths"
    seqs_to_peek = SEQS_TO_GUESS_FASTQ_VERSION
    chunk_size = CHUNK_TO_GUESS_FASTQ_VERSION

    lengths = array("I")
    seqs_analyzed = 0
    if fhand_is_seekable(fhand) and not force_file_as_non_seek:
        fmt_fhand = fhand
    else:
        chunk = peek_chunk_from_file(fhand, chunk_size)
        fmt_fhand = cStringIO.StringIO(chunk)

    try:
        for seq in FastqGeneralIterator(fmt_fhand):
            qual = [ord(char) for char in seq[2]]
            sanger_chars = [q for q in qual if q < 64]
            if sanger_chars:
                fhand.seek(0)
                return None, True  # no quals, no lengths, is_sanger
            lengths.append(len(qual))
            seqs_analyzed += 1
            if seqs_analyzed > seqs_to_peek:
                break
    except ValueError:
        raise UnknownFormatError("Malformed fastq")
    finally:
        fhand.seek(0)
    return lengths, None  # quals, lengths, don't know if it's sanger
Beispiel #5
0
def _get_some_qual_and_lengths(fhand, force_file_as_non_seek):
    'It returns the quality characters and the lengths'
    seqs_to_peek = get_setting('SEQS_TO_GUESS_FASTQ_VERSION')
    chunk_size = get_setting('CHUNK_TO_GUESS_FASTQ_VERSION')

    lengths = array('I')
    seqs_analyzed = 0
    if fhand_is_seekable(fhand) and not force_file_as_non_seek:
        fmt_fhand = fhand
        chunk = fmt_fhand.read(chunk_size)
        fhand.seek(0)
    else:
        chunk = peek_chunk_from_file(fhand, chunk_size)
        fmt_fhand = cStringIO.StringIO(chunk)

    try:
        for seq in FastqGeneralIterator(fmt_fhand):
            qual = [ord(char) for char in seq[2]]
            sanger_chars = [q for q in qual if q < 64]
            if sanger_chars:
                fhand.seek(0)
                return None, True, chunk  # no quals, no lengths, is_sanger
            lengths.append(len(qual))
            seqs_analyzed += 1
            if seqs_analyzed > seqs_to_peek:
                break
    except ValueError:
        msg = 'The file is Fastq, but the version is difficult to guess'
        raise UndecidedFastqVersionError(msg)
    finally:
        fhand.seek(0)
    return lengths, None, chunk  # don't know if it's sanger
Beispiel #6
0
def _guess_format(fhand, force_file_as_non_seek):
    '''It guesses the format of the sequence file.

    This function is just for testing forcing the fhand as non-seekable.
    It does ignore the solexa fastq version.
    '''
    chunk_size = 2048
    chunk = peek_chunk_from_file(fhand, chunk_size)
    if not chunk:
        raise FileIsEmptyError('The file is empty')
    lines = chunk.splitlines()
    if chunk.startswith('>'):
        if lines[1].startswith('>'):
            raise UnknownFormatError('Malformed fasta')
        else:
            first_item = lines[1].strip().split()[0]
            if first_item.isdigit():
                return 'qual'
            else:
                return 'fasta'
    elif chunk.startswith('@'):
        return _guess_fastq_version(fhand, force_file_as_non_seek)
    elif chunk.startswith('LOCUS'):
        return 'genbank'
    elif chunk.startswith('ID'):
        return 'embl'
    raise UnknownFormatError('Sequence file of unknown format.')