Exemple #1
0
def read_chunks(f: RawIOBase,
                buffer_size: int = 4 * 1024**2) -> Iterator[memoryview]:
    """
    Read a chunk of complete FASTA or FASTQ records from a file.
    The size of a chunk is at most buffer_size.
    f needs to be a file opened in binary mode.

    The yielded memoryview objects become invalid on the next iteration.
    """
    # This buffer is re-used in each iteration.
    buf = bytearray(buffer_size)

    # Read one byte to determine file format.
    # If there is a comment char, we assume FASTA!
    start = f.readinto(memoryview(buf)[0:1])
    if start == 1 and buf[0:1] == b'@':
        head = _fastq_head
    elif start == 1 and (buf[0:1] == b'#' or buf[0:1] == b'>'):
        head = _fasta_head
    elif start == 0:
        # Empty file
        return
    else:
        raise UnknownFileFormat('Input file format unknown')

    # Layout of buf
    #
    # |-- complete records --|
    # +---+------------------+---------+-------+
    # |   |                  |         |       |
    # +---+------------------+---------+-------+
    # ^   ^                   ^         ^       ^
    # 0   start               end       bufend  len(buf)
    #
    # buf[0:start] is the 'leftover' data that could not be processed
    # in the previous iteration because it contained an incomplete
    # FASTA or FASTQ record.

    while True:
        if start == len(buf):
            raise OverflowError('FASTA/FASTQ record does not fit into buffer')
        bufend = f.readinto(memoryview(buf)[start:]) + start  # type: ignore
        if start == bufend:
            # End of file
            break
        end = head(buf, bufend)
        assert end <= bufend
        if end > 0:
            yield memoryview(buf)[0:end]
        start = bufend - end
        assert start >= 0
        buf[0:start] = buf[end:bufend]

    if start > 0:
        yield memoryview(buf)[0:start]
Exemple #2
0
def read_paired_chunks(
    f: RawIOBase,
    f2: RawIOBase,
    buffer_size: int = 4 * 1024**2,
) -> Iterator[Tuple[memoryview, memoryview]]:
    if buffer_size < 1:
        raise ValueError("Buffer size too small")

    buf1 = bytearray(buffer_size)
    buf2 = bytearray(buffer_size)

    # Read one byte to make sure we are processing FASTQ
    start1 = f.readinto(memoryview(buf1)[0:1])  # type: ignore
    start2 = f2.readinto(memoryview(buf2)[0:1])  # type: ignore
    if (start1 == 1 and buf1[0:1] != b'@') or (start2 == 1
                                               and buf2[0:1] != b'@'):
        raise FileFormatError(
            "Paired-end data must be in FASTQ format when using multiple cores",
            line=None)

    while True:
        if start1 == len(buf1) or start2 == len(buf2):
            raise ValueError("FASTQ record does not fit into buffer")
        bufend1 = f.readinto(
            memoryview(buf1)[start1:]) + start1  # type: ignore
        bufend2 = f2.readinto(
            memoryview(buf2)[start2:]) + start2  # type: ignore
        if start1 == bufend1 and start2 == bufend2:
            break

        end1, end2 = _paired_fastq_heads(buf1, buf2, bufend1, bufend2)
        assert end1 <= bufend1
        assert end2 <= bufend2

        if end1 > 0 or end2 > 0:
            yield (memoryview(buf1)[0:end1], memoryview(buf2)[0:end2])
        start1 = bufend1 - end1
        assert start1 >= 0
        buf1[0:start1] = buf1[end1:bufend1]
        start2 = bufend2 - end2
        assert start2 >= 0
        buf2[0:start2] = buf2[end2:bufend2]

    if start1 > 0 or start2 > 0:
        yield (memoryview(buf1)[0:start1], memoryview(buf2)[0:start2])
Exemple #3
0
def read_intf_packets(fd: io.RawIOBase, inq: MQueue, outq: MQueue):
    logger.info("read: start reading from interface")
    while True:
        m = inq.pop()

        n = fd.readinto(m.start)
        if n <= 0:
            logger.error("read: bad read %d on interface, dropping", n)
            inq.push(m, True)
        else:
            if DEBUG:
                logger.debug("read: %d bytes on interface", n)
            m.end = m.start[n:]
            outq.push(m, False)
Exemple #4
0
def cache_segment_data(input_file: io.RawIOBase, segments: List[Any], segment_id: int, base_file_offset: int=0) -> None:
    """
    base_file_offset: when the input file is located within a containing file.
    """
    data = None
    file_offset = get_segment_data_file_offset(segments, segment_id)
    # No data for segments that have no data..
    if file_offset != -1:
        file_length = get_segment_data_length(segments, segment_id)

        input_file.seek(base_file_offset + file_offset, os.SEEK_SET)
        file_data = bytearray(file_length)
        if input_file.readinto(file_data) == file_length:
            # NOTE(rmtew): Python 2, type(data[0]) is str. Python 3, type(data[0]) is int
            data = memoryview(file_data)
        else:
            logger.error("Unable to cache segment %d data, got %d bytes, wanted %d", segment_id, len(file_data), file_length)
    segments[segment_id][SI_CACHED_DATA] = data
Exemple #5
0
def read_file(
    f: io.RawIOBase,
    executor: futures.Executor,
    q: _result_queue,
    stop_reading: threading.Event,
):
    try:
        while not stop_reading.is_set():
            # make a bytearray and try very hard to fill it.
            buf = memoryview(bytearray(_CHUNK_SIZE))
            ntotal: int = 0
            nread: int = 1
            while nread > 0:
                # As of 2020-06-01: typeshed wrongly claims we can't
                # `readinto(memoryview)`, so we disable type checking.
                nread = f.readinto(buf[ntotal:]) or 0  # type: ignore
                ntotal += nread
            if ntotal == 0:  # end of file when we can't fill any
                return
            q.put(executor.submit(compute_hash, buf[:ntotal]))
    finally:
        q.put(None)  # signal end of queue to the printer
Exemple #6
0
def read_chunks(f: RawIOBase,
                buffer_size: int = 4 * 1024**2) -> Iterator[memoryview]:
    """
    Read chunks of complete FASTA or FASTQ records from a file.
    If the format is detected to be FASTQ, all chunks except possibly the last contain
    an even number of records such that interleaved paired-end reads remain in sync.
    The yielded memoryview objects are only valid for one iteration because the internal
    buffer is re-used in the next iteration.

    Arguments:
        f: File with FASTA or FASTQ reads; must have been opened in binary mode
        buffer_size: Largest allowed chunk size

    Yields:
        memoryview representing the chunk. This becomes invalid on the next iteration.

    Raises:
         ValueError: A FASTQ record was encountered that is larger than *buffer_size*.
         UnknownFileFormat: The file format could not be detected
           (the first byte must be "@", ">" or "#")
    """
    # This buffer is re-used in each iteration.
    buf = bytearray(buffer_size)

    # Read one byte to determine file format.
    # If there is a comment char, we assume FASTA!
    start = f.readinto(memoryview(buf)[0:1])
    if start == 0:
        # Empty file
        return
    assert start == 1
    if buf[0:1] == b'@':
        head = _fastq_head
    elif buf[0:1] == b'#' or buf[0:1] == b'>':
        head = _fasta_head
    else:
        raise UnknownFileFormat(
            f"Cannnot determine input file format: First character expected to be '>' or '@', "
            f"but found {repr(chr(buf[0]))}")

    # Layout of buf
    #
    # |-- complete records --|
    # +---+------------------+---------+-------+
    # |   |                  |         |       |
    # +---+------------------+---------+-------+
    # ^   ^                   ^         ^       ^
    # 0   start               end       bufend  len(buf)
    #
    # buf[0:start] is the 'leftover' data that could not be processed
    # in the previous iteration because it contained an incomplete
    # FASTA or FASTQ record.

    while True:
        if start == len(buf):
            raise OverflowError('FASTA/FASTQ record does not fit into buffer')
        bufend = f.readinto(memoryview(buf)[start:]) + start  # type: ignore
        if start == bufend:
            # End of file
            break
        end = head(buf, bufend)
        assert end <= bufend
        if end > 0:
            yield memoryview(buf)[0:end]
        start = bufend - end
        assert start >= 0
        buf[0:start] = buf[end:bufend]

    if start > 0:
        yield memoryview(buf)[0:start]
Exemple #7
0
def read_paired_chunks(
    f: RawIOBase,
    f2: RawIOBase,
    buffer_size: int = 4 * 1024**2,
) -> Iterator[Tuple[memoryview, memoryview]]:
    """
    Read chunks of paired-end FASTQ reads from two files.
    A pair of chunks (memoryview objects) is yielded on each iteration,
    and both chunks are guaranteed to have the same number of sequences.
    That is, the paired-end reads will stay in sync.

    The memoryviews are only valid for one iteration because the internal
    buffer is re-used in the next iteration.

    This is similar to `read_chunks`, but for paired-end data.
    Unlike `read_chunks`, this only works for FASTQ input.

    Args:
        f: File with R1 reads; must have been opened in binary mode
        f2: File with R2 reads; must have been opened in binary mode
        buffer_size: Largest allowed chunk size

    Yields:
        Pairs of memoryview objects.

    Raises:
         ValueError: A FASTQ record was encountered that is larger than *buffer_size*.
    """
    if buffer_size < 1:
        raise ValueError("Buffer size too small")

    buf1 = bytearray(buffer_size)
    buf2 = bytearray(buffer_size)

    # Read one byte to make sure we are processing FASTQ
    start1 = f.readinto(memoryview(buf1)[0:1])  # type: ignore
    start2 = f2.readinto(memoryview(buf2)[0:1])  # type: ignore
    if (start1 == 1 and buf1[0:1] != b'@') or (start2 == 1
                                               and buf2[0:1] != b'@'):
        raise FileFormatError(
            "Paired-end data must be in FASTQ format when using multiple cores",
            line=None)

    while True:
        if start1 == len(buf1) or start2 == len(buf2):
            raise ValueError("FASTQ record does not fit into buffer")
        bufend1 = f.readinto(
            memoryview(buf1)[start1:]) + start1  # type: ignore
        bufend2 = f2.readinto(
            memoryview(buf2)[start2:]) + start2  # type: ignore
        if start1 == bufend1 and start2 == bufend2:
            break

        end1, end2 = _paired_fastq_heads(buf1, buf2, bufend1, bufend2)
        assert end1 <= bufend1
        assert end2 <= bufend2

        if end1 > 0 or end2 > 0:
            yield (memoryview(buf1)[0:end1], memoryview(buf2)[0:end2])
        start1 = bufend1 - end1
        assert start1 >= 0
        buf1[0:start1] = buf1[end1:bufend1]
        start2 = bufend2 - end2
        assert start2 >= 0
        buf2[0:start2] = buf2[end2:bufend2]

    if start1 > 0 or start2 > 0:
        yield (memoryview(buf1)[0:start1], memoryview(buf2)[0:start2])