def test(self): if False: # to help with debugging: # print the first 16 sync points - some _may_ be in the audio data bfr = CornuCopyBuffer.from_filename(TESTFILE) count = 16 while not bfr.at_eof() and count > 0: bs = b''.join(MP3AudioFrame.scan_for_sync(bfr)) X("AUDIO at %d after %d bytes", bfr.offset, len(bs)) bfr.take(1) count -= 1 S = os.stat(TESTFILE) mp3_size = S.st_size bfr = CornuCopyBuffer.from_filename(TESTFILE) for offset, frame, post_offset in MP3Frame.scan_with_offsets(bfr): frame_size = post_offset - offset frame_bs = bytes(frame) ##frame2 = MP3Frame.from_bytes(frame_bs) ##self.assertIs(type(frame), type(frame2)) # There used to be a round trip size check, but we repair # some input data and write it out correctly, so the size can # change. Example: a USC-2 text field missing its BOM. self.assertEqual( bfr.offset, mp3_size, "file size = %d, buffer offset = %d" % (mp3_size, bfr.offset)) self.assertTrue(bfr.at_eof()) bfr.close()
def __iter__(self): _, payload = self.S.do(ArchiveListRequest(self.archive_name)) bfr = CornuCopyBuffer([payload]) while not bfr.at_eof(): when = BSString.parse_value(bfr) when = float(when) E = BSString.parse_value(bfr) E = parse(E) if not isinstance(E, _Dirent): raise ValueError("not a _Dirent: %r" % (E, )) yield when, E
def pushto_queue(self, Q, offset=0, runstate=None, progress=None): ''' Push the `Block`s from this `DataFile` to the Queue `Q`. Note that if the target store is a DataDirStore it is faster and simpler to move/copy the `.vtd` file into its `data` subdirectory directly. Of course, that may introduce redundant block copies. Parameters: * `Q`: queue on which to put blocks * `offset`: starting offset, default `0`. * `runstate`: optional `RunState` used to cancel operation. ''' if progress: progress.total += len(self) - offset with open(self.pathname, 'rb') as f: f.seek(offset) bfr = CornuCopyBuffer(datafrom(f, offset), offset=offset) for DR in DataRecord.parse_buffer(bfr): if runstate and runstate.cancelled: return False data = DR.data Q.put(Block(data=data)) if progress: progress += len(data) return True
def upload_bytes( self, bs, *, bucket_name: str, path: str, file_info=None, content_type=None, upload_progress=None, ): ''' Upload bytes from `bs` to `path` within `bucket_name`. The default implementation calls `self.upload_buffer()`. Parameters: * `bs`: the source `bytes`-like object * `bucket_name`: the bucket name * `path`: the subpath within the bucket * `file_info`: an optional mapping of extra information about the file * `content_type`: an optional MIME content type value * `upload_progress`: an optional `cs.progress.Progress` instance to which to report upload data ''' return self.upload_buffer( CornuCopyBuffer([bs]), bucket_name=bucket_name, path=path, file_info=file_info, content_type=content_type, upload_progress=upload_progress )
def test_shuffled_randomblocks(self): ''' Save RUN_SIZE random blocks, close, retrieve in random order. ''' # save random blocks to a file blocks = {} with open(self.pathname, 'wb') as f: for n in range(RUN_SIZE): with self.subTest(put_block_n=n): data = make_randblock(rand0(MAX_BLOCK_SIZE + 1)) dr = DataRecord(data) offset = f.tell() blocks[offset] = data f.write(bytes(dr)) # shuffle the block offsets offsets = list(blocks.keys()) random.shuffle(offsets) # retrieve the blocks in random order, check for correct content with open(self.pathname, 'rb') as f: for n, offset in enumerate(offsets): with self.subTest(shuffled_offsets_n=n, offset=offset): f.seek(offset) bfr = CornuCopyBuffer.from_file(f) dr = DataRecord.parse(bfr) data = dr.data self.assertTrue(data == blocks[offset])
def upload_file( self, f, *, bucket_name: str, path: str, file_info=None, content_type=None, upload_progress=None, ): ''' Upload the data from the file `f` to `path` within `bucket_name`. Return a `dict` containing the upload result. The default implementation calls `self.upload_buffer()`. Parameters: * `f`: the file * `bucket_name`: the bucket name * `path`: the subpath within the bucket * `file_info`: an optional mapping of extra information about the file * `content_type`: an optional MIME content type value * `upload_progress`: an optional `cs.progress.Progress` instance to which to report upload data ''' return self.upload_buffer( CornuCopyBuffer.from_file(f), bucket_name=bucket_name, path=path, file_info=file_info, content_type=content_type, upload_progress=upload_progress, )
def hashcodes(self, start_hashcode=None, after: bool = False, length: Optional[int] = None): hashclass = self.hashclass if length is not None and length < 1: raise ValueError("length should be None or >1, got: %r" % (length, )) if after and start_hashcode is None: raise ValueError("after=%s but start_hashcode=%s" % (after, start_hashcode)) if length is not None and length < 1: raise ValueError("length should be None or >1, got: %r" % (length, )) if after and start_hashcode is None: raise ValueError("after=%s but start_hashcode=%s" % (after, start_hashcode)) flags, payload = self.do( HashCodesRequest(start_hashcode=start_hashcode, hashclass=hashclass, after=after, length=length)) if flags: raise StoreError("unexpected flags: 0x%02x" % (flags, )) bfr = CornuCopyBuffer([payload]) hashary = list(HashCodeField.scan_values(bfr)) # verify hashcode types mismatches = set( type(hashcode).__name__ for hashcode in hashary if not isinstance(hashcode, hashclass)) if mismatches: raise StoreError( "expected hashcodes of type %s, got %d mismatches of of type %s" % (hashclass.__name__, len(mismatches), sorted(mismatches))) return hashary
def download_buffer( self, *, bucket_name: str, path: str, download_progress=None, # pylint: disable=unused-argument ) -> (CornuCopyBuffer, dict): ''' Download from `path` within `bucket_name`, returning `(buffer,file_info)` being a `CornuCopyBuffer` presenting the data bytes and the file info uploaded with the file. Parameters: * `bucket_name`: the bucket name * `path`: the subpath within the bucket * `download_progress`: an optional `cs.progress.Progress` instance to which to report download data ''' filename = os.sep + joinpath(bucket_name, path) with Pfx("open(%r)", filename): with open(filename, 'rb') as f: bfr = CornuCopyBuffer.from_fd(f.fileno(), progress=download_progress) with FSTags() as fstags: file_info = fstags[filename].as_dict() return bfr, file_info
def from_pathname(cls, pathname, readsize=None, **kw): ''' Compute hashcode from the contents of the file `pathname`. ''' if readsize is None: readsize = DEFAULT_READSIZE return cls.from_buffer( CornuCopyBuffer.from_filename(pathname, readsize=readsize, **kw) )
def prev_dirent(self): ''' Return the previous Dirent. If not None, during encoding or transcription, if self != prev_dirent, include it in the encoding or transcription. TODO: parse out multiple blockrefs. ''' prev_blockref = self._prev_dirent_blockref if prev_blockref is None: return None bfr = CornuCopyBuffer(prev_blockref) E = _Dirent.from_buffer(bfr) if not bfr.at_eof(): warning( "prev_dirent: _prev_dirent_blockref=%s:" " unparsed bytes after dirent at offset %d", prev_blockref, bfr.offset) return E
def parse(cls, bfr): self = cls() # pylint: disable=attribute-defined-outside-init self.tag_id = bfr.take(4) with Pfx(self.tag_id): size = UInt32BE.parse_value(bfr) self.flags = UInt16BE.parse_value(bfr) if size < 1: warning("size < 1") else: data_bs = bfr.take(size) data_type = self.tag_id_class(self.tag_id) if data_type is None: self.dataframe_body = data_bs else: databfr = CornuCopyBuffer([data_bs]) self.datafrome_body = data_type.parse(databfr) if not databfr.at_eof(): warning("unparsed data: %r" % (databfr.take(...), )) return self
def _test_chunks(data_spec): ''' Return an iterable of chunks from a data spec (filename or list-of-bytes). ''' # obtain the test data if data_spec is None: chunks = None elif isinstance(data_spec, str): chunks = CornuCopyBuffer.from_filename(data_spec) elif isinstance(data_spec, (list, tuple)): chunks = data_spec else: raise RuntimeError("unexpected data_spec of type %s" % (type(data_spec), )) return chunks
def upload_file( self, f, *, bucket_name: str, path: str, file_info=None, content_type=None, upload_progress=None, ): ''' Upload the data from the file `f` to `path` within `bucket_name`. Return a `dict` containing the B2 `FileVersion` attribute values. Note that the b2api expects to be able to seek when given a file so this tries to `mmap.mmap` the file and use the bytes upload interface, falling back to coping to a scratch file. Parameters: * `f`: the file, preferably seekable * `bucket_name`: the bucket name * `path`: the subpath within the bucket * `file_info`: an optional mapping of extra information about the file * `content_type`: an optional MIME content type value * `upload_progress`: an optional `cs.progress.Progress` instance to which to report upload data ''' try: fd = f.fileno() mm = mmap(fd, 0, prot=PROT_READ) except (AttributeError, OSError) as e: # no .fileno, not mmapable warning("f=%s: %s", f, e) # upload via a scratch file bfr = f if isinstance( f, CornuCopyBuffer) else CornuCopyBuffer.from_file(f) return self.upload_buffer( bfr, bucket_name=bucket_name, path=path, file_info=file_info, content_type=content_type, upload_progress=upload_progress, ) else: file_version = self._b2_upload_bytes( mm, bucket_name=bucket_name, path=path, upload_progress=upload_progress, ) return file_version.as_dict()
def xattrs_from_bytes(bs, offset=0): ''' Decode an XAttrs from some bytes, return the xattrs dictionary. ''' bfr = CornuCopyBuffer.from_bytes(bs) if offset > 0: bfr.skip(offset) xattrs = {} while not bfr.at_eof(): name = BSString.parse_value(bfr) data = BSData.parse_value(bfr) if name in xattrs: warning("repeated name, ignored: %r", name) else: xattrs[name] = data return xattrs
def parse(cls, bfr): self = cls() # pylint: disable=attribute-defined-outside-init self.tag_id = bfr.take(3) with Pfx(self.tag_id): sz0, sz1, sz2 = bfr.take(3) size = sz0 << 16 | sz1 << 8 | sz2 if size < 1: warning("size < 1") else: data_bs = bfr.take(size) if not data_bs or data_bs[0] == 0: # forbidden empty data or data zeroed out data_type = None else: data_type = self.tag_id_class(self.tag_id) if data_type is None: self.value = data_bs else: databfr = CornuCopyBuffer([data_bs]) self.value = data_type.parse(databfr) if not databfr.at_eof(): warning("unparsed data: %r" % (databfr.take(...), )) return self
def last(self): ''' The last Archive entry `(when,E)` or `(None,None)`. ''' with Pfx("%s.last", self): try: flags, payload = self.S.do( ArchiveLastRequest(self.archive_name)) except StoreError as e: warning("%s, returning (None, None)", e) return ArchiveEntry(None, None) found = flags & 0x01 if not found: return ArchiveEntry(None, None) bfr = CornuCopyBuffer.from_bytes(payload) entry = ArchiveEntry.from_buffer(bfr) return entry
def run_parser(): ''' Thread body to run the supplied scanner against the input data. ''' bfr = CornuCopyBuffer(chunk_iter) # pylint: disable=broad-except try: for offset in scanner(bfr): # the scanner should yield only offsets, not chunks and offsets if not isinstance(offset, int): warning("discarding non-int from scanner %s: %s", scanner, offset) else: parseQ.put(offset) except Exception as e: exception("exception from scanner %s: %s", scanner, e) # Consume the remainder of chunk_iter; the tee() will copy it to parseQ. for _ in chunk_iter: pass # end of offsets and chunks parseQ.close()
def parse(cls, bfr): ''' Return an ID3v2 frame as described here: ''' self = cls() # pylint: disable=attribute-defined-outside-init if bfr.peek(3, short_ok=True) != b'ID3': raise ValueError("expected b'ID3'") bfr.take(3) # the 2.0 part of ID3.2.0 self.v1, self.v2 = bfr.take(2) self.flags = bfr.byte0() size = ID3V2Size.parse_value(bfr) data_bs = bfr.take(size) data_bfr = CornuCopyBuffer([data_bs]) dataframe_class = { 2: ID3V22TagDataFrame, 3: ID3V23TagDataFrame }[self.v1] self.tag_frames = list(dataframe_class.scan(data_bfr)) return self
def parse(cls, bfr): ''' Parse a packet from a buffer. ''' raw_payload = BSData.parse_value(bfr) payload_bfr = CornuCopyBuffer([raw_payload]) self = cls() # pylint: disable=attribute-defined-outside-init self.tag = BSUInt.parse_value(payload_bfr) flags = BSUInt.parse_value(payload_bfr) has_channel = (flags & 0x01) != 0 self.is_request = (flags & 0x02) != 0 flags >>= 2 self.flags = flags if has_channel: self.channel = BSUInt.parse_value(payload_bfr) else: self.channel = 0 if self.is_request: self.rq_type = BSUInt.parse_value(payload_bfr) self.payload = b''.join(payload_bfr) return self
def selftest(): ''' Run some self tests. ''' # pylint: disable=import-outside-toplevel from cs.buffer import CornuCopyBuffer for n in (0, 1, 2, 3, 16, 17, 127, 128, 129, 32767, 32768, 32769, 65535, 65536, 65537): bs = transcribe_length_encoded_value(n) bfr = CornuCopyBuffer.from_bytes(bs) n2 = get_length_encoded_value(bfr) assert n == n2, "n:%s != n2:%s" % (n, n2) assert bfr.offset == len( bs), "bfr.offset:%s != len(bs):%s" % (bfr.offset, len(bs)) assert bfr.at_eof, "bfr not at EOF" ds, offset = DataSize.from_bytes(bs) assert ds.value == n assert offset == len(bs) bs2 = bytes(ds) assert bs == bs2 ds2 = DataSize(n) bs3 = bytes(ds2) assert bs == bs3
def cmd_tags(argv): ''' Usage: {cmd} mp3filenames... Print the tags from the named files. ''' xit = 0 first_print = True for mp3path in argv: with Pfx(mp3path): try: bfr = CornuCopyBuffer.from_filename(mp3path) except Exception as e: # pylint: disable=broad-except error(e) xit = 1 continue tags = tags_of(bfr) if not first_print: print() first_print = False print(mp3path) for tag in tags: print(' ', tag) return xit
def test01scanners(self): ''' Test some domain specific data parsers. ''' for parser in PARSERS: with self.subTest(parser.__name__): f = None testfilename = scanner_testfile(parser) if testfilename is None: input_chunks = self.random_data else: self.assertIsNotNone(testfilename) f = open(testfilename, 'rb') input_chunks = read_from(f) last_offset = 0 for offset in parser(CornuCopyBuffer(input_chunks)): self.assertTrue( last_offset <= offset, "offset %d <= last_offset %d" % (offset, last_offset)) last_offset = offset if f is not None: f.close() f = None
def bufferfrom(self): ''' Return a CornuCopyBuffer presenting data from the file. ''' return CornuCopyBuffer(self.datafrom())
def parse_value(bfr): ''' Decode a Block reference from a buffer. Format is a `BSData` holding this encoded data: BS(flags) 0x01 indirect blockref 0x02 typed: type follows, otherwise BT_HASHCODE 0x04 type flags: per type flags follow type BS(span) [BS(type)] [BS(type_flags)] union { type BT_HASHCODE: hash type BT_RLE: octet-value (repeat span times to get data) type BT_LITERAL: raw-data (span bytes) type BT_SUBBLOCK: suboffset, super block } Even though this is all decodable without the leading length we use a leading length so that future encodings do not prevent parsing any following data. ''' raw_encoding = BSData.parse_value(bfr) blockref_bfr = CornuCopyBuffer.from_bytes(raw_encoding) flags = BSUInt.parse_value(blockref_bfr) is_indirect = bool(flags & F_BLOCK_INDIRECT) is_typed = bool(flags & F_BLOCK_TYPED) has_type_flags = bool(flags & F_BLOCK_TYPE_FLAGS) unknown_flags = flags & ~(F_BLOCK_INDIRECT | F_BLOCK_TYPED | F_BLOCK_TYPE_FLAGS) if unknown_flags: raise ValueError( "unexpected flags value (0x%02x) with unsupported flags=0x%02x" % (flags, unknown_flags)) span = BSUInt.parse_value(blockref_bfr) if is_indirect: # With indirect blocks, the span is of the implied data, not # the referenced block's data. Therefore we build the referenced # block with a span of None and store the span in the indirect # block. ispan = span span = None # block type, default BT_HASHCODE if is_typed: block_type = BlockType(BSUInt.parse_value(blockref_bfr)) else: block_type = BlockType.BT_HASHCODE if has_type_flags: type_flags = BSUInt.parse_value(blockref_bfr) if type_flags: warning("nonzero type_flags: 0x%02x", type_flags) else: type_flags = 0x00 # instantiate type specific block ref if block_type == BlockType.BT_HASHCODE: hashcode = HashCode.from_buffer(blockref_bfr) B = HashCodeBlock(hashcode=hashcode, span=span) elif block_type == BlockType.BT_RLE: octet = blockref_bfr.take(1) B = RLEBlock(span, octet) elif block_type == BlockType.BT_LITERAL: data = blockref_bfr.take(span) B = LiteralBlock(data) elif block_type == BlockType.BT_SUBBLOCK: suboffset = BSUInt.parse_value(blockref_bfr) superB = BlockRecord.parse_value(blockref_bfr) # wrap inner Block in subspan B = SubBlock(superB, suboffset, span) else: raise ValueError("unsupported Block type 0x%02x" % (block_type, )) if is_indirect: B = IndirectBlock(B, span=ispan) if not blockref_bfr.at_eof(): warning("unparsed data (%d bytes) follow Block %s", len(raw_encoding) - blockref_bfr.offset, B) assert isinstance(B, _Block) return B
def bufferfrom(self, offset=0, **kw): ''' Return a CornuCopyBuffer presenting data from the Block. ''' return CornuCopyBuffer(self.datafrom(start=offset, **kw), offset=offset)
def __init__(self): self.Q = IterableQueue(1024) self.bfr = CornuCopyBuffer(self.Q)
def scanfrom(filepath, offset=0): ''' Scan the specified `filepath` from `offset`, yielding `DataRecord`s. ''' bfr = CornuCopyBuffer.from_filename(filepath, offset=offset) yield from DataRecord.scan_with_offsets(bfr)
def file_fromchunks(self, name, chunks): ''' Create a new file named `name` from the data in `chunks`. ''' return self.file_frombuffer(name, CornuCopyBuffer(chunks))
def __init__(self, recv, send, request_handler=None, name=None, packet_grace=None, tick=None): ''' Initialise the PacketConnection. Parameters: * `recv`: inbound binary stream. If this is an `int` it is taken to be an OS file descriptor, otherwise it should be a `cs.buffer.CornuCopyBuffer` or a file like object with a `read1` or `read` method. * `send`: outbound binary stream. If this is an `int` it is taken to be an OS file descriptor, otherwise it should be a file like object with `.write(bytes)` and `.flush()` methods. For a file descriptor sending is done via an os.dup() of the supplied descriptor, so the caller remains responsible for closing the original descriptor. * `packet_grace`: default pause in the packet sending worker to allow another packet to be queued before flushing the output stream. Default: `DEFAULT_PACKET_GRACE`s. A value of `0` will flush immediately if the queue is empty. * `request_handler`: an optional callable accepting (`rq_type`, `flags`, `payload`). The request_handler may return one of 5 values on success: * `None`: response will be 0 flags and an empty payload. * `int`: flags only. Response will be the flags and an empty payload. * `bytes`: payload only. Response will be 0 flags and the payload. * `str`: payload only. Response will be 0 flags and the str encoded as bytes using UTF-8. * `(int, bytes)`: Specify flags and payload for response. An unsuccessful request should raise an exception, which will cause a failure response packet. * `tick`: optional tick parameter, default `None`. If `None`, do nothing. If a Boolean, call `tick_fd_2` if true, otherwise do nothing. Otherwise `tick` should be a callable accepting a byteslike value. ''' if name is None: name = str(seq()) self.name = name if isinstance(recv, int): self._recv = CornuCopyBuffer.from_fd(recv) elif isinstance(recv, CornuCopyBuffer): self._recv = recv else: self._recv = CornuCopyBuffer.from_file(recv) if isinstance(send, int): self._send = os.fdopen(os.dup(send), 'wb') else: self._send = send if packet_grace is None: packet_grace = DEFAULT_PACKET_GRACE if tick is None: tick = lambda bs: None elif isinstance(tick, bool): if tick: tick = tick_fd_2 else: tick = lambda bs: None self.packet_grace = packet_grace self.request_handler = request_handler self.tick = tick # tags of requests in play against the local system self._channel_request_tags = {0: set()} self.notify_recv_eof = set() self.notify_send_eof = set() # LateFunctions for the requests we are performing for the remote system self._running = set() # requests we have outstanding against the remote system self._pending = {0: {}} # sequence of tag numbers # TODO: later, reuse old tags to prevent monotonic growth of tag field self._tag_seq = Seq(1) # work queue for local requests self._later = Later(4, name="%s:Later" % (self, )) self._later.open() # dispatch queue of Packets to send self._sendQ = IterableQueue(16) self._lock = Lock() self.closed = False # debugging: check for reuse of (channel,tag) etc self.__sent = set() self.__send_queued = set() # dispatch Thread to process received packets self._recv_thread = bg_thread(self._receive_loop, name="%s[_receive_loop]" % (self.name, )) # dispatch Thread to send data # primary purpose is to bundle output by deferring flushes self._send_thread = bg_thread(self._send_loop, name="%s[_send]" % (self.name, ))
def blocked_chunks_of2( chunks, *, scanner=None, min_block=None, max_block=None, ): ''' Generator which connects to a scanner of a chunk stream in order to emit low level edge aligned data chunks. Parameters: * `chunks`: a source iterable of data chunks, handed to `scanner` * `scanner`: optional callable accepting a `CornuCopyBuffer` and returning an iterable of `int`s, such as a generator. `scanner` may be `None`, in which case only the rolling hash is used to locate boundaries. * `min_block`: the smallest amount of data that will be used to create a Block, default from `MIN_BLOCKSIZE` (`{MIN_BLOCKSIZE}`) * `max_block`: the largest amount of data that will be used to create a Block, default from `MAX_BLOCKSIZE` (`{MAX_BLOCKSIZE}`) The iterable returned from `scanner(chunks)` yields `int`s which are considered desirable block boundaries. ''' if min_block is None: min_block = MIN_BLOCKSIZE elif min_block < 8: raise ValueError("rejecting min_block < 8: %s" % (min_block, )) if max_block is None: max_block = MAX_BLOCKSIZE elif max_block >= 1024 * 1024: raise ValueError("rejecting max_block >= 1024*1024: %s" % (max_block, )) if min_block >= max_block: raise ValueError("rejecting min_block:%d >= max_block:%d" % (min_block, max_block)) # source data for aligned chunk construction dataQ = IterableQueue() # queue of offsets from the parser offsetQ = IterableQueue() # copy chunks to the parser and also to the post-parser chunk assembler tee_chunks = tee(chunks, dataQ) parse_bfr = CornuCopyBuffer(tee_chunks) runstate = defaults.runstate def run_parser(runstate, bfr, min_block, max_block, offsetQ): ''' Thread body to scan `chunks` for offsets. The chunks are copied to `parseQ`, then their boundary offsets. If thwere is a scanner we scan the input data with it first. When it terminates (including from some exception), we scan the remaining chunks with scanbuf. The main function processes `parseQ` and uses its chunks and offsets to assemble aligned chunks of data. ''' try: offset = 0 if scanner: # Consume the chunks and offsets via a queue. # The scanner puts offsets onto the queue. # When the scanner fetches from the chunks, those chunks are copied to the queue. # Accordingly, chunks _should_ arrive before offsets within them. # pylint: disable=broad-except try: for offset in scanner(bfr): if runstate.cancelled: break # the scanner should yield only offsets, not chunks and offsets if not isinstance(offset, int): warning("discarding non-int from scanner %s: %s", scanner, offset) else: offsetQ.put(offset) except Exception as e: warning("exception from scanner %s: %s", scanner, e) # Consume the remainder of chunk_iter; the tee() will copy it to parseQ. # This is important to ensure that no chunk is missed. # We run these blocks through scanbuf() to find offsets. cso = bfr.offset # offset after all the chunks so far assert offset <= cso sofar = cso - offset if sofar >= max_block: offsetQ.put(cso) sofar = 0 for offset in scan(bfr, sofar=sofar, min_block=min_block, max_block=max_block): if runstate.cancelled: break offsetQ.put(cso + offset) finally: # end of offsets and chunks offsetQ.close() dataQ.close() # dispatch the parser bg_thread(run_parser, args=(runstate, parse_bfr, min_block, max_block, offsetQ), daemon=True) # data source for assembling aligned chunks data_bfr = CornuCopyBuffer(dataQ) sofar = 0 offset = None for offset in offsetQ: assert offset >= sofar block_size = offset - sofar assert block_size >= 0, ("block_size:%d <= 0 -- sofar=%d, offset=%d" % (block_size, sofar, offset)) if block_size < min_block: # skip over small edges assert scanner is not None, ( "scanner=None but still got an overly near offset" " (sofar=%d, offset=%d => block_size=%d < min_block:%d)" % (sofar, offset, block_size, min_block)) continue subchunks = data_bfr.takev(block_size) assert sum(map(len, subchunks)) == block_size if block_size > max_block: # break up overly long blocks without a parser assert scanner is not None, ( "scanner=None but still got an overly distant offset" " (sofar=%d, offset=%d => block_size=%d > max_block:%d)" % (sofar, offset, block_size, max_block)) yield from blocked_chunks_of2(subchunks, min_block=min_block, max_block=max_block) else: yield b''.join(subchunks) sofar += block_size bs = b''.join(data_bfr) if bs: assert len(bs) <= max_block yield bs