def test_size_in_bytes(self): """Test convenience function size_in_bytes().""" # None or numbers shall be returned unchanged self.assertIsNone(size_in_bytes(None)) self.assertEqual(1234, size_in_bytes(1234)) # string shall be parsed self.assertEqual(1234, size_in_bytes('1234')) self.assertEqual(4096, size_in_bytes('4 kb')) self.assertEqual(0x4000, size_in_bytes('16 KB')) self.assertEqual(0x300000, size_in_bytes('3 MB')) self.assertEqual(0x80000000, size_in_bytes('2gB')) self.assertRaises(ValueError, size_in_bytes, '2 TB') self.assertRaises(ValueError, size_in_bytes, 'KB 2')
def read_undefined_length_value(fp, is_little_endian, delimiter_tag, defer_size=None, read_size=1024 * 8): """Read until the delimiter tag found and return the value; ignore the delimiter. On completion, the file will be set to the first byte after the delimiter and its following four zero bytes. Parameters ---------- fp : a file-like object is_little_endian : boolean True if file transfer syntax is little endian, else False. delimiter_tag : BaseTag tag used as and marker for reading defer_size : int, None, optional Size to avoid loading large elements in memory. See ``filereader.dcmread`` for more parameter info. read_size : int Number of bytes to read at one time. Returns ------- delimiter : str, None The file delimiter Raises ------ EOFError If EOF is reached before delimiter found. """ data_start = fp.tell() search_rewind = 3 if is_little_endian: bytes_format = b"<HH" else: bytes_format = b">HH" bytes_to_find = pack(bytes_format, delimiter_tag.group, delimiter_tag.elem) found = False eof = False value_chunks = [] defer_size = size_in_bytes(defer_size) byte_count = 0 # for defer_size checks while not found: chunk_start = fp.tell() bytes_read = fp.read(read_size) if len(bytes_read) < read_size: # try again - if still don't get required amount, # this is the last block new_bytes = fp.read(read_size - len(bytes_read)) bytes_read += new_bytes if len(bytes_read) < read_size: eof = True # but will still check whatever we did get index = bytes_read.find(bytes_to_find) if index != -1: found = True new_bytes = bytes_read[:index] byte_count += len(new_bytes) if defer_size is None or byte_count < defer_size: value_chunks.append(bytes_read[:index]) fp.seek(chunk_start + index + 4) # rewind to end of delimiter length = fp.read(4) if length != b"\0\0\0\0": msg = ("Expected 4 zero bytes after undefined length delimiter" " at pos {0:04x}") logger.error(msg.format(fp.tell() - 4)) elif eof: fp.seek(data_start) raise EOFError( "End of file reached before delimiter {0!r} found".format( delimiter_tag)) else: # rewind a bit in case delimiter crossed read_size boundary fp.seek(fp.tell() - search_rewind) # accumulate the bytes read (not including the rewind) new_bytes = bytes_read[:-search_rewind] byte_count += len(new_bytes) if defer_size is None or byte_count < defer_size: value_chunks.append(new_bytes) # if get here then have found the byte string if defer_size is not None and byte_count >= defer_size: return None else: return b"".join(value_chunks)
def data_element_generator(fp, is_implicit_VR, is_little_endian, stop_when=None, defer_size=None): """:return: (tag, VR, length, value, value_tell, is_implicit_VR, is_little_endian) """ if is_little_endian: endian_chr = "<" else: endian_chr = ">" if is_implicit_VR: element_struct = Struct(endian_chr + "HHL") else: # Explicit VR # tag, VR, 2-byte length (or 0 if special VRs) element_struct = Struct(endian_chr + "HH2sH") extra_length_struct = Struct(endian_chr + "L") # for special VRs extra_length_unpack = extra_length_struct.unpack # for lookup speed # Make local variables so have faster lookup fp_read = fp.read fp_tell = fp.tell element_struct_unpack = element_struct.unpack defer_size = size_in_bytes(defer_size) while True: # Read tag, VR, length, get ready to read value bytes_read = fp_read(8) if len(bytes_read) < 8: raise StopIteration # at end of file if is_implicit_VR: # must reset VR each time; could have set last iteration (e.g. SQ) VR = None group, elem, length = element_struct_unpack(bytes_read) else: # explicit VR group, elem, VR, length = element_struct_unpack(bytes_read) if VR in extra_length_VRs_b: bytes_read = fp_read(4) length = extra_length_unpack(bytes_read)[0] # Positioned to read the value, but may not want to -- check stop_when value_tell = fp_tell() if stop_when is not None: if stop_when(group, elem): rewind_length = 8 if not is_implicit_VR and VR in extra_length_VRs_b: rewind_length += 4 fp.seek(value_tell - rewind_length) raise StopIteration # Reading the value # First case (most common): reading a value with a defined length if length != 0xFFFFFFFF: if defer_size is not None and length > defer_size: # Flag as deferred by setting value to None, and skip bytes value = None fp.seek(fp_tell() + length) else: value = fp_read(length) # import pdb;pdb.set_trace() yield ((group, elem), VR, length, value, value_tell) # Second case: undefined length - must seek to delimiter, # unless is SQ type, in which case is easier to parse it, because # undefined length SQs and items of undefined lengths can be nested # and it would be error-prone to read to the correct outer delimiter else: # Try to look up type to see if is a SQ # if private tag, won't be able to look it up in dictionary, # in which case just ignore it and read the bytes unless it is # identified as a Sequence if VR is None: try: VR = dictionary_VR(tag) except KeyError: # Look ahead to see if it consists of items and # is thus a SQ next_tag = TupleTag(unpack(endian_chr + "HH", fp_read(4))) # Rewind the file fp.seek(fp_tell() - 4) if next_tag == ItemTag: VR = b'SQ' if VR == b'SQ': yield ((group, elem), VR, length, None, value_tell) # seq = read_sequence(fp, is_implicit_VR, # is_little_endian, length, encoding) # yield DataElement(tag, VR, seq, value_tell, # is_undefined_length=True) else: raise NotImplementedError("This reader does not handle " "undefined length except for SQ") from pydicom.fileio.fileutil import read_undefined_length_value delimiter = SequenceDelimiterTag value = read_undefined_length_value(fp, is_little_endian, delimiter, defer_size) yield ((group, elem), VR, length, value, value_tell)
def dcmread( fp: Union[PathType, BinaryIO], defer_size: Optional[Union[str, int, float]] = None, stop_before_pixels: bool = False, force: bool = False, specific_tags: Optional[TagListType] = None ) -> Union[FileDataset, DicomDir]: """Read and parse a DICOM dataset stored in the DICOM File Format. Read a DICOM dataset stored in accordance with the :dcm:`DICOM File Format <part10/chapter_7.html>`. If the dataset is not stored in accordance with the File Format (i.e. the preamble and prefix are missing, there are missing required Type 1 *File Meta Information Group* elements or the entire *File Meta Information* is missing) then you will have to set `force` to ``True``. .. deprecated:: 2.2 Returning a :class:`~pydicom.dicomdir.DicomDir` is deprecated and will be removed in v3.0. Use :class:`~pydicom.fileset.FileSet` instead. Examples -------- Read and return a dataset stored in accordance with the DICOM File Format: >>> ds = pydicom.dcmread("CT_small.dcm") >>> ds.PatientName Read and return a dataset not in accordance with the DICOM File Format: >>> ds = pydicom.dcmread("rtplan.dcm", force=True) >>> ds.PatientName Use within a context manager: >>> with pydicom.dcmread("rtplan.dcm") as ds: ... ds.PatientName Parameters ---------- fp : str or PathLike or file-like Either a file-like object, a string containing the file name or the path to the file. The file-like object must have ``seek()``, ``read()`` and ``tell()`` methods and the caller is responsible for closing it (if required). defer_size : int, str or float, optional If not used then all elements are read into memory. If specified, then if a data element's stored value is larger than `defer_size`, the value is not read into memory until it is accessed in code. Should be the number of bytes to be read as :class:`int` or as a :class:`str` with units, e.g. ``'512 KB'``, ``'2 MB'``. stop_before_pixels : bool, optional If ``False`` (default), the full file will be read and parsed. Set ``True`` to stop before reading (7FE0,0010) *Pixel Data* (and all subsequent elements). force : bool, optional If ``False`` (default), raises an :class:`~pydicom.errors.InvalidDicomError` if the file is missing the *File Meta Information* header. Set to ``True`` to force reading even if no *File Meta Information* header is found. specific_tags : list of (int or str or 2-tuple of int), optional If used the only the supplied tags will be returned. The supplied elements can be tags or keywords. Note that the element (0008,0005) *Specific Character Set* is always returned if present - this ensures correct decoding of returned text values. Returns ------- FileDataset or DicomDir An instance of :class:`~pydicom.dataset.FileDataset` that represents a parsed DICOM file, unless the dataset is a *Media Storage Directory* instance in which case it will be a :class:`~pydicom.dicomdir.DicomDir`. Raises ------ InvalidDicomError If `force` is ``False`` and the file is not a valid DICOM file. TypeError If `fp` is ``None`` or of an unsupported type. See Also -------- pydicom.dataset.FileDataset Data class that is returned. pydicom.filereader.read_partial Only read part of a DICOM file, stopping on given conditions. """ # Open file if not already a file object caller_owns_file = True fp = path_from_pathlike(fp) if isinstance(fp, str): # caller provided a file name; we own the file handle caller_owns_file = False logger.debug("Reading file '{0}'".format(fp)) fp = open(fp, 'rb') elif fp is None or not hasattr(fp, "read") or not hasattr(fp, "seek"): raise TypeError("dcmread: Expected a file path or a file-like, " "but got " + type(fp).__name__) if config.debugging: logger.debug("\n" + "-" * 80) logger.debug("Call to dcmread()") msg = ("filename:'%s', defer_size='%s', " "stop_before_pixels=%s, force=%s, specific_tags=%s") logger.debug(msg % (fp.name, defer_size, stop_before_pixels, force, specific_tags)) if caller_owns_file: logger.debug("Caller passed file object") else: logger.debug("Caller passed file name") logger.debug("-" * 80) if specific_tags: specific_tags = [Tag(t) for t in specific_tags] specific_tags = cast(Optional[List[BaseTag]], specific_tags) # Iterate through all items and store them --include file meta if present stop_when = None if stop_before_pixels: stop_when = _at_pixel_data try: dataset = read_partial( fp, stop_when, defer_size=size_in_bytes(defer_size), force=force, specific_tags=specific_tags, ) finally: if not caller_owns_file: fp.close() # XXX need to store transfer syntax etc. return dataset
def data_element_generator( fp: BinaryIO, is_implicit_VR: bool, is_little_endian: bool, stop_when: Optional[Callable[[BaseTag, Optional[str], int], bool]] = None, defer_size: Optional[Union[int, str, float]] = None, encoding: Union[str, MutableSequence[str]] = default_encoding, specific_tags: Optional[List[BaseTag]] = None ) -> Iterator[Union[RawDataElement, DataElement]]: """Create a generator to efficiently return the raw data elements. .. note:: This function is used internally - usually there is no need to call it from user code. To read data from a DICOM file, :func:`dcmread` shall be used instead. Parameters ---------- fp : file-like The file-like to read from. is_implicit_VR : bool ``True`` if the data is encoded as implicit VR, ``False`` otherwise. is_little_endian : bool ``True`` if the data is encoded as little endian, ``False`` otherwise. stop_when : None, callable, optional If ``None`` (default), then the whole file is read. A callable which takes tag, VR, length, and returns ``True`` or ``False``. If it returns ``True``, ``read_data_element`` will just return. defer_size : int, str or float, optional See :func:`dcmread` for parameter info. encoding : Union[str, MutableSequence[str]] Encoding scheme specific_tags : list or None See :func:`dcmread` for parameter info. Yields ------- RawDataElement or DataElement Yields DataElement for undefined length UN or SQ, RawDataElement otherwise. """ # Summary of DICOM standard PS3.5-2008 chapter 7: # If Implicit VR, data element is: # tag, 4-byte length, value. # The 4-byte length can be FFFFFFFF (undefined length)* # # If Explicit VR: # if OB, OW, OF, SQ, UN, or UT: # tag, VR, 2-bytes reserved (both zero), 4-byte length, value # For all but UT, the length can be FFFFFFFF (undefined length)* # else: (any other VR) # tag, VR, (2 byte length), value # * for undefined length, a Sequence Delimitation Item marks the end # of the Value Field. # Note, except for the special_VRs, both impl and expl VR use 8 bytes; # the special VRs follow the 8 bytes with a 4-byte length # With a generator, state is stored, so we can break down # into the individual cases, and not have to check them again for each # data element from pydicom.values import convert_string if is_little_endian: endian_chr = "<" else: endian_chr = ">" # assign implicit VR struct to variable as use later if VR assumed missing implicit_VR_struct = Struct(endian_chr + "HHL") if is_implicit_VR: element_struct = implicit_VR_struct else: # Explicit VR # tag, VR, 2-byte length (or 0 if special VRs) element_struct = Struct(endian_chr + "HH2sH") extra_length_struct = Struct(endian_chr + "L") # for special VRs extra_length_unpack = extra_length_struct.unpack # for lookup speed # Make local variables so have faster lookup fp_read = fp.read fp_tell = fp.tell logger_debug = logger.debug debugging = config.debugging element_struct_unpack = element_struct.unpack defer_size = size_in_bytes(defer_size) tag_set = {Tag(tag) for tag in specific_tags} if specific_tags else set() has_tag_set = bool(tag_set) if has_tag_set: tag_set.add(Tag(0x00080005)) # Specific Character Set while True: # VR: Optional[str] # Read tag, VR, length, get ready to read value bytes_read = fp_read(8) if len(bytes_read) < 8: return # at end of file if debugging: debug_msg = f"{fp.tell() - 8:08x}: {bytes2hex(bytes_read)}" if is_implicit_VR: # must reset VR each time; could have set last iteration (e.g. SQ) VR = None group, elem, length = element_struct_unpack(bytes_read) else: # explicit VR group, elem, VR, length = element_struct_unpack(bytes_read) # defend against switching to implicit VR, some writer do in SQ's # issue 1067, issue 1035 if not (b'AA' <= VR <= b'ZZ') and config.assume_implicit_vr_switch: # invalid VR, must be 2 cap chrs, assume implicit and continue VR = None group, elem, length = implicit_VR_struct.unpack(bytes_read) else: VR = VR.decode(default_encoding) if VR in extra_length_VRs: bytes_read = fp_read(4) length = extra_length_unpack(bytes_read)[0] if debugging: debug_msg += " " + bytes2hex(bytes_read) if debugging: debug_msg = "%-47s (%04x, %04x)" % (debug_msg, group, elem) if not is_implicit_VR: debug_msg += " %s " % VR if length != 0xFFFFFFFF: debug_msg += "Length: %d" % length else: debug_msg += "Length: Undefined length (FFFFFFFF)" logger_debug(debug_msg) # Positioned to read the value, but may not want to -- check stop_when value_tell = fp_tell() tag = TupleTag((group, elem)) if stop_when is not None: # XXX VR may be None here!! Should stop_when just take tag? if stop_when(tag, VR, length): if debugging: logger_debug("Reading ended by stop_when callback. " "Rewinding to start of data element.") rewind_length = 8 if not is_implicit_VR and VR in extra_length_VRs: rewind_length += 4 fp.seek(value_tell - rewind_length) return # Reading the value # First case (most common): reading a value with a defined length if length != 0xFFFFFFFF: # don't defer loading of Specific Character Set value as it is # needed immediately to get the character encoding for other tags if has_tag_set and tag not in tag_set: # skip the tag if not in specific tags fp.seek(fp_tell() + length) continue if (defer_size is not None and length > defer_size and tag != BaseTag(0x00080005)): # Flag as deferred by setting value to None, and skip bytes value = None logger_debug("Defer size exceeded. " "Skipping forward to next data element.") fp.seek(fp_tell() + length) else: value = ( fp_read(length) if length > 0 else cast( Optional[bytes], empty_value_for_VR(VR, raw=True) ) ) if debugging: dotdot = "..." if length > 20 else " " displayed_value = value[:20] if value else b'' logger_debug("%08x: %-34s %s %r %s" % (value_tell, bytes2hex(displayed_value), dotdot, displayed_value, dotdot)) # If the tag is (0008,0005) Specific Character Set, then store it if tag == BaseTag(0x00080005): # *Specific Character String* is b'' for empty value encoding = convert_string( cast(bytes, value) or b'', is_little_endian ) # Store the encoding value in the generator # for use with future elements (SQs) encoding = convert_encodings(encoding) yield RawDataElement(tag, VR, length, value, value_tell, is_implicit_VR, is_little_endian) # Second case: undefined length - must seek to delimiter, # unless is SQ type, in which case is easier to parse it, because # undefined length SQs and items of undefined lengths can be nested # and it would be error-prone to read to the correct outer delimiter else: # VR UN with undefined length shall be handled as SQ # see PS 3.5, section 6.2.2 if VR == 'UN': VR = 'SQ' # Try to look up type to see if is a SQ # if private tag, won't be able to look it up in dictionary, # in which case just ignore it and read the bytes unless it is # identified as a Sequence if VR is None or VR == 'UN' and config.replace_un_with_known_vr: try: VR = dictionary_VR(tag) except KeyError: # Look ahead to see if it consists of items # and is thus a SQ next_tag = _unpack_tag(fp_read(4), endian_chr) # Rewind the file fp.seek(fp_tell() - 4) if next_tag == ItemTag: VR = 'SQ' if VR == 'SQ': if debugging: logger_debug( f"{fp_tell():08X}: Reading/parsing undefined length " "sequence" ) seq = read_sequence(fp, is_implicit_VR, is_little_endian, length, encoding) if has_tag_set and tag not in tag_set: continue yield DataElement(tag, VR, seq, value_tell, is_undefined_length=True) else: delimiter = SequenceDelimiterTag if debugging: logger_debug("Reading undefined length data element") value = read_undefined_length_value( fp, is_little_endian, delimiter, defer_size ) # tags with undefined length are skipped after read if has_tag_set and tag not in tag_set: continue yield RawDataElement(tag, VR, length, value, value_tell, is_implicit_VR, is_little_endian)
def dcmread(fp, defer_size=None, stop_before_pixels=False, force=False, specific_tags=None): """Read and parse a DICOM dataset stored in the DICOM File Format. Read a DICOM dataset stored in accordance with the DICOM File Format (DICOM Standard Part 10 Section 7). If the dataset is not stored in accordance with the File Format (i.e. the preamble and prefix are missing, there are missing required Type 1 File Meta Information Group elements or the entire File Meta Information is missing) then you will have to set `force` to True. Parameters ---------- fp : str or file-like Either a file-like object, or a string containing the file name. If a file-like object, the caller is responsible for closing it. defer_size : int or str or None If None (default), all elements read into memory. If specified, then if a data element's stored value is larger than `defer_size`, the value is not read into memory until it is accessed in code. Specify an integer (bytes), or a string value with units, e.g. "512 KB", "2 MB". stop_before_pixels : bool If False (default), the full file will be read and parsed. Set True to stop before reading (7FE0,0010) 'Pixel Data' (and all subsequent elements). force : bool If False (default), raises an InvalidDicomError if the file is missing the File Meta Information header. Set to True to force reading even if no File Meta Information header is found. specific_tags : list or None If not None, only the tags in the list are returned. The list elements can be tags or tag names. Note that the tag Specific Character Set is always returned if present - this ensures correct decoding of returned text values. Returns ------- FileDataset An instance of FileDataset that represents a parsed DICOM file. Raises ------ InvalidDicomError If `force` is True and the file is not a valid DICOM file. See Also -------- pydicom.dataset.FileDataset Data class that is returned. pydicom.filereader.read_partial Only read part of a DICOM file, stopping on given conditions. Examples -------- Read and return a dataset stored in accordance with the DICOM File Format: >>> ds = pydicom.dcmread("rtplan.dcm") >>> ds.PatientName Read and return a dataset not in accordance with the DICOM File Format: >>> ds = pydicom.dcmread("rtplan.dcm", force=True) >>> ds.PatientName Use within a context manager: >>> with pydicom.dcmread("rtplan.dcm") as ds: >>> ds.PatientName """ # Open file if not already a file object caller_owns_file = True if isinstance(fp, compat.string_types): # caller provided a file name; we own the file handle caller_owns_file = False try: logger.debug(u"Reading file '{0}'".format(fp)) except Exception: logger.debug("Reading file '{0}'".format(fp)) fp = open(fp, 'rb') if config.debugging: logger.debug("\n" + "-" * 80) logger.debug("Call to dcmread()") msg = ("filename:'%s', defer_size='%s', " "stop_before_pixels=%s, force=%s, specific_tags=%s") logger.debug(msg % (fp.name, defer_size, stop_before_pixels, force, specific_tags)) if caller_owns_file: logger.debug("Caller passed file object") else: logger.debug("Caller passed file name") logger.debug("-" * 80) # Convert size to defer reading into bytes defer_size = size_in_bytes(defer_size) # Iterate through all items and store them --include file meta if present stop_when = None if stop_before_pixels: stop_when = _at_pixel_data try: dataset = read_partial(fp, stop_when, defer_size=defer_size, force=force, specific_tags=specific_tags) finally: if not caller_owns_file: fp.close() # XXX need to store transfer syntax etc. return dataset
def data_element_generator(fp, is_implicit_VR, is_little_endian, stop_when=None, defer_size=None, encoding=default_encoding, specific_tags=None): """Create a generator to efficiently return the raw data elements. Parameters ---------- fp : file-like object is_implicit_VR : boolean is_little_endian : boolean stop_when : None, callable, optional If None (default), then the whole file is read. A callable which takes tag, VR, length, and returns True or False. If it returns True, read_data_element will just return. defer_size : int, str, None, optional See ``dcmread`` for parameter info. encoding : Encoding scheme specific_tags : list or None See ``dcmread`` for parameter info. Returns ------- VR : None if implicit VR, otherwise the VR read from the file length : the length as in the DICOM data element (could be DICOM "undefined length" 0xffffffffL) value_bytes : the raw bytes from the DICOM file (not parsed into python types) is_little_endian : boolean True if transfer syntax is little endian; else False. """ # Summary of DICOM standard PS3.5-2008 chapter 7: # If Implicit VR, data element is: # tag, 4-byte length, value. # The 4-byte length can be FFFFFFFF (undefined length)* # # If Explicit VR: # if OB, OW, OF, SQ, UN, or UT: # tag, VR, 2-bytes reserved (both zero), 4-byte length, value # For all but UT, the length can be FFFFFFFF (undefined length)* # else: (any other VR) # tag, VR, (2 byte length), value # * for undefined length, a Sequence Delimitation Item marks the end # of the Value Field. # Note, except for the special_VRs, both impl and expl VR use 8 bytes; # the special VRs follow the 8 bytes with a 4-byte length # With a generator, state is stored, so we can break down # into the individual cases, and not have to check them again for each # data element if is_little_endian: endian_chr = "<" else: endian_chr = ">" if is_implicit_VR: element_struct = Struct(endian_chr + "HHL") else: # Explicit VR # tag, VR, 2-byte length (or 0 if special VRs) element_struct = Struct(endian_chr + "HH2sH") extra_length_struct = Struct(endian_chr + "L") # for special VRs extra_length_unpack = extra_length_struct.unpack # for lookup speed # Make local variables so have faster lookup fp_read = fp.read fp_tell = fp.tell logger_debug = logger.debug debugging = config.debugging element_struct_unpack = element_struct.unpack defer_size = size_in_bytes(defer_size) tag_set = set() if specific_tags is not None: for tag in specific_tags: if isinstance(tag, (str, compat.text_type)): tag = Tag(tag_for_keyword(tag)) if isinstance(tag, BaseTag): tag_set.add(tag) tag_set.add(Tag(0x08, 0x05)) has_tag_set = len(tag_set) > 0 while True: # Read tag, VR, length, get ready to read value bytes_read = fp_read(8) if len(bytes_read) < 8: return # at end of file if debugging: debug_msg = "{0:08x}: {1}".format(fp.tell() - 8, bytes2hex(bytes_read)) if is_implicit_VR: # must reset VR each time; could have set last iteration (e.g. SQ) VR = None group, elem, length = element_struct_unpack(bytes_read) else: # explicit VR group, elem, VR, length = element_struct_unpack(bytes_read) if not in_py2: VR = VR.decode(default_encoding) if VR in extra_length_VRs: bytes_read = fp_read(4) length = extra_length_unpack(bytes_read)[0] if debugging: debug_msg += " " + bytes2hex(bytes_read) if debugging: debug_msg = "%-47s (%04x, %04x)" % (debug_msg, group, elem) if not is_implicit_VR: debug_msg += " %s " % VR if length != 0xFFFFFFFF: debug_msg += "Length: %d" % length else: debug_msg += "Length: Undefined length (FFFFFFFF)" logger_debug(debug_msg) # Positioned to read the value, but may not want to -- check stop_when value_tell = fp_tell() tag = TupleTag((group, elem)) if stop_when is not None: # XXX VR may be None here!! Should stop_when just take tag? if stop_when(tag, VR, length): if debugging: logger_debug("Reading ended by stop_when callback. " "Rewinding to start of data element.") rewind_length = 8 if not is_implicit_VR and VR in extra_length_VRs: rewind_length += 4 fp.seek(value_tell - rewind_length) return # Reading the value # First case (most common): reading a value with a defined length if length != 0xFFFFFFFF: # don't defer loading of Specific Character Set value as it is # needed immediately to get the character encoding for other tags if has_tag_set and tag not in tag_set: # skip the tag if not in specific tags fp.seek(fp_tell() + length) continue if (defer_size is not None and length > defer_size and tag != BaseTag(0x00080005)): # Flag as deferred by setting value to None, and skip bytes value = None logger_debug("Defer size exceeded. " "Skipping forward to next data element.") fp.seek(fp_tell() + length) else: value = fp_read(length) if debugging: dotdot = " " if length > 12: dotdot = "..." logger_debug("%08x: %-34s %s %r %s" % (value_tell, bytes2hex( value[:12]), dotdot, value[:12], dotdot)) # If the tag is (0008,0005) Specific Character Set, then store it if tag == BaseTag(0x00080005): from pydicom.values import convert_string encoding = convert_string(value, is_little_endian) # Store the encoding value in the generator # for use with future elements (SQs) encoding = convert_encodings(encoding) yield RawDataElement(tag, VR, length, value, value_tell, is_implicit_VR, is_little_endian) # Second case: undefined length - must seek to delimiter, # unless is SQ type, in which case is easier to parse it, because # undefined length SQs and items of undefined lengths can be nested # and it would be error-prone to read to the correct outer delimiter else: # Try to look up type to see if is a SQ # if private tag, won't be able to look it up in dictionary, # in which case just ignore it and read the bytes unless it is # identified as a Sequence if VR is None: try: VR = dictionary_VR(tag) except KeyError: # Look ahead to see if it consists of items # and is thus a SQ next_tag = TupleTag(unpack(endian_chr + "HH", fp_read(4))) # Rewind the file fp.seek(fp_tell() - 4) if next_tag == ItemTag: VR = 'SQ' if VR == 'SQ': if debugging: msg = "{0:08x}: Reading/parsing undefined length sequence" logger_debug(msg.format(fp_tell())) seq = read_sequence(fp, is_implicit_VR, is_little_endian, length, encoding) if has_tag_set and tag not in tag_set: continue yield DataElement(tag, VR, seq, value_tell, is_undefined_length=True) else: delimiter = SequenceDelimiterTag if debugging: logger_debug("Reading undefined length data element") value = read_undefined_length_value(fp, is_little_endian, delimiter, defer_size) # If the tag is (0008,0005) Specific Character Set, # then store it if tag == (0x08, 0x05): from pydicom.values import convert_string encoding = convert_string(value, is_little_endian) # Store the encoding value in the generator for use # with future elements (SQs) encoding = convert_encodings(encoding) # tags with undefined length are skipped after read if has_tag_set and tag not in tag_set: continue yield RawDataElement(tag, VR, length, value, value_tell, is_implicit_VR, is_little_endian)
def data_element_generator( fp: BinaryIO, is_implicit_VR: bool, is_little_endian: bool, stop_when: Optional[Callable[[int, int], bool]] = None, defer_size: Optional[Union[str, int, float]] = None, ) -> Iterator[_ElementType]: """:return: (tag, VR, length, value, value_tell, is_implicit_VR, is_little_endian) """ endian_chr = "<" if is_little_endian else ">" if is_implicit_VR: element_struct = Struct(endian_chr + "HHL") else: # Explicit VR # tag, VR, 2-byte length (or 0 if special VRs) element_struct = Struct(endian_chr + "HH2sH") extra_length_struct = Struct(endian_chr + "L") # for special VRs extra_length_unpack = extra_length_struct.unpack # for lookup speed # Make local variables so have faster lookup fp_read = fp.read fp_tell = fp.tell element_struct_unpack = element_struct.unpack defer_size = size_in_bytes(defer_size) while True: # Read tag, VR, length, get ready to read value bytes_read = fp_read(8) if len(bytes_read) < 8: return # at end of file if is_implicit_VR: # must reset VR each time; could have set last iteration (e.g. SQ) vr = None group, elem, length = element_struct_unpack(bytes_read) else: # explicit VR group, elem, vr, length = element_struct_unpack(bytes_read) if vr in extra_length_VRs_b: length = extra_length_unpack(fp_read(4))[0] # Positioned to read the value, but may not want to -- check stop_when value_tell = fp_tell() if stop_when is not None: if stop_when(group, elem): rewind_length = 8 if not is_implicit_VR and vr in extra_length_VRs_b: rewind_length += 4 fp.seek(value_tell - rewind_length) return # Reading the value # First case (most common): reading a value with a defined length if length != 0xFFFFFFFF: if defer_size is not None and length > defer_size: # Flag as deferred by setting value to None, and skip bytes value = None fp.seek(fp_tell() + length) else: value = fp_read(length) # import pdb;pdb.set_trace() yield ((group, elem), vr, length, value, value_tell) # Second case: undefined length - must seek to delimiter, # unless is SQ type, in which case is easier to parse it, because # undefined length SQs and items of undefined lengths can be nested # and it would be error-prone to read to the correct outer delimiter else: # Try to look up type to see if is a SQ # if private tag, won't be able to look it up in dictionary, # in which case just ignore it and read the bytes unless it is # identified as a Sequence if vr is None: try: vr = dictionary_VR((group, elem)).encode('ascii') except KeyError: # Look ahead to see if it consists of items and # is thus a SQ next_tag = TupleTag( cast( Tuple[int, int], unpack(endian_chr + "HH", fp_read(4)), )) # Rewind the file fp.seek(fp_tell() - 4) if next_tag == ItemTag: vr = b'SQ' if vr == b'SQ': yield ((group, elem), vr, length, None, value_tell) else: raise NotImplementedError( "This reader does not handle undefined length except " "for SQ")
def test_size_in_bytes(self): """Test convenience function size_in_bytes().""" # None or numbers shall be returned unchanged assert size_in_bytes(None) is None assert size_in_bytes(float('inf')) is None assert size_in_bytes(1234) == 1234 # string shall be parsed assert size_in_bytes('1234') == 1234 assert size_in_bytes('4 kb') == 4096 assert size_in_bytes('16 KB') == 0x4000 assert size_in_bytes('3 MB') == 0x300000 assert size_in_bytes('2gB') == 0x80000000 with pytest.raises(ValueError): size_in_bytes('2 TB') with pytest.raises(ValueError): size_in_bytes('KB 2')
def dcmread(fp, defer_size=None, stop_before_pixels=False, force=False, specific_tags=None): """Read and parse a DICOM dataset stored in the DICOM File Format. Read a DICOM dataset stored in accordance with the :dcm:`DICOM File Format <part10/chapter_7.html>`. If the dataset is not stored in accordance with the File Format (i.e. the preamble and prefix are missing, there are missing required Type 1 *File Meta Information Group* elements or the entire *File Meta Information* is missing) then you will have to set `force` to ``True``. Parameters ---------- fp : str or PathLike or file-like Either a file-like object, or a string containing the file name. If a file-like object, the caller is responsible for closing it. defer_size : int or str or None, optional If ``None`` (default), all elements are read into memory. If specified, then if a data element's stored value is larger than `defer_size`, the value is not read into memory until it is accessed in code. Specify an integer (bytes), or a string value with units, e.g. "512 KB", "2 MB". stop_before_pixels : bool, optional If ``False`` (default), the full file will be read and parsed. Set ``True`` to stop before reading (7FE0,0010) *Pixel Data* (and all subsequent elements). force : bool, optional If ``False`` (default), raises an :class:`~pydicom.errors.InvalidDicomError` if the file is missing the *File Meta Information* header. Set to ``True`` to force reading even if no *File Meta Information* header is found. specific_tags : list or None, optional If not ``None``, only the tags in the list are returned. The list elements can be tags or tag names. Note that the element (0008,0005) *Specific Character Set* is always returned if present - this ensures correct decoding of returned text values. Returns ------- FileDataset An instance of :class:`~pydicom.dataset.FileDataset` that represents a parsed DICOM file. Raises ------ InvalidDicomError If `force` is ``True`` and the file is not a valid DICOM file. TypeError If `fp` is ``None`` or of an unsupported type. See Also -------- pydicom.dataset.FileDataset Data class that is returned. pydicom.filereader.read_partial Only read part of a DICOM file, stopping on given conditions. Examples -------- Read and return a dataset stored in accordance with the DICOM File Format: >>> ds = pydicom.dcmread("rtplan.dcm") >>> ds.PatientName Read and return a dataset not in accordance with the DICOM File Format: >>> ds = pydicom.dcmread("rtplan.dcm", force=True) >>> ds.PatientName Use within a context manager: >>> with pydicom.dcmread("rtplan.dcm") as ds: >>> ds.PatientName """ # Open file if not already a file object caller_owns_file = True fp = path_from_pathlike(fp) if isinstance(fp, str): # caller provided a file name; we own the file handle caller_owns_file = False logger.debug("Reading file '{0}'".format(fp)) fp = open(fp, 'rb') elif fp is None or not hasattr(fp, "read") or not hasattr(fp, "seek"): raise TypeError("dcmread: Expected a file path or a file-like, " "but got " + type(fp).__name__) if config.debugging: logger.debug("\n" + "-" * 80) logger.debug("Call to dcmread()") msg = ("filename:'%s', defer_size='%s', " "stop_before_pixels=%s, force=%s, specific_tags=%s") logger.debug( msg % (fp.name, defer_size, stop_before_pixels, force, specific_tags)) if caller_owns_file: logger.debug("Caller passed file object") else: logger.debug("Caller passed file name") logger.debug("-" * 80) # Convert size to defer reading into bytes defer_size = size_in_bytes(defer_size) # Iterate through all items and store them --include file meta if present stop_when = None if stop_before_pixels: stop_when = _at_pixel_data try: dataset = read_partial(fp, stop_when, defer_size=defer_size, force=force, specific_tags=specific_tags) finally: if not caller_owns_file: fp.close() # XXX need to store transfer syntax etc. return dataset
def read_undefined_length_value(fp: BinaryIO, is_little_endian: bool, delimiter_tag: BaseTag, defer_size: Optional[Union[int, float]] = None, read_size: int = 1024 * 8) -> Optional[bytes]: """Read until `delimiter_tag` and return the value up to that point. On completion, the file will be set to the first byte after the delimiter and its following four zero bytes. Parameters ---------- fp : file-like The file-like to read. is_little_endian : bool ``True`` if file transfer syntax is little endian, else ``False``. delimiter_tag : BaseTag Tag used as end marker for reading defer_size : int or None, optional Size to avoid loading large elements in memory. See :func:`~pydicom.filereader.dcmread` for more parameter info. read_size : int, optional Number of bytes to read at one time. Returns ------- delimiter : bytes or None The file delimiter. Raises ------ EOFError If EOF is reached before delimiter found. """ data_start = fp.tell() defer_size = size_in_bytes(defer_size) # It's common for an undefined length value item to be an # encapsulated pixel data as defined in PS3.5 section A.4. # Attempt to parse the data under that assumption, since the method # 1. is proof against coincidental embedded sequence delimiter tags # 2. avoids accumulating any data in memory if the element is large # enough to be deferred # 3. does not double-accumulate data (in chunks and then joined) # # Unfortunately, some implementations deviate from the standard and the # encapsulated pixel data-parsing algorithm fails. In that case, we fall # back to a method of scanning the entire element value for the # sequence delimiter, as was done historically. if delimiter_tag == SequenceDelimiterTag: was_value_found, value = _try_read_encapsulated_pixel_data( fp, is_little_endian, defer_size) if was_value_found: return value search_rewind = 3 if is_little_endian: bytes_format = b"<HH" else: bytes_format = b">HH" bytes_to_find = pack(bytes_format, delimiter_tag.group, delimiter_tag.elem) found = False eof = False value_chunks = [] byte_count = 0 # for defer_size checks while not found: chunk_start = fp.tell() bytes_read = fp.read(read_size) if len(bytes_read) < read_size: # try again - if still don't get required amount, # this is the last block new_bytes = fp.read(read_size - len(bytes_read)) bytes_read += new_bytes if len(bytes_read) < read_size: eof = True # but will still check whatever we did get index = bytes_read.find(bytes_to_find) if index != -1: found = True new_bytes = bytes_read[:index] byte_count += len(new_bytes) if defer_size is None or byte_count < defer_size: value_chunks.append(new_bytes) fp.seek(chunk_start + index + 4) # rewind to end of delimiter length = fp.read(4) if length != b"\0\0\0\0": msg = ("Expected 4 zero bytes after undefined length delimiter" " at pos {0:04x}") logger.error(msg.format(fp.tell() - 4)) elif eof: fp.seek(data_start) raise EOFError( "End of file reached before delimiter {0!r} found".format( delimiter_tag)) else: # rewind a bit in case delimiter crossed read_size boundary fp.seek(fp.tell() - search_rewind) # accumulate the bytes read (not including the rewind) new_bytes = bytes_read[:-search_rewind] byte_count += len(new_bytes) if defer_size is None or byte_count < defer_size: value_chunks.append(new_bytes) # if get here then have found the byte string if defer_size is not None and byte_count >= defer_size: return None else: return b"".join(value_chunks)
def data_element_generator(fp, is_implicit_VR, is_little_endian, stop_when=None, defer_size=None, encoding=default_encoding, specific_tags=None): """Create a generator to efficiently return the raw data elements. Parameters ---------- fp : file-like object is_implicit_VR : boolean is_little_endian : boolean stop_when : None, callable, optional If None (default), then the whole file is read. A callable which takes tag, VR, length, and returns True or False. If it returns True, read_data_element will just return. defer_size : int, str, None, optional See ``dcmread`` for parameter info. encoding : Encoding scheme specific_tags : list or None See ``dcmread`` for parameter info. Returns ------- VR : None if implicit VR, otherwise the VR read from the file length : the length as in the DICOM data element (could be DICOM "undefined length" 0xffffffffL) value_bytes : the raw bytes from the DICOM file (not parsed into python types) is_little_endian : boolean True if transfer syntax is little endian; else False. """ # Summary of DICOM standard PS3.5-2008 chapter 7: # If Implicit VR, data element is: # tag, 4-byte length, value. # The 4-byte length can be FFFFFFFF (undefined length)* # # If Explicit VR: # if OB, OW, OF, SQ, UN, or UT: # tag, VR, 2-bytes reserved (both zero), 4-byte length, value # For all but UT, the length can be FFFFFFFF (undefined length)* # else: (any other VR) # tag, VR, (2 byte length), value # * for undefined length, a Sequence Delimitation Item marks the end # of the Value Field. # Note, except for the special_VRs, both impl and expl VR use 8 bytes; # the special VRs follow the 8 bytes with a 4-byte length # With a generator, state is stored, so we can break down # into the individual cases, and not have to check them again for each # data element if is_little_endian: endian_chr = "<" else: endian_chr = ">" if is_implicit_VR: element_struct = Struct(endian_chr + "HHL") else: # Explicit VR # tag, VR, 2-byte length (or 0 if special VRs) element_struct = Struct(endian_chr + "HH2sH") extra_length_struct = Struct(endian_chr + "L") # for special VRs extra_length_unpack = extra_length_struct.unpack # for lookup speed # Make local variables so have faster lookup fp_read = fp.read fp_tell = fp.tell logger_debug = logger.debug debugging = config.debugging element_struct_unpack = element_struct.unpack defer_size = size_in_bytes(defer_size) tag_set = set() has_specific_char_set = True if specific_tags is not None: for tag in specific_tags: if isinstance(tag, (str, compat.text_type)): tag = Tag(tag_for_keyword(tag)) if isinstance(tag, BaseTag): tag_set.add(tag) has_specific_char_set = Tag(0x08, 0x05) in tag_set tag_set.add(Tag(0x08, 0x05)) has_tag_set = len(tag_set) > 0 while True: # Read tag, VR, length, get ready to read value bytes_read = fp_read(8) if len(bytes_read) < 8: return # at end of file if debugging: debug_msg = "{0:08x}: {1}".format(fp.tell() - 8, bytes2hex(bytes_read)) if is_implicit_VR: # must reset VR each time; could have set last iteration (e.g. SQ) VR = None group, elem, length = element_struct_unpack(bytes_read) else: # explicit VR group, elem, VR, length = element_struct_unpack(bytes_read) if not in_py2: VR = VR.decode(default_encoding) if VR in extra_length_VRs: bytes_read = fp_read(4) length = extra_length_unpack(bytes_read)[0] if debugging: debug_msg += " " + bytes2hex(bytes_read) if debugging: debug_msg = "%-47s (%04x, %04x)" % (debug_msg, group, elem) if not is_implicit_VR: debug_msg += " %s " % VR if length != 0xFFFFFFFF: debug_msg += "Length: %d" % length else: debug_msg += "Length: Undefined length (FFFFFFFF)" logger_debug(debug_msg) # Positioned to read the value, but may not want to -- check stop_when value_tell = fp_tell() tag = TupleTag((group, elem)) if stop_when is not None: # XXX VR may be None here!! Should stop_when just take tag? if stop_when(tag, VR, length): if debugging: logger_debug("Reading ended by stop_when callback. " "Rewinding to start of data element.") rewind_length = 8 if not is_implicit_VR and VR in extra_length_VRs: rewind_length += 4 fp.seek(value_tell - rewind_length) return # Reading the value # First case (most common): reading a value with a defined length if length != 0xFFFFFFFF: # don't defer loading of Specific Character Set value as it is # needed immediately to get the character encoding for other tags if has_tag_set and tag not in tag_set: # skip the tag if not in specific tags fp.seek(fp_tell() + length) continue if (defer_size is not None and length > defer_size and tag != BaseTag(0x00080005)): # Flag as deferred by setting value to None, and skip bytes value = None logger_debug("Defer size exceeded. " "Skipping forward to next data element.") fp.seek(fp_tell() + length) else: value = fp_read(length) if debugging: dotdot = " " if length > 12: dotdot = "..." logger_debug("%08x: %-34s %s %r %s" % (value_tell, bytes2hex( value[:12]), dotdot, value[:12], dotdot)) # If the tag is (0008,0005) Specific Character Set, then store it if tag == BaseTag(0x00080005): from pydicom.values import convert_string encoding = convert_string(value, is_little_endian, encoding=default_encoding) # Store the encoding value in the generator # for use with future elements (SQs) encoding = convert_encodings(encoding) if not has_specific_char_set: continue yield RawDataElement(tag, VR, length, value, value_tell, is_implicit_VR, is_little_endian) # Second case: undefined length - must seek to delimiter, # unless is SQ type, in which case is easier to parse it, because # undefined length SQs and items of undefined lengths can be nested # and it would be error-prone to read to the correct outer delimiter else: # Try to look up type to see if is a SQ # if private tag, won't be able to look it up in dictionary, # in which case just ignore it and read the bytes unless it is # identified as a Sequence if VR is None: try: VR = dictionary_VR(tag) except KeyError: # Look ahead to see if it consists of items # and is thus a SQ next_tag = TupleTag(unpack(endian_chr + "HH", fp_read(4))) # Rewind the file fp.seek(fp_tell() - 4) if next_tag == ItemTag: VR = 'SQ' if VR == 'SQ': if debugging: msg = "{0:08x}: Reading/parsing undefined length sequence" logger_debug(msg.format(fp_tell())) seq = read_sequence(fp, is_implicit_VR, is_little_endian, length, encoding) if has_tag_set and tag not in tag_set: continue yield DataElement(tag, VR, seq, value_tell, is_undefined_length=True) else: delimiter = SequenceDelimiterTag if debugging: logger_debug("Reading undefined length data element") value = read_undefined_length_value(fp, is_little_endian, delimiter, defer_size) # If the tag is (0008,0005) Specific Character Set, # then store it if tag == (0x08, 0x05): from pydicom.values import convert_string encoding = convert_string(value, is_little_endian, encoding=default_encoding) # Store the encoding value in the generator for use # with future elements (SQs) encoding = convert_encodings(encoding) if not has_specific_char_set: continue # tags with undefined length are skipped after read if has_tag_set and tag not in tag_set: continue yield RawDataElement(tag, VR, length, value, value_tell, is_implicit_VR, is_little_endian)
def dcmread(fp, defer_size=None, stop_before_pixels=False, force=False, specific_tags=None): """Read and parse a DICOM dataset stored in the DICOM File Format. Read a DICOM dataset stored in accordance with the DICOM File Format (DICOM Standard Part 10 Section 7). If the dataset is not stored in accordance with the File Format (i.e. the preamble and prefix are missing, there are missing required Type 1 File Meta Information Group elements or the entire File Meta Information is missing) then you will have to set `force` to True. Parameters ---------- fp : str or file-like Either a file-like object, or a string containing the file name. If a file-like object, the caller is responsible for closing it. defer_size : int or str or None If None (default), all elements read into memory. If specified, then if a data element's stored value is larger than `defer_size`, the value is not read into memory until it is accessed in code. Specify an integer (bytes), or a string value with units, e.g. "512 KB", "2 MB". stop_before_pixels : bool If False (default), the full file will be read and parsed. Set True to stop before reading (7FE0,0010) 'Pixel Data' (and all subsequent elements). force : bool If False (default), raises an InvalidDicomError if the file is missing the File Meta Information header. Set to True to force reading even if no File Meta Information header is found. specific_tags : list or None If not None, only the tags in the list are returned. The list elements can be tags or tag names. Returns ------- FileDataset An instance of FileDataset that represents a parsed DICOM file. Raises ------ InvalidDicomError If `force` is True and the file is not a valid DICOM file. See Also -------- pydicom.dataset.FileDataset Data class that is returned. pydicom.filereader.read_partial Only read part of a DICOM file, stopping on given conditions. Examples -------- Read and return a dataset stored in accordance with the DICOM File Format: >>> ds = pydicom.dcmread("rtplan.dcm") >>> ds.PatientName Read and return a dataset not in accordance with the DICOM File Format: >>> ds = pydicom.dcmread("rtplan.dcm", force=True) >>> ds.PatientName Use within a context manager: >>> with pydicom.dcmread("rtplan.dcm") as ds: >>> ds.PatientName """ # Open file if not already a file object caller_owns_file = True if isinstance(fp, compat.string_types): # caller provided a file name; we own the file handle caller_owns_file = False try: logger.debug(u"Reading file '{0}'".format(fp)) except Exception: logger.debug("Reading file '{0}'".format(fp)) fp = open(fp, 'rb') if config.debugging: logger.debug("\n" + "-" * 80) logger.debug("Call to dcmread()") msg = ("filename:'%s', defer_size='%s', " "stop_before_pixels=%s, force=%s, specific_tags=%s") logger.debug( msg % (fp.name, defer_size, stop_before_pixels, force, specific_tags)) if caller_owns_file: logger.debug("Caller passed file object") else: logger.debug("Caller passed file name") logger.debug("-" * 80) # Convert size to defer reading into bytes defer_size = size_in_bytes(defer_size) # Iterate through all items and store them --include file meta if present stop_when = None if stop_before_pixels: stop_when = _at_pixel_data try: dataset = read_partial(fp, stop_when, defer_size=defer_size, force=force, specific_tags=specific_tags) finally: if not caller_owns_file: fp.close() # XXX need to store transfer syntax etc. return dataset
def read_undefined_length_value(fp, is_little_endian, delimiter_tag, defer_size=None, read_size=1024*8): """Read until the delimiter tag found and return the value; ignore the delimiter. On completion, the file will be set to the first byte after the delimiter and its following four zero bytes. Parameters ---------- fp : a file-like object is_little_endian : boolean True if file transfer syntax is little endian, else False. delimiter_tag : BaseTag tag used as and marker for reading defer_size : int, None, optional Size to avoid loading large elements in memory. See ``filereader.read_file`` for more parameter info. read_size : int Number of bytes to read at one time. Returns ------- delimiter : str, None The file delimiter Raises ------ EOFError If EOF is reached before delimiter found. """ data_start = fp.tell() search_rewind = 3 if is_little_endian: bytes_format = b"<HH" else: bytes_format = b">HH" bytes_to_find = pack(bytes_format, delimiter_tag.group, delimiter_tag.elem) found = False eof = False value_chunks = [] defer_size = size_in_bytes(defer_size) byte_count = 0 # for defer_size checks while not found: chunk_start = fp.tell() bytes_read = fp.read(read_size) if len(bytes_read) < read_size: # try again - if still don't get required amount, # this is the last block new_bytes = fp.read(read_size - len(bytes_read)) bytes_read += new_bytes if len(bytes_read) < read_size: eof = True # but will still check whatever we did get index = bytes_read.find(bytes_to_find) if index != -1: found = True new_bytes = bytes_read[:index] byte_count += len(new_bytes) if defer_size is None or byte_count < defer_size: value_chunks.append(bytes_read[:index]) fp.seek(chunk_start + index + 4) # rewind to end of delimiter length = fp.read(4) if length != b"\0\0\0\0": msg = ("Expected 4 zero bytes after undefined length delimiter" " at pos {0:04x}") logger.error(msg.format(fp.tell() - 4)) elif eof: fp.seek(data_start) raise EOFError("End of file reached before delimiter {0!r} found". format(delimiter_tag)) else: # rewind a bit in case delimiter crossed read_size boundary fp.seek(fp.tell() - search_rewind) # accumulate the bytes read (not including the rewind) new_bytes = bytes_read[:-search_rewind] byte_count += len(new_bytes) if defer_size is None or byte_count < defer_size: value_chunks.append(new_bytes) # if get here then have found the byte string if defer_size is not None and byte_count >= defer_size: return None else: return b"".join(value_chunks)