def read_preamble(fp: BinaryIO, force: bool) -> Optional[bytes]: """Return the 128-byte DICOM preamble in `fp` if present. `fp` should be positioned at the start of the file-like. If the preamble and prefix are found then after reading `fp` will be positioned at the first byte after the prefix (byte offset 133). If either the preamble or prefix are missing and `force` is ``True`` then after reading `fp` will be positioned at the start of the file-like. Parameters ---------- fp : file-like object The file-like to read the preamble from. force : bool Flag to force reading of a file even if no header is found. Returns ------- preamble : bytes or None The 128-byte DICOM preamble will be returned if the appropriate prefix ('DICM') is found at byte offset 128. Returns ``None`` if the 'DICM' prefix is not found and `force` is ``True``. Raises ------ InvalidDicomError If `force` is ``False`` and no appropriate header information found. Notes ----- Also reads past the 'DICM' marker. Rewinds file to the beginning if no header found. """ logger.debug("Reading File Meta Information preamble...") preamble = fp.read(128) if config.debugging: sample = bytes2hex(preamble[:8]) + "..." + bytes2hex(preamble[-8:]) logger.debug(f"{fp.tell() - 128:08x}: {sample}") logger.debug("Reading File Meta Information prefix...") magic = fp.read(4) if magic != b"DICM" and force: logger.info( "File is not conformant with the DICOM File Format: 'DICM' " "prefix is missing from the File Meta Information header " "or the header itself is missing. Assuming no header and " "continuing.") fp.seek(0) return None if magic != b"DICM" and not force: raise InvalidDicomError( "File is missing DICOM File Meta Information header or the 'DICM' " "prefix is missing from the header. Use force=True to force " "reading.") else: logger.debug(f"{fp.tell() - 4:08x}: 'DICM' prefix found") return preamble
def _read_file_meta_info(fp): """Return the file meta information. fp must be set after the 128 byte preamble and 'DICM' marker """ # File meta info always LittleEndian, Explicit VR. After will change these # to the transfer syntax values set in the meta info # Get group length data element, whose value is the length of the meta_info fp_save = fp.tell() # in case need to rewind debugging = config.debugging if debugging: logger.debug("Try to read group length info...") bytes_read = fp.read(8) group, elem, VR, length = unpack("<HH2sH", bytes_read) if debugging: debug_msg = "{0:08x}: {1}".format(fp.tell() - 8, bytes2hex(bytes_read)) if not in_py2: VR = VR.decode(default_encoding) if VR in extra_length_VRs: bytes_read = fp.read(4) length = unpack("<L", bytes_read)[0] if debugging: debug_msg += " " + bytes2hex(bytes_read) if debugging: debug_msg = "{0:<47s} ({1:04x}, {2:04x}) {3:2s} Length: {4:d}".format(debug_msg, group, elem, VR, length) logger.debug(debug_msg) # Store meta group length if it exists, then read until not group 2 if group == 2 and elem == 0: bytes_read = fp.read(length) if debugging: logger.debug("{0:08x}: {1}".format(fp.tell() - length, bytes2hex(bytes_read))) group_length = unpack("<L", bytes_read)[0] expected_ds_start = fp.tell() + group_length if debugging: msg = "value (group length) = {0:d}".format(group_length) msg += " regular dataset should start at {0:08x}".format(expected_ds_start) logger.debug(" " * 10 + msg) else: expected_ds_start = None if debugging: logger.debug(" " * 10 + "(0002,0000) Group length not found.") # Changed in pydicom 0.9.7 -- don't trust the group length, just read # until no longer group 2 data elements. But check the length and # give a warning if group 2 ends at different location. # Rewind to read the first data element as part of the file_meta dataset if debugging: logger.debug("Rewinding and reading whole dataset " "including this first data element") fp.seek(fp_save) file_meta = read_dataset(fp, is_implicit_VR=False, is_little_endian=True, stop_when=not_group2) fp_now = fp.tell() if expected_ds_start and fp_now != expected_ds_start: logger.info("*** Group length for file meta dataset " "did not match end of group 2 data ***") else: if debugging: logger.debug("--- End of file meta data found " "as expected ---------") return file_meta
def read_preamble(fp, force): """Return the 128-byte DICOM preamble in `fp` if present. `fp` should be positioned at the start of the file-like. If the preamble and prefix are found then after reading `fp` will be positioned at the first byte after the prefix (byte offset 133). If either the preamble or prefix are missing and `force` is True then after reading `fp` will be positioned at the start of the file-like. Parameters ---------- fp : file-like object The file-like to read the preamble from. force : bool Flag to force reading of a file even if no header is found. Returns ------- preamble : str/bytes or None The 128-byte DICOM preamble will be returned if the appropriate prefix ('DICM') is found at byte offset 128. Returns None if the 'DICM' prefix is not found and `force` is True. Raises ------ InvalidDicomError If `force` is False and no appropriate header information found. Notes ----- Also reads past the 'DICM' marker. Rewinds file to the beginning if no header found. """ logger.debug("Reading File Meta Information preamble...") preamble = fp.read(128) if config.debugging: sample = bytes2hex(preamble[:8]) + "..." + bytes2hex(preamble[-8:]) logger.debug("{0:08x}: {1}".format(fp.tell() - 128, sample)) logger.debug("Reading File Meta Information prefix...") magic = fp.read(4) if magic != b"DICM" and force: logger.info( "File is not conformant with the DICOM File Format: 'DICM' " "prefix is missing from the File Meta Information header " "or the header itself is missing. Assuming no header and " "continuing.") preamble = None fp.seek(0) elif magic != b"DICM" and not force: raise InvalidDicomError("File is missing DICOM File Meta Information " "header or the 'DICM' prefix is missing from " "the header. Use force=True to force reading.") else: logger.debug("{0:08x}: 'DICM' prefix found".format(fp.tell() - 4)) return preamble
def test_bytes_to_hex(self): """Test utils.hexutil.hex2bytes""" hexstring = "00 01 02 03 04 05 06 07 08 09 0a 0b 0c 0d 0e 0f" bytestring = b'\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09' \ b'\x0A\x0B\x0C\x0D\x0E\x0F' assert bytes2hex(bytestring) == hexstring hexstring = "00 10 20 30 40 50 60 70 80 90 a0 b0 c0 d0 e0 f0" bytestring = b'\x00\x10\x20\x30\x40\x50\x60\x70\x80\x90' \ b'\xA0\xB0\xC0\xD0\xE0\xF0' assert bytes2hex(bytestring) == hexstring
def test_empty_AT(self): """Write empty AT correctly..........""" # Was issue 74 data_elem = DataElement(0x00280009, "AT", []) expected = hex2bytes(( " 28 00 09 00" # (0028,0009) Frame Increment Pointer " 00 00 00 00" # length 0 )) write_data_element(self.f1, data_elem) got = self.f1.parent.getvalue() msg = ("Did not write zero-length AT value correctly. " "Expected %r, got %r") % (bytes2hex(expected), bytes2hex(got)) msg = "%r %r" % (type(expected), type(got)) msg = "'%r' '%r'" % (expected, got) self.assertEqual(expected, got, msg)
def read_preamble(fp, force): """Return the 128-byte DICOM preamble in `fp` if present. Parameters ---------- fp : file-like object The file-like to read the preamble from. force : bool Flag to force reading of a file even if no header is found. Returns ------- preamble : str/bytes or None The 128-byte DICOM preamble will be returned if the appropriate prefix ('DICM') is found at byte offset 128. Returns None if the 'DICM' prefix is not found and `force` is True. Raises ------ InvalidDicomError If `force` is False and no appropriate header information found. Notes ----- Also reads past the 'DICM' marker. Rewinds file to the beginning if no header found. """ logger.debug("Reading preamble...") preamble = fp.read(0x80) if config.debugging: sample = bytes2hex(preamble[:8]) + "..." + bytes2hex(preamble[-8:]) logger.debug("{0:08x}: {1}".format(fp.tell() - 0x80, sample)) magic = fp.read(4) if magic != b"DICM": if force: logger.info( "File is not a conformant DICOM file; 'DICM' prefix is " "missing from the file header or the header is " "missing. Assuming no header and continuing.") preamble = None fp.seek(0) else: raise InvalidDicomError("File is missing DICOM header or 'DICM' " "prefix is missing from the header. Use " "force=True to force reading.") else: logger.debug("{0:08x}: 'DICM' prefix found".format(fp.tell() - 4)) return preamble
def read_preamble(fp, force): """Read and return the DICOM preamble. Parameters ---------- fp : file-like object force : boolean Flag to force reading of a file even if no header is found. Returns ------- preamble : DICOM preamble, None The DICOM preamble will be returned if appropriate header ('DICM') is found. Returns None if no header is found. Raises ------ InvalidDicomError If force flag is false and no appropriate header information found. Notes ----- Also reads past the 'DICM' marker. Rewinds file to the beginning if no header found. """ logger.debug("Reading preamble...") preamble = fp.read(0x80) if config.debugging: sample = bytes2hex(preamble[:8]) + "..." + bytes2hex(preamble[-8:]) logger.debug("{0:08x}: {1}".format(fp.tell() - 0x80, sample)) magic = fp.read(4) if magic != b"DICM": if force: logger.info("File is not a standard DICOM file; 'DICM' header is " "missing. Assuming no header and continuing") preamble = None fp.seek(0) else: raise InvalidDicomError("File is missing 'DICM' marker. " "Use force=True to force reading") else: logger.debug("{0:08x}: 'DICM' marker found".format(fp.tell() - 4)) return preamble
def read_sequence_item(fp, is_implicit_VR, is_little_endian, encoding, offset=0): """Read and return a single :class:`~pydicom.sequence.Sequence` item, i.e. a :class:`~pydicom.dataset.Dataset`. """ seq_item_tell = fp.tell() + offset if is_little_endian: tag_length_format = "<HHL" else: tag_length_format = ">HHL" try: bytes_read = fp.read(8) group, element, length = unpack(tag_length_format, bytes_read) except BaseException: raise IOError("No tag to read at file position " "{0:05x}".format(fp.tell() + offset)) tag = (group, element) if tag == SequenceDelimiterTag: # No more items, time to stop reading logger.debug("{0:08x}: {1}".format(fp.tell() - 8 + offset, "End of Sequence")) if length != 0: logger.warning("Expected 0x00000000 after delimiter, found 0x%x, " "at position 0x%x" % (length, fp.tell() - 4 + offset)) return None if tag != ItemTag: logger.warning("Expected sequence item with tag %s at file position " "0x%x" % (ItemTag, fp.tell() - 4 + offset)) else: logger.debug("{0:08x}: {1} Found Item tag (start of item)".format( fp.tell() - 4 + offset, bytes2hex(bytes_read))) if length == 0xFFFFFFFF: ds = read_dataset(fp, is_implicit_VR, is_little_endian, bytelength=None, parent_encoding=encoding, at_top_level=False) ds.is_undefined_length_sequence_item = True else: ds = read_dataset(fp, is_implicit_VR, is_little_endian, length, parent_encoding=encoding, at_top_level=False) ds.is_undefined_length_sequence_item = False logger.debug("%08x: Finished sequence item" % (fp.tell() + offset, )) ds.seq_item_tell = seq_item_tell return ds
def read_sequence_item(fp, is_implicit_VR, is_little_endian, encoding, offset=0): """Read and return a single sequence item, i.e. a Dataset""" seq_item_tell = fp.tell() + offset if is_little_endian: tag_length_format = "<HHL" else: tag_length_format = ">HHL" try: bytes_read = fp.read(8) group, element, length = unpack(tag_length_format, bytes_read) except BaseException: raise IOError("No tag to read at file position " "{0:05x}".format(fp.tell() + offset)) tag = (group, element) if tag == SequenceDelimiterTag: # No more items, time to stop reading logger.debug( "{0:08x}: {1}".format(fp.tell() - 8 + offset, "End of Sequence")) if length != 0: logger.warning("Expected 0x00000000 after delimiter, found 0x%x, " "at position 0x%x" % ( length, fp.tell() - 4 + offset)) return None if tag != ItemTag: logger.warning("Expected sequence item with tag %s at file position " "0x%x" % (ItemTag, fp.tell() - 4 + offset)) else: logger.debug("{0:08x}: {1} Found Item tag (start of item)".format( fp.tell() - 4 + offset, bytes2hex(bytes_read))) if length == 0xFFFFFFFF: ds = read_dataset(fp, is_implicit_VR, is_little_endian, bytelength=None, parent_encoding=encoding) ds.is_undefined_length_sequence_item = True else: ds = read_dataset(fp, is_implicit_VR, is_little_endian, length, parent_encoding=encoding) ds.is_undefined_length_sequence_item = False logger.debug("%08x: Finished sequence item" % (fp.tell() + offset,)) ds.seq_item_tell = seq_item_tell return ds
def data_element_generator(fp, is_implicit_VR, is_little_endian, stop_when=None, defer_size=None, encoding=default_encoding, specific_tags=None): """Create a generator to efficiently return the raw data elements. Parameters ---------- fp : file-like object is_implicit_VR : boolean is_little_endian : boolean stop_when : None, callable, optional If None (default), then the whole file is read. A callable which takes tag, VR, length, and returns True or False. If it returns True, read_data_element will just return. defer_size : int, str, None, optional See ``dcmread`` for parameter info. encoding : Encoding scheme specific_tags : list or None See ``dcmread`` for parameter info. Returns ------- VR : None if implicit VR, otherwise the VR read from the file length : the length as in the DICOM data element (could be DICOM "undefined length" 0xffffffffL) value_bytes : the raw bytes from the DICOM file (not parsed into python types) is_little_endian : boolean True if transfer syntax is little endian; else False. """ # Summary of DICOM standard PS3.5-2008 chapter 7: # If Implicit VR, data element is: # tag, 4-byte length, value. # The 4-byte length can be FFFFFFFF (undefined length)* # # If Explicit VR: # if OB, OW, OF, SQ, UN, or UT: # tag, VR, 2-bytes reserved (both zero), 4-byte length, value # For all but UT, the length can be FFFFFFFF (undefined length)* # else: (any other VR) # tag, VR, (2 byte length), value # * for undefined length, a Sequence Delimitation Item marks the end # of the Value Field. # Note, except for the special_VRs, both impl and expl VR use 8 bytes; # the special VRs follow the 8 bytes with a 4-byte length # With a generator, state is stored, so we can break down # into the individual cases, and not have to check them again for each # data element if is_little_endian: endian_chr = "<" else: endian_chr = ">" if is_implicit_VR: element_struct = Struct(endian_chr + "HHL") else: # Explicit VR # tag, VR, 2-byte length (or 0 if special VRs) element_struct = Struct(endian_chr + "HH2sH") extra_length_struct = Struct(endian_chr + "L") # for special VRs extra_length_unpack = extra_length_struct.unpack # for lookup speed # Make local variables so have faster lookup fp_read = fp.read fp_tell = fp.tell logger_debug = logger.debug debugging = config.debugging element_struct_unpack = element_struct.unpack defer_size = size_in_bytes(defer_size) tag_set = set() has_specific_char_set = True if specific_tags is not None: for tag in specific_tags: if isinstance(tag, (str, compat.text_type)): tag = Tag(tag_for_keyword(tag)) if isinstance(tag, BaseTag): tag_set.add(tag) has_specific_char_set = Tag(0x08, 0x05) in tag_set tag_set.add(Tag(0x08, 0x05)) has_tag_set = len(tag_set) > 0 while True: # Read tag, VR, length, get ready to read value bytes_read = fp_read(8) if len(bytes_read) < 8: return # at end of file if debugging: debug_msg = "{0:08x}: {1}".format(fp.tell() - 8, bytes2hex(bytes_read)) if is_implicit_VR: # must reset VR each time; could have set last iteration (e.g. SQ) VR = None group, elem, length = element_struct_unpack(bytes_read) else: # explicit VR group, elem, VR, length = element_struct_unpack(bytes_read) if not in_py2: VR = VR.decode(default_encoding) if VR in extra_length_VRs: bytes_read = fp_read(4) length = extra_length_unpack(bytes_read)[0] if debugging: debug_msg += " " + bytes2hex(bytes_read) if debugging: debug_msg = "%-47s (%04x, %04x)" % (debug_msg, group, elem) if not is_implicit_VR: debug_msg += " %s " % VR if length != 0xFFFFFFFF: debug_msg += "Length: %d" % length else: debug_msg += "Length: Undefined length (FFFFFFFF)" logger_debug(debug_msg) # Positioned to read the value, but may not want to -- check stop_when value_tell = fp_tell() tag = TupleTag((group, elem)) if stop_when is not None: # XXX VR may be None here!! Should stop_when just take tag? if stop_when(tag, VR, length): if debugging: logger_debug("Reading ended by stop_when callback. " "Rewinding to start of data element.") rewind_length = 8 if not is_implicit_VR and VR in extra_length_VRs: rewind_length += 4 fp.seek(value_tell - rewind_length) return # Reading the value # First case (most common): reading a value with a defined length if length != 0xFFFFFFFF: # don't defer loading of Specific Character Set value as it is # needed immediately to get the character encoding for other tags if has_tag_set and tag not in tag_set: # skip the tag if not in specific tags fp.seek(fp_tell() + length) continue if (defer_size is not None and length > defer_size and tag != BaseTag(0x00080005)): # Flag as deferred by setting value to None, and skip bytes value = None logger_debug("Defer size exceeded. " "Skipping forward to next data element.") fp.seek(fp_tell() + length) else: value = fp_read(length) if debugging: dotdot = " " if length > 12: dotdot = "..." logger_debug("%08x: %-34s %s %r %s" % (value_tell, bytes2hex( value[:12]), dotdot, value[:12], dotdot)) # If the tag is (0008,0005) Specific Character Set, then store it if tag == BaseTag(0x00080005): from pydicom.values import convert_string encoding = convert_string(value, is_little_endian, encoding=default_encoding) # Store the encoding value in the generator # for use with future elements (SQs) encoding = convert_encodings(encoding) if not has_specific_char_set: continue yield RawDataElement(tag, VR, length, value, value_tell, is_implicit_VR, is_little_endian) # Second case: undefined length - must seek to delimiter, # unless is SQ type, in which case is easier to parse it, because # undefined length SQs and items of undefined lengths can be nested # and it would be error-prone to read to the correct outer delimiter else: # Try to look up type to see if is a SQ # if private tag, won't be able to look it up in dictionary, # in which case just ignore it and read the bytes unless it is # identified as a Sequence if VR is None: try: VR = dictionary_VR(tag) except KeyError: # Look ahead to see if it consists of items # and is thus a SQ next_tag = TupleTag(unpack(endian_chr + "HH", fp_read(4))) # Rewind the file fp.seek(fp_tell() - 4) if next_tag == ItemTag: VR = 'SQ' if VR == 'SQ': if debugging: msg = "{0:08x}: Reading/parsing undefined length sequence" logger_debug(msg.format(fp_tell())) seq = read_sequence(fp, is_implicit_VR, is_little_endian, length, encoding) if has_tag_set and tag not in tag_set: continue yield DataElement(tag, VR, seq, value_tell, is_undefined_length=True) else: delimiter = SequenceDelimiterTag if debugging: logger_debug("Reading undefined length data element") value = read_undefined_length_value(fp, is_little_endian, delimiter, defer_size) # If the tag is (0008,0005) Specific Character Set, # then store it if tag == (0x08, 0x05): from pydicom.values import convert_string encoding = convert_string(value, is_little_endian, encoding=default_encoding) # Store the encoding value in the generator for use # with future elements (SQs) encoding = convert_encodings(encoding) if not has_specific_char_set: continue # tags with undefined length are skipped after read if has_tag_set and tag not in tag_set: continue yield RawDataElement(tag, VR, length, value, value_tell, is_implicit_VR, is_little_endian)
def data_element_generator(fp, is_implicit_VR, is_little_endian, stop_when=None, defer_size=None, encoding=default_encoding): """Create a generator to efficiently return the raw data elements. Parameters ---------- fp : file-like object is_implicit_VR : boolean is_little_endian : boolean stop_when : None, callable, optional If None (default), then the whole file is read. A callable which takes tag, VR, length, and returns True or False. If it returns True, read_data_element will raise StopIteration. defer_size : int, str, None, optional See ``read_file`` for parameter info. encoding : Encoding scheme Returns ------- VR : None if implicit VR, otherwise the VR read from the file length : the length as in the DICOM data element (could be DICOM "undefined length" 0xffffffffL) value_bytes : the raw bytes from the DICOM file (not parsed into python types) is_little_endian : boolean True if transfer syntax is little endian; else False. """ # Summary of DICOM standard PS3.5-2008 chapter 7: # If Implicit VR, data element is: # tag, 4-byte length, value. # The 4-byte length can be FFFFFFFF (undefined length)* # If Explicit VR: # if OB, OW, OF, SQ, UN, or UT: # tag, VR, 2-bytes reserved (both zero), 4-byte length, value # For all but UT, the length can be FFFFFFFF (undefined length)* # else: (any other VR) # tag, VR, (2 byte length), value # * for undefined length, a Sequence Delimitation Item marks the end # of the Value Field. # Note, except for the special_VRs, both impl and expl VR use 8 bytes; # the special VRs follow the 8 bytes with a 4-byte length # With a generator, state is stored, so we can break down # into the individual cases, and not have to check them again for each # data element if is_little_endian: endian_chr = "<" else: endian_chr = ">" if is_implicit_VR: element_struct = Struct(endian_chr + "HHL") else: # Explicit VR # tag, VR, 2-byte length (or 0 if special VRs) element_struct = Struct(endian_chr + "HH2sH") extra_length_struct = Struct(endian_chr + "L") # for special VRs extra_length_unpack = extra_length_struct.unpack # for lookup speed # Make local variables so have faster lookup fp_read = fp.read fp_tell = fp.tell logger_debug = logger.debug debugging = config.debugging element_struct_unpack = element_struct.unpack while True: # Read tag, VR, length, get ready to read value bytes_read = fp_read(8) if len(bytes_read) < 8: raise StopIteration # at end of file if debugging: debug_msg = "{0:08x}: {1}".format(fp.tell() - 8, bytes2hex(bytes_read)) if is_implicit_VR: # must reset VR each time; could have set last iteration (e.g. SQ) VR = None group, elem, length = element_struct_unpack(bytes_read) else: # explicit VR group, elem, VR, length = element_struct_unpack(bytes_read) if not in_py2: VR = VR.decode(default_encoding) if VR in extra_length_VRs: bytes_read = fp_read(4) length = extra_length_unpack(bytes_read)[0] if debugging: debug_msg += " " + bytes2hex(bytes_read) if debugging: debug_msg = "%-47s (%04x, %04x)" % (debug_msg, group, elem) if not is_implicit_VR: debug_msg += " %s " % VR if length != 0xFFFFFFFF: debug_msg += "Length: %d" % length else: debug_msg += "Length: Undefined length (FFFFFFFF)" logger_debug(debug_msg) # Positioned to read the value, but may not want to -- check stop_when value_tell = fp_tell() tag = TupleTag((group, elem)) if stop_when is not None: # XXX VR may be None here!! Should stop_when just take tag? if stop_when(tag, VR, length): if debugging: logger_debug("Reading ended by stop_when callback. " "Rewinding to start of data element.") rewind_length = 8 if not is_implicit_VR and VR in extra_length_VRs: rewind_length += 4 fp.seek(value_tell - rewind_length) raise StopIteration # Reading the value # First case (most common): reading a value with a defined length if length != 0xFFFFFFFF: if defer_size is not None and length > defer_size: # Flag as deferred by setting value to None, and skip bytes value = None logger_debug("Defer size exceeded. " "Skipping forward to next data element.") fp.seek(fp_tell() + length) else: value = fp_read(length) if debugging: dotdot = " " if length > 12: dotdot = "..." logger_debug("%08x: %-34s %s %r %s" % (value_tell, bytes2hex(value[:12]), dotdot, value[:12], dotdot)) # If the tag is (0008,0005) Specific Character Set, then store it if tag == (0x08, 0x05): from pydicom.values import convert_string encoding = convert_string(value, is_little_endian, encoding=default_encoding) # Store the encoding value in the generator for use with future elements (SQs) encoding = convert_encodings(encoding) yield RawDataElement(tag, VR, length, value, value_tell, is_implicit_VR, is_little_endian) # Second case: undefined length - must seek to delimiter, # unless is SQ type, in which case is easier to parse it, because # undefined length SQs and items of undefined lengths can be nested # and it would be error-prone to read to the correct outer delimiter else: # Try to look up type to see if is a SQ # if private tag, won't be able to look it up in dictionary, # in which case just ignore it and read the bytes unless it is # identified as a Sequence if VR is None: try: VR = dictionaryVR(tag) except KeyError: # Look ahead to see if it consists of items and is thus a SQ next_tag = TupleTag(unpack(endian_chr + "HH", fp_read(4))) # Rewind the file fp.seek(fp_tell() - 4) if next_tag == ItemTag: VR = 'SQ' if VR == 'SQ': if debugging: msg = "{0:08x}: Reading/parsing undefined length sequence" logger_debug(msg.format(fp_tell())) seq = read_sequence(fp, is_implicit_VR, is_little_endian, length, encoding) yield DataElement(tag, VR, seq, value_tell, is_undefined_length=True) else: delimiter = SequenceDelimiterTag if debugging: logger_debug("Reading undefined length data element") value = read_undefined_length_value(fp, is_little_endian, delimiter, defer_size) # If the tag is (0008,0005) Specific Character Set, then store it if tag == (0x08, 0x05): from pydicom.values import convert_string encoding = convert_string(value, is_little_endian, encoding=default_encoding) # Store the encoding value in the generator for use with future elements (SQs) encoding = convert_encodings(encoding) yield RawDataElement(tag, VR, length, value, value_tell, is_implicit_VR, is_little_endian)
def data_element_generator( fp: BinaryIO, is_implicit_VR: bool, is_little_endian: bool, stop_when: Optional[Callable[[BaseTag, Optional[str], int], bool]] = None, defer_size: Optional[Union[int, str, float]] = None, encoding: Union[str, MutableSequence[str]] = default_encoding, specific_tags: Optional[List[BaseTag]] = None ) -> Iterator[Union[RawDataElement, DataElement]]: """Create a generator to efficiently return the raw data elements. .. note:: This function is used internally - usually there is no need to call it from user code. To read data from a DICOM file, :func:`dcmread` shall be used instead. Parameters ---------- fp : file-like The file-like to read from. is_implicit_VR : bool ``True`` if the data is encoded as implicit VR, ``False`` otherwise. is_little_endian : bool ``True`` if the data is encoded as little endian, ``False`` otherwise. stop_when : None, callable, optional If ``None`` (default), then the whole file is read. A callable which takes tag, VR, length, and returns ``True`` or ``False``. If it returns ``True``, ``read_data_element`` will just return. defer_size : int, str or float, optional See :func:`dcmread` for parameter info. encoding : Union[str, MutableSequence[str]] Encoding scheme specific_tags : list or None See :func:`dcmread` for parameter info. Yields ------- RawDataElement or DataElement Yields DataElement for undefined length UN or SQ, RawDataElement otherwise. """ # Summary of DICOM standard PS3.5-2008 chapter 7: # If Implicit VR, data element is: # tag, 4-byte length, value. # The 4-byte length can be FFFFFFFF (undefined length)* # # If Explicit VR: # if OB, OW, OF, SQ, UN, or UT: # tag, VR, 2-bytes reserved (both zero), 4-byte length, value # For all but UT, the length can be FFFFFFFF (undefined length)* # else: (any other VR) # tag, VR, (2 byte length), value # * for undefined length, a Sequence Delimitation Item marks the end # of the Value Field. # Note, except for the special_VRs, both impl and expl VR use 8 bytes; # the special VRs follow the 8 bytes with a 4-byte length # With a generator, state is stored, so we can break down # into the individual cases, and not have to check them again for each # data element from pydicom.values import convert_string if is_little_endian: endian_chr = "<" else: endian_chr = ">" # assign implicit VR struct to variable as use later if VR assumed missing implicit_VR_struct = Struct(endian_chr + "HHL") if is_implicit_VR: element_struct = implicit_VR_struct else: # Explicit VR # tag, VR, 2-byte length (or 0 if special VRs) element_struct = Struct(endian_chr + "HH2sH") extra_length_struct = Struct(endian_chr + "L") # for special VRs extra_length_unpack = extra_length_struct.unpack # for lookup speed # Make local variables so have faster lookup fp_read = fp.read fp_tell = fp.tell logger_debug = logger.debug debugging = config.debugging element_struct_unpack = element_struct.unpack defer_size = size_in_bytes(defer_size) tag_set = {Tag(tag) for tag in specific_tags} if specific_tags else set() has_tag_set = bool(tag_set) if has_tag_set: tag_set.add(Tag(0x00080005)) # Specific Character Set while True: # VR: Optional[str] # Read tag, VR, length, get ready to read value bytes_read = fp_read(8) if len(bytes_read) < 8: return # at end of file if debugging: debug_msg = f"{fp.tell() - 8:08x}: {bytes2hex(bytes_read)}" if is_implicit_VR: # must reset VR each time; could have set last iteration (e.g. SQ) VR = None group, elem, length = element_struct_unpack(bytes_read) else: # explicit VR group, elem, VR, length = element_struct_unpack(bytes_read) # defend against switching to implicit VR, some writer do in SQ's # issue 1067, issue 1035 if not (b'AA' <= VR <= b'ZZ') and config.assume_implicit_vr_switch: # invalid VR, must be 2 cap chrs, assume implicit and continue VR = None group, elem, length = implicit_VR_struct.unpack(bytes_read) else: VR = VR.decode(default_encoding) if VR in extra_length_VRs: bytes_read = fp_read(4) length = extra_length_unpack(bytes_read)[0] if debugging: debug_msg += " " + bytes2hex(bytes_read) if debugging: debug_msg = "%-47s (%04x, %04x)" % (debug_msg, group, elem) if not is_implicit_VR: debug_msg += " %s " % VR if length != 0xFFFFFFFF: debug_msg += "Length: %d" % length else: debug_msg += "Length: Undefined length (FFFFFFFF)" logger_debug(debug_msg) # Positioned to read the value, but may not want to -- check stop_when value_tell = fp_tell() tag = TupleTag((group, elem)) if stop_when is not None: # XXX VR may be None here!! Should stop_when just take tag? if stop_when(tag, VR, length): if debugging: logger_debug("Reading ended by stop_when callback. " "Rewinding to start of data element.") rewind_length = 8 if not is_implicit_VR and VR in extra_length_VRs: rewind_length += 4 fp.seek(value_tell - rewind_length) return # Reading the value # First case (most common): reading a value with a defined length if length != 0xFFFFFFFF: # don't defer loading of Specific Character Set value as it is # needed immediately to get the character encoding for other tags if has_tag_set and tag not in tag_set: # skip the tag if not in specific tags fp.seek(fp_tell() + length) continue if (defer_size is not None and length > defer_size and tag != BaseTag(0x00080005)): # Flag as deferred by setting value to None, and skip bytes value = None logger_debug("Defer size exceeded. " "Skipping forward to next data element.") fp.seek(fp_tell() + length) else: value = ( fp_read(length) if length > 0 else cast( Optional[bytes], empty_value_for_VR(VR, raw=True) ) ) if debugging: dotdot = "..." if length > 20 else " " displayed_value = value[:20] if value else b'' logger_debug("%08x: %-34s %s %r %s" % (value_tell, bytes2hex(displayed_value), dotdot, displayed_value, dotdot)) # If the tag is (0008,0005) Specific Character Set, then store it if tag == BaseTag(0x00080005): # *Specific Character String* is b'' for empty value encoding = convert_string( cast(bytes, value) or b'', is_little_endian ) # Store the encoding value in the generator # for use with future elements (SQs) encoding = convert_encodings(encoding) yield RawDataElement(tag, VR, length, value, value_tell, is_implicit_VR, is_little_endian) # Second case: undefined length - must seek to delimiter, # unless is SQ type, in which case is easier to parse it, because # undefined length SQs and items of undefined lengths can be nested # and it would be error-prone to read to the correct outer delimiter else: # VR UN with undefined length shall be handled as SQ # see PS 3.5, section 6.2.2 if VR == 'UN': VR = 'SQ' # Try to look up type to see if is a SQ # if private tag, won't be able to look it up in dictionary, # in which case just ignore it and read the bytes unless it is # identified as a Sequence if VR is None or VR == 'UN' and config.replace_un_with_known_vr: try: VR = dictionary_VR(tag) except KeyError: # Look ahead to see if it consists of items # and is thus a SQ next_tag = _unpack_tag(fp_read(4), endian_chr) # Rewind the file fp.seek(fp_tell() - 4) if next_tag == ItemTag: VR = 'SQ' if VR == 'SQ': if debugging: logger_debug( f"{fp_tell():08X}: Reading/parsing undefined length " "sequence" ) seq = read_sequence(fp, is_implicit_VR, is_little_endian, length, encoding) if has_tag_set and tag not in tag_set: continue yield DataElement(tag, VR, seq, value_tell, is_undefined_length=True) else: delimiter = SequenceDelimiterTag if debugging: logger_debug("Reading undefined length data element") value = read_undefined_length_value( fp, is_little_endian, delimiter, defer_size ) # tags with undefined length are skipped after read if has_tag_set and tag not in tag_set: continue yield RawDataElement(tag, VR, length, value, value_tell, is_implicit_VR, is_little_endian)
def _read_file_meta_info(fp): """Return the file meta information. fp must be set after the 128 byte preamble and 'DICM' marker """ # File meta info always LittleEndian, Explicit VR. After will change these # to the transfer syntax values set in the meta info # Get group length data element, whose value is the length of the meta_info fp_save = fp.tell() # in case need to rewind debugging = config.debugging if debugging: logger.debug("Try to read group length info...") bytes_read = fp.read(8) group, elem, VR, length = unpack("<HH2sH", bytes_read) if debugging: debug_msg = "{0:08x}: {1}".format(fp.tell() - 8, bytes2hex(bytes_read)) if not in_py2: VR = VR.decode(default_encoding) if VR in extra_length_VRs: bytes_read = fp.read(4) length = unpack("<L", bytes_read)[0] if debugging: debug_msg += " " + bytes2hex(bytes_read) if debugging: debug_msg = "{0:<47s} ({1:04x}, {2:04x}) {3:2s} Length: {4:d}".format( debug_msg, group, elem, VR, length) logger.debug(debug_msg) # Store meta group length if it exists, then read until not group 2 if group == 2 and elem == 0: bytes_read = fp.read(length) if debugging: logger.debug("{0:08x}: {1}".format(fp.tell() - length, bytes2hex(bytes_read))) group_length = unpack("<L", bytes_read)[0] expected_ds_start = fp.tell() + group_length if debugging: msg = "value (group length) = {0:d}".format(group_length) msg += " regular dataset should start at {0:08x}".format( expected_ds_start) logger.debug(" " * 10 + msg) else: expected_ds_start = None if debugging: logger.debug(" " * 10 + "(0002,0000) Group length not found.") # Changed in pydicom 0.9.7 -- don't trust the group length, just read # until no longer group 2 data elements. But check the length and # give a warning if group 2 ends at different location. # Rewind to read the first data element as part of the file_meta dataset if debugging: logger.debug("Rewinding and reading whole dataset " "including this first data element") fp.seek(fp_save) file_meta = read_dataset(fp, is_implicit_VR=False, is_little_endian=True, stop_when=not_group2) fp_now = fp.tell() if expected_ds_start and fp_now != expected_ds_start: logger.info("*** Group length for file meta dataset " "did not match end of group 2 data ***") else: if debugging: logger.debug("--- End of file meta data found " "as expected ---------") return file_meta