Beispiel #1
0
    def test_size_in_bytes(self):
        """Test convenience function size_in_bytes()."""
        # None or numbers shall be returned unchanged
        self.assertIsNone(size_in_bytes(None))
        self.assertEqual(1234, size_in_bytes(1234))

        # string shall be parsed
        self.assertEqual(1234, size_in_bytes('1234'))
        self.assertEqual(4096, size_in_bytes('4 kb'))
        self.assertEqual(0x4000, size_in_bytes('16 KB'))
        self.assertEqual(0x300000, size_in_bytes('3  MB'))
        self.assertEqual(0x80000000, size_in_bytes('2gB'))

        self.assertRaises(ValueError, size_in_bytes, '2 TB')
        self.assertRaises(ValueError, size_in_bytes, 'KB 2')
Beispiel #2
0
    def test_size_in_bytes(self):
        """Test convenience function size_in_bytes()."""
        # None or numbers shall be returned unchanged
        self.assertIsNone(size_in_bytes(None))
        self.assertEqual(1234, size_in_bytes(1234))

        # string shall be parsed
        self.assertEqual(1234, size_in_bytes('1234'))
        self.assertEqual(4096, size_in_bytes('4 kb'))
        self.assertEqual(0x4000, size_in_bytes('16 KB'))
        self.assertEqual(0x300000, size_in_bytes('3  MB'))
        self.assertEqual(0x80000000, size_in_bytes('2gB'))

        self.assertRaises(ValueError, size_in_bytes, '2 TB')
        self.assertRaises(ValueError, size_in_bytes, 'KB 2')
Beispiel #3
0
def read_undefined_length_value(fp,
                                is_little_endian,
                                delimiter_tag,
                                defer_size=None,
                                read_size=1024 * 8):
    """Read until the delimiter tag found and return the value;
     ignore the delimiter.

    On completion, the file will be set to the first byte after the delimiter
    and its following four zero bytes.

    Parameters
    ----------
    fp : a file-like object
    is_little_endian : boolean
        True if file transfer syntax is little endian, else False.
    delimiter_tag : BaseTag
        tag used as and marker for reading
    defer_size : int, None, optional
        Size to avoid loading large elements in memory.
        See ``filereader.dcmread`` for more parameter info.
    read_size : int
        Number of bytes to read at one time.

    Returns
    -------
    delimiter : str, None
        The file delimiter

    Raises
    ------
    EOFError
        If EOF is reached before delimiter found.
    """
    data_start = fp.tell()
    search_rewind = 3

    if is_little_endian:
        bytes_format = b"<HH"
    else:
        bytes_format = b">HH"
    bytes_to_find = pack(bytes_format, delimiter_tag.group, delimiter_tag.elem)

    found = False
    eof = False
    value_chunks = []
    defer_size = size_in_bytes(defer_size)
    byte_count = 0  # for defer_size checks
    while not found:
        chunk_start = fp.tell()
        bytes_read = fp.read(read_size)
        if len(bytes_read) < read_size:
            # try again - if still don't get required amount,
            # this is the last block
            new_bytes = fp.read(read_size - len(bytes_read))
            bytes_read += new_bytes
            if len(bytes_read) < read_size:
                eof = True  # but will still check whatever we did get
        index = bytes_read.find(bytes_to_find)
        if index != -1:
            found = True
            new_bytes = bytes_read[:index]
            byte_count += len(new_bytes)
            if defer_size is None or byte_count < defer_size:
                value_chunks.append(bytes_read[:index])
            fp.seek(chunk_start + index + 4)  # rewind to end of delimiter
            length = fp.read(4)
            if length != b"\0\0\0\0":
                msg = ("Expected 4 zero bytes after undefined length delimiter"
                       " at pos {0:04x}")
                logger.error(msg.format(fp.tell() - 4))
        elif eof:
            fp.seek(data_start)
            raise EOFError(
                "End of file reached before delimiter {0!r} found".format(
                    delimiter_tag))
        else:
            # rewind a bit in case delimiter crossed read_size boundary
            fp.seek(fp.tell() - search_rewind)
            # accumulate the bytes read (not including the rewind)
            new_bytes = bytes_read[:-search_rewind]
            byte_count += len(new_bytes)
            if defer_size is None or byte_count < defer_size:
                value_chunks.append(new_bytes)
    # if get here then have found the byte string
    if defer_size is not None and byte_count >= defer_size:
        return None
    else:
        return b"".join(value_chunks)
Beispiel #4
0
def data_element_generator(fp,
                           is_implicit_VR,
                           is_little_endian,
                           stop_when=None,
                           defer_size=None):
    """:return: (tag, VR, length, value, value_tell,
                                 is_implicit_VR, is_little_endian)
    """
    if is_little_endian:
        endian_chr = "<"
    else:
        endian_chr = ">"
    if is_implicit_VR:
        element_struct = Struct(endian_chr + "HHL")
    else:  # Explicit VR
        # tag, VR, 2-byte length (or 0 if special VRs)
        element_struct = Struct(endian_chr + "HH2sH")
        extra_length_struct = Struct(endian_chr + "L")  # for special VRs
        extra_length_unpack = extra_length_struct.unpack  # for lookup speed

    # Make local variables so have faster lookup
    fp_read = fp.read
    fp_tell = fp.tell
    element_struct_unpack = element_struct.unpack
    defer_size = size_in_bytes(defer_size)

    while True:
        # Read tag, VR, length, get ready to read value
        bytes_read = fp_read(8)
        if len(bytes_read) < 8:
            raise StopIteration  # at end of file

        if is_implicit_VR:
            # must reset VR each time; could have set last iteration (e.g. SQ)
            VR = None
            group, elem, length = element_struct_unpack(bytes_read)
        else:  # explicit VR
            group, elem, VR, length = element_struct_unpack(bytes_read)
            if VR in extra_length_VRs_b:
                bytes_read = fp_read(4)
                length = extra_length_unpack(bytes_read)[0]

        # Positioned to read the value, but may not want to -- check stop_when
        value_tell = fp_tell()
        if stop_when is not None:
            if stop_when(group, elem):
                rewind_length = 8
                if not is_implicit_VR and VR in extra_length_VRs_b:
                    rewind_length += 4
                fp.seek(value_tell - rewind_length)
                raise StopIteration

        # Reading the value
        # First case (most common): reading a value with a defined length
        if length != 0xFFFFFFFF:
            if defer_size is not None and length > defer_size:
                # Flag as deferred by setting value to None, and skip bytes
                value = None
                fp.seek(fp_tell() + length)
            else:
                value = fp_read(length)
            # import pdb;pdb.set_trace()
            yield ((group, elem), VR, length, value, value_tell)

        # Second case: undefined length - must seek to delimiter,
        # unless is SQ type, in which case is easier to parse it, because
        # undefined length SQs and items of undefined lengths can be nested
        # and it would be error-prone to read to the correct outer delimiter
        else:
            # Try to look up type to see if is a SQ
            # if private tag, won't be able to look it up in dictionary,
            #   in which case just ignore it and read the bytes unless it is
            #   identified as a Sequence
            if VR is None:
                try:
                    VR = dictionary_VR(tag)
                except KeyError:
                    # Look ahead to see if it consists of items and
                    # is thus a SQ
                    next_tag = TupleTag(unpack(endian_chr + "HH", fp_read(4)))
                    # Rewind the file
                    fp.seek(fp_tell() - 4)
                    if next_tag == ItemTag:
                        VR = b'SQ'

            if VR == b'SQ':
                yield ((group, elem), VR, length, None, value_tell)
                # seq = read_sequence(fp, is_implicit_VR,
                #                     is_little_endian, length, encoding)
                # yield DataElement(tag, VR, seq, value_tell,
                #                   is_undefined_length=True)
            else:
                raise NotImplementedError("This reader does not handle "
                                          "undefined length except for SQ")
                from pydicom.fileio.fileutil import read_undefined_length_value

                delimiter = SequenceDelimiterTag
                value = read_undefined_length_value(fp, is_little_endian,
                                                    delimiter, defer_size)
                yield ((group, elem), VR, length, value, value_tell)
Beispiel #5
0
def dcmread(
    fp: Union[PathType, BinaryIO],
    defer_size: Optional[Union[str, int, float]] = None,
    stop_before_pixels: bool = False,
    force: bool = False,
    specific_tags: Optional[TagListType] = None
) -> Union[FileDataset, DicomDir]:
    """Read and parse a DICOM dataset stored in the DICOM File Format.

    Read a DICOM dataset stored in accordance with the :dcm:`DICOM File
    Format <part10/chapter_7.html>`. If the dataset is not stored in
    accordance with the File Format (i.e. the preamble and prefix are missing,
    there are missing required Type 1 *File Meta Information Group* elements
    or the entire *File Meta Information* is missing) then you will have to
    set `force` to ``True``.

    .. deprecated:: 2.2

        Returning a :class:`~pydicom.dicomdir.DicomDir` is deprecated and
        will be removed in v3.0. Use :class:`~pydicom.fileset.FileSet` instead.


    Examples
    --------
    Read and return a dataset stored in accordance with the DICOM File Format:

    >>> ds = pydicom.dcmread("CT_small.dcm")
    >>> ds.PatientName

    Read and return a dataset not in accordance with the DICOM File Format:

    >>> ds = pydicom.dcmread("rtplan.dcm", force=True)
    >>> ds.PatientName

    Use within a context manager:

    >>> with pydicom.dcmread("rtplan.dcm") as ds:
    ...     ds.PatientName

    Parameters
    ----------
    fp : str or PathLike or file-like
        Either a file-like object, a string containing the file name or the
        path to the file. The file-like object must have ``seek()``,
        ``read()`` and ``tell()`` methods and the caller is responsible for
        closing it (if required).
    defer_size : int, str or float, optional
        If not used then all elements are read into memory. If specified,
        then if a data element's stored value is larger than `defer_size`, the
        value is not read into memory until it is accessed in code. Should be
        the number of bytes to be read as :class:`int` or as a :class:`str`
        with units, e.g. ``'512 KB'``, ``'2 MB'``.
    stop_before_pixels : bool, optional
        If ``False`` (default), the full file will be read and parsed. Set
        ``True`` to stop before reading (7FE0,0010) *Pixel Data* (and all
        subsequent elements).
    force : bool, optional
        If ``False`` (default), raises an
        :class:`~pydicom.errors.InvalidDicomError` if the file is
        missing the *File Meta Information* header. Set to ``True`` to force
        reading even if no *File Meta Information* header is found.
    specific_tags : list of (int or str or 2-tuple of int), optional
        If used the only the supplied tags will be returned. The supplied
        elements can be tags or keywords. Note that the element (0008,0005)
        *Specific Character Set* is always returned if present - this ensures
        correct decoding of returned text values.

    Returns
    -------
    FileDataset or DicomDir
        An instance of :class:`~pydicom.dataset.FileDataset` that represents
        a parsed DICOM file, unless the dataset is a *Media Storage Directory*
        instance in which case it will be a
        :class:`~pydicom.dicomdir.DicomDir`.

    Raises
    ------
    InvalidDicomError
        If `force` is ``False`` and the file is not a valid DICOM file.
    TypeError
        If `fp` is ``None`` or of an unsupported type.

    See Also
    --------
    pydicom.dataset.FileDataset
        Data class that is returned.
    pydicom.filereader.read_partial
        Only read part of a DICOM file, stopping on given conditions.
    """
    # Open file if not already a file object
    caller_owns_file = True
    fp = path_from_pathlike(fp)
    if isinstance(fp, str):
        # caller provided a file name; we own the file handle
        caller_owns_file = False
        logger.debug("Reading file '{0}'".format(fp))
        fp = open(fp, 'rb')
    elif fp is None or not hasattr(fp, "read") or not hasattr(fp, "seek"):
        raise TypeError("dcmread: Expected a file path or a file-like, "
                        "but got " + type(fp).__name__)

    if config.debugging:
        logger.debug("\n" + "-" * 80)
        logger.debug("Call to dcmread()")
        msg = ("filename:'%s', defer_size='%s', "
               "stop_before_pixels=%s, force=%s, specific_tags=%s")
        logger.debug(msg % (fp.name, defer_size, stop_before_pixels,
                            force, specific_tags))
        if caller_owns_file:
            logger.debug("Caller passed file object")
        else:
            logger.debug("Caller passed file name")
        logger.debug("-" * 80)

    if specific_tags:
        specific_tags = [Tag(t) for t in specific_tags]

    specific_tags = cast(Optional[List[BaseTag]], specific_tags)

    # Iterate through all items and store them --include file meta if present
    stop_when = None
    if stop_before_pixels:
        stop_when = _at_pixel_data
    try:
        dataset = read_partial(
            fp,
            stop_when,
            defer_size=size_in_bytes(defer_size),
            force=force,
            specific_tags=specific_tags,
        )
    finally:
        if not caller_owns_file:
            fp.close()
    # XXX need to store transfer syntax etc.
    return dataset
Beispiel #6
0
def data_element_generator(
    fp: BinaryIO,
    is_implicit_VR: bool,
    is_little_endian: bool,
    stop_when: Optional[Callable[[BaseTag, Optional[str], int], bool]] = None,
    defer_size: Optional[Union[int, str, float]] = None,
    encoding: Union[str, MutableSequence[str]] = default_encoding,
    specific_tags: Optional[List[BaseTag]] = None
) -> Iterator[Union[RawDataElement, DataElement]]:
    """Create a generator to efficiently return the raw data elements.

    .. note::

        This function is used internally - usually there is no need to call it
        from user code. To read data from a DICOM file, :func:`dcmread`
        shall be used instead.

    Parameters
    ----------
    fp : file-like
        The file-like to read from.
    is_implicit_VR : bool
        ``True`` if the data is encoded as implicit VR, ``False`` otherwise.
    is_little_endian : bool
        ``True`` if the data is encoded as little endian, ``False`` otherwise.
    stop_when : None, callable, optional
        If ``None`` (default), then the whole file is read. A callable which
        takes tag, VR, length, and returns ``True`` or ``False``. If it
        returns ``True``, ``read_data_element`` will just return.
    defer_size : int, str or float, optional
        See :func:`dcmread` for parameter info.
    encoding : Union[str, MutableSequence[str]]
        Encoding scheme
    specific_tags : list or None
        See :func:`dcmread` for parameter info.

    Yields
    -------
    RawDataElement or DataElement
        Yields DataElement for undefined length UN or SQ, RawDataElement
        otherwise.
    """
    # Summary of DICOM standard PS3.5-2008 chapter 7:
    # If Implicit VR, data element is:
    #    tag, 4-byte length, value.
    #        The 4-byte length can be FFFFFFFF (undefined length)*
    #
    # If Explicit VR:
    #    if OB, OW, OF, SQ, UN, or UT:
    #       tag, VR, 2-bytes reserved (both zero), 4-byte length, value
    #           For all but UT, the length can be FFFFFFFF (undefined length)*
    #   else: (any other VR)
    #       tag, VR, (2 byte length), value
    # * for undefined length, a Sequence Delimitation Item marks the end
    #        of the Value Field.
    # Note, except for the special_VRs, both impl and expl VR use 8 bytes;
    #    the special VRs follow the 8 bytes with a 4-byte length

    # With a generator, state is stored, so we can break down
    #    into the individual cases, and not have to check them again for each
    #    data element
    from pydicom.values import convert_string

    if is_little_endian:
        endian_chr = "<"
    else:
        endian_chr = ">"

    # assign implicit VR struct to variable as use later if VR assumed missing
    implicit_VR_struct = Struct(endian_chr + "HHL")
    if is_implicit_VR:
        element_struct = implicit_VR_struct
    else:  # Explicit VR
        # tag, VR, 2-byte length (or 0 if special VRs)
        element_struct = Struct(endian_chr + "HH2sH")
        extra_length_struct = Struct(endian_chr + "L")  # for special VRs
        extra_length_unpack = extra_length_struct.unpack  # for lookup speed

    # Make local variables so have faster lookup
    fp_read = fp.read
    fp_tell = fp.tell
    logger_debug = logger.debug
    debugging = config.debugging
    element_struct_unpack = element_struct.unpack
    defer_size = size_in_bytes(defer_size)

    tag_set = {Tag(tag) for tag in specific_tags} if specific_tags else set()
    has_tag_set = bool(tag_set)
    if has_tag_set:
        tag_set.add(Tag(0x00080005))  # Specific Character Set

    while True:
        # VR: Optional[str]

        # Read tag, VR, length, get ready to read value
        bytes_read = fp_read(8)
        if len(bytes_read) < 8:
            return  # at end of file

        if debugging:
            debug_msg = f"{fp.tell() - 8:08x}: {bytes2hex(bytes_read)}"

        if is_implicit_VR:
            # must reset VR each time; could have set last iteration (e.g. SQ)
            VR = None
            group, elem, length = element_struct_unpack(bytes_read)
        else:  # explicit VR
            group, elem, VR, length = element_struct_unpack(bytes_read)
            # defend against switching to implicit VR, some writer do in SQ's
            # issue 1067, issue 1035

            if not (b'AA' <= VR <= b'ZZ') and config.assume_implicit_vr_switch:
                # invalid VR, must be 2 cap chrs, assume implicit and continue
                VR = None
                group, elem, length = implicit_VR_struct.unpack(bytes_read)
            else:
                VR = VR.decode(default_encoding)
                if VR in extra_length_VRs:
                    bytes_read = fp_read(4)
                    length = extra_length_unpack(bytes_read)[0]
                    if debugging:
                        debug_msg += " " + bytes2hex(bytes_read)

        if debugging:
            debug_msg = "%-47s  (%04x, %04x)" % (debug_msg, group, elem)
            if not is_implicit_VR:
                debug_msg += " %s " % VR
            if length != 0xFFFFFFFF:
                debug_msg += "Length: %d" % length
            else:
                debug_msg += "Length: Undefined length (FFFFFFFF)"
            logger_debug(debug_msg)

        # Positioned to read the value, but may not want to -- check stop_when
        value_tell = fp_tell()
        tag = TupleTag((group, elem))
        if stop_when is not None:
            # XXX VR may be None here!! Should stop_when just take tag?
            if stop_when(tag, VR, length):
                if debugging:
                    logger_debug("Reading ended by stop_when callback. "
                                 "Rewinding to start of data element.")
                rewind_length = 8
                if not is_implicit_VR and VR in extra_length_VRs:
                    rewind_length += 4
                fp.seek(value_tell - rewind_length)
                return

        # Reading the value
        # First case (most common): reading a value with a defined length
        if length != 0xFFFFFFFF:
            # don't defer loading of Specific Character Set value as it is
            # needed immediately to get the character encoding for other tags
            if has_tag_set and tag not in tag_set:
                # skip the tag if not in specific tags
                fp.seek(fp_tell() + length)
                continue

            if (defer_size is not None and length > defer_size and
                    tag != BaseTag(0x00080005)):
                # Flag as deferred by setting value to None, and skip bytes
                value = None
                logger_debug("Defer size exceeded. "
                             "Skipping forward to next data element.")
                fp.seek(fp_tell() + length)
            else:
                value = (
                    fp_read(length) if length > 0
                    else cast(
                        Optional[bytes], empty_value_for_VR(VR, raw=True)
                    )
                )
                if debugging:
                    dotdot = "..." if length > 20 else "   "
                    displayed_value = value[:20] if value else b''
                    logger_debug("%08x: %-34s %s %r %s" %
                                 (value_tell, bytes2hex(displayed_value),
                                  dotdot, displayed_value, dotdot))

            # If the tag is (0008,0005) Specific Character Set, then store it
            if tag == BaseTag(0x00080005):
                # *Specific Character String* is b'' for empty value
                encoding = convert_string(
                    cast(bytes, value) or b'', is_little_endian
                )
                # Store the encoding value in the generator
                # for use with future elements (SQs)
                encoding = convert_encodings(encoding)

            yield RawDataElement(tag, VR, length, value, value_tell,
                                 is_implicit_VR, is_little_endian)

        # Second case: undefined length - must seek to delimiter,
        # unless is SQ type, in which case is easier to parse it, because
        # undefined length SQs and items of undefined lengths can be nested
        # and it would be error-prone to read to the correct outer delimiter
        else:
            # VR UN with undefined length shall be handled as SQ
            # see PS 3.5, section 6.2.2
            if VR == 'UN':
                VR = 'SQ'
            # Try to look up type to see if is a SQ
            # if private tag, won't be able to look it up in dictionary,
            #   in which case just ignore it and read the bytes unless it is
            #   identified as a Sequence
            if VR is None or VR == 'UN' and config.replace_un_with_known_vr:
                try:
                    VR = dictionary_VR(tag)
                except KeyError:
                    # Look ahead to see if it consists of items
                    # and is thus a SQ
                    next_tag = _unpack_tag(fp_read(4), endian_chr)
                    # Rewind the file
                    fp.seek(fp_tell() - 4)
                    if next_tag == ItemTag:
                        VR = 'SQ'

            if VR == 'SQ':
                if debugging:
                    logger_debug(
                        f"{fp_tell():08X}: Reading/parsing undefined length "
                        "sequence"
                    )

                seq = read_sequence(fp, is_implicit_VR,
                                    is_little_endian, length, encoding)
                if has_tag_set and tag not in tag_set:
                    continue

                yield DataElement(tag, VR, seq, value_tell,
                                  is_undefined_length=True)
            else:
                delimiter = SequenceDelimiterTag
                if debugging:
                    logger_debug("Reading undefined length data element")
                value = read_undefined_length_value(
                    fp, is_little_endian, delimiter, defer_size
                )

                # tags with undefined length are skipped after read
                if has_tag_set and tag not in tag_set:
                    continue

                yield RawDataElement(tag, VR, length, value, value_tell,
                                     is_implicit_VR, is_little_endian)
Beispiel #7
0
def dcmread(fp, defer_size=None, stop_before_pixels=False,
            force=False, specific_tags=None):
    """Read and parse a DICOM dataset stored in the DICOM File Format.

    Read a DICOM dataset stored in accordance with the DICOM File Format
    (DICOM Standard Part 10 Section 7). If the dataset is not stored in
    accordance with the File Format (i.e. the preamble and prefix are missing,
    there are missing required Type 1 File Meta Information Group elements
    or the entire File Meta Information is missing) then you will have to
    set `force` to True.

    Parameters
    ----------
    fp : str or file-like
        Either a file-like object, or a string containing the file name. If a
        file-like object, the caller is responsible for closing it.
    defer_size : int or str or None
        If None (default), all elements read into memory. If specified, then if
        a data element's stored value is larger than `defer_size`, the value is
        not read into memory until it is accessed in code. Specify an integer
        (bytes), or a string value with units, e.g. "512 KB", "2 MB".
    stop_before_pixels : bool
        If False (default), the full file will be read and parsed. Set True to
        stop before reading (7FE0,0010) 'Pixel Data' (and all subsequent
        elements).
    force : bool
        If False (default), raises an InvalidDicomError if the file is missing
        the File Meta Information header. Set to True to force reading even if
        no File Meta Information header is found.
    specific_tags : list or None
        If not None, only the tags in the list are returned. The list
        elements can be tags or tag names. Note that the tag Specific
        Character Set is always returned if present - this ensures correct
        decoding of returned text values.

    Returns
    -------
    FileDataset
        An instance of FileDataset that represents a parsed DICOM file.

    Raises
    ------
    InvalidDicomError
        If `force` is True and the file is not a valid DICOM file.

    See Also
    --------
    pydicom.dataset.FileDataset
        Data class that is returned.
    pydicom.filereader.read_partial
        Only read part of a DICOM file, stopping on given conditions.

    Examples
    --------
    Read and return a dataset stored in accordance with the DICOM File Format:

    >>> ds = pydicom.dcmread("rtplan.dcm")
    >>> ds.PatientName

    Read and return a dataset not in accordance with the DICOM File Format:

    >>> ds = pydicom.dcmread("rtplan.dcm", force=True)
    >>> ds.PatientName

    Use within a context manager:

    >>> with pydicom.dcmread("rtplan.dcm") as ds:
    >>>     ds.PatientName
    """
    # Open file if not already a file object
    caller_owns_file = True
    if isinstance(fp, compat.string_types):
        # caller provided a file name; we own the file handle
        caller_owns_file = False
        try:
            logger.debug(u"Reading file '{0}'".format(fp))
        except Exception:
            logger.debug("Reading file '{0}'".format(fp))
        fp = open(fp, 'rb')

    if config.debugging:
        logger.debug("\n" + "-" * 80)
        logger.debug("Call to dcmread()")
        msg = ("filename:'%s', defer_size='%s', "
               "stop_before_pixels=%s, force=%s, specific_tags=%s")
        logger.debug(msg % (fp.name, defer_size, stop_before_pixels,
                            force, specific_tags))
        if caller_owns_file:
            logger.debug("Caller passed file object")
        else:
            logger.debug("Caller passed file name")
        logger.debug("-" * 80)

    # Convert size to defer reading into bytes
    defer_size = size_in_bytes(defer_size)

    # Iterate through all items and store them --include file meta if present
    stop_when = None
    if stop_before_pixels:
        stop_when = _at_pixel_data
    try:
        dataset = read_partial(fp, stop_when, defer_size=defer_size,
                               force=force, specific_tags=specific_tags)
    finally:
        if not caller_owns_file:
            fp.close()
    # XXX need to store transfer syntax etc.
    return dataset
Beispiel #8
0
def data_element_generator(fp,
                           is_implicit_VR,
                           is_little_endian,
                           stop_when=None,
                           defer_size=None,
                           encoding=default_encoding,
                           specific_tags=None):

    """Create a generator to efficiently return the raw data elements.

    Parameters
    ----------
    fp : file-like object
    is_implicit_VR : boolean
    is_little_endian : boolean
    stop_when : None, callable, optional
        If None (default), then the whole file is read.
        A callable which takes tag, VR, length,
        and returns True or False. If it returns True,
        read_data_element will just return.
    defer_size : int, str, None, optional
        See ``dcmread`` for parameter info.
    encoding :
        Encoding scheme
    specific_tags : list or None
        See ``dcmread`` for parameter info.

    Returns
    -------
    VR : None if implicit VR, otherwise the VR read from the file
    length :
        the length as in the DICOM data element (could be
        DICOM "undefined length" 0xffffffffL)
    value_bytes :
        the raw bytes from the DICOM file
        (not parsed into python types)
    is_little_endian : boolean
        True if transfer syntax is little endian; else False.
    """
    # Summary of DICOM standard PS3.5-2008 chapter 7:
    # If Implicit VR, data element is:
    #    tag, 4-byte length, value.
    #        The 4-byte length can be FFFFFFFF (undefined length)*
    #
    # If Explicit VR:
    #    if OB, OW, OF, SQ, UN, or UT:
    #       tag, VR, 2-bytes reserved (both zero), 4-byte length, value
    #           For all but UT, the length can be FFFFFFFF (undefined length)*
    #   else: (any other VR)
    #       tag, VR, (2 byte length), value
    # * for undefined length, a Sequence Delimitation Item marks the end
    #        of the Value Field.
    # Note, except for the special_VRs, both impl and expl VR use 8 bytes;
    #    the special VRs follow the 8 bytes with a 4-byte length

    # With a generator, state is stored, so we can break down
    #    into the individual cases, and not have to check them again for each
    #    data element

    if is_little_endian:
        endian_chr = "<"
    else:
        endian_chr = ">"
    if is_implicit_VR:
        element_struct = Struct(endian_chr + "HHL")
    else:  # Explicit VR
        # tag, VR, 2-byte length (or 0 if special VRs)
        element_struct = Struct(endian_chr + "HH2sH")
        extra_length_struct = Struct(endian_chr + "L")  # for special VRs
        extra_length_unpack = extra_length_struct.unpack  # for lookup speed

    # Make local variables so have faster lookup
    fp_read = fp.read
    fp_tell = fp.tell
    logger_debug = logger.debug
    debugging = config.debugging
    element_struct_unpack = element_struct.unpack
    defer_size = size_in_bytes(defer_size)

    tag_set = set()
    if specific_tags is not None:
        for tag in specific_tags:
            if isinstance(tag, (str, compat.text_type)):
                tag = Tag(tag_for_keyword(tag))
            if isinstance(tag, BaseTag):
                tag_set.add(tag)
        tag_set.add(Tag(0x08, 0x05))
    has_tag_set = len(tag_set) > 0

    while True:
        # Read tag, VR, length, get ready to read value
        bytes_read = fp_read(8)
        if len(bytes_read) < 8:
            return  # at end of file
        if debugging:
            debug_msg = "{0:08x}: {1}".format(fp.tell() - 8,
                                              bytes2hex(bytes_read))

        if is_implicit_VR:
            # must reset VR each time; could have set last iteration (e.g. SQ)
            VR = None
            group, elem, length = element_struct_unpack(bytes_read)
        else:  # explicit VR
            group, elem, VR, length = element_struct_unpack(bytes_read)
            if not in_py2:
                VR = VR.decode(default_encoding)
            if VR in extra_length_VRs:
                bytes_read = fp_read(4)
                length = extra_length_unpack(bytes_read)[0]
                if debugging:
                    debug_msg += " " + bytes2hex(bytes_read)
        if debugging:
            debug_msg = "%-47s  (%04x, %04x)" % (debug_msg, group, elem)
            if not is_implicit_VR:
                debug_msg += " %s " % VR
            if length != 0xFFFFFFFF:
                debug_msg += "Length: %d" % length
            else:
                debug_msg += "Length: Undefined length (FFFFFFFF)"
            logger_debug(debug_msg)

        # Positioned to read the value, but may not want to -- check stop_when
        value_tell = fp_tell()
        tag = TupleTag((group, elem))
        if stop_when is not None:
            # XXX VR may be None here!! Should stop_when just take tag?
            if stop_when(tag, VR, length):
                if debugging:
                    logger_debug("Reading ended by stop_when callback. "
                                 "Rewinding to start of data element.")
                rewind_length = 8
                if not is_implicit_VR and VR in extra_length_VRs:
                    rewind_length += 4
                fp.seek(value_tell - rewind_length)
                return

        # Reading the value
        # First case (most common): reading a value with a defined length
        if length != 0xFFFFFFFF:
            # don't defer loading of Specific Character Set value as it is
            # needed immediately to get the character encoding for other tags
            if has_tag_set and tag not in tag_set:
                # skip the tag if not in specific tags
                fp.seek(fp_tell() + length)
                continue

            if (defer_size is not None and length > defer_size and
                    tag != BaseTag(0x00080005)):
                # Flag as deferred by setting value to None, and skip bytes
                value = None
                logger_debug("Defer size exceeded. "
                             "Skipping forward to next data element.")
                fp.seek(fp_tell() + length)
            else:
                value = fp_read(length)
                if debugging:
                    dotdot = "   "
                    if length > 12:
                        dotdot = "..."
                    logger_debug("%08x: %-34s %s %r %s" % (value_tell,
                                                           bytes2hex(
                                                               value[:12]),
                                                           dotdot,
                                                           value[:12], dotdot))

            # If the tag is (0008,0005) Specific Character Set, then store it
            if tag == BaseTag(0x00080005):
                from pydicom.values import convert_string
                encoding = convert_string(value, is_little_endian)
                # Store the encoding value in the generator
                # for use with future elements (SQs)
                encoding = convert_encodings(encoding)

            yield RawDataElement(tag, VR, length, value, value_tell,
                                 is_implicit_VR, is_little_endian)

        # Second case: undefined length - must seek to delimiter,
        # unless is SQ type, in which case is easier to parse it, because
        # undefined length SQs and items of undefined lengths can be nested
        # and it would be error-prone to read to the correct outer delimiter
        else:
            # Try to look up type to see if is a SQ
            # if private tag, won't be able to look it up in dictionary,
            #   in which case just ignore it and read the bytes unless it is
            #   identified as a Sequence
            if VR is None:
                try:
                    VR = dictionary_VR(tag)
                except KeyError:
                    # Look ahead to see if it consists of items
                    # and is thus a SQ
                    next_tag = TupleTag(unpack(endian_chr + "HH", fp_read(4)))
                    # Rewind the file
                    fp.seek(fp_tell() - 4)
                    if next_tag == ItemTag:
                        VR = 'SQ'

            if VR == 'SQ':
                if debugging:
                    msg = "{0:08x}: Reading/parsing undefined length sequence"
                    logger_debug(msg.format(fp_tell()))
                seq = read_sequence(fp, is_implicit_VR,
                                    is_little_endian, length, encoding)
                if has_tag_set and tag not in tag_set:
                    continue
                yield DataElement(tag, VR, seq, value_tell,
                                  is_undefined_length=True)
            else:
                delimiter = SequenceDelimiterTag
                if debugging:
                    logger_debug("Reading undefined length data element")
                value = read_undefined_length_value(fp, is_little_endian,
                                                    delimiter, defer_size)

                # If the tag is (0008,0005) Specific Character Set,
                # then store it
                if tag == (0x08, 0x05):
                    from pydicom.values import convert_string
                    encoding = convert_string(value, is_little_endian)
                    # Store the encoding value in the generator for use
                    # with future elements (SQs)
                    encoding = convert_encodings(encoding)

                # tags with undefined length are skipped after read
                if has_tag_set and tag not in tag_set:
                    continue
                yield RawDataElement(tag, VR, length, value, value_tell,
                                     is_implicit_VR, is_little_endian)
Beispiel #9
0
def data_element_generator(
    fp: BinaryIO,
    is_implicit_VR: bool,
    is_little_endian: bool,
    stop_when: Optional[Callable[[int, int], bool]] = None,
    defer_size: Optional[Union[str, int, float]] = None,
) -> Iterator[_ElementType]:
    """:return: (tag, VR, length, value, value_tell,
                                 is_implicit_VR, is_little_endian)
    """
    endian_chr = "<" if is_little_endian else ">"

    if is_implicit_VR:
        element_struct = Struct(endian_chr + "HHL")
    else:  # Explicit VR
        # tag, VR, 2-byte length (or 0 if special VRs)
        element_struct = Struct(endian_chr + "HH2sH")
        extra_length_struct = Struct(endian_chr + "L")  # for special VRs
        extra_length_unpack = extra_length_struct.unpack  # for lookup speed

    # Make local variables so have faster lookup
    fp_read = fp.read
    fp_tell = fp.tell
    element_struct_unpack = element_struct.unpack
    defer_size = size_in_bytes(defer_size)

    while True:
        # Read tag, VR, length, get ready to read value
        bytes_read = fp_read(8)
        if len(bytes_read) < 8:
            return  # at end of file

        if is_implicit_VR:
            # must reset VR each time; could have set last iteration (e.g. SQ)
            vr = None
            group, elem, length = element_struct_unpack(bytes_read)
        else:  # explicit VR
            group, elem, vr, length = element_struct_unpack(bytes_read)
            if vr in extra_length_VRs_b:
                length = extra_length_unpack(fp_read(4))[0]

        # Positioned to read the value, but may not want to -- check stop_when
        value_tell = fp_tell()
        if stop_when is not None:
            if stop_when(group, elem):
                rewind_length = 8
                if not is_implicit_VR and vr in extra_length_VRs_b:
                    rewind_length += 4
                fp.seek(value_tell - rewind_length)

                return

        # Reading the value
        # First case (most common): reading a value with a defined length
        if length != 0xFFFFFFFF:
            if defer_size is not None and length > defer_size:
                # Flag as deferred by setting value to None, and skip bytes
                value = None
                fp.seek(fp_tell() + length)
            else:
                value = fp_read(length)
            # import pdb;pdb.set_trace()
            yield ((group, elem), vr, length, value, value_tell)

        # Second case: undefined length - must seek to delimiter,
        # unless is SQ type, in which case is easier to parse it, because
        # undefined length SQs and items of undefined lengths can be nested
        # and it would be error-prone to read to the correct outer delimiter
        else:
            # Try to look up type to see if is a SQ
            # if private tag, won't be able to look it up in dictionary,
            #   in which case just ignore it and read the bytes unless it is
            #   identified as a Sequence
            if vr is None:
                try:
                    vr = dictionary_VR((group, elem)).encode('ascii')
                except KeyError:
                    # Look ahead to see if it consists of items and
                    # is thus a SQ
                    next_tag = TupleTag(
                        cast(
                            Tuple[int, int],
                            unpack(endian_chr + "HH", fp_read(4)),
                        ))
                    # Rewind the file
                    fp.seek(fp_tell() - 4)
                    if next_tag == ItemTag:
                        vr = b'SQ'

            if vr == b'SQ':
                yield ((group, elem), vr, length, None, value_tell)
            else:
                raise NotImplementedError(
                    "This reader does not handle undefined length except "
                    "for SQ")
    def test_size_in_bytes(self):
        """Test convenience function size_in_bytes()."""
        # None or numbers shall be returned unchanged
        assert size_in_bytes(None) is None
        assert size_in_bytes(float('inf')) is None
        assert size_in_bytes(1234) == 1234

        # string shall be parsed
        assert size_in_bytes('1234') == 1234
        assert size_in_bytes('4 kb') == 4096
        assert size_in_bytes('16 KB') == 0x4000
        assert size_in_bytes('3  MB') == 0x300000
        assert size_in_bytes('2gB') == 0x80000000

        with pytest.raises(ValueError):
            size_in_bytes('2 TB')
        with pytest.raises(ValueError):
            size_in_bytes('KB 2')
Beispiel #11
0
def data_element_generator(fp,
                           is_implicit_VR,
                           is_little_endian,
                           stop_when=None,
                           defer_size=None):
    """:return: (tag, VR, length, value, value_tell,
                                 is_implicit_VR, is_little_endian)
    """
    if is_little_endian:
        endian_chr = "<"
    else:
        endian_chr = ">"
    if is_implicit_VR:
        element_struct = Struct(endian_chr + "HHL")
    else:  # Explicit VR
        # tag, VR, 2-byte length (or 0 if special VRs)
        element_struct = Struct(endian_chr + "HH2sH")
        extra_length_struct = Struct(endian_chr + "L")  # for special VRs
        extra_length_unpack = extra_length_struct.unpack  # for lookup speed

    # Make local variables so have faster lookup
    fp_read = fp.read
    fp_tell = fp.tell
    element_struct_unpack = element_struct.unpack
    defer_size = size_in_bytes(defer_size)

    while True:
        # Read tag, VR, length, get ready to read value
        bytes_read = fp_read(8)
        if len(bytes_read) < 8:
            raise StopIteration  # at end of file

        if is_implicit_VR:
            # must reset VR each time; could have set last iteration (e.g. SQ)
            VR = None
            group, elem, length = element_struct_unpack(bytes_read)
        else:  # explicit VR
            group, elem, VR, length = element_struct_unpack(bytes_read)
            if VR in extra_length_VRs_b:
                bytes_read = fp_read(4)
                length = extra_length_unpack(bytes_read)[0]

        # Positioned to read the value, but may not want to -- check stop_when
        value_tell = fp_tell()
        if stop_when is not None:
            if stop_when(group, elem):
                rewind_length = 8
                if not is_implicit_VR and VR in extra_length_VRs_b:
                    rewind_length += 4
                fp.seek(value_tell - rewind_length)
                raise StopIteration

        # Reading the value
        # First case (most common): reading a value with a defined length
        if length != 0xFFFFFFFF:
            if defer_size is not None and length > defer_size:
                # Flag as deferred by setting value to None, and skip bytes
                value = None
                fp.seek(fp_tell() + length)
            else:
                value = fp_read(length)
            # import pdb;pdb.set_trace()
            yield ((group, elem), VR, length, value, value_tell)

        # Second case: undefined length - must seek to delimiter,
        # unless is SQ type, in which case is easier to parse it, because
        # undefined length SQs and items of undefined lengths can be nested
        # and it would be error-prone to read to the correct outer delimiter
        else:
            # Try to look up type to see if is a SQ
            # if private tag, won't be able to look it up in dictionary,
            #   in which case just ignore it and read the bytes unless it is
            #   identified as a Sequence
            if VR is None:
                try:
                    VR = dictionary_VR(tag)
                except KeyError:
                    # Look ahead to see if it consists of items and
                    # is thus a SQ
                    next_tag = TupleTag(unpack(endian_chr + "HH", fp_read(4)))
                    # Rewind the file
                    fp.seek(fp_tell() - 4)
                    if next_tag == ItemTag:
                        VR = b'SQ'

            if VR == b'SQ':
                yield ((group, elem), VR, length, None, value_tell)
                # seq = read_sequence(fp, is_implicit_VR,
                #                     is_little_endian, length, encoding)
                # yield DataElement(tag, VR, seq, value_tell,
                #                   is_undefined_length=True)
            else:
                raise NotImplementedError("This reader does not handle "
                                          "undefined length except for SQ")
                from pydicom.fileio.fileutil import read_undefined_length_value

                delimiter = SequenceDelimiterTag
                value = read_undefined_length_value(fp, is_little_endian,
                                                    delimiter, defer_size)
                yield ((group, elem), VR, length, value, value_tell)
Beispiel #12
0
    def test_size_in_bytes(self):
        """Test convenience function size_in_bytes()."""
        # None or numbers shall be returned unchanged
        assert size_in_bytes(None) is None
        assert size_in_bytes(float('inf')) is None
        assert size_in_bytes(1234) == 1234

        # string shall be parsed
        assert size_in_bytes('1234') == 1234
        assert size_in_bytes('4 kb') == 4096
        assert size_in_bytes('16 KB') == 0x4000
        assert size_in_bytes('3  MB') == 0x300000
        assert size_in_bytes('2gB') == 0x80000000

        with pytest.raises(ValueError):
            size_in_bytes('2 TB')
        with pytest.raises(ValueError):
            size_in_bytes('KB 2')
Beispiel #13
0
def dcmread(fp,
            defer_size=None,
            stop_before_pixels=False,
            force=False,
            specific_tags=None):
    """Read and parse a DICOM dataset stored in the DICOM File Format.

    Read a DICOM dataset stored in accordance with the :dcm:`DICOM File
    Format <part10/chapter_7.html>`. If the dataset is not stored in
    accordance with the File Format (i.e. the preamble and prefix are missing,
    there are missing required Type 1 *File Meta Information Group* elements
    or the entire *File Meta Information* is missing) then you will have to
    set `force` to ``True``.

    Parameters
    ----------
    fp : str or PathLike or file-like
        Either a file-like object, or a string containing the file name. If a
        file-like object, the caller is responsible for closing it.
    defer_size : int or str or None, optional
        If ``None`` (default), all elements are read into memory. If specified,
        then if a data element's stored value is larger than `defer_size`, the
        value is not read into memory until it is accessed in code. Specify an
        integer (bytes), or a string value with units, e.g. "512 KB", "2 MB".
    stop_before_pixels : bool, optional
        If ``False`` (default), the full file will be read and parsed. Set
        ``True`` to stop before reading (7FE0,0010) *Pixel Data* (and all
        subsequent elements).
    force : bool, optional
        If ``False`` (default), raises an
        :class:`~pydicom.errors.InvalidDicomError` if the file is
        missing the *File Meta Information* header. Set to ``True`` to force
        reading even if no *File Meta Information* header is found.
    specific_tags : list or None, optional
        If not ``None``, only the tags in the list are returned. The list
        elements can be tags or tag names. Note that the element (0008,0005)
        *Specific Character Set* is always returned if present - this ensures
        correct decoding of returned text values.

    Returns
    -------
    FileDataset
        An instance of :class:`~pydicom.dataset.FileDataset` that represents
        a parsed DICOM file.

    Raises
    ------
    InvalidDicomError
        If `force` is ``True`` and the file is not a valid DICOM file.
    TypeError
        If `fp` is ``None`` or of an unsupported type.

    See Also
    --------
    pydicom.dataset.FileDataset
        Data class that is returned.
    pydicom.filereader.read_partial
        Only read part of a DICOM file, stopping on given conditions.

    Examples
    --------
    Read and return a dataset stored in accordance with the DICOM File Format:

    >>> ds = pydicom.dcmread("rtplan.dcm")
    >>> ds.PatientName

    Read and return a dataset not in accordance with the DICOM File Format:

    >>> ds = pydicom.dcmread("rtplan.dcm", force=True)
    >>> ds.PatientName

    Use within a context manager:

    >>> with pydicom.dcmread("rtplan.dcm") as ds:
    >>>     ds.PatientName
    """
    # Open file if not already a file object
    caller_owns_file = True
    fp = path_from_pathlike(fp)
    if isinstance(fp, str):
        # caller provided a file name; we own the file handle
        caller_owns_file = False
        logger.debug("Reading file '{0}'".format(fp))
        fp = open(fp, 'rb')
    elif fp is None or not hasattr(fp, "read") or not hasattr(fp, "seek"):
        raise TypeError("dcmread: Expected a file path or a file-like, "
                        "but got " + type(fp).__name__)

    if config.debugging:
        logger.debug("\n" + "-" * 80)
        logger.debug("Call to dcmread()")
        msg = ("filename:'%s', defer_size='%s', "
               "stop_before_pixels=%s, force=%s, specific_tags=%s")
        logger.debug(
            msg %
            (fp.name, defer_size, stop_before_pixels, force, specific_tags))
        if caller_owns_file:
            logger.debug("Caller passed file object")
        else:
            logger.debug("Caller passed file name")
        logger.debug("-" * 80)

    # Convert size to defer reading into bytes
    defer_size = size_in_bytes(defer_size)

    # Iterate through all items and store them --include file meta if present
    stop_when = None
    if stop_before_pixels:
        stop_when = _at_pixel_data
    try:
        dataset = read_partial(fp,
                               stop_when,
                               defer_size=defer_size,
                               force=force,
                               specific_tags=specific_tags)
    finally:
        if not caller_owns_file:
            fp.close()
    # XXX need to store transfer syntax etc.
    return dataset
Beispiel #14
0
def read_undefined_length_value(fp: BinaryIO,
                                is_little_endian: bool,
                                delimiter_tag: BaseTag,
                                defer_size: Optional[Union[int, float]] = None,
                                read_size: int = 1024 * 8) -> Optional[bytes]:
    """Read until `delimiter_tag` and return the value up to that point.

    On completion, the file will be set to the first byte after the delimiter
    and its following four zero bytes.

    Parameters
    ----------
    fp : file-like
        The file-like to read.
    is_little_endian : bool
        ``True`` if file transfer syntax is little endian, else ``False``.
    delimiter_tag : BaseTag
        Tag used as end marker for reading
    defer_size : int or None, optional
        Size to avoid loading large elements in memory. See
        :func:`~pydicom.filereader.dcmread` for more parameter info.
    read_size : int, optional
        Number of bytes to read at one time.

    Returns
    -------
    delimiter : bytes or None
        The file delimiter.

    Raises
    ------
    EOFError
        If EOF is reached before delimiter found.
    """
    data_start = fp.tell()
    defer_size = size_in_bytes(defer_size)

    # It's common for an undefined length value item to be an
    # encapsulated pixel data as defined in PS3.5 section A.4.
    # Attempt to parse the data under that assumption, since the method
    #  1. is proof against coincidental embedded sequence delimiter tags
    #  2. avoids accumulating any data in memory if the element is large
    #     enough to be deferred
    #  3. does not double-accumulate data (in chunks and then joined)
    #
    # Unfortunately, some implementations deviate from the standard and the
    # encapsulated pixel data-parsing algorithm fails. In that case, we fall
    # back to a method of scanning the entire element value for the
    # sequence delimiter, as was done historically.
    if delimiter_tag == SequenceDelimiterTag:
        was_value_found, value = _try_read_encapsulated_pixel_data(
            fp, is_little_endian, defer_size)
        if was_value_found:
            return value

    search_rewind = 3

    if is_little_endian:
        bytes_format = b"<HH"
    else:
        bytes_format = b">HH"
    bytes_to_find = pack(bytes_format, delimiter_tag.group, delimiter_tag.elem)

    found = False
    eof = False
    value_chunks = []
    byte_count = 0  # for defer_size checks
    while not found:
        chunk_start = fp.tell()
        bytes_read = fp.read(read_size)
        if len(bytes_read) < read_size:
            # try again - if still don't get required amount,
            # this is the last block
            new_bytes = fp.read(read_size - len(bytes_read))
            bytes_read += new_bytes
            if len(bytes_read) < read_size:
                eof = True  # but will still check whatever we did get
        index = bytes_read.find(bytes_to_find)
        if index != -1:
            found = True
            new_bytes = bytes_read[:index]
            byte_count += len(new_bytes)
            if defer_size is None or byte_count < defer_size:
                value_chunks.append(new_bytes)
            fp.seek(chunk_start + index + 4)  # rewind to end of delimiter
            length = fp.read(4)
            if length != b"\0\0\0\0":
                msg = ("Expected 4 zero bytes after undefined length delimiter"
                       " at pos {0:04x}")
                logger.error(msg.format(fp.tell() - 4))
        elif eof:
            fp.seek(data_start)
            raise EOFError(
                "End of file reached before delimiter {0!r} found".format(
                    delimiter_tag))
        else:
            # rewind a bit in case delimiter crossed read_size boundary
            fp.seek(fp.tell() - search_rewind)
            # accumulate the bytes read (not including the rewind)
            new_bytes = bytes_read[:-search_rewind]
            byte_count += len(new_bytes)
            if defer_size is None or byte_count < defer_size:
                value_chunks.append(new_bytes)
    # if get here then have found the byte string
    if defer_size is not None and byte_count >= defer_size:
        return None
    else:
        return b"".join(value_chunks)
Beispiel #15
0
def data_element_generator(fp,
                           is_implicit_VR,
                           is_little_endian,
                           stop_when=None,
                           defer_size=None,
                           encoding=default_encoding,
                           specific_tags=None):
    """Create a generator to efficiently return the raw data elements.

    Parameters
    ----------
    fp : file-like object
    is_implicit_VR : boolean
    is_little_endian : boolean
    stop_when : None, callable, optional
        If None (default), then the whole file is read.
        A callable which takes tag, VR, length,
        and returns True or False. If it returns True,
        read_data_element will just return.
    defer_size : int, str, None, optional
        See ``dcmread`` for parameter info.
    encoding :
        Encoding scheme
    specific_tags : list or None
        See ``dcmread`` for parameter info.

    Returns
    -------
    VR : None if implicit VR, otherwise the VR read from the file
    length :
        the length as in the DICOM data element (could be
        DICOM "undefined length" 0xffffffffL)
    value_bytes :
        the raw bytes from the DICOM file
        (not parsed into python types)
    is_little_endian : boolean
        True if transfer syntax is little endian; else False.
    """
    # Summary of DICOM standard PS3.5-2008 chapter 7:
    # If Implicit VR, data element is:
    #    tag, 4-byte length, value.
    #        The 4-byte length can be FFFFFFFF (undefined length)*
    #
    # If Explicit VR:
    #    if OB, OW, OF, SQ, UN, or UT:
    #       tag, VR, 2-bytes reserved (both zero), 4-byte length, value
    #           For all but UT, the length can be FFFFFFFF (undefined length)*
    #   else: (any other VR)
    #       tag, VR, (2 byte length), value
    # * for undefined length, a Sequence Delimitation Item marks the end
    #        of the Value Field.
    # Note, except for the special_VRs, both impl and expl VR use 8 bytes;
    #    the special VRs follow the 8 bytes with a 4-byte length

    # With a generator, state is stored, so we can break down
    #    into the individual cases, and not have to check them again for each
    #    data element

    if is_little_endian:
        endian_chr = "<"
    else:
        endian_chr = ">"
    if is_implicit_VR:
        element_struct = Struct(endian_chr + "HHL")
    else:  # Explicit VR
        # tag, VR, 2-byte length (or 0 if special VRs)
        element_struct = Struct(endian_chr + "HH2sH")
        extra_length_struct = Struct(endian_chr + "L")  # for special VRs
        extra_length_unpack = extra_length_struct.unpack  # for lookup speed

    # Make local variables so have faster lookup
    fp_read = fp.read
    fp_tell = fp.tell
    logger_debug = logger.debug
    debugging = config.debugging
    element_struct_unpack = element_struct.unpack
    defer_size = size_in_bytes(defer_size)

    tag_set = set()
    has_specific_char_set = True
    if specific_tags is not None:
        for tag in specific_tags:
            if isinstance(tag, (str, compat.text_type)):
                tag = Tag(tag_for_keyword(tag))
            if isinstance(tag, BaseTag):
                tag_set.add(tag)
        has_specific_char_set = Tag(0x08, 0x05) in tag_set
        tag_set.add(Tag(0x08, 0x05))
    has_tag_set = len(tag_set) > 0

    while True:
        # Read tag, VR, length, get ready to read value
        bytes_read = fp_read(8)
        if len(bytes_read) < 8:
            return  # at end of file
        if debugging:
            debug_msg = "{0:08x}: {1}".format(fp.tell() - 8,
                                              bytes2hex(bytes_read))

        if is_implicit_VR:
            # must reset VR each time; could have set last iteration (e.g. SQ)
            VR = None
            group, elem, length = element_struct_unpack(bytes_read)
        else:  # explicit VR
            group, elem, VR, length = element_struct_unpack(bytes_read)
            if not in_py2:
                VR = VR.decode(default_encoding)
            if VR in extra_length_VRs:
                bytes_read = fp_read(4)
                length = extra_length_unpack(bytes_read)[0]
                if debugging:
                    debug_msg += " " + bytes2hex(bytes_read)
        if debugging:
            debug_msg = "%-47s  (%04x, %04x)" % (debug_msg, group, elem)
            if not is_implicit_VR:
                debug_msg += " %s " % VR
            if length != 0xFFFFFFFF:
                debug_msg += "Length: %d" % length
            else:
                debug_msg += "Length: Undefined length (FFFFFFFF)"
            logger_debug(debug_msg)

        # Positioned to read the value, but may not want to -- check stop_when
        value_tell = fp_tell()
        tag = TupleTag((group, elem))
        if stop_when is not None:
            # XXX VR may be None here!! Should stop_when just take tag?
            if stop_when(tag, VR, length):
                if debugging:
                    logger_debug("Reading ended by stop_when callback. "
                                 "Rewinding to start of data element.")
                rewind_length = 8
                if not is_implicit_VR and VR in extra_length_VRs:
                    rewind_length += 4
                fp.seek(value_tell - rewind_length)
                return

        # Reading the value
        # First case (most common): reading a value with a defined length
        if length != 0xFFFFFFFF:
            # don't defer loading of Specific Character Set value as it is
            # needed immediately to get the character encoding for other tags
            if has_tag_set and tag not in tag_set:
                # skip the tag if not in specific tags
                fp.seek(fp_tell() + length)
                continue

            if (defer_size is not None and length > defer_size
                    and tag != BaseTag(0x00080005)):
                # Flag as deferred by setting value to None, and skip bytes
                value = None
                logger_debug("Defer size exceeded. "
                             "Skipping forward to next data element.")
                fp.seek(fp_tell() + length)
            else:
                value = fp_read(length)
                if debugging:
                    dotdot = "   "
                    if length > 12:
                        dotdot = "..."
                    logger_debug("%08x: %-34s %s %r %s" %
                                 (value_tell, bytes2hex(
                                     value[:12]), dotdot, value[:12], dotdot))

            # If the tag is (0008,0005) Specific Character Set, then store it
            if tag == BaseTag(0x00080005):
                from pydicom.values import convert_string
                encoding = convert_string(value,
                                          is_little_endian,
                                          encoding=default_encoding)
                # Store the encoding value in the generator
                # for use with future elements (SQs)
                encoding = convert_encodings(encoding)
                if not has_specific_char_set:
                    continue

            yield RawDataElement(tag, VR, length, value, value_tell,
                                 is_implicit_VR, is_little_endian)

        # Second case: undefined length - must seek to delimiter,
        # unless is SQ type, in which case is easier to parse it, because
        # undefined length SQs and items of undefined lengths can be nested
        # and it would be error-prone to read to the correct outer delimiter
        else:
            # Try to look up type to see if is a SQ
            # if private tag, won't be able to look it up in dictionary,
            #   in which case just ignore it and read the bytes unless it is
            #   identified as a Sequence
            if VR is None:
                try:
                    VR = dictionary_VR(tag)
                except KeyError:
                    # Look ahead to see if it consists of items
                    # and is thus a SQ
                    next_tag = TupleTag(unpack(endian_chr + "HH", fp_read(4)))
                    # Rewind the file
                    fp.seek(fp_tell() - 4)
                    if next_tag == ItemTag:
                        VR = 'SQ'

            if VR == 'SQ':
                if debugging:
                    msg = "{0:08x}: Reading/parsing undefined length sequence"
                    logger_debug(msg.format(fp_tell()))
                seq = read_sequence(fp, is_implicit_VR, is_little_endian,
                                    length, encoding)
                if has_tag_set and tag not in tag_set:
                    continue
                yield DataElement(tag,
                                  VR,
                                  seq,
                                  value_tell,
                                  is_undefined_length=True)
            else:
                delimiter = SequenceDelimiterTag
                if debugging:
                    logger_debug("Reading undefined length data element")
                value = read_undefined_length_value(fp, is_little_endian,
                                                    delimiter, defer_size)

                # If the tag is (0008,0005) Specific Character Set,
                # then store it
                if tag == (0x08, 0x05):
                    from pydicom.values import convert_string
                    encoding = convert_string(value,
                                              is_little_endian,
                                              encoding=default_encoding)
                    # Store the encoding value in the generator for use
                    # with future elements (SQs)
                    encoding = convert_encodings(encoding)
                    if not has_specific_char_set:
                        continue

                # tags with undefined length are skipped after read
                if has_tag_set and tag not in tag_set:
                    continue
                yield RawDataElement(tag, VR, length, value, value_tell,
                                     is_implicit_VR, is_little_endian)
Beispiel #16
0
def dcmread(fp,
            defer_size=None,
            stop_before_pixels=False,
            force=False,
            specific_tags=None):
    """Read and parse a DICOM dataset stored in the DICOM File Format.

    Read a DICOM dataset stored in accordance with the DICOM File Format
    (DICOM Standard Part 10 Section 7). If the dataset is not stored in
    accordance with the File Format (i.e. the preamble and prefix are missing,
    there are missing required Type 1 File Meta Information Group elements
    or the entire File Meta Information is missing) then you will have to
    set `force` to True.

    Parameters
    ----------
    fp : str or file-like
        Either a file-like object, or a string containing the file name. If a
        file-like object, the caller is responsible for closing it.
    defer_size : int or str or None
        If None (default), all elements read into memory. If specified, then if
        a data element's stored value is larger than `defer_size`, the value is
        not read into memory until it is accessed in code. Specify an integer
        (bytes), or a string value with units, e.g. "512 KB", "2 MB".
    stop_before_pixels : bool
        If False (default), the full file will be read and parsed. Set True to
        stop before reading (7FE0,0010) 'Pixel Data' (and all subsequent
        elements).
    force : bool
        If False (default), raises an InvalidDicomError if the file is missing
        the File Meta Information header. Set to True to force reading even if
        no File Meta Information header is found.
    specific_tags : list or None
        If not None, only the tags in the list are returned. The list
        elements can be tags or tag names.

    Returns
    -------
    FileDataset
        An instance of FileDataset that represents a parsed DICOM file.

    Raises
    ------
    InvalidDicomError
        If `force` is True and the file is not a valid DICOM file.

    See Also
    --------
    pydicom.dataset.FileDataset
        Data class that is returned.
    pydicom.filereader.read_partial
        Only read part of a DICOM file, stopping on given conditions.

    Examples
    --------
    Read and return a dataset stored in accordance with the DICOM File Format:

    >>> ds = pydicom.dcmread("rtplan.dcm")
    >>> ds.PatientName

    Read and return a dataset not in accordance with the DICOM File Format:

    >>> ds = pydicom.dcmread("rtplan.dcm", force=True)
    >>> ds.PatientName

    Use within a context manager:

    >>> with pydicom.dcmread("rtplan.dcm") as ds:
    >>>     ds.PatientName
    """
    # Open file if not already a file object
    caller_owns_file = True
    if isinstance(fp, compat.string_types):
        # caller provided a file name; we own the file handle
        caller_owns_file = False
        try:
            logger.debug(u"Reading file '{0}'".format(fp))
        except Exception:
            logger.debug("Reading file '{0}'".format(fp))
        fp = open(fp, 'rb')

    if config.debugging:
        logger.debug("\n" + "-" * 80)
        logger.debug("Call to dcmread()")
        msg = ("filename:'%s', defer_size='%s', "
               "stop_before_pixels=%s, force=%s, specific_tags=%s")
        logger.debug(
            msg %
            (fp.name, defer_size, stop_before_pixels, force, specific_tags))
        if caller_owns_file:
            logger.debug("Caller passed file object")
        else:
            logger.debug("Caller passed file name")
        logger.debug("-" * 80)

    # Convert size to defer reading into bytes
    defer_size = size_in_bytes(defer_size)

    # Iterate through all items and store them --include file meta if present
    stop_when = None
    if stop_before_pixels:
        stop_when = _at_pixel_data
    try:
        dataset = read_partial(fp,
                               stop_when,
                               defer_size=defer_size,
                               force=force,
                               specific_tags=specific_tags)
    finally:
        if not caller_owns_file:
            fp.close()
    # XXX need to store transfer syntax etc.
    return dataset
Beispiel #17
0
def read_undefined_length_value(fp,
                                is_little_endian,
                                delimiter_tag,
                                defer_size=None,
                                read_size=1024*8):
    """Read until the delimiter tag found and return the value;
     ignore the delimiter.

    On completion, the file will be set to the first byte after the delimiter
    and its following four zero bytes.

    Parameters
    ----------
    fp : a file-like object
    is_little_endian : boolean
        True if file transfer syntax is little endian, else False.
    delimiter_tag : BaseTag
        tag used as and marker for reading
    defer_size : int, None, optional
        Size to avoid loading large elements in memory.
        See ``filereader.read_file`` for more parameter info.
    read_size : int
        Number of bytes to read at one time.

    Returns
    -------
    delimiter : str, None
        The file delimiter

    Raises
    ------
    EOFError
        If EOF is reached before delimiter found.
    """
    data_start = fp.tell()
    search_rewind = 3

    if is_little_endian:
        bytes_format = b"<HH"
    else:
        bytes_format = b">HH"
    bytes_to_find = pack(bytes_format, delimiter_tag.group, delimiter_tag.elem)

    found = False
    eof = False
    value_chunks = []
    defer_size = size_in_bytes(defer_size)
    byte_count = 0  # for defer_size checks
    while not found:
        chunk_start = fp.tell()
        bytes_read = fp.read(read_size)
        if len(bytes_read) < read_size:
            # try again - if still don't get required amount,
            # this is the last block
            new_bytes = fp.read(read_size - len(bytes_read))
            bytes_read += new_bytes
            if len(bytes_read) < read_size:
                eof = True  # but will still check whatever we did get
        index = bytes_read.find(bytes_to_find)
        if index != -1:
            found = True
            new_bytes = bytes_read[:index]
            byte_count += len(new_bytes)
            if defer_size is None or byte_count < defer_size:
                value_chunks.append(bytes_read[:index])
            fp.seek(chunk_start + index + 4)  # rewind to end of delimiter
            length = fp.read(4)
            if length != b"\0\0\0\0":
                msg = ("Expected 4 zero bytes after undefined length delimiter"
                       " at pos {0:04x}")
                logger.error(msg.format(fp.tell() - 4))
        elif eof:
            fp.seek(data_start)
            raise EOFError("End of file reached before delimiter {0!r} found".
                           format(delimiter_tag))
        else:
            # rewind a bit in case delimiter crossed read_size boundary
            fp.seek(fp.tell() - search_rewind)
            # accumulate the bytes read (not including the rewind)
            new_bytes = bytes_read[:-search_rewind]
            byte_count += len(new_bytes)
            if defer_size is None or byte_count < defer_size:
                value_chunks.append(new_bytes)
    # if get here then have found the byte string
    if defer_size is not None and byte_count >= defer_size:
        return None
    else:
        return b"".join(value_chunks)