Example #1
0
def segment_map(filehandle):
    """Get the map of all segments in a jpeg file.

    1st parameter = file handle of jpeg file, open for binary read

    Returns a list of tuples corresponding to the segments in the file,
    in the order they occur in the file. Each tuple contains a segment
    ID and an absolute offset in the file; e.g., ('APP1', 2).

    Note: we stop at EOI (End Of Image) or SOS (Start Of Scan). This
    means that we ignore any segments that might occur after SOS -- a
    theoretical possibility, although we've never seen a Jpeg structured
    in that manner and stopping at SOS provides better performance
    because we don't have to process any image data.
    """
    segments = [] # initialize the list of segments
    filehandle.seek(2, 0) # first segment starts right after the SOI

    while True:
        seg_mark = filehandle.read(2)
        if len(seg_mark) < 2:
            break # file parsing error: we've reached EOF unexpectedly

        seg_id = seginfo(seg_mark)['name']

        if seg_id == 'APP1':
            # determine whether APP1 format is Exif, XMP, or XMP Extended
            filepos = filehandle.tell() # note current file position
            _ = filehandle.read(2) # skip over thec data size value
            id_str = filehandle.read(35) # APP1 identification string
            if id_str[:6] == b'Exif\x00\x00':
                segments.append(('APP1-Exif', filepos-2))
            elif id_str[:29] == b'http://ns.adobe.com/xap/1.0/\x00':
                segments.append(('APP1-XMP', filepos-2))
            elif id_str[:35] == b'http://ns.adobe.com/xmp/extension/\x00':
                segments.append(('APP1-XMPext', filepos-2))
            else:
                segments.append(('APP1-unknown', filepos-2))

            filehandle.seek(filepos, 0) # return to current file position
        else:
            # non-APP1 segment, add it to the list
            segments.append((seg_id, filehandle.tell()-2))

        if seg_id == 'EOI' or seg_id == 'SOS':
            break # stop processing the image

        if seg_mark in [b'\xff\x01', b'\xff\xd0', b'\xff\xd1', b'\xff\xd2',
                        b'\xff\xd3', b'\xff\xd4', b'\xff\xd5', b'\xff\xd6',
                        b'\xff\xd7', b'\xff\xd8', b'\xff\xd9']:
            # These segment markers have no payload, so we're already
            # positioned for the next segment after reading the segment marker
            datasize = 0
        else:
            dsbytes = filehandle.read(2)
            if len(dsbytes) < 2:
                break # file parsing error: we've reached EOF unexpectedly
            datasize = struct.unpack('>H', dsbytes)[0]
            # skip forward to next segment, ready to repeat the loop
            filehandle.seek(datasize-2, 1)

    return segments
Example #2
0
def segment_read(filehandle=None):
    """Convert a Jpeg segment to a dictionary.

    filehandle = Jpeg file open for binary read, positioned to first byte
                 of the segment

    Returns a dictionary with these keys:
        offset = offset of the segment within the Jpeg file
        segmark = the 2-byte segment marker
        segtype = name of this segment type (e.g., 'APP1')
        has_data = whether segment type has a data payload
        has_meta = whether segment type's payload contains metadata
        payload = segment's data payload
    """

    # initialize dictionary object
    segdict = {}
    segdict['offset'] = filehandle.tell()
    segdict['segmark'] = filehandle.read(2)
    segdict['payload'] = None # default value
    segdict['next_segment'] = None # default value

    # get info about this segment type and copy to dictionary
    segtype_info = seginfo(segdict['segmark'])
    segdict['segtype'] = segtype_info['name']
    segdict['has_data'] = segtype_info['has_data']
    segdict['has_meta'] = segtype_info['has_meta']

    # Stop processing the file when SOS or EOI segment reached. We do this
    # because we're only interested in reading metadata, and want to maximize
    # performance for scanning large numbers of images quickly. The SOS
    # segment is different from the others in that its " data size" is
    # merely the size of the SOS header. The compressed data imediately
    # follows, and this is by far the largest segment in a typical Jpeg file.
    # Furthermore, the compressed data must actually be scanned and decoded
    # to find the EOI marker that follows, and we want to avoid the need to
    # read all of that data.

    # Note: if we terminate at SOS then we should never actually see an EOI,
    # but we're checking for EOI as well here to allow for the case where a
    # Jpeg file has no image data and only metadata -- we've not seen this
    # in an actual Jpeg, but it's theoretically possible and by checking for
    # both SOS and EOI here we will gracefully handle any such file.

    if segdict['segtype'] in ['SOS', 'EOI']:
        return segdict

    # if this segment type has no data payload, then the next segment
    # starts right after the 2-byte segment marker
    if not segdict['has_data']:
        segdict['next_segment'] = segdict['offset'] + 2
        return segdict

    # read data size; note that this size includes the 2-byte size
    # itself but doesn't include the 2-byte segment marker
    datasize_bytes = filehandle.read(2)
    datasize = struct.unpack('>H', datasize_bytes)[0]

    # if segment contains metadata, save a copy in the segment's dictionary
    if segdict['has_meta']:
        segdict['payload'] = filehandle.read(datasize - 2)
    else:
        # no metadata to save, so just skip past the data
        filehandle.seek(datasize-2, 1)
    segdict['next_segment'] = filehandle.tell()

    return segdict