def segment_map(filehandle): """Get the map of all segments in a jpeg file. 1st parameter = file handle of jpeg file, open for binary read Returns a list of tuples corresponding to the segments in the file, in the order they occur in the file. Each tuple contains a segment ID and an absolute offset in the file; e.g., ('APP1', 2). Note: we stop at EOI (End Of Image) or SOS (Start Of Scan). This means that we ignore any segments that might occur after SOS -- a theoretical possibility, although we've never seen a Jpeg structured in that manner and stopping at SOS provides better performance because we don't have to process any image data. """ segments = [] # initialize the list of segments filehandle.seek(2, 0) # first segment starts right after the SOI while True: seg_mark = filehandle.read(2) if len(seg_mark) < 2: break # file parsing error: we've reached EOF unexpectedly seg_id = seginfo(seg_mark)['name'] if seg_id == 'APP1': # determine whether APP1 format is Exif, XMP, or XMP Extended filepos = filehandle.tell() # note current file position _ = filehandle.read(2) # skip over thec data size value id_str = filehandle.read(35) # APP1 identification string if id_str[:6] == b'Exif\x00\x00': segments.append(('APP1-Exif', filepos-2)) elif id_str[:29] == b'http://ns.adobe.com/xap/1.0/\x00': segments.append(('APP1-XMP', filepos-2)) elif id_str[:35] == b'http://ns.adobe.com/xmp/extension/\x00': segments.append(('APP1-XMPext', filepos-2)) else: segments.append(('APP1-unknown', filepos-2)) filehandle.seek(filepos, 0) # return to current file position else: # non-APP1 segment, add it to the list segments.append((seg_id, filehandle.tell()-2)) if seg_id == 'EOI' or seg_id == 'SOS': break # stop processing the image if seg_mark in [b'\xff\x01', b'\xff\xd0', b'\xff\xd1', b'\xff\xd2', b'\xff\xd3', b'\xff\xd4', b'\xff\xd5', b'\xff\xd6', b'\xff\xd7', b'\xff\xd8', b'\xff\xd9']: # These segment markers have no payload, so we're already # positioned for the next segment after reading the segment marker datasize = 0 else: dsbytes = filehandle.read(2) if len(dsbytes) < 2: break # file parsing error: we've reached EOF unexpectedly datasize = struct.unpack('>H', dsbytes)[0] # skip forward to next segment, ready to repeat the loop filehandle.seek(datasize-2, 1) return segments
def segment_read(filehandle=None): """Convert a Jpeg segment to a dictionary. filehandle = Jpeg file open for binary read, positioned to first byte of the segment Returns a dictionary with these keys: offset = offset of the segment within the Jpeg file segmark = the 2-byte segment marker segtype = name of this segment type (e.g., 'APP1') has_data = whether segment type has a data payload has_meta = whether segment type's payload contains metadata payload = segment's data payload """ # initialize dictionary object segdict = {} segdict['offset'] = filehandle.tell() segdict['segmark'] = filehandle.read(2) segdict['payload'] = None # default value segdict['next_segment'] = None # default value # get info about this segment type and copy to dictionary segtype_info = seginfo(segdict['segmark']) segdict['segtype'] = segtype_info['name'] segdict['has_data'] = segtype_info['has_data'] segdict['has_meta'] = segtype_info['has_meta'] # Stop processing the file when SOS or EOI segment reached. We do this # because we're only interested in reading metadata, and want to maximize # performance for scanning large numbers of images quickly. The SOS # segment is different from the others in that its " data size" is # merely the size of the SOS header. The compressed data imediately # follows, and this is by far the largest segment in a typical Jpeg file. # Furthermore, the compressed data must actually be scanned and decoded # to find the EOI marker that follows, and we want to avoid the need to # read all of that data. # Note: if we terminate at SOS then we should never actually see an EOI, # but we're checking for EOI as well here to allow for the case where a # Jpeg file has no image data and only metadata -- we've not seen this # in an actual Jpeg, but it's theoretically possible and by checking for # both SOS and EOI here we will gracefully handle any such file. if segdict['segtype'] in ['SOS', 'EOI']: return segdict # if this segment type has no data payload, then the next segment # starts right after the 2-byte segment marker if not segdict['has_data']: segdict['next_segment'] = segdict['offset'] + 2 return segdict # read data size; note that this size includes the 2-byte size # itself but doesn't include the 2-byte segment marker datasize_bytes = filehandle.read(2) datasize = struct.unpack('>H', datasize_bytes)[0] # if segment contains metadata, save a copy in the segment's dictionary if segdict['has_meta']: segdict['payload'] = filehandle.read(datasize - 2) else: # no metadata to save, so just skip past the data filehandle.seek(datasize-2, 1) segdict['next_segment'] = filehandle.tell() return segdict