def test_file(self): """ Tests if the file in the rar archive is the same as the extracted version. """ rar_file = os.path.join(self.path, "store_little", "store_little.rar") txt_file = os.path.join(self.path, "txt", "little_file.txt") rs = RarStream(rar_file) with open(txt_file, "rb") as tfile: self.assertEqual(rs.read(), tfile.read()) rar_file = os.path.join(self.path, "store_split_folder_old_srrsfv_windows", "winrar2.80.rar") txt_file = os.path.join(self.path, "txt", "unicode_dos.nfo") rs = RarStream(rar_file, "unicode_dos.nfo") # 3.316 bytes with open(txt_file, "rb") as tfile: rs.seek(3316) self.assertEqual(rs.seek(6316), rs.tell()) rs.seek(3312) tfile.seek(3336, os.SEEK_SET) tfile.seek(6336, os.SEEK_SET) rs.read(4) rs.seek(0) tfile.seek(0) self.assertEqual(rs.read(), tfile.read()) tfile.seek(-20, os.SEEK_END) self.assertEqual(rs.seek(-20, os.SEEK_END), tfile.tell()) self.assertEqual(rs.read(), tfile.read()) rs.close() self.assertEqual(rs.closed, True, "Stream not closed") txt_file = os.path.join(self.path, "txt", "unicode_mac.nfo") rs = RarStream(rar_file, "unicode_mac.nfo") with open(txt_file, "rb") as tfile: tfile.seek(3000) tfile.read() tfile.seek(333) rs.seek(333) self.assertEqual(rs.read(), tfile.read())
class M2tsReader(object): """Implements a simple Reader class that reads M2TS files.""" def __init__(self, read_mode=M2tsReadMode.M2ts, path=None, stream=None, match_offset=0, archived_file_name=""): assert path or stream if path: if is_rar(path): self._stream = RarStream(path, archived_file_name) else: self._stream = open(path, 'rb') elif stream: self._stream = stream self._stream.seek(0, 2) self._file_length = self._stream.tell() self.mode = read_mode self.read_done = True self.current_packet = None self.current_offset = 0 if self._file_length < 192: raise InvalidDataException("File too small") # faster reconstructing when match_offset is provided if match_offset >= 8 and match_offset < self._file_length: # use lowest muliple of 192 < offset as a starting point start = match_offset // PACKET_SIZE self._stream.seek(start) self.current_offset = start elif match_offset >= self._file_length: msg = "Invalid match offset for video: {0}".format(match_offset) raise InvalidMatchOffsetException(msg) else: # no useful matching offset against the main movie file self._stream.seek(0) def read(self): # read() is invalid at this time: read_contents() or # skip_contents() must be called before read() can be called again assert self.read_done or self.mode == M2tsReadMode.SRS self.read_done = False self._stream.seek(self.current_offset) # TP_extra_header (4 Bytes) + MPEG-2 transport stream header (4 B) header = self._stream.read(HEADER_SIZE) if not len(header): return False if M2tsReadMode.M2ts: if self.current_offset + PACKET_SIZE > self._file_length: msg = "Invalid packet length at 0x{0:08X}" raise InvalidDataException(msg.format(self.current_offset)) else: # SRS header data must be a multiple of 8 if self.current_offset + HEADER_SIZE > self._file_length: raise InvalidDataException("Broken SRS file") if header[5] == b'\x47': msg = "Invalid synchronization byte at 0x{0:08X}" raise InvalidDataException(msg.format(self.current_offset)) packet = Packet(self.current_offset) packet.raw_header = header (byte8, ) = S_BYTE.unpack_from(header, 7) # two bits: bit 3 and 4 of last byte in the header packet.adaptation_field = (byte8 & 0x30) >> 4 # last four bits of last byte in the header packet.continuity_counter = (byte8 & 0xF) (byte67, ) = S_SHORT.unpack_from(header, 5) packet.pid = byte67 & 0x1FFF self.current_offset += PACKET_SIZE self.current_packet = packet # if _DEBUG and packet.adaptation_field != 1: # # if _DEBUG: # print(packet) # print(bin(byte67)) return True def read_contents(self): """Reads the transport stream packet payload. (no 8B header)""" buff = b"" if self.read_done: self._stream.seek(-PAYLOAD_SIZE, os.SEEK_CUR) self.read_done = True if self.mode != M2tsReadMode.SRS: buff = self._stream.read(PAYLOAD_SIZE) return buff def skip_contents(self): """Skips over the payload data to the next packet.""" if not self.read_done: self.read_done = True if self.mode != M2tsReadMode.SRS: self._stream.seek(PAYLOAD_SIZE, os.SEEK_CUR) def close(self): try: # close the file/stream self._stream.close() except: pass def __del__(self): try: # close the file/stream self._stream.close() except: pass
class AsfReader(object): """Implements a simple Reader class that reads through WMV or WMV-SRS files one Object at a time.""" def __init__(self, read_mode, path=None, stream=None, archived_file_name=""): assert path or stream, "missing ASF reader input" if path: if is_rar(path): self._asf_stream = RarStream(path, archived_file_name) else: self._asf_stream = open(path, 'rb') elif stream: self._asf_stream = stream self._asf_stream.seek(0, 2) self._file_length = self._asf_stream.tell() self._asf_stream.seek(0) self.mode = read_mode self.read_done = True self.current_object = None self.object_guid = None def read(self): # "Read() is invalid at this time", "MoveToChild(), ReadContents(), or # SkipContents() must be called before Read() can be called again") assert self.read_done or (self.mode == AsfReadMode.SRS and self.object_guid == GUID_DATA_OBJECT), \ "AsfReader read() is invalid at this time" object_start_position = self._asf_stream.tell() self.current_object = None self.read_done = False # no room for GUID (16B) and size (8B) of the object if object_start_position + 24 > self._file_length: return False self._atom_header = self._asf_stream.read(24) # 16 bytes for GUID, 8 bytes for object size self.object_guid, size = struct.unpack("<16sQ", self._atom_header) # sanity check on object length # Skip check on GUID_DATA_OBJECT so we can still report expected size. # This is only applied on samples, # since a partial movie might still be useful. end_offset = object_start_position + size if (self.mode == AsfReadMode.Sample and self.object_guid != GUID_DATA_OBJECT and end_offset > self._file_length): raise InvalidDataException("Invalid object length at 0x%08X" % object_start_position) if self.object_guid == GUID_HEADER_OBJECT: self._atom_header += self._asf_stream.read(6) elif self.object_guid == GUID_DATA_OBJECT: self._atom_header += self._asf_stream.read(26) self.current_object = Object(size, self.object_guid) self.current_object.raw_header = self._atom_header self.current_object.start_pos = object_start_position # Calculate the size for the data object in SRS mode if (self.mode == AsfReadMode.SRS and self.object_guid == GUID_DATA_OBJECT): # size of the data object cannot be relied upon # so change size and end_offset o = self.current_object size = len(o.raw_header) i = 16 + 8 + 16 (total_data_packets, ) = S_LONGLONG.unpack_from(o.raw_header, i) # data packet/media object size psize = (o.size - len(o.raw_header)) // total_data_packets rp_offsets = 0 start = o.start_pos + len(o.raw_header) for i in range(total_data_packets): # calculate real packet size packet = AsfDataPacket() packet.data_file_offset = start + rp_offsets self._asf_stream.seek(packet.data_file_offset) # just read all of it to make it easier # SRS files are small anyway packet.data = self._asf_stream.read() # packet.data_size = len(data) # psize s = asf_data_get_packet(packet, psize, AsfReadMode.SRS) rp_offsets += s self.current_object.osize = self.current_object.size self.current_object.size = rp_offsets + size self._asf_stream.seek(object_start_position, os.SEEK_SET) # New top-level objects should be added only between the # Data Object and Index Object(s). return True def read_contents(self): # if read_done is set, we've already read or skipped it. # back up and read again? if self.read_done: self._asf_stream.seek(self.current_object.start_pos, os.SEEK_SET) self.read_done = True # skip header bytes hl = len(self.current_object.raw_header) self._asf_stream.seek(hl, os.SEEK_CUR) buff = self._asf_stream.read(self.current_object.size - hl) return buff def read_data_part(self, offset, length): if (offset + length == self.current_object.start_pos + self.current_object.size): self.read_done = True self._asf_stream.seek(offset, os.SEEK_SET) return self._asf_stream.read(length) def skip_contents(self): if not self.read_done: self.read_done = True self._asf_stream.seek( self.current_object.start_pos + self.current_object.size, os.SEEK_SET) def move_to_child(self): self.read_done = True # skip the header bytes hl = len(self.current_object.raw_header) self._asf_stream.seek(hl, os.SEEK_CUR) def close(self): try: # close the file/stream self._asf_stream.close() except: pass def __del__(self): try: # close the file/stream self._asf_stream.close() except: pass
class StreamReader(object): """Implements a simple Reader class that reads STREAM-SRS files.""" def __init__(self, path=None, stream=None, archived_file_name=""): assert path or stream if path: if is_rar(path): self._stream = RarStream(path, archived_file_name) else: self._stream = open(path, 'rb') elif stream: self._stream = stream self._stream.seek(0, 2) self._file_length = self._stream.tell() self._stream.seek(0) self.current_block = None self.blocks = [] pos = 0 while pos < self._file_length: if pos + 8 > self._file_length: raise InvalidDataException("SRS file too small!") # header: block signature marker = self._stream.read(4) if pos == 0 and marker not in (STREAM_MARKER, M2TS_MARKER): raise InvalidDataException("Not a stream or m2ts SRS file!") if marker not in (b"STRM", b"SRSF", b"SRST", b"M2TS", b"HDRS"): print("Unknown header block encountered") else: marker = marker.decode("ascii") # header: block size (size, ) = S_LONG.unpack(self._stream.read(4)) block = Block(size, marker, pos) self.blocks.append(block) if _DEBUG: print(block) if size == 0 and pos != 0: # only allowed for the marker block raise InvalidDataException("SRS size field is zero") pos += size if pos > self._file_length: raise InvalidDataException("SRS file too small!") self._stream.seek(pos) self._stream.seek(0) def read(self): for block in self.blocks: self.current_block = block yield block def read_contents(self): """Skips the marker and size fields""" self._stream.seek(self.current_block.start_pos + 8, os.SEEK_SET) return self._stream.read(self.current_block.size - 8) def close(self): try: # close the file/stream self._stream.close() except: pass def __del__(self): try: # close the file/stream self._stream.close() except: pass
class Mp3Reader(object): """Implements a simple Reader class that reads through MP3 or MP3-SRS files one block at a time.""" def __init__(self, path=None, stream=None, archived_file_name=""): assert path or stream if path: if is_rar(path): self._mp3_stream = RarStream(path, archived_file_name) else: self._mp3_stream = open(path, 'rb') elif stream: self._mp3_stream = stream self._mp3_stream.seek(0, 2) # reset on ID3v2 tag search self._file_length = self._mp3_stream.tell() self.current_block = None self.blocks = [] begin_main_content = 0 # easier for corner case ("ID3" multiple times before sync) last_id3v2 = None # parse the whole file immediately! # 1) check for ID3v2 (beginning of mp3 file) # The ID3v2 tag size is the size of the complete tag after # unsychronisation, including padding, excluding the header but not # excluding the extended header (total tag size - 10). Only 28 bits # (representing up to 256MB) are used in the size description to avoid # the introduction of 'false syncsignals'. # http://id3.org/id3v2.4.0-structure while True: # tag should be here only once # detect repeating leading ID3 tags in the srs files startpos = begin_main_content self._mp3_stream.seek(startpos, os.SEEK_SET) if self._mp3_stream.read(3) == b"ID3": # skip ID3v2 version (2 bytes) and flags (1 byte) self._mp3_stream.seek(3, os.SEEK_CUR) sbytes = self._mp3_stream.read(4) size = decode_id3_size(sbytes) tag_size = 10 + size # 3 + 3 + 4 last_id3v2 = Block(tag_size, "ID3", startpos) self.blocks.append(last_id3v2) begin_main_content += tag_size else: break # 2) check for ID3v1 (last 128 bytes of mp3 file) end_meta_data_offset = self._file_length self._mp3_stream.seek(-128, os.SEEK_END) idv1_start_offset = self._mp3_stream.tell() first = self._mp3_stream.read(3) if first == b"TAG": idv1_block = Block(128, "TAG", idv1_start_offset) self.blocks.append(idv1_block) end_meta_data_offset = idv1_start_offset # 3) check for http://id3.org/Lyrics3v2 # "The Lyrics3 block, after the MP3 audio and before the ID3 tag, # begins with the word "LYRICSBEGIN" after which a number of field # records follows. The Lyrics3 block ends with a six character size # descriptor and the string "LYRICS200". The size value includes the # "LYRICSBEGIN" string, but does not include the 6 character size # descriptor and the trailing "LYRICS200" string. if end_meta_data_offset - 6 - 9 >= 0: self._mp3_stream.seek(end_meta_data_offset - 6 - 9, os.SEEK_SET) lyrics_footer = self._mp3_stream.read(6 + 9) if lyrics_footer[6:] == b"LYRICS200": lyrics_size = int(lyrics_footer[:6]) # only header + body lyrics3v2_block = Block(lyrics_size + 6 + 9, "LYRICS200", end_meta_data_offset - (lyrics_size + 6 + 9)) self.blocks.append(lyrics3v2_block) end_meta_data_offset -= (lyrics_size + 6 + 9) # 4) check for http://id3.org/Lyrics3 if end_meta_data_offset - 9 >= 0: self._mp3_stream.seek(end_meta_data_offset - 9, os.SEEK_SET) if b"LYRICSEND" == self._mp3_stream.read(9): self._mp3_stream.seek(end_meta_data_offset - 5100, os.SEEK_SET) lyrics_data = self._mp3_stream.read(5100) index = lyrics_data.find(b"LYRICSBEGIN") if index == -1: raise InvalidDataException( "Unable to find start of LyricsV1 block") start_block = end_meta_data_offset - 5100 + index lyrics3_block = Block(end_meta_data_offset - start_block, "LYRICS", start_block) self.blocks.append(lyrics3_block) end_meta_data_offset -= lyrics3_block.size # 5) APE tags # "Tag size in bytes including footer and all tag items excluding # the header to be as compatible as possible with APE Tags 1.000" # "An APEv1 tag at the end of a file must have at least a footer, APEv1 # tags may never be used at the beginning of a file # (unlike APEv2 tags)." if end_meta_data_offset - 32 >= 0: self._mp3_stream.seek(end_meta_data_offset - 32, os.SEEK_SET) if b"APETAGEX" == self._mp3_stream.read(8): (version,) = S_LONG.unpack(self._mp3_stream.read(4)) if version == 2000: header = 32 else: # 1000 header = 0 (size,) = S_LONG.unpack(self._mp3_stream.read(4)) start_block = end_meta_data_offset - size - header apev2_block = Block(end_meta_data_offset - start_block, "APE%s" % version, start_block) self.blocks.append(apev2_block) end_meta_data_offset -= apev2_block.size def marker_has_issues(marker): if len(marker) != 4: return True (sync,) = BE_SHORT.unpack(marker[:2]) sync_bytes = sync & 0xFFE0 == 0xFFE0 if not sync_bytes and marker not in (b"RIFF", b"SRSF"): return True return False # in between is SRS or MP3 data self._mp3_stream.seek(begin_main_content, os.SEEK_SET) marker = self._mp3_stream.read(4) if last_id3v2 and marker_has_issues(marker): # problem with (angelmoon)-hes_all_i_want_cd_pg2k-bmi # The .mp3 files contain ID3+nfo before the real ID3 starts # And it's also a RIFF mp3, so it won't play without removing # the bad initial tag first. # This can cause the space between the "ID3" and the end tag # to be empty. (or just wrong) # Mickey_K.-Distracted-(DNR019F8)-WEB-2008-B2R has the 'ID3' string # in the ID3v2 tag for 02-mickey_k.-distracted_-_dub_mix.mp3 last_id3 = last_id3v2_before_sync(self._mp3_stream, self._file_length) dupe_id3_string = last_id3 != last_id3v2.start_pos after_v2_tag = last_id3 >= last_id3v2.start_pos + last_id3v2.size if dupe_id3_string and after_v2_tag: # another 'ID3' string found after id3v2 tag self._mp3_stream.seek(last_id3 + 3 + 3, os.SEEK_SET) sbytes = self._mp3_stream.read(4) size = decode_id3_size(sbytes) begin_main_content = last_id3 + 10 + size # 3 + 3 + 4 # add extra amount of data to the last block last_id3v2.size = begin_main_content - last_id3v2.start_pos elif dupe_id3_string and not after_v2_tag: # another 'ID3' string found inside first id3v2 tag if begin_main_content > self._file_length: # first tag is corrupt by definition # assume latter tag to be the good one: parse it # skip 'ID3' + ID3v2 version (2 bytes) and flags (1 byte) self._mp3_stream.seek(last_id3 + 6, os.SEEK_SET) sbytes = self._mp3_stream.read(4) size = decode_id3_size(sbytes) tag_size = 10 + size # 3 + 3 + 4 last_id3v2 = Block(tag_size, "ID3", last_id3) self.blocks.append(last_id3v2) begin_main_content = last_id3 + tag_size self._mp3_stream.seek(begin_main_content, os.SEEK_SET) marker = self._mp3_stream.read(4) if not len(marker): # there still is something horribly wrong # (unless you think that an mp3 without any music data is possible) raise InvalidDataException("Tagging f****d up big time!") (sync,) = BE_SHORT.unpack(marker[:2]) main_size = end_meta_data_offset - begin_main_content if marker[:3] == b"SRS": # SRS data blocks cur_pos = begin_main_content while cur_pos < begin_main_content + main_size: self._mp3_stream.seek(cur_pos, os.SEEK_SET) # SRSF, SRST and SRSP try: marker = self._mp3_stream.read(4) # size includes the 8 bytes header (size,) = S_LONG.unpack(self._mp3_stream.read(4)) except: raise InvalidDataException("Not enough SRS data") srs_block = Block(size, marker.decode("ascii"), cur_pos) self.blocks.append(srs_block) cur_pos += size if size == 0: raise InvalidDataException("SRS size field is zero") if size > begin_main_content + main_size: raise InvalidDataException("Broken SRS") elif sync & 0xFFE0 == 0xFFE0 or marker == b"RIFF": # first 11 bits all 1 for MP3 frame marker mp3_data_block = Block(main_size, "MP3", begin_main_content) self.blocks.append(mp3_data_block) else: print("WARNING: MP3 file is not valid!") data_block = Block(main_size, "MP3", begin_main_content) self.blocks.append(data_block) # the order of which we add blocks doesn't matter this way self.blocks.sort(key=lambda block: block.start_pos) def read(self): for block in self.blocks: self.current_block = block if _DEBUG: print(block) yield block def read_contents(self): self._mp3_stream.seek(self.current_block.start_pos, os.SEEK_SET) return self._mp3_stream.read(self.current_block.size) def read_part(self, size, offset=0): if (self.current_block.start_pos + offset + size > self.current_block.start_pos + self.current_block.size): raise ValueError("Can't read beyond end of block.") self._mp3_stream.seek(self.current_block.start_pos + offset, os.SEEK_SET) return self._mp3_stream.read(size) def close(self): try: # close the file/stream self._mp3_stream.close() except: pass def __del__(self): try: # close the file/stream self._mp3_stream.close() except: pass
class EbmlReader(object): """Implements a simple Reader class that reads through MKV or MKV-SRS files one element at a time.""" def __init__(self, read_mode, path=None, stream=None, archived_file_name=""): assert path or stream self.element_header = b"" # 12 bytes self._ebml_stream = None self.mode = None self.read_done = True self.current_element = None self.element_type = None # when not empty: an expected file size has been printed # to stderr already when data was missing self.expected_file_size = "" if path: if is_rar(path): self._ebml_stream = RarStream(path, archived_file_name) else: self._ebml_stream = open(path, 'rb') elif stream: self._ebml_stream = stream else: assert False self._ebml_stream.seek(0, 2) self._file_length = self._ebml_stream.tell() self._ebml_stream.seek(0) self.mode = read_mode def read(self): # "Read() is invalid at this time", "MoveToChild(), ReadContents(), or # SkipContents() must be called before Read() can be called again" assert self.read_done or (self.mode == EbmlReadMode.SRS and self.element_type == EbmlElementType.Block), "improper state" element_start_position = self._ebml_stream.tell() # too little data (+2: 1B element ID + 1B data size) if element_start_position + 2 > self._file_length: return False self.current_element = None self.read_done = False # 1) Element ID ------------------------------------------------------- # length descriptor: the leading bits of the header # used to identify the length of the ID (ID: like xml tags) read_byte = self._ebml_stream.read(1) if not len(read_byte): return False # raise ValueError("Missing data") (id_length_descriptor, ) = BE_BYTE.unpack(read_byte) id_length_descriptor = GetUIntLength(id_length_descriptor) self.element_header = read_byte self.element_header += self._ebml_stream.read(id_length_descriptor - 1) # 2) Data size -------------------------------------------------------- read_byte = self._ebml_stream.read(1) if not len(read_byte): return False # raise ValueError("Missing data") (data_length_descriptor, ) = BE_BYTE.unpack(read_byte) data_length_descriptor = GetUIntLength(data_length_descriptor) self.element_header += read_byte self.element_header += self._ebml_stream.read(data_length_descriptor - 1) assert id_length_descriptor + data_length_descriptor == len( self.element_header) # 3) Data ------------------------------------------------------------- eh = self.element_header[0:id_length_descriptor] self.element_type = id_type_mapping.get(eh, EbmlElementType.Unknown) element_length = GetEbmlUInt(self.element_header, id_length_descriptor, data_length_descriptor) # sanity check on element length. skip check on Segment element so we # can still report expected size. this is only applied on samples # since a partial movie might still be useful endOffset = (element_start_position + id_length_descriptor + data_length_descriptor + element_length) if (self.mode == EbmlReadMode.Sample and self.element_type != EbmlElementType.Segment and endOffset > self._file_length): if self.expected_file_size: msg = ("Invalid element length at 0x{0:08X}. " "Expected size: {1} bytes".format( element_start_position, self.expected_file_size)) raise InvalidDataException(msg) else: msg = "Invalid element length at 0x{0:08X}" raise InvalidDataException(msg.format(element_start_position)) if self.element_type != EbmlElementType.Block: self.current_element = EbmlElement() self.current_element.raw_header = self.element_header self.current_element.element_start_pos = element_start_position self.current_element.length = element_length else: # it's a block # first thing in the block is the track number trackDescriptor = self._ebml_stream.read(1) blockHeader = trackDescriptor trackDescriptor = GetUIntLength(BE_BYTE.unpack(trackDescriptor)[0]) # incredibly unlikely the track number is > 1 byte, # but just to be safe... if trackDescriptor > 1: blockHeader += self._ebml_stream.read(trackDescriptor - 1) trackno = GetEbmlUInt(blockHeader, 0, trackDescriptor) # read in time code (2 bytes) and flags (1 byte) blockHeader += self._ebml_stream.read(3) timecode = ((BE_BYTE.unpack_from(blockHeader, len(blockHeader) - 3)[0] << 8) + BE_BYTE.unpack_from(blockHeader, len(blockHeader) - 2)[0]) # need to grab the flags (last byte of the header) # to check for lacing lace_type = (BE_BYTE.unpack_from(blockHeader, len(blockHeader) - 1)[0] & EbmlLaceType.EBML) data_length = element_length - len(blockHeader) frameSizes, bytesConsumed = GetBlockFrameLengths( lace_type, data_length, self._ebml_stream) if bytesConsumed > 0: newBlockHeader = blockHeader self._ebml_stream.seek(-bytesConsumed, os.SEEK_CUR) newBlockHeader += self._ebml_stream.read(bytesConsumed) blockHeader = newBlockHeader element_length -= len(blockHeader) self.current_element = BlockElement() self.current_element.track_number = trackno self.current_element.timecode = timecode self.current_element.frame_lengths = frameSizes self.current_element.raw_block_header = blockHeader self.current_element.raw_header = self.element_header self.current_element.element_start_pos = element_start_position self.current_element.length = element_length # the following line will write mkvinfo-like output from the parser # (extremely useful for debugging) # print("{0}: {3} + {1} bytes @ {2}".format( # EbmlElementTypeName[self.element_type], # element_length, # without header # element_start_position, # len(self.element_header))) return True def read_contents(self): # if readReady is set, we've already read or skipped it. # back up and read again? if self.read_done: self._ebml_stream.seek(-self.current_element.length, os.SEEK_CUR) self.read_done = True buff = None if (self.mode != EbmlReadMode.SRS or self.element_type != EbmlElementType.Block): buff = self._ebml_stream.read(self.current_element.length) return buff def skip_contents(self): if not self.read_done: self.read_done = True if (self.mode != EbmlReadMode.SRS or self.element_type != EbmlElementType.Block): self._ebml_stream.seek(self.current_element.length, os.SEEK_CUR) def move_to_child(self): if self.read_done: self._ebml_stream.seek(-self.current_element.length, os.SEEK_CUR) self.read_done = True def close(self): try: # close the file/stream self._ebml_stream.close() except: pass def __del__(self): try: # close the file/stream self._ebml_stream.close() except: pass
class FlacReader(object): """Implements a simple Reader class that reads through FLAC or FLAC-SRS files one block at a time.""" def __init__(self, path=None, stream=None, archived_file_name=""): assert path or stream if path: if is_rar(path): self._flac_stream = RarStream(path, archived_file_name) else: self._flac_stream = open(path, 'rb') elif stream: self._flac_stream = stream self._flac_stream.seek(0, 2) self._file_length = self._flac_stream.tell() self._flac_stream.seek(0) self.read_done = True self.current_block = None self.block_type = None def read(self): assert self.read_done block_start_position = self._flac_stream.tell() self.current_block = None self.read_done = False if block_start_position == self._file_length: return False self._block_header = self._flac_stream.read(4) # METADATA_BLOCK_HEADER # <1> Last-metadata-block flag: '1' if this block is the last # metadata block before the audio blocks, '0' otherwise. # <7> BLOCK_TYPE # <24> Length (in bytes) of metadata to follow # (does not include the size of the METADATA_BLOCK_HEADER) if self._block_header == b"fLaC": self.block_type = "fLaC" self.current_block = Block(0, self.block_type) self.current_block.raw_header = b"fLaC" self.current_block.start_pos = block_start_position self._flac_stream.seek(block_start_position, os.SEEK_SET) return True # ID3v2 if self._block_header.startswith(b"ID3"): self.block_type = "ID3" self._flac_stream.seek(block_start_position, os.SEEK_SET) raw_header = self._flac_stream.read(10) size = decode_id3_size(raw_header[6:10]) self.current_block = Block(size, self.block_type) self.current_block.raw_header = raw_header self.current_block.start_pos = block_start_position self._flac_stream.seek(block_start_position, os.SEEK_SET) return True # ID3v1 if self._block_header.startswith(b"TAG"): self.block_type = "TAG" self.current_block = Block(128, self.block_type) self.current_block.raw_header = b"" self.current_block.start_pos = block_start_position self._flac_stream.seek(block_start_position, os.SEEK_SET) return True (self.block_type, ) = BE_BYTE.unpack_from(self._block_header, 0) if self.block_type == 0xFF: # frame data block_length = self._file_length - block_start_position # check for ID3v1 tag self._flac_stream.seek(self._file_length - 128) if self._flac_stream.read(3) == b"TAG": block_length -= 128 self._block_header = b"" else: (block_length, ) = BE_LONG.unpack(b"\x00" + self._block_header[1:]) # sanity check on block length end_offset = block_start_position + block_length if (end_offset > self._file_length): raise InvalidDataException("Invalid block length at 0x%08X" % block_start_position) self.current_block = Block(block_length, self.block_type) self.current_block.raw_header = self._block_header self.current_block.start_pos = block_start_position self._flac_stream.seek(block_start_position, os.SEEK_SET) return True def read_contents(self): # if read_done is set, we've already read or skipped it. # back up and read again? if self.read_done: self._flac_stream.seek(self.current_block.start_pos, os.SEEK_SET) self.read_done = True # skip header bytes hl = len(self.current_block.raw_header) self._flac_stream.seek(hl, os.SEEK_CUR) buff = self._flac_stream.read(self.current_block.size) return buff def skip_contents(self): if not self.read_done: self.read_done = True self._flac_stream.seek( self.current_block.start_pos + len(self.current_block.raw_header) + self.current_block.size, os.SEEK_SET) def read_part(self, size, offset=0): """idempotent operation""" hl = len(self.current_block.raw_header) initial_offset = self._flac_stream.tell() if initial_offset != self.current_block.start_pos: self._flac_stream.seek(self.current_block.start_pos, os.SEEK_SET) self._flac_stream.seek(offset + hl, os.SEEK_CUR) data = self._flac_stream.read(size) self._flac_stream.seek(initial_offset, os.SEEK_SET) return data def close(self): try: # close the file/stream self._flac_stream.close() except: pass def __del__(self): try: # close the file/stream self._flac_stream.close() except: pass
class RiffReader(object): """Implements a simple Reader class that reads through AVI or AVI-SRS files one chunk at a time.""" def __init__(self, read_mode, path=None, stream=None, match_offset=0, archived_file_name=""): if path: if is_rar(path): self._riff_stream = RarStream(path, archived_file_name) else: self._riff_stream = open(path, 'rb') elif stream: self._riff_stream = stream else: assert False self._riff_stream.seek(0, os.SEEK_END) self._file_length = self._riff_stream.tell() self.mode = read_mode self.read_done = True self.current_chunk = None self.chunk_type = None self.has_padding = False self.padding_byte = "" # faster reconstructing when match_offset is provided if match_offset >= 8 and match_offset < self._file_length: # -8 is there to add the chunk header for read() if self._is_valid_chunk_location(match_offset - 8): # yes! reconstruction will be fast self._riff_stream.seek(match_offset - 8, os.SEEK_SET) else: # match offset is not at the start boundary of a chunk chunk_offset = self._find_chunk_offset(match_offset) if _DEBUG: print("Match offset doesn't start on a nice boundary.") print("Chunk offset: {0}".format(chunk_offset)) print("Match offset: {0}".format(match_offset)) assert chunk_offset < match_offset self._riff_stream.seek(chunk_offset, os.SEEK_SET) # re-initialisation self.read_done = True self.current_chunk = None self.chunk_type = None self.has_padding = False self.padding_byte = "" elif match_offset >= self._file_length: msg = "Invalid match offset for video: {0}".format(match_offset) raise InvalidMatchOffsetException(msg) else: # no useful matching offset against the main movie file self._riff_stream.seek(0) def _is_valid_chunk_location(self, offset): """Checks whether a certain offset is a valid chunk location to start processing from. Based on Four Character Code.""" self._riff_stream.seek(offset, os.SEEK_SET) fourcc = self._riff_stream.read(4) return fourCCValidator.match(fourcc) def _find_chunk_offset(self, match_offset): """Finds the start offset of the chunk for the match_offset. It uses the index at the end of the file.""" self._riff_stream.seek(0, os.SEEK_SET) index_data = "" movi_start = 0 while self.read(): fourcc = self.current_chunk.fourcc if fourcc == b"AVI ": # the index is in here self.move_to_child() elif fourcc == b"movi": # location where the index is relative to movi_start = self.current_chunk.chunk_start_pos elif self.chunk_type == RiffChunkType.Index: index_data = self.read_contents() break self.skip_contents() # https://msdn.microsoft.com/en-us/library/windows/desktop/dd318181(v=vs.85).aspx # we've found the index if movi_start and len(index_data): # read chunk positions until an _absolute_ file position larger # than our match offset is found offsets = [] offset = 0 idxpos = 0 while offset < match_offset and idxpos + 16 <= len(index_data): (offset,) = S_LONG.unpack(index_data[idxpos + 8:idxpos + 12]) offsets.append(offset) idxpos += 16 # ckid, dwFlags, dwChunkOffset, dwChunkLength # choose the last _relative_ chunk smaller than the match offset # the match offset is absolute form the beginning of the file for offset in reversed(offsets): start_offset = movi_start + 8 + offset if start_offset < match_offset: if self._is_valid_chunk_location(start_offset): return start_offset else: if _DEBUG: print("AVI doesn't follow the 'idx1' spec.") break # assume the AVI doesn't follow the specification for offset in reversed(offsets): if offset < match_offset: if self._is_valid_chunk_location(offset): return offset else: if _DEBUG: print("The index offset wasn't usable.") return 0 return 0 def read(self): # "Read() is invalid at this time", "MoveToChild(), ReadContents(), or # SkipContents() must be called before Read() can be called again"); assert self.read_done or (self.mode == RiffReadMode.SRS and self.chunk_type == RiffChunkType.Movi) # includes 8 byte header chunk_start_position = self._riff_stream.tell() self.current_chunk = None self.read_done = False if chunk_start_position + 8 > self._file_length: return False chunk_header = self._riff_stream.read(8) # 4 bytes for fourcc, 4 for chunk length fourcc = chunk_header[:4] (chunk_length,) = S_LONG.unpack_from(chunk_header, 4) # might not keep this check # the length check should catch corruption on its own... if not fourCCValidator.match(fourcc): raise InvalidDataException("Invalid FourCC value (%r) at 0x%08X" % (fourcc, chunk_start_position)) # sanity check on chunk length # Skip check on RIFF list so we can still report expected size. # This is only applied on samples, # since a partial movie might still be useful. endOffset = chunk_start_position + 8 + chunk_length if (self.mode == RiffReadMode.Sample and fourcc != b"RIFF" and endOffset > self._file_length): raise InvalidDataException("Invalid chunk length at 0x%08X" % (chunk_start_position + 4)) # Lists if fourcc == b"RIFF" or fourcc == b"LIST": # if the fourcc indicates a list type (RIFF or LIST), # there is another fourcc code in the next 4 bytes listType = fourcc chunk_header += self._riff_stream.read(4) fourcc = chunk_header[8:12] chunk_length -= 4 # extra dwFourCC self.chunk_type = RiffChunkType.List self.current_chunk = RiffList() self.current_chunk.list_type = listType # RIFF list specific self.current_chunk.fourcc = fourcc self.current_chunk.length = chunk_length self.current_chunk.raw_header = chunk_header self.current_chunk.chunk_start_pos = chunk_start_position else: # Chunks # Chunk containing video, audio or subtitle data if chunk_header[:2].isdigit(): self.current_chunk = MoviChunk() self.current_chunk.stream_number = int(fourcc[:2]) self.chunk_type = RiffChunkType.Movi elif fourcc == b"idx1": self.current_chunk = RiffChunk() self.chunk_type = RiffChunkType.Index else: self.current_chunk = RiffChunk() self.chunk_type = RiffChunkType.Unknown self.current_chunk.fourcc = fourcc self.current_chunk.length = chunk_length self.current_chunk.raw_header = chunk_header self.current_chunk.chunk_start_pos = chunk_start_position self.has_padding = chunk_length % 2 == 1 return True def read_contents(self): # if read_done is set, we've already read or skipped it. # back up and read again? if self.read_done: self._riff_stream.seek(-self.current_chunk.length - (1 if self.has_padding else 0), os.SEEK_CUR) self.read_done = True buff = None if (self.mode != RiffReadMode.SRS or self.chunk_type != RiffChunkType.Movi): buff = self._riff_stream.read(self.current_chunk.length) if self.has_padding: (self.padding_byte,) = S_BYTE.unpack(self._riff_stream.read(1)) return buff def skip_contents(self): if not self.read_done: self.read_done = True if (self.mode != RiffReadMode.SRS or self.chunk_type != RiffChunkType.Movi): self._riff_stream.seek(self.current_chunk.length, os.SEEK_CUR) if self.has_padding: (self.padding_byte,) = S_BYTE.unpack(self._riff_stream.read(1)) def move_to_child(self): # "MoveToChild() should only be called on a RIFF List"); assert self.chunk_type == RiffChunkType.List self.read_done = True def close(self): try: # close the file/stream self._riff_stream.close() except: pass def __del__(self): try: # close the file/stream self._riff_stream.close() except: pass