Example #1
0
    def test_file(self):
        """ Tests if the file in the rar archive is the same as the
			extracted version. """
        rar_file = os.path.join(self.path, "store_little", "store_little.rar")
        txt_file = os.path.join(self.path, "txt", "little_file.txt")
        rs = RarStream(rar_file)
        with open(txt_file, "rb") as tfile:
            self.assertEqual(rs.read(), tfile.read())

        rar_file = os.path.join(self.path,
                                "store_split_folder_old_srrsfv_windows",
                                "winrar2.80.rar")
        txt_file = os.path.join(self.path, "txt", "unicode_dos.nfo")
        rs = RarStream(rar_file, "unicode_dos.nfo")  # 3.316 bytes
        with open(txt_file, "rb") as tfile:
            rs.seek(3316)
            self.assertEqual(rs.seek(6316), rs.tell())
            rs.seek(3312)
            tfile.seek(3336, os.SEEK_SET)
            tfile.seek(6336, os.SEEK_SET)
            rs.read(4)
            rs.seek(0)
            tfile.seek(0)
            self.assertEqual(rs.read(), tfile.read())
            tfile.seek(-20, os.SEEK_END)
            self.assertEqual(rs.seek(-20, os.SEEK_END), tfile.tell())
            self.assertEqual(rs.read(), tfile.read())
        rs.close()
        self.assertEqual(rs.closed, True, "Stream not closed")

        txt_file = os.path.join(self.path, "txt", "unicode_mac.nfo")
        rs = RarStream(rar_file, "unicode_mac.nfo")
        with open(txt_file, "rb") as tfile:
            tfile.seek(3000)
            tfile.read()
            tfile.seek(333)
            rs.seek(333)
            self.assertEqual(rs.read(), tfile.read())
Example #2
0
 def test_folder_multiple(self):
     # with path and multiple files in folder / split volumes
     rs = RarStream(
         os.path.join(self.path, self.folder, "store_split_folder.rar"),
         "txt/users_manual4.00.txt")
     with open(os.path.join(self.path, "txt", "users_manual4.00.txt"),
               "rb") as txt_file:
         # + other tests to increase code coverage
         self.assertEqual(rs.read(), txt_file.read())
         self.assertEqual(rs.tell(), txt_file.tell())
         self.assertEqual(rs.length(), txt_file.tell())
         self.assertEqual(rs.readable(), True)
         self.assertEqual(rs.seekable(), True)
         self.assertEqual(rs.read(), b"")
         self.assertEqual(rs.read(), b"")
         rs.seek(0, os.SEEK_SET)
         rs.read(2)
         rs.seek(0, os.SEEK_END)
         self.assertRaises(IndexError, rs.seek, -1)
     self.assertEqual(rs.list_files(), [
         "txt\\empty_file.txt", "txt\\little_file.txt",
         "txt\\users_manual4.00.txt"
     ])
     self.assertRaises(NotImplementedError, rs.readinto, "")
Example #3
0
class M2tsReader(object):
    """Implements a simple Reader class that reads M2TS files."""
    def __init__(self,
                 read_mode=M2tsReadMode.M2ts,
                 path=None,
                 stream=None,
                 match_offset=0,
                 archived_file_name=""):
        assert path or stream
        if path:
            if is_rar(path):
                self._stream = RarStream(path, archived_file_name)
            else:
                self._stream = open(path, 'rb')
        elif stream:
            self._stream = stream
        self._stream.seek(0, 2)
        self._file_length = self._stream.tell()
        self.mode = read_mode
        self.read_done = True

        self.current_packet = None
        self.current_offset = 0

        if self._file_length < 192:
            raise InvalidDataException("File too small")

        # faster reconstructing when match_offset is provided
        if match_offset >= 8 and match_offset < self._file_length:
            # use lowest muliple of 192 < offset as a starting point
            start = match_offset // PACKET_SIZE
            self._stream.seek(start)
            self.current_offset = start
        elif match_offset >= self._file_length:
            msg = "Invalid match offset for video: {0}".format(match_offset)
            raise InvalidMatchOffsetException(msg)
        else:
            # no useful matching offset against the main movie file
            self._stream.seek(0)

    def read(self):
        # read() is invalid at this time: read_contents() or
        # skip_contents() must be called before read() can be called again
        assert self.read_done or self.mode == M2tsReadMode.SRS

        self.read_done = False
        self._stream.seek(self.current_offset)
        # TP_extra_header (4 Bytes) + MPEG-2 transport stream header (4 B)
        header = self._stream.read(HEADER_SIZE)

        if not len(header):
            return False

        if M2tsReadMode.M2ts:
            if self.current_offset + PACKET_SIZE > self._file_length:
                msg = "Invalid packet length at 0x{0:08X}"
                raise InvalidDataException(msg.format(self.current_offset))
        else:
            # SRS header data must be a multiple of 8
            if self.current_offset + HEADER_SIZE > self._file_length:
                raise InvalidDataException("Broken SRS file")

        if header[5] == b'\x47':
            msg = "Invalid synchronization byte at 0x{0:08X}"
            raise InvalidDataException(msg.format(self.current_offset))

        packet = Packet(self.current_offset)
        packet.raw_header = header
        (byte8, ) = S_BYTE.unpack_from(header, 7)
        # two bits: bit 3 and 4 of last byte in the header
        packet.adaptation_field = (byte8 & 0x30) >> 4
        # last four bits of last byte in the header
        packet.continuity_counter = (byte8 & 0xF)
        (byte67, ) = S_SHORT.unpack_from(header, 5)
        packet.pid = byte67 & 0x1FFF

        self.current_offset += PACKET_SIZE
        self.current_packet = packet

        # 		if _DEBUG and packet.adaptation_field != 1:
        # # 		if _DEBUG:
        # 			print(packet)
        # 			print(bin(byte67))

        return True

    def read_contents(self):
        """Reads the transport stream packet payload. (no 8B header)"""
        buff = b""
        if self.read_done:
            self._stream.seek(-PAYLOAD_SIZE, os.SEEK_CUR)
        self.read_done = True
        if self.mode != M2tsReadMode.SRS:
            buff = self._stream.read(PAYLOAD_SIZE)
        return buff

    def skip_contents(self):
        """Skips over the payload data to the next packet."""
        if not self.read_done:
            self.read_done = True
            if self.mode != M2tsReadMode.SRS:
                self._stream.seek(PAYLOAD_SIZE, os.SEEK_CUR)

    def close(self):
        try:  # close the file/stream
            self._stream.close()
        except:
            pass

    def __del__(self):
        try:  # close the file/stream
            self._stream.close()
        except:
            pass
Example #4
0
class AsfReader(object):
    """Implements a simple Reader class that reads through WMV 
	or WMV-SRS files one Object at a time."""
    def __init__(self,
                 read_mode,
                 path=None,
                 stream=None,
                 archived_file_name=""):
        assert path or stream, "missing ASF reader input"
        if path:
            if is_rar(path):
                self._asf_stream = RarStream(path, archived_file_name)
            else:
                self._asf_stream = open(path, 'rb')
        elif stream:
            self._asf_stream = stream
        self._asf_stream.seek(0, 2)
        self._file_length = self._asf_stream.tell()
        self._asf_stream.seek(0)
        self.mode = read_mode

        self.read_done = True
        self.current_object = None
        self.object_guid = None

    def read(self):
        # "Read() is invalid at this time", "MoveToChild(), ReadContents(), or
        # SkipContents() must be called before Read() can be called again")
        assert self.read_done or (self.mode == AsfReadMode.SRS and
                                  self.object_guid == GUID_DATA_OBJECT), \
                                  "AsfReader read() is invalid at this time"

        object_start_position = self._asf_stream.tell()
        self.current_object = None
        self.read_done = False

        # no room for GUID (16B) and size (8B) of the object
        if object_start_position + 24 > self._file_length:
            return False

        self._atom_header = self._asf_stream.read(24)
        # 16 bytes for GUID, 8 bytes for object size
        self.object_guid, size = struct.unpack("<16sQ", self._atom_header)

        # sanity check on object length
        # Skip check on GUID_DATA_OBJECT so we can still report expected size.
        # This is only applied on samples,
        # since a partial movie might still be useful.
        end_offset = object_start_position + size
        if (self.mode == AsfReadMode.Sample
                and self.object_guid != GUID_DATA_OBJECT
                and end_offset > self._file_length):
            raise InvalidDataException("Invalid object length at 0x%08X" %
                                       object_start_position)

        if self.object_guid == GUID_HEADER_OBJECT:
            self._atom_header += self._asf_stream.read(6)
        elif self.object_guid == GUID_DATA_OBJECT:
            self._atom_header += self._asf_stream.read(26)

        self.current_object = Object(size, self.object_guid)
        self.current_object.raw_header = self._atom_header
        self.current_object.start_pos = object_start_position

        # Calculate the size for the data object in SRS mode
        if (self.mode == AsfReadMode.SRS
                and self.object_guid == GUID_DATA_OBJECT):
            # size of the data object cannot be relied upon
            # so change size and end_offset
            o = self.current_object

            size = len(o.raw_header)
            i = 16 + 8 + 16
            (total_data_packets, ) = S_LONGLONG.unpack_from(o.raw_header, i)
            # data packet/media object size
            psize = (o.size - len(o.raw_header)) // total_data_packets
            rp_offsets = 0
            start = o.start_pos + len(o.raw_header)
            for i in range(total_data_packets):
                # calculate real packet size
                packet = AsfDataPacket()
                packet.data_file_offset = start + rp_offsets
                self._asf_stream.seek(packet.data_file_offset)
                # just read all of it to make it easier
                # SRS files are small anyway
                packet.data = self._asf_stream.read()
                # 				packet.data_size = len(data) # psize

                s = asf_data_get_packet(packet, psize, AsfReadMode.SRS)
                rp_offsets += s

            self.current_object.osize = self.current_object.size
            self.current_object.size = rp_offsets + size

        self._asf_stream.seek(object_start_position, os.SEEK_SET)

        # New top-level objects should be added only between the
        # Data Object and Index Object(s).

        return True

    def read_contents(self):
        # if read_done is set, we've already read or skipped it.
        # back up and read again?
        if self.read_done:
            self._asf_stream.seek(self.current_object.start_pos, os.SEEK_SET)

        self.read_done = True

        # skip header bytes
        hl = len(self.current_object.raw_header)
        self._asf_stream.seek(hl, os.SEEK_CUR)
        buff = self._asf_stream.read(self.current_object.size - hl)
        return buff

    def read_data_part(self, offset, length):
        if (offset + length == self.current_object.start_pos +
                self.current_object.size):
            self.read_done = True
        self._asf_stream.seek(offset, os.SEEK_SET)
        return self._asf_stream.read(length)

    def skip_contents(self):
        if not self.read_done:
            self.read_done = True
            self._asf_stream.seek(
                self.current_object.start_pos + self.current_object.size,
                os.SEEK_SET)

    def move_to_child(self):
        self.read_done = True
        # skip the header bytes
        hl = len(self.current_object.raw_header)
        self._asf_stream.seek(hl, os.SEEK_CUR)

    def close(self):
        try:  # close the file/stream
            self._asf_stream.close()
        except:
            pass

    def __del__(self):
        try:  # close the file/stream
            self._asf_stream.close()
        except:
            pass
Example #5
0
class StreamReader(object):
    """Implements a simple Reader class that reads STREAM-SRS files."""
    def __init__(self, path=None, stream=None, archived_file_name=""):
        assert path or stream
        if path:
            if is_rar(path):
                self._stream = RarStream(path, archived_file_name)
            else:
                self._stream = open(path, 'rb')
        elif stream:
            self._stream = stream
        self._stream.seek(0, 2)
        self._file_length = self._stream.tell()
        self._stream.seek(0)

        self.current_block = None
        self.blocks = []

        pos = 0
        while pos < self._file_length:
            if pos + 8 > self._file_length:
                raise InvalidDataException("SRS file too small!")

            # header: block signature
            marker = self._stream.read(4)
            if pos == 0 and marker not in (STREAM_MARKER, M2TS_MARKER):
                raise InvalidDataException("Not a stream or m2ts SRS file!")
            if marker not in (b"STRM", b"SRSF", b"SRST", b"M2TS", b"HDRS"):
                print("Unknown header block encountered")
            else:
                marker = marker.decode("ascii")

            # header: block size
            (size, ) = S_LONG.unpack(self._stream.read(4))
            block = Block(size, marker, pos)
            self.blocks.append(block)
            if _DEBUG:
                print(block)

            if size == 0 and pos != 0:
                # only allowed for the marker block
                raise InvalidDataException("SRS size field is zero")

            pos += size
            if pos > self._file_length:
                raise InvalidDataException("SRS file too small!")

            self._stream.seek(pos)
        self._stream.seek(0)

    def read(self):
        for block in self.blocks:
            self.current_block = block
            yield block

    def read_contents(self):
        """Skips the marker and size fields"""
        self._stream.seek(self.current_block.start_pos + 8, os.SEEK_SET)
        return self._stream.read(self.current_block.size - 8)

    def close(self):
        try:  # close the file/stream
            self._stream.close()
        except:
            pass

    def __del__(self):
        try:  # close the file/stream
            self._stream.close()
        except:
            pass
Example #6
0
class Mp3Reader(object):
	"""Implements a simple Reader class that reads through MP3 
	or MP3-SRS files one block at a time."""
	def __init__(self, path=None, stream=None, archived_file_name=""):
		assert path or stream
		if path:
			if is_rar(path):
				self._mp3_stream = RarStream(path, archived_file_name)
			else:
				self._mp3_stream = open(path, 'rb')
		elif stream:
			self._mp3_stream = stream
		self._mp3_stream.seek(0, 2)  # reset on ID3v2 tag search
		self._file_length = self._mp3_stream.tell()

		self.current_block = None

		self.blocks = []
		begin_main_content = 0

		# easier for corner case ("ID3" multiple times before sync)
		last_id3v2 = None

		# parse the whole file immediately!
		# 1) check for ID3v2 (beginning of mp3 file)
		# The ID3v2 tag size is the size of the complete tag after
		# unsychronisation, including padding, excluding the header but not
		# excluding the extended header (total tag size - 10). Only 28 bits
		# (representing up to 256MB) are used in the size description to avoid
		# the introduction of 'false syncsignals'.
		# http://id3.org/id3v2.4.0-structure
		while True:  # tag should be here only once
			# detect repeating leading ID3 tags in the srs files
			startpos = begin_main_content
			self._mp3_stream.seek(startpos, os.SEEK_SET)
			if self._mp3_stream.read(3) == b"ID3":
				# skip ID3v2 version (2 bytes) and flags (1 byte)
				self._mp3_stream.seek(3, os.SEEK_CUR)
				sbytes = self._mp3_stream.read(4)
				size = decode_id3_size(sbytes)

				tag_size = 10 + size  # 3 + 3 + 4
				last_id3v2 = Block(tag_size, "ID3", startpos)
				self.blocks.append(last_id3v2)
				begin_main_content += tag_size
			else:
				break

		# 2) check for ID3v1 (last 128 bytes of mp3 file)
		end_meta_data_offset = self._file_length
		self._mp3_stream.seek(-128, os.SEEK_END)
		idv1_start_offset = self._mp3_stream.tell()
		first = self._mp3_stream.read(3)
		if first == b"TAG":
			idv1_block = Block(128, "TAG", idv1_start_offset)
			self.blocks.append(idv1_block)
			end_meta_data_offset = idv1_start_offset

		# 3) check for http://id3.org/Lyrics3v2
		# "The Lyrics3 block, after the MP3 audio and before the ID3 tag,
		# begins with the word "LYRICSBEGIN" after which a number of field
		# records follows. The Lyrics3 block ends with a six character size
		# descriptor and the string "LYRICS200". The size value includes the
		# "LYRICSBEGIN" string, but does not include the 6 character size
		# descriptor and the trailing "LYRICS200" string.
		if end_meta_data_offset - 6 - 9 >= 0:
			self._mp3_stream.seek(end_meta_data_offset - 6 - 9, os.SEEK_SET)
			lyrics_footer = self._mp3_stream.read(6 + 9)
			if lyrics_footer[6:] == b"LYRICS200":
				lyrics_size = int(lyrics_footer[:6])  # only header + body
				lyrics3v2_block = Block(lyrics_size + 6 + 9, "LYRICS200",
				                        end_meta_data_offset -
				                        (lyrics_size + 6 + 9))
				self.blocks.append(lyrics3v2_block)
				end_meta_data_offset -= (lyrics_size + 6 + 9)

		# 4) check for http://id3.org/Lyrics3
		if end_meta_data_offset - 9 >= 0:
			self._mp3_stream.seek(end_meta_data_offset - 9, os.SEEK_SET)
			if b"LYRICSEND" == self._mp3_stream.read(9):
				self._mp3_stream.seek(end_meta_data_offset - 5100, os.SEEK_SET)
				lyrics_data = self._mp3_stream.read(5100)
				index = lyrics_data.find(b"LYRICSBEGIN")
				if index == -1:
					raise InvalidDataException(
							"Unable to find start of LyricsV1 block")
				start_block = end_meta_data_offset - 5100 + index
				lyrics3_block = Block(end_meta_data_offset - start_block,
				                      "LYRICS", start_block)
				self.blocks.append(lyrics3_block)
				end_meta_data_offset -= lyrics3_block.size

		# 5) APE tags
		# "Tag size in bytes including footer and all tag items excluding
		# the header to be as compatible as possible with APE Tags 1.000"
		# "An APEv1 tag at the end of a file must have at least a footer, APEv1
		# tags may never be used at the beginning of a file
		# (unlike APEv2 tags)."
		if end_meta_data_offset - 32 >= 0:
			self._mp3_stream.seek(end_meta_data_offset - 32, os.SEEK_SET)
			if b"APETAGEX" == self._mp3_stream.read(8):
				(version,) = S_LONG.unpack(self._mp3_stream.read(4))
				if version == 2000:
					header = 32
				else:  # 1000
					header = 0
				(size,) = S_LONG.unpack(self._mp3_stream.read(4))
				start_block = end_meta_data_offset - size - header
				apev2_block = Block(end_meta_data_offset - start_block,
				                    "APE%s" % version, start_block)
				self.blocks.append(apev2_block)
				end_meta_data_offset -= apev2_block.size

		def marker_has_issues(marker):
			if len(marker) != 4:
				return True
			(sync,) = BE_SHORT.unpack(marker[:2])
			sync_bytes = sync & 0xFFE0 == 0xFFE0
			if not sync_bytes and marker not in (b"RIFF", b"SRSF"):
				return True
			return False

		# in between is SRS or MP3 data
		self._mp3_stream.seek(begin_main_content, os.SEEK_SET)
		marker = self._mp3_stream.read(4)

		if last_id3v2 and marker_has_issues(marker):
			# problem with (angelmoon)-hes_all_i_want_cd_pg2k-bmi
			# The .mp3 files contain ID3+nfo before the real ID3 starts
			# And it's also a RIFF mp3, so it won't play without removing
			# the bad initial tag first.
			# This can cause the space between the "ID3" and the end tag
			# to be empty. (or just wrong)
			# Mickey_K.-Distracted-(DNR019F8)-WEB-2008-B2R has the 'ID3' string
			# in the ID3v2 tag for 02-mickey_k.-distracted_-_dub_mix.mp3
			last_id3 = last_id3v2_before_sync(self._mp3_stream,
			                                  self._file_length)
			dupe_id3_string = last_id3 != last_id3v2.start_pos
			after_v2_tag = last_id3 >= last_id3v2.start_pos + last_id3v2.size
			if dupe_id3_string and after_v2_tag:
				# another 'ID3' string found after id3v2 tag
				self._mp3_stream.seek(last_id3 + 3 + 3, os.SEEK_SET)
				sbytes = self._mp3_stream.read(4)
				size = decode_id3_size(sbytes)

				begin_main_content = last_id3 + 10 + size  # 3 + 3 + 4
				# add extra amount of data to the last block
				last_id3v2.size = begin_main_content - last_id3v2.start_pos
			elif dupe_id3_string and not after_v2_tag:
				# another 'ID3' string found inside first id3v2 tag
				if begin_main_content > self._file_length:
					# first tag is corrupt by definition
					# assume latter tag to be the good one: parse it
					# skip 'ID3' + ID3v2 version (2 bytes) and flags (1 byte)
					self._mp3_stream.seek(last_id3 + 6, os.SEEK_SET)
					sbytes = self._mp3_stream.read(4)
					size = decode_id3_size(sbytes)
					tag_size = 10 + size  # 3 + 3 + 4
					last_id3v2 = Block(tag_size, "ID3", last_id3)
					self.blocks.append(last_id3v2)
					begin_main_content = last_id3 + tag_size

		self._mp3_stream.seek(begin_main_content, os.SEEK_SET)
		marker = self._mp3_stream.read(4)

		if not len(marker):
			# there still is something horribly wrong
			# (unless you think that an mp3 without any music data is possible)
			raise InvalidDataException("Tagging f****d up big time!")

		(sync,) = BE_SHORT.unpack(marker[:2])
		main_size = end_meta_data_offset - begin_main_content
		if marker[:3] == b"SRS":  # SRS data blocks
			cur_pos = begin_main_content
			while cur_pos < begin_main_content + main_size:
				self._mp3_stream.seek(cur_pos, os.SEEK_SET)
				# SRSF, SRST and SRSP
				try:
					marker = self._mp3_stream.read(4)
					# size includes the 8 bytes header
					(size,) = S_LONG.unpack(self._mp3_stream.read(4))
				except:
					raise InvalidDataException("Not enough SRS data")
				srs_block = Block(size, marker.decode("ascii"), cur_pos)
				self.blocks.append(srs_block)
				cur_pos += size
				if size == 0:
					raise InvalidDataException("SRS size field is zero")
				if size > begin_main_content + main_size:
					raise InvalidDataException("Broken SRS")
		elif sync & 0xFFE0 == 0xFFE0 or marker == b"RIFF":
			# first 11 bits all 1 for MP3 frame marker
			mp3_data_block = Block(main_size, "MP3", begin_main_content)
			self.blocks.append(mp3_data_block)
		else:
			print("WARNING: MP3 file is not valid!")
			data_block = Block(main_size, "MP3", begin_main_content)
			self.blocks.append(data_block)

		# the order of which we add blocks doesn't matter this way
		self.blocks.sort(key=lambda block: block.start_pos)

	def read(self):
		for block in self.blocks:
			self.current_block = block
			if _DEBUG:
				print(block)
			yield block

	def read_contents(self):
		self._mp3_stream.seek(self.current_block.start_pos, os.SEEK_SET)
		return self._mp3_stream.read(self.current_block.size)

	def read_part(self, size, offset=0):
		if (self.current_block.start_pos + offset + size >
			self.current_block.start_pos + self.current_block.size):
			raise ValueError("Can't read beyond end of block.")
		self._mp3_stream.seek(self.current_block.start_pos + offset, os.SEEK_SET)
		return self._mp3_stream.read(size)

	def close(self):
		try:  # close the file/stream
			self._mp3_stream.close()
		except:
			pass

	def __del__(self):
		try:  # close the file/stream
			self._mp3_stream.close()
		except:
			pass
Example #7
0
class EbmlReader(object):
    """Implements a simple Reader class that reads through MKV or 
	MKV-SRS files one element at a time."""
    def __init__(self,
                 read_mode,
                 path=None,
                 stream=None,
                 archived_file_name=""):
        assert path or stream
        self.element_header = b""  # 12 bytes

        self._ebml_stream = None
        self.mode = None
        self.read_done = True

        self.current_element = None
        self.element_type = None

        # when not empty: an expected file size has been printed
        # to stderr already when data was missing
        self.expected_file_size = ""

        if path:
            if is_rar(path):
                self._ebml_stream = RarStream(path, archived_file_name)
            else:
                self._ebml_stream = open(path, 'rb')
        elif stream:
            self._ebml_stream = stream
        else:
            assert False
        self._ebml_stream.seek(0, 2)
        self._file_length = self._ebml_stream.tell()
        self._ebml_stream.seek(0)
        self.mode = read_mode

    def read(self):
        # "Read() is invalid at this time", "MoveToChild(), ReadContents(), or
        # SkipContents() must be called before Read() can be called again"
        assert self.read_done or (self.mode == EbmlReadMode.SRS
                                  and self.element_type
                                  == EbmlElementType.Block), "improper state"

        element_start_position = self._ebml_stream.tell()

        # too little data (+2: 1B element ID + 1B data size)
        if element_start_position + 2 > self._file_length:
            return False

        self.current_element = None
        self.read_done = False

        # 1) Element ID -------------------------------------------------------
        # length descriptor: the leading bits of the header
        # used to identify the length of the ID (ID: like xml tags)
        read_byte = self._ebml_stream.read(1)
        if not len(read_byte):
            return False
# 			raise ValueError("Missing data")
        (id_length_descriptor, ) = BE_BYTE.unpack(read_byte)
        id_length_descriptor = GetUIntLength(id_length_descriptor)
        self.element_header = read_byte
        self.element_header += self._ebml_stream.read(id_length_descriptor - 1)

        # 2) Data size --------------------------------------------------------
        read_byte = self._ebml_stream.read(1)
        if not len(read_byte):
            return False
# 			raise ValueError("Missing data")
        (data_length_descriptor, ) = BE_BYTE.unpack(read_byte)
        data_length_descriptor = GetUIntLength(data_length_descriptor)
        self.element_header += read_byte
        self.element_header += self._ebml_stream.read(data_length_descriptor -
                                                      1)

        assert id_length_descriptor + data_length_descriptor == len(
            self.element_header)

        # 3) Data -------------------------------------------------------------
        eh = self.element_header[0:id_length_descriptor]
        self.element_type = id_type_mapping.get(eh, EbmlElementType.Unknown)

        element_length = GetEbmlUInt(self.element_header, id_length_descriptor,
                                     data_length_descriptor)

        # sanity check on element length.  skip check on Segment element so we
        # can still report expected size.  this is only applied on samples
        # since a partial movie might still be useful
        endOffset = (element_start_position + id_length_descriptor +
                     data_length_descriptor + element_length)
        if (self.mode == EbmlReadMode.Sample
                and self.element_type != EbmlElementType.Segment
                and endOffset > self._file_length):
            if self.expected_file_size:
                msg = ("Invalid element length at 0x{0:08X}. "
                       "Expected size: {1} bytes".format(
                           element_start_position, self.expected_file_size))
                raise InvalidDataException(msg)
            else:
                msg = "Invalid element length at 0x{0:08X}"
                raise InvalidDataException(msg.format(element_start_position))

        if self.element_type != EbmlElementType.Block:
            self.current_element = EbmlElement()
            self.current_element.raw_header = self.element_header
            self.current_element.element_start_pos = element_start_position
            self.current_element.length = element_length
        else:  # it's a block
            # first thing in the block is the track number
            trackDescriptor = self._ebml_stream.read(1)
            blockHeader = trackDescriptor
            trackDescriptor = GetUIntLength(BE_BYTE.unpack(trackDescriptor)[0])

            # incredibly unlikely the track number is > 1 byte,
            # but just to be safe...
            if trackDescriptor > 1:
                blockHeader += self._ebml_stream.read(trackDescriptor - 1)

            trackno = GetEbmlUInt(blockHeader, 0, trackDescriptor)

            # read in time code (2 bytes) and flags (1 byte)
            blockHeader += self._ebml_stream.read(3)
            timecode = ((BE_BYTE.unpack_from(blockHeader,
                                             len(blockHeader) - 3)[0] << 8) +
                        BE_BYTE.unpack_from(blockHeader,
                                            len(blockHeader) - 2)[0])

            # need to grab the flags (last byte of the header)
            # to check for lacing
            lace_type = (BE_BYTE.unpack_from(blockHeader,
                                             len(blockHeader) - 1)[0]
                         & EbmlLaceType.EBML)

            data_length = element_length - len(blockHeader)
            frameSizes, bytesConsumed = GetBlockFrameLengths(
                lace_type, data_length, self._ebml_stream)
            if bytesConsumed > 0:
                newBlockHeader = blockHeader
                self._ebml_stream.seek(-bytesConsumed, os.SEEK_CUR)
                newBlockHeader += self._ebml_stream.read(bytesConsumed)
                blockHeader = newBlockHeader

            element_length -= len(blockHeader)

            self.current_element = BlockElement()
            self.current_element.track_number = trackno
            self.current_element.timecode = timecode
            self.current_element.frame_lengths = frameSizes
            self.current_element.raw_block_header = blockHeader

            self.current_element.raw_header = self.element_header
            self.current_element.element_start_pos = element_start_position
            self.current_element.length = element_length

        # the following line will write mkvinfo-like output from the parser
        # (extremely useful for debugging)
# 		print("{0}: {3} + {1} bytes @ {2}".format(
# 		                            EbmlElementTypeName[self.element_type],
# 		                            element_length, # without header
# 		                            element_start_position,
# 		                            len(self.element_header)))

        return True

    def read_contents(self):
        # if readReady is set, we've already read or skipped it.
        # back up and read again?
        if self.read_done:
            self._ebml_stream.seek(-self.current_element.length, os.SEEK_CUR)

        self.read_done = True
        buff = None

        if (self.mode != EbmlReadMode.SRS
                or self.element_type != EbmlElementType.Block):
            buff = self._ebml_stream.read(self.current_element.length)
        return buff

    def skip_contents(self):
        if not self.read_done:
            self.read_done = True
            if (self.mode != EbmlReadMode.SRS
                    or self.element_type != EbmlElementType.Block):
                self._ebml_stream.seek(self.current_element.length,
                                       os.SEEK_CUR)

    def move_to_child(self):
        if self.read_done:
            self._ebml_stream.seek(-self.current_element.length, os.SEEK_CUR)
        self.read_done = True

    def close(self):
        try:  # close the file/stream
            self._ebml_stream.close()
        except:
            pass

    def __del__(self):
        try:  # close the file/stream
            self._ebml_stream.close()
        except:
            pass
Example #8
0
class FlacReader(object):
    """Implements a simple Reader class that reads through FLAC  
	or FLAC-SRS files one block at a time."""
    def __init__(self, path=None, stream=None, archived_file_name=""):
        assert path or stream
        if path:
            if is_rar(path):
                self._flac_stream = RarStream(path, archived_file_name)
            else:
                self._flac_stream = open(path, 'rb')
        elif stream:
            self._flac_stream = stream
        self._flac_stream.seek(0, 2)
        self._file_length = self._flac_stream.tell()
        self._flac_stream.seek(0)

        self.read_done = True
        self.current_block = None
        self.block_type = None

    def read(self):
        assert self.read_done

        block_start_position = self._flac_stream.tell()
        self.current_block = None
        self.read_done = False

        if block_start_position == self._file_length:
            return False

        self._block_header = self._flac_stream.read(4)
        # METADATA_BLOCK_HEADER
        # <1>    Last-metadata-block flag: '1' if this block is the last
        #        metadata block before the audio blocks, '0' otherwise.
        # <7>    BLOCK_TYPE
        # <24>   Length (in bytes) of metadata to follow
        #        (does not include the size of the METADATA_BLOCK_HEADER)

        if self._block_header == b"fLaC":
            self.block_type = "fLaC"
            self.current_block = Block(0, self.block_type)
            self.current_block.raw_header = b"fLaC"
            self.current_block.start_pos = block_start_position
            self._flac_stream.seek(block_start_position, os.SEEK_SET)
            return True

        # ID3v2
        if self._block_header.startswith(b"ID3"):
            self.block_type = "ID3"
            self._flac_stream.seek(block_start_position, os.SEEK_SET)
            raw_header = self._flac_stream.read(10)
            size = decode_id3_size(raw_header[6:10])
            self.current_block = Block(size, self.block_type)
            self.current_block.raw_header = raw_header
            self.current_block.start_pos = block_start_position
            self._flac_stream.seek(block_start_position, os.SEEK_SET)
            return True

        # ID3v1
        if self._block_header.startswith(b"TAG"):
            self.block_type = "TAG"
            self.current_block = Block(128, self.block_type)
            self.current_block.raw_header = b""
            self.current_block.start_pos = block_start_position
            self._flac_stream.seek(block_start_position, os.SEEK_SET)
            return True

        (self.block_type, ) = BE_BYTE.unpack_from(self._block_header, 0)
        if self.block_type == 0xFF:  # frame data
            block_length = self._file_length - block_start_position
            # check for ID3v1 tag
            self._flac_stream.seek(self._file_length - 128)
            if self._flac_stream.read(3) == b"TAG":
                block_length -= 128
            self._block_header = b""
        else:
            (block_length, ) = BE_LONG.unpack(b"\x00" + self._block_header[1:])

        # sanity check on block length
        end_offset = block_start_position + block_length
        if (end_offset > self._file_length):
            raise InvalidDataException("Invalid block length at 0x%08X" %
                                       block_start_position)

        self.current_block = Block(block_length, self.block_type)
        self.current_block.raw_header = self._block_header
        self.current_block.start_pos = block_start_position

        self._flac_stream.seek(block_start_position, os.SEEK_SET)

        return True

    def read_contents(self):
        # if read_done is set, we've already read or skipped it.
        # back up and read again?
        if self.read_done:
            self._flac_stream.seek(self.current_block.start_pos, os.SEEK_SET)

        self.read_done = True

        # skip header bytes
        hl = len(self.current_block.raw_header)
        self._flac_stream.seek(hl, os.SEEK_CUR)
        buff = self._flac_stream.read(self.current_block.size)
        return buff

    def skip_contents(self):
        if not self.read_done:
            self.read_done = True
            self._flac_stream.seek(
                self.current_block.start_pos +
                len(self.current_block.raw_header) + self.current_block.size,
                os.SEEK_SET)

    def read_part(self, size, offset=0):
        """idempotent operation"""
        hl = len(self.current_block.raw_header)
        initial_offset = self._flac_stream.tell()
        if initial_offset != self.current_block.start_pos:
            self._flac_stream.seek(self.current_block.start_pos, os.SEEK_SET)
        self._flac_stream.seek(offset + hl, os.SEEK_CUR)
        data = self._flac_stream.read(size)
        self._flac_stream.seek(initial_offset, os.SEEK_SET)
        return data

    def close(self):
        try:  # close the file/stream
            self._flac_stream.close()
        except:
            pass

    def __del__(self):
        try:  # close the file/stream
            self._flac_stream.close()
        except:
            pass
Example #9
0
class RiffReader(object):
	"""Implements a simple Reader class that reads through AVI 
	or AVI-SRS files one chunk at a time."""
	def __init__(self, read_mode, path=None, stream=None, match_offset=0,
			archived_file_name=""):
		if path:
			if is_rar(path):
				self._riff_stream = RarStream(path, archived_file_name)
			else:
				self._riff_stream = open(path, 'rb')
		elif stream:
			self._riff_stream = stream
		else:
			assert False
		self._riff_stream.seek(0, os.SEEK_END)
		self._file_length = self._riff_stream.tell()
		self.mode = read_mode

		self.read_done = True
		self.current_chunk = None
		self.chunk_type = None
		self.has_padding = False
		self.padding_byte = ""

		# faster reconstructing when match_offset is provided
		if match_offset >= 8 and match_offset < self._file_length:
			# -8 is there to add the chunk header for read()
			if self._is_valid_chunk_location(match_offset - 8):
				# yes! reconstruction will be fast
				self._riff_stream.seek(match_offset - 8, os.SEEK_SET)
			else:
				# match offset is not at the start boundary of a chunk
				chunk_offset = self._find_chunk_offset(match_offset)
				if _DEBUG:
					print("Match offset doesn't start on a nice boundary.")
					print("Chunk offset: {0}".format(chunk_offset))
					print("Match offset: {0}".format(match_offset))
				assert chunk_offset < match_offset
				self._riff_stream.seek(chunk_offset, os.SEEK_SET)

			# re-initialisation
			self.read_done = True
			self.current_chunk = None
			self.chunk_type = None
			self.has_padding = False
			self.padding_byte = ""
		elif match_offset >= self._file_length:
			msg = "Invalid match offset for video: {0}".format(match_offset)
			raise InvalidMatchOffsetException(msg)
		else:
			# no useful matching offset against the main movie file
			self._riff_stream.seek(0)

	def _is_valid_chunk_location(self, offset):
		"""Checks whether a certain offset is a valid chunk location to
		start processing from. Based on Four Character Code."""
		self._riff_stream.seek(offset, os.SEEK_SET)
		fourcc = self._riff_stream.read(4)
		return fourCCValidator.match(fourcc)

	def _find_chunk_offset(self, match_offset):
		"""Finds the start offset of the chunk for the match_offset. It uses
		the index at the end of the file."""
		self._riff_stream.seek(0, os.SEEK_SET)
		index_data = ""
		movi_start = 0

		while self.read():
			fourcc = self.current_chunk.fourcc
			if fourcc == b"AVI ":
				# the index is in here
				self.move_to_child()
			elif fourcc == b"movi":
				# location where the index is relative to
				movi_start = self.current_chunk.chunk_start_pos
			elif self.chunk_type == RiffChunkType.Index:
				index_data = self.read_contents()
				break
			self.skip_contents()

		# https://msdn.microsoft.com/en-us/library/windows/desktop/dd318181(v=vs.85).aspx
		# we've found the index
		if movi_start and len(index_data):
			# read chunk positions until an _absolute_ file position larger
			# than our match offset is found
			offsets = []
			offset = 0
			idxpos = 0
			while offset < match_offset and idxpos + 16 <= len(index_data):
				(offset,) = S_LONG.unpack(index_data[idxpos + 8:idxpos + 12])
				offsets.append(offset)
				idxpos += 16  # ckid, dwFlags, dwChunkOffset, dwChunkLength

			# choose the last _relative_ chunk smaller than the match offset
			# the match offset is absolute form the beginning of the file
			for offset in reversed(offsets):
				start_offset = movi_start + 8 + offset
				if start_offset < match_offset:
					if self._is_valid_chunk_location(start_offset):
						return start_offset
					else:
						if _DEBUG:
							print("AVI doesn't follow the 'idx1' spec.")
						break

			# assume the AVI doesn't follow the specification
			for offset in reversed(offsets):
				if offset < match_offset:
					if self._is_valid_chunk_location(offset):
						return offset
					else:
						if _DEBUG:
							print("The index offset wasn't usable.")
						return 0
		return 0

	def read(self):
		# "Read() is invalid at this time", "MoveToChild(), ReadContents(), or
		# SkipContents() must be called before Read() can be called again");
		assert self.read_done or (self.mode == RiffReadMode.SRS and
		                          self.chunk_type == RiffChunkType.Movi)

		# includes 8 byte header
		chunk_start_position = self._riff_stream.tell()
		self.current_chunk = None
		self.read_done = False

		if chunk_start_position + 8 > self._file_length:
			return False

		chunk_header = self._riff_stream.read(8)
		# 4 bytes for fourcc, 4 for chunk length
		fourcc = chunk_header[:4]
		(chunk_length,) = S_LONG.unpack_from(chunk_header, 4)

		# might not keep this check
		# the length check should catch corruption on its own...
		if not fourCCValidator.match(fourcc):
			raise InvalidDataException("Invalid FourCC value (%r) at 0x%08X" %
			                           (fourcc, chunk_start_position))

		# sanity check on chunk length
		# Skip check on RIFF list so we can still report expected size.
		# This is only applied on samples,
		# since a partial movie might still be useful.
		endOffset = chunk_start_position + 8 + chunk_length
		if (self.mode == RiffReadMode.Sample and
			fourcc != b"RIFF" and endOffset > self._file_length):
			raise InvalidDataException("Invalid chunk length at 0x%08X" %
			                           (chunk_start_position + 4))

		# Lists
		if fourcc == b"RIFF" or fourcc == b"LIST":
			# if the fourcc indicates a list type (RIFF or LIST),
			# there is another fourcc code in the next 4 bytes
			listType = fourcc
			chunk_header += self._riff_stream.read(4)
			fourcc = chunk_header[8:12]
			chunk_length -= 4  # extra dwFourCC

			self.chunk_type = RiffChunkType.List
			self.current_chunk = RiffList()
			self.current_chunk.list_type = listType  # RIFF list specific
			self.current_chunk.fourcc = fourcc
			self.current_chunk.length = chunk_length
			self.current_chunk.raw_header = chunk_header
			self.current_chunk.chunk_start_pos = chunk_start_position
		else:  # Chunks
			# Chunk containing video, audio or subtitle data
			if chunk_header[:2].isdigit():
				self.current_chunk = MoviChunk()
				self.current_chunk.stream_number = int(fourcc[:2])
				self.chunk_type = RiffChunkType.Movi
			elif fourcc == b"idx1":
				self.current_chunk = RiffChunk()
				self.chunk_type = RiffChunkType.Index
			else:
				self.current_chunk = RiffChunk()
				self.chunk_type = RiffChunkType.Unknown
			self.current_chunk.fourcc = fourcc
			self.current_chunk.length = chunk_length
			self.current_chunk.raw_header = chunk_header
			self.current_chunk.chunk_start_pos = chunk_start_position
		self.has_padding = chunk_length % 2 == 1

		return True

	def read_contents(self):
		# if read_done is set, we've already read or skipped it.
		# back up and read again?
		if self.read_done:
			self._riff_stream.seek(-self.current_chunk.length -
			                       (1 if self.has_padding else 0), os.SEEK_CUR)

		self.read_done = True
		buff = None

		if (self.mode != RiffReadMode.SRS or
			self.chunk_type != RiffChunkType.Movi):
			buff = self._riff_stream.read(self.current_chunk.length)

		if self.has_padding:
			(self.padding_byte,) = S_BYTE.unpack(self._riff_stream.read(1))

		return buff

	def skip_contents(self):
		if not self.read_done:
			self.read_done = True
			if (self.mode != RiffReadMode.SRS
				or self.chunk_type != RiffChunkType.Movi):
				self._riff_stream.seek(self.current_chunk.length, os.SEEK_CUR)

			if self.has_padding:
				(self.padding_byte,) = S_BYTE.unpack(self._riff_stream.read(1))

	def move_to_child(self):
		# "MoveToChild() should only be called on a RIFF List");
		assert self.chunk_type == RiffChunkType.List
		self.read_done = True

	def close(self):
		try:  # close the file/stream
			self._riff_stream.close()
		except:
			pass

	def __del__(self):
		try:  # close the file/stream
			self._riff_stream.close()
		except:
			pass