def read_from_file(file: io.FileIO, start: int, stop: int) -> bytes: assert stop > start file.seek(start) data = bytes() while file.tell() < stop: read_data = file.read(stop - file.tell()) if read_data == b'': raise ReachEndOfFile('Read until the end of file') data += read_data assert len(data) == stop - start return data
def read_from_file(file_fd: io.FileIO, start: int, stop: int) -> bytes: length = stop - start assert length >= 0 file_fd.seek(start) data = bytes() while file_fd.tell() < stop: read_data = file_fd.read(stop - file_fd.tell()) if read_data == b'': raise ReachedEndOfFile('Read until the end of file') data += read_data assert len(data) == length return data
def read_from_file(file_fd: io.FileIO, start: int, stop: int) -> bytes: length = stop - start assert length >= 0 file_fd.seek(start) data = bytes() while file_fd.tell() < stop: read_data = file_fd.read(stop - file_fd.tell()) if read_data == b'': raise EndOfFileError('Read until the end of file_fd') data += read_data assert len(data) == length return data
def read_from_file(file_fd: io.FileIO, start: int, end: int) -> bytes: length = end - start assert length >= 0 file_fd.seek(start) data = bytes() while file_fd.tell() < end: # The read() (when called with a positive argument), readinto() and write() methods on this class will only make one system call. read_data = file_fd.read(end - file_fd.tell()) if read_data == b'': raise EndOfFileError('read until the end of file_fd') data += read_data assert len(data) == length return data
def load(self, file: FileIO): self.ptr = file.tell() self.is_leaf, self.keys = load(file) ptr_num = len(self.keys) if not self.is_leaf: ptr_num += (ptr_num + 1) ptrs = unpack('Q' * ptr_num, file.read(8 * ptr_num)) if self.is_leaf: self.ptrs_value = list(ptrs) else: self.ptrs_value = list(ptrs[:len(self.keys)]) self.ptrs_child = list(ptrs[len(self.keys):]) self.size = file.tell() - self.ptr
def load(self, file: FileIO): self.ptr = file.tell() # IndexNode: [is_leaf, [..., key]] + ptrs_value + ptrs_child if not is_leaf self.is_leaf, self.keys = load(file) ptr_num = len(self.keys) if not self.is_leaf: ptr_num += (ptr_num + 1) ptrs = unpack('Q' * ptr_num, file.read(8 * ptr_num)) if self.is_leaf: self.ptrs_value = list(ptrs) else: self.ptrs_value = list(ptrs[:len(self.keys)]) self.ptrs_child = list(ptrs[len(self.keys):]) self.size = file.tell() - self.ptr
def load(self, file: FileIO): self.ptr = file.tell() self.is_leaf, self.keys = load(file) ptr_num = len(self.keys) if not self.is_leaf: ptr_num += (ptr_num + 1) ptrs = unpack('Q' * ptr_num, file.read(8 * ptr_num)) if self.is_leaf: self.ptrs_value = list(ptrs) else: ptr_num //= 2 self.ptrs_value = list(ptrs[:ptr_num]) self.ptrs_child = list(ptrs[ptr_num:]) self.size = file.tell() - self.ptr
def usenet_reader(zp: FileIO): """ An iterator that takes a ZipFile or other file-like object and returns the usenet posts in order according to RFC 1036 and later NetNews formats. :param io.FileIO zp: a file that contains a usenet or netnews mailbox :return str: A post, iteratively """ outfile = b'' line = True spot = zp.tell() while line: line = zp.readline() if re.match(b'From [\\d-]+$', line): if outfile != b'': yield str(outfile), zp.tell() - spot spot = zp.tell() outfile = b'' outfile += line
def read_id3(file_handle: FileIO, skip_v1: bool = False) -> ID3Base: id3 = ID3v2(file_handle) if id3.is_valid_id3 or skip_v1: return id3 # Check for an id3v1 tag current_file_position = file_handle.tell() file_handle.seek(-128, SEEK_END) block = file_handle.read(128) id3 = ID3v1(block) file_handle.seek(current_file_position, SEEK_SET) return id3
class filestream_range_iterator(Iterable): """ A class that mimics FileIO and implements an iterator that returns a fixed-sized sequence of bytes. Beginning from `start` to `end`. BBB: due to a possible bug in Zope>4, <=4.1.3, couldn't be subclass of FileIO as Iterators.filestream_iterator """ def __init__(self, name, mode='rb', bufsize=-1, streamsize=1 << 16, start=0, end=None): self._io = FileIO(name, mode=mode) self.streamsize = streamsize self.start = start self.end = end self._io.seek(start, 0) def __iter__(self): if self._io.closed: raise ValueError("I/O operation on closed file.") return self def __next__(self): if self.end is None: bytes = self.streamsize else: bytes = max(min(self.end - self._io.tell(), self.streamsize), 0) data = self._io.read(bytes) if not data: raise StopIteration return data next = __next__ def close(self): self._io.close() # BBB: is it necessary to implement __len__ ? # def __len__(self) def read(self, size=-1): return self._io.read(size)
def read_from(cls, sector_file: io.FileIO) -> "SectorInfo": # Save current position in file tell = sector_file.tell() # Go to end of file minus size of length. sector_file.seek(-cls.length_parser.size, 2) length, = cls.length_parser.unpack_from_file(sector_file) print(length) objects = [] if length: # Go back to saved position sector_file.seek(tell) for _ in range(length): objects.append(Object.read_from(sector_file)) return SectorObjects(objects=objects)
def read_from(cls, sector_file: io.FileIO) -> "SectorInfo": # Save current position in file tell = sector_file.tell() # Go to end of file minus size of length. sector_file.seek(-cls.length_parser.size, 2) length, = cls.length_parser.unpack_from_file(sector_file) print(length) objects = [] if length: # Go back to saved position sector_file.seek(tell) for _ in range(length): objects.append(Object.read_from(sector_file)) return SectorObjects(objects=objects)
def dump(self, file: FileIO): self.ptr = file.tell() file.write(bytes(self))
def _parse_cm(hf, data, progress_callback): failures = [] # is it a bytes-like? try: with memoryview(data) as mview: _parse(hf, mview, progress_callback) yield hf return except (HprofError, BufferError): # _parse failed raise except Exception as e: # pylint: disable=broad-except # we failed before calling _parse failures.append(('bytes-like?', e)) # can it be mmapped? from mmap import mmap, ACCESS_READ from io import BufferedReader import os if isinstance(data, BufferedReader): fno = data.fileno() fsize = os.fstat(fno).st_size with mmap(fno, fsize, access=ACCESS_READ) as mapped: with memoryview(mapped) as mview: _parse(hf, mview, progress_callback) yield hf return # can it be read? try: from tempfile import TemporaryFile from io import FileIO underlying_file = FileIO(data.fileno(), closefd=False) insize = os.fstat(underlying_file.fileno()).st_size with TemporaryFile() as f: buf = bytearray(256 * 1024) fsize = 0 while True: if progress_callback: progress_callback('extracting', min(underlying_file.tell(), insize - 1), insize) nread = data.readinto(buf) if not nread: break fsize += nread f.write(buf[:nread]) f.flush() if progress_callback: progress_callback('extracting', insize, insize) with mmap(f.fileno(), fsize) as mapped: with memoryview(mapped) as mview: _parse(hf, mview, progress_callback) yield hf return except BufferError as e: raise except Exception as e: # pylint: disable=broad-except prev = e while prev is not None: if isinstance(prev, HprofError): raise e prev = prev.__context__ # pylint: disable=redefined-variable-type failures.append(('tmpfile?', e)) raise TypeError('cannot handle `data` arg', data, *failures)
def dump(self, file: FileIO): self.ptr = file.tell() file.write(bytes(self))
def load(self, file: FileIO): self.ptr = file.tell() indicator = file.read(1) assert indicator in (OP, ED) self.key, self.value = load(file) self.size = file.tell() - self.ptr
def _tell_bytes(self): ''' Returns the file pointer position in bytes. ''' return FileIO.tell(self)
def parse(f: FileIO, dbfile: str, use_prefix: bool = False) -> bool: prefix = "".join( c for c in Path(str(f.name)).name.replace(".cfg.bin", "").upper() if "A" <= c <= "Z" or "0" <= c <= "9" or c == "_") prefix += "_" magic = f.read(4) if magic != b"RDBN": logger.error("magic not found") return False header_size = int.from_bytes(f.read(2), "little") if header_size != 0x50: logger.error("header must be 50 byte long") return False f.seek(0) header = parse_header(f.read(header_size)) logger.debug(header) f.seek(header.header_size + header.body_size) strings = f.read() strings_table: Dict[int, str] = {} for i in strings.rstrip(b"\0").split(b"\0"): strings_table[binascii.crc32(i)] = i.decode() f.seek(header.header_size) tmp_tables: List[Table] = [] nondata_strings: List[str] = [] for i in range(header.table_count): name_crc, unk1, col_offset, col_count, zero1, zero2 = struct.unpack( "<2I 2H II", f.read(header.item_size)) f.read(header.item_data_size - header.item_size) table = Table( id=name_crc, name=strings_table[name_crc], unk1=unk1, col_offset=col_offset, col_count=col_count, zero1=zero1, zero2=zero2, columns=[], ) logger.debug(table) nondata_strings.append(table.name) tmp_tables.append(table) tmp_columns: List[Column] = [] for i in range(header.column_count): name_crc, subid, id, size, offset, count = struct.unpack( "<I 2H 2I I", f.read(header.item_size)) f.read(header.item_data_size - header.item_size) col = Column( id=name_crc, name=strings_table[name_crc], typeid=id, sub_typeid=subid, size=size, offset=offset, count=count, ) logger.debug(col) nondata_strings.append(col.name) tmp_columns.append(col) columns = {x.name: x for x in tmp_columns} # table-column relationships for t in tmp_tables: for i in range(t.col_offset, t.col_offset + t.col_count): t.columns.append(tmp_columns[i]) logger.debug("table {} consists of columns {}".format( t.name, [c.name for c in t.columns])) tables = {x.name: x for x in tmp_tables} tmp_lists: List[DataList] = [] lists: Dict[str, DataList] = {} for i in range(header.list_count): idx, unk, offset, size, count, listname_crc = struct.unpack( "<2HIIII", f.read(header.item_size)) f.read(header.item_data_size - header.item_size) nondata_strings.append(strings_table[listname_crc]) list_ = DataList( id=listname_crc, index=idx, name=strings_table[listname_crc], unk=unk, offset=offset, size=size, count=count, ) tmp_lists.append(list_) logger.debug(list_) lists = {x.name: x for x in tmp_lists} con = sqlite3.connect(dbfile) # list-table relationship list_table: Dict[str, str] = {} # list_name -> table_name for l in lists.values(): table_cand = next( (t.name for idx, t in enumerate(tmp_tables) if idx == l.index), None) if table_cand is None: logger.warning("table for list {} not found".format(l)) return False list_table[l.name] = table_cand logger.debug("list {} is a list for tabel {}".format( l.name, table_cand)) # list, table, and column ids <-> string table offset relations # -- # All item ids I have discovered so far equals to the crc32 of item names. # So this relation is not necessarily required. ids = [ int.from_bytes(f.read(4), "little") for _ in range(header.id_name_table_size // 8) ] name_offsets = [ int.from_bytes(f.read(4), "little") for _ in range(header.id_name_table_size // 8) ] all_items: Dict[int, str] = {i.id: i.name for i in lists.values()} all_items.update({i.id: i.name for i in tables.values()}) all_items.update({i.id: i.name for i in columns.values()}) for id, name_offset in zip(ids, name_offsets): name = all_items.get(id, None) if name is None: logger.warning( "id (crc32 of name) {} is not recorded, but found in id-name table" .format(id)) continue name_ = strings[name_offset:].split(b"\0")[0].decode() if name != name_: logger.warning("name for id {} should be {} but {}".format( id, name_, name)) list_sorter: Callable[[DataList], int] = lambda l: l.offset for l in sorted(lists.values(), key=list_sorter): f.seek(header.header_size + header.item_data_size * (header.table_count + header.column_count + header.list_count) + header.id_name_table_size + l.offset) table_name = list_table[l.name] table_name_sql = table_name if not use_prefix else (prefix + table_name) # fetch type name table_type = tables[table_name] logger.debug("list {} starts at 0x{:08x}".format(l.name, f.tell())) # type convertors convertors = [ DBType(c.typeid, c.sub_typeid, c.name, c.size, c.count) for c in table_type.columns ] # get table information columns = ", ".join("{} {}".format(c.name, c.sqlite_type) for c in convertors) con.execute("CREATE TABLE IF NOT EXISTS {} ({});".format( table_name_sql, columns)) # insert information for i in range(l.count): row_data = f.read(l.size) row_out: List[Optional[Union[str, int, bytes]]] = [] pos = 0 last_offset: Optional[int] = None for col, conv in zip(table_type.columns, convertors): if last_offset: pad = col.offset - last_offset if pad > 0: pos += pad if i == 0: logger.debug( "{}-byte padding inserted at {}".format( pad, pos)) elif pad < 0: logger.error("padding could not be negative") data = row_data[pos:pos + col.size * col.count] pos += col.size * col.count last_offset = col.offset + col.size if conv.id == 3 and conv.subid in (0x14, 0x15): addr = conv.convert(data) assert isinstance(addr, int) if addr != 0 and addr != 0xFFFFFFFF and addr < len( strings): # TODO: more accurate string offset detection s = strings[addr:].split(b"\0")[0].decode() if strings[addr - 1] != 0: data = "[{:08x}]".format(addr) elif s in nondata_strings: data = "[{:08x}]".format(addr) else: # OK data = s else: data = "[{:08x}]".format(addr) else: data = conv.convert(data) row_out.append(data) if i == 0 and pos != l.size: logger.debug( "data reading ends at {}, leaving {} byte unread".format( pos, l.size - pos)) logger.debug( "unread data (only the first row will be shown): {}". format(row_data[pos:])) placeholder = ", ".join("?" * len(row_out)) con.execute( "INSERT INTO {} VALUES ({});".format(table_name_sql, placeholder), row_out) logger.debug("list {} ends at 0x{:08x}".format(l.name, f.tell())) con.commit() con.close() logger.debug("proccessing finished at 0x{:08x}".format(f.tell())) if f.tell() != header.header_size + header.body_size: logger.warning( "data parsing finished at 0x{:08x}, but the data seems ends at {:08x}" .format( f.tell(), header.header_size + header.body_size, )) return True
def load(self, file: FileIO): self.ptr = file.tell() indicator = file.read(1) assert unpack('B', indicator)[0] in (0, 1) self.key, self.value = load(file) self.size = file.tell() - self.ptr
class File(RawIOBase): 'Create a file object wrapping an e[x]ploded zip file' HEADER = 0 DATA = 1 DESCRIPTOR = 2 DIRECTORY = 3 def __init__(self, path, flags, info, fh=None, base='.', depth=0): super(File, self).__init__() self.path = path self.flags = flags self.fh = fh self.info = info self.depth = depth self.cursor = 0 self.offset = 0 self.state = File.HEADER # stream item info self.stream_offset = 0 self.zip_header = b'' self.descriptor = b'' # data file info self.data = None self.data_name = '' self.data_len = 0 # streams prefix = os.path.join(base, 'meta', os.path.basename(path)) self.stream = FileIO(prefix + '.stream', 'rb') self.dir = FileIO(prefix + '.dir', 'rb') self.data_dir = os.path.join(base, 'data') # init self._load_stream_item() self.lock = threading.Lock() def _load_stream_item(self): 'Sets the next stream item as current.' if self.data: self.data.close() self.data = None # open the header so we can know the data file to open, and the # length of the var fields raw_header = self.stream.read(STREAM_ITEM.size) header = StreamItem._make(STREAM_ITEM.unpack(raw_header)) var_fields = header.filename_len + header.extra_field_len # I would think that b2a_hex should decode the raw bytes... sha1 = b2a_hex(header.sha).decode('ascii') # only save the zip part of the header self.zip_header = (raw_header[:HEADER_DIFF] + self.stream.read(var_fields)) self.descriptor = self.stream.read(header.descriptor_len) self.data_name = path.join(*([self.data_dir] + list(sha1[:self.depth]) + [sha1])) def _open_data_file(self): self.data = FileIO(self.data_name, 'rb') self.data_len = self.data.seek(0, 2) self.data.seek(0) def close(self): self.stream.close() self.dir.close() if self.data: self.data.close() def fileno(self): return self.fh def isatty(self): return False def read(self, count=-1): if count < 0: return self.readall() elif count == 0: return b'' state = self.state if state == File.HEADER: previous_offset = self.offset self.offset += count result = self.zip_header[previous_offset:self.offset] self.cursor += len(result) if self.offset >= len(self.zip_header): self.state = File.DATA if not self.data: self._open_data_file() return result elif state == File.DATA: result = self.data.read(count) self.cursor += len(result) if self.data.tell() >= self.data_len: self.state = File.DESCRIPTOR self.offset = 0 # empty data file (state will now be DESCRIPTOR) if not result: return self.read(count) return result elif state == File.DESCRIPTOR: previous_offset = self.offset self.offset += count result = self.descriptor[previous_offset:self.offset] self.cursor += len(result) if self.offset >= len(self.descriptor): if self.cursor >= self.info.directory_offset: self.state = File.DIRECTORY self.dir.seek(0) self.stream_offset = None if self.data: self.data.close() self.data = None else: self.state = File.HEADER self.offset = 0 self.stream_offset = self.stream.tell() self._load_stream_item() # descriptor is optional (state will now be HEADER or DIRECTORY) if not result: return self.read(count) return result elif state == File.DIRECTORY: result = self.dir.read(count) self.cursor += len(result) return result else: raise RuntimeError('Invalid state: %r' % self.state) def readable(self): return True def readinto(self, b): count = len(b) if count == 0: return 0 state = self.state if state == File.HEADER: header_len = len(self.zip_header) previous_offset = self.offset current_offset = self.offset = \ min(previous_offset + count, header_len) read = current_offset - previous_offset b[:read] = self.zip_header[previous_offset:current_offset] self.cursor += read if current_offset == header_len: self.state = File.DATA if not self.data: self._open_data_file() return read elif state == File.DATA: read = self.data.readinto(b) self.cursor += read if self.data.tell() >= self.data_len: self.state = File.DESCRIPTOR self.offset = 0 # empty data file (state will now be DESCRIPTOR) if not read: return self.readinto(b) return read elif state == File.DESCRIPTOR: descriptor_len = len(self.descriptor) previous_offset = self.offset current_offset = self.offset = \ min(previous_offset + count, descriptor_len) read = current_offset - previous_offset b[:read] = self.descriptor[previous_offset:current_offset] self.cursor += read if current_offset == descriptor_len: if self.cursor >= self.info.directory_offset: self.state = File.DIRECTORY self.dir.seek(0) self.stream_offset = None if self.data: self.data.close() self.data = None else: self.state = File.HEADER self.offset = 0 self.stream_offset = self.stream.tell() self._load_stream_item() # descriptor is optional (state will now be HEADER or DIRECTORY) if not read: return self.readinto(b) return read elif state == File.DIRECTORY: read = self.dir.readinto(b) self.cursor += read return read else: raise RuntimeError('Invalid state: %r' % self.state) def seek(self, pos, offset=0): if offset == 1: pos += self.cursor elif offset == 2: pos += self.info.filesize if pos == self.cursor: return pos self.cursor = pos # skip directly to the central directory if pos >= self.info.directory_offset: if self.data: self.data.close() self.data = None self.state = File.DIRECTORY self.stream_offset = None self.dir.seek(pos - self.info.directory_offset) return pos # calculate the offset into the stream file z_offset, s_offset = self.info.jump_tree.find(pos).location additional = pos - z_offset # we're looking at a different data file # (load local header into memory) if s_offset != self.stream_offset: self.stream_offset = s_offset self.stream.seek(s_offset) self._load_stream_item() header_len = len(self.zip_header) if additional < header_len: self.state = File.HEADER self.offset = additional return pos # assume currently in the data file additional -= header_len self.state = File.DATA # if the file hasn't been opened yet, open it and find its size if not self.data: self._open_data_file() if additional < self.data_len: self.data.seek(additional) else: self.state = File.DESCRIPTOR self.offset = additional - self.data_len return pos def seekable(self): return True def tell(self): return self.cursor def writeable(self): return False
def load(self, file: FileIO): self.ptr = file.tell() indic = file.read(1) assert unpack('B', indic)[0] in (0, 1) self.key, self.value = load(file) self.size = file.tell() - self.ptr
def load(self, file: FileIO): self.ptr = file.tell() indicator = file.read(1) assert indicator in (OP, ED) self.key, self.value = load(file) self.size = file.tell() - self.ptr