Exemple #1
0
def read_from_file(file: io.FileIO, start: int, stop: int) -> bytes:
    assert stop > start
    file.seek(start)
    data = bytes()
    while file.tell() < stop:
        read_data = file.read(stop - file.tell())
        if read_data == b'':
            raise ReachEndOfFile('Read until the end of file')
        data += read_data
    assert len(data) == stop - start
    return data
Exemple #2
0
def read_from_file(file_fd: io.FileIO, start: int, stop: int) -> bytes:
    length = stop - start
    assert length >= 0
    file_fd.seek(start)
    data = bytes()
    while file_fd.tell() < stop:
        read_data = file_fd.read(stop - file_fd.tell())
        if read_data == b'':
            raise ReachedEndOfFile('Read until the end of file')
        data += read_data
    assert len(data) == length
    return data
Exemple #3
0
def read_from_file(file_fd: io.FileIO, start: int, stop: int) -> bytes:
    length = stop - start
    assert length >= 0
    file_fd.seek(start)
    data = bytes()
    while file_fd.tell() < stop:
        read_data = file_fd.read(stop - file_fd.tell())
        if read_data == b'':
            raise EndOfFileError('Read until the end of file_fd')
        data += read_data
    assert len(data) == length
    return data
Exemple #4
0
def read_from_file(file_fd: io.FileIO, start: int, end: int) -> bytes:
    length = end - start
    assert length >= 0
    file_fd.seek(start)
    data = bytes()
    while file_fd.tell() < end:
        # The read() (when called with a positive argument), readinto() and write() methods on this class will only make one system call.
        read_data = file_fd.read(end - file_fd.tell())
        if read_data == b'':
            raise EndOfFileError('read until the end of file_fd')
        data += read_data
    assert len(data) == length
    return data
Exemple #5
0
    def load(self, file: FileIO):
        self.ptr = file.tell()
        self.is_leaf, self.keys = load(file)

        ptr_num = len(self.keys)
        if not self.is_leaf:
            ptr_num += (ptr_num + 1)
        ptrs = unpack('Q' * ptr_num, file.read(8 * ptr_num))

        if self.is_leaf:
            self.ptrs_value = list(ptrs)
        else:
            self.ptrs_value = list(ptrs[:len(self.keys)])
            self.ptrs_child = list(ptrs[len(self.keys):])
        self.size = file.tell() - self.ptr
Exemple #6
0
    def load(self, file: FileIO):
        self.ptr = file.tell()
        # IndexNode: [is_leaf, [..., key]] + ptrs_value + ptrs_child if not is_leaf
        self.is_leaf, self.keys = load(file)

        ptr_num = len(self.keys)
        if not self.is_leaf:
            ptr_num += (ptr_num + 1)
        ptrs = unpack('Q' * ptr_num, file.read(8 * ptr_num))

        if self.is_leaf:
            self.ptrs_value = list(ptrs)
        else:
            self.ptrs_value = list(ptrs[:len(self.keys)])
            self.ptrs_child = list(ptrs[len(self.keys):])
        self.size = file.tell() - self.ptr
Exemple #7
0
    def load(self, file: FileIO):
        self.ptr = file.tell()
        self.is_leaf, self.keys = load(file)

        ptr_num = len(self.keys)
        if not self.is_leaf:
            ptr_num += (ptr_num + 1)
        ptrs = unpack('Q' * ptr_num, file.read(8 * ptr_num))

        if self.is_leaf:
            self.ptrs_value = list(ptrs)
        else:
            ptr_num //= 2
            self.ptrs_value = list(ptrs[:ptr_num])
            self.ptrs_child = list(ptrs[ptr_num:])
        self.size = file.tell() - self.ptr
def usenet_reader(zp: FileIO):
    """
    An iterator that takes a ZipFile or other file-like object and returns the usenet posts in order according to RFC
    1036 and later NetNews formats.
    :param io.FileIO zp: a file that contains a usenet or netnews mailbox
    :return str: A post, iteratively
    """
    outfile = b''
    line = True
    spot = zp.tell()
    while line:
        line = zp.readline()
        if re.match(b'From [\\d-]+$', line):
            if outfile != b'':
                yield str(outfile), zp.tell() - spot
                spot = zp.tell()
            outfile = b''

        outfile += line
Exemple #9
0
    def read_id3(file_handle: FileIO, skip_v1: bool = False) -> ID3Base:
        id3 = ID3v2(file_handle)
        if id3.is_valid_id3 or skip_v1:
            return id3

        # Check for an id3v1 tag
        current_file_position = file_handle.tell()
        file_handle.seek(-128, SEEK_END)
        block = file_handle.read(128)
        id3 = ID3v1(block)

        file_handle.seek(current_file_position, SEEK_SET)
        return id3
Exemple #10
0
class filestream_range_iterator(Iterable):
    """
    A class that mimics FileIO and implements an iterator that returns a
    fixed-sized sequence of bytes. Beginning from `start` to `end`.

    BBB: due to a possible bug in Zope>4, <=4.1.3, couldn't be subclass of FileIO
         as Iterators.filestream_iterator
    """

    def __init__(self, name, mode='rb', bufsize=-1, streamsize=1 << 16, start=0, end=None):
        self._io = FileIO(name, mode=mode)
        self.streamsize = streamsize
        self.start = start
        self.end = end
        self._io.seek(start, 0)

    def __iter__(self):
        if self._io.closed:
            raise ValueError("I/O operation on closed file.")
        return self

    def __next__(self):
        if self.end is None:
            bytes = self.streamsize
        else:
            bytes = max(min(self.end - self._io.tell(), self.streamsize), 0)
        data = self._io.read(bytes)
        if not data:
            raise StopIteration
        return data

    next = __next__

    def close(self):
        self._io.close()

    # BBB: is it necessary to implement __len__ ?
    # def __len__(self)

    def read(self, size=-1):
        return self._io.read(size)
Exemple #11
0
    def read_from(cls, sector_file: io.FileIO) -> "SectorInfo":

        # Save current position in file
        tell = sector_file.tell()

        # Go to end of file minus size of length.
        sector_file.seek(-cls.length_parser.size, 2)

        length, = cls.length_parser.unpack_from_file(sector_file)

        print(length)

        objects = []

        if length:
            # Go back to saved position
            sector_file.seek(tell)

            for _ in range(length):
                objects.append(Object.read_from(sector_file))

        return SectorObjects(objects=objects)
Exemple #12
0
    def read_from(cls, sector_file: io.FileIO) -> "SectorInfo":

        # Save current position in file
        tell = sector_file.tell()

        # Go to end of file minus size of length.
        sector_file.seek(-cls.length_parser.size, 2)

        length,  = cls.length_parser.unpack_from_file(sector_file)

        print(length)

        objects = []

        if length:
            # Go back to saved position
            sector_file.seek(tell)

            for _ in range(length):
                objects.append(Object.read_from(sector_file))

        return SectorObjects(objects=objects)
Exemple #13
0
 def dump(self, file: FileIO):
     self.ptr = file.tell()
     file.write(bytes(self))
Exemple #14
0
def _parse_cm(hf, data, progress_callback):
    failures = []

    # is it a bytes-like?
    try:
        with memoryview(data) as mview:
            _parse(hf, mview, progress_callback)
            yield hf
            return
    except (HprofError, BufferError):
        # _parse failed
        raise
    except Exception as e:  # pylint: disable=broad-except
        # we failed before calling _parse
        failures.append(('bytes-like?', e))

    # can it be mmapped?
    from mmap import mmap, ACCESS_READ
    from io import BufferedReader
    import os
    if isinstance(data, BufferedReader):
        fno = data.fileno()
        fsize = os.fstat(fno).st_size
        with mmap(fno, fsize, access=ACCESS_READ) as mapped:
            with memoryview(mapped) as mview:
                _parse(hf, mview, progress_callback)
                yield hf
                return

    # can it be read?
    try:
        from tempfile import TemporaryFile
        from io import FileIO
        underlying_file = FileIO(data.fileno(), closefd=False)
        insize = os.fstat(underlying_file.fileno()).st_size
        with TemporaryFile() as f:
            buf = bytearray(256 * 1024)
            fsize = 0
            while True:
                if progress_callback:
                    progress_callback('extracting',
                                      min(underlying_file.tell(), insize - 1),
                                      insize)
                nread = data.readinto(buf)
                if not nread:
                    break
                fsize += nread
                f.write(buf[:nread])
            f.flush()
            if progress_callback:
                progress_callback('extracting', insize, insize)
            with mmap(f.fileno(), fsize) as mapped:
                with memoryview(mapped) as mview:
                    _parse(hf, mview, progress_callback)
                    yield hf
                    return
    except BufferError as e:
        raise
    except Exception as e:  # pylint: disable=broad-except
        prev = e
        while prev is not None:
            if isinstance(prev, HprofError):
                raise e
            prev = prev.__context__  # pylint: disable=redefined-variable-type
        failures.append(('tmpfile?', e))

    raise TypeError('cannot handle `data` arg', data, *failures)
Exemple #15
0
 def dump(self, file: FileIO):
     self.ptr = file.tell()
     file.write(bytes(self))
Exemple #16
0
 def load(self, file: FileIO):
     self.ptr = file.tell()
     indicator = file.read(1)
     assert indicator in (OP, ED)
     self.key, self.value = load(file)
     self.size = file.tell() - self.ptr
Exemple #17
0
    def _tell_bytes(self):
        '''
        Returns the file pointer position in bytes.
        '''

        return FileIO.tell(self)
Exemple #18
0
def parse(f: FileIO, dbfile: str, use_prefix: bool = False) -> bool:
    prefix = "".join(
        c for c in Path(str(f.name)).name.replace(".cfg.bin", "").upper()
        if "A" <= c <= "Z" or "0" <= c <= "9" or c == "_")
    prefix += "_"

    magic = f.read(4)
    if magic != b"RDBN":
        logger.error("magic not found")
        return False

    header_size = int.from_bytes(f.read(2), "little")
    if header_size != 0x50:
        logger.error("header must be 50 byte long")
        return False
    f.seek(0)
    header = parse_header(f.read(header_size))

    logger.debug(header)

    f.seek(header.header_size + header.body_size)
    strings = f.read()
    strings_table: Dict[int, str] = {}
    for i in strings.rstrip(b"\0").split(b"\0"):
        strings_table[binascii.crc32(i)] = i.decode()

    f.seek(header.header_size)

    tmp_tables: List[Table] = []
    nondata_strings: List[str] = []
    for i in range(header.table_count):
        name_crc, unk1, col_offset, col_count, zero1, zero2 = struct.unpack(
            "<2I 2H II", f.read(header.item_size))
        f.read(header.item_data_size - header.item_size)
        table = Table(
            id=name_crc,
            name=strings_table[name_crc],
            unk1=unk1,
            col_offset=col_offset,
            col_count=col_count,
            zero1=zero1,
            zero2=zero2,
            columns=[],
        )
        logger.debug(table)
        nondata_strings.append(table.name)
        tmp_tables.append(table)

    tmp_columns: List[Column] = []
    for i in range(header.column_count):
        name_crc, subid, id, size, offset, count = struct.unpack(
            "<I 2H 2I I", f.read(header.item_size))
        f.read(header.item_data_size - header.item_size)
        col = Column(
            id=name_crc,
            name=strings_table[name_crc],
            typeid=id,
            sub_typeid=subid,
            size=size,
            offset=offset,
            count=count,
        )
        logger.debug(col)
        nondata_strings.append(col.name)
        tmp_columns.append(col)
    columns = {x.name: x for x in tmp_columns}

    # table-column relationships
    for t in tmp_tables:
        for i in range(t.col_offset, t.col_offset + t.col_count):
            t.columns.append(tmp_columns[i])
        logger.debug("table {} consists of columns {}".format(
            t.name, [c.name for c in t.columns]))

    tables = {x.name: x for x in tmp_tables}

    tmp_lists: List[DataList] = []
    lists: Dict[str, DataList] = {}
    for i in range(header.list_count):
        idx, unk, offset, size, count, listname_crc = struct.unpack(
            "<2HIIII", f.read(header.item_size))
        f.read(header.item_data_size - header.item_size)
        nondata_strings.append(strings_table[listname_crc])
        list_ = DataList(
            id=listname_crc,
            index=idx,
            name=strings_table[listname_crc],
            unk=unk,
            offset=offset,
            size=size,
            count=count,
        )
        tmp_lists.append(list_)
        logger.debug(list_)
    lists = {x.name: x for x in tmp_lists}

    con = sqlite3.connect(dbfile)

    # list-table relationship
    list_table: Dict[str, str] = {}  # list_name -> table_name
    for l in lists.values():
        table_cand = next(
            (t.name for idx, t in enumerate(tmp_tables) if idx == l.index),
            None)
        if table_cand is None:
            logger.warning("table for list {} not found".format(l))
            return False
        list_table[l.name] = table_cand
        logger.debug("list {} is a list for tabel {}".format(
            l.name, table_cand))

    # list, table, and column ids <-> string table offset relations
    # --
    # All item ids I have discovered so far equals to the crc32 of item names.
    # So this relation is not necessarily required.
    ids = [
        int.from_bytes(f.read(4), "little")
        for _ in range(header.id_name_table_size // 8)
    ]
    name_offsets = [
        int.from_bytes(f.read(4), "little")
        for _ in range(header.id_name_table_size // 8)
    ]
    all_items: Dict[int, str] = {i.id: i.name for i in lists.values()}
    all_items.update({i.id: i.name for i in tables.values()})
    all_items.update({i.id: i.name for i in columns.values()})
    for id, name_offset in zip(ids, name_offsets):
        name = all_items.get(id, None)
        if name is None:
            logger.warning(
                "id (crc32 of name) {} is not recorded, but found in id-name table"
                .format(id))
            continue
        name_ = strings[name_offset:].split(b"\0")[0].decode()
        if name != name_:
            logger.warning("name for id {} should be {} but {}".format(
                id, name_, name))

    list_sorter: Callable[[DataList], int] = lambda l: l.offset
    for l in sorted(lists.values(), key=list_sorter):
        f.seek(header.header_size + header.item_data_size *
               (header.table_count + header.column_count + header.list_count) +
               header.id_name_table_size + l.offset)
        table_name = list_table[l.name]
        table_name_sql = table_name if not use_prefix else (prefix +
                                                            table_name)

        # fetch type name
        table_type = tables[table_name]

        logger.debug("list {} starts at 0x{:08x}".format(l.name, f.tell()))

        # type convertors
        convertors = [
            DBType(c.typeid, c.sub_typeid, c.name, c.size, c.count)
            for c in table_type.columns
        ]

        # get table information
        columns = ", ".join("{} {}".format(c.name, c.sqlite_type)
                            for c in convertors)

        con.execute("CREATE TABLE IF NOT EXISTS {} ({});".format(
            table_name_sql, columns))

        # insert information
        for i in range(l.count):
            row_data = f.read(l.size)
            row_out: List[Optional[Union[str, int, bytes]]] = []
            pos = 0
            last_offset: Optional[int] = None
            for col, conv in zip(table_type.columns, convertors):
                if last_offset:
                    pad = col.offset - last_offset
                    if pad > 0:
                        pos += pad
                        if i == 0:
                            logger.debug(
                                "{}-byte padding inserted at {}".format(
                                    pad, pos))
                    elif pad < 0:
                        logger.error("padding could not be negative")
                data = row_data[pos:pos + col.size * col.count]
                pos += col.size * col.count
                last_offset = col.offset + col.size

                if conv.id == 3 and conv.subid in (0x14, 0x15):
                    addr = conv.convert(data)
                    assert isinstance(addr, int)
                    if addr != 0 and addr != 0xFFFFFFFF and addr < len(
                            strings):
                        # TODO: more accurate string offset detection
                        s = strings[addr:].split(b"\0")[0].decode()
                        if strings[addr - 1] != 0:
                            data = "[{:08x}]".format(addr)
                        elif s in nondata_strings:
                            data = "[{:08x}]".format(addr)
                        else:  # OK
                            data = s
                    else:
                        data = "[{:08x}]".format(addr)
                else:
                    data = conv.convert(data)
                row_out.append(data)
            if i == 0 and pos != l.size:
                logger.debug(
                    "data reading ends at {}, leaving {} byte unread".format(
                        pos, l.size - pos))
                logger.debug(
                    "unread data (only the first row will be shown): {}".
                    format(row_data[pos:]))
            placeholder = ", ".join("?" * len(row_out))
            con.execute(
                "INSERT INTO {} VALUES ({});".format(table_name_sql,
                                                     placeholder), row_out)
        logger.debug("list {} ends at 0x{:08x}".format(l.name, f.tell()))

    con.commit()
    con.close()
    logger.debug("proccessing finished at 0x{:08x}".format(f.tell()))
    if f.tell() != header.header_size + header.body_size:
        logger.warning(
            "data parsing finished at 0x{:08x}, but the data seems ends at {:08x}"
            .format(
                f.tell(),
                header.header_size + header.body_size,
            ))
    return True
Exemple #19
0
 def load(self, file: FileIO):
     self.ptr = file.tell()
     indicator = file.read(1)
     assert unpack('B', indicator)[0] in (0, 1)
     self.key, self.value = load(file)
     self.size = file.tell() - self.ptr
Exemple #20
0
class File(RawIOBase):
    'Create a file object wrapping an e[x]ploded zip file'

    HEADER = 0
    DATA = 1
    DESCRIPTOR = 2
    DIRECTORY = 3

    def __init__(self, path, flags, info, fh=None, base='.', depth=0):
        super(File, self).__init__()

        self.path = path
        self.flags = flags
        self.fh = fh

        self.info = info
        self.depth = depth
        self.cursor = 0
        self.offset = 0
        self.state = File.HEADER

        # stream item info
        self.stream_offset = 0
        self.zip_header = b''
        self.descriptor = b''

        # data file info
        self.data = None
        self.data_name = ''
        self.data_len = 0

        # streams
        prefix = os.path.join(base, 'meta', os.path.basename(path))
        self.stream = FileIO(prefix + '.stream', 'rb')
        self.dir = FileIO(prefix + '.dir', 'rb')
        self.data_dir = os.path.join(base, 'data')

        # init
        self._load_stream_item()
        self.lock = threading.Lock()

    def _load_stream_item(self):
        'Sets the next stream item as current.'

        if self.data:
            self.data.close()
            self.data = None

        # open the header so we can know the data file to open, and the
        # length of the var fields
        raw_header = self.stream.read(STREAM_ITEM.size)
        header = StreamItem._make(STREAM_ITEM.unpack(raw_header))

        var_fields = header.filename_len + header.extra_field_len
        # I would think that b2a_hex should decode the raw bytes...
        sha1 = b2a_hex(header.sha).decode('ascii')

        # only save the zip part of the header
        self.zip_header = (raw_header[:HEADER_DIFF] +
                           self.stream.read(var_fields))

        self.descriptor = self.stream.read(header.descriptor_len)

        self.data_name = path.join(*([self.data_dir] +
                                     list(sha1[:self.depth]) + [sha1]))

    def _open_data_file(self):
        self.data = FileIO(self.data_name, 'rb')
        self.data_len = self.data.seek(0, 2)
        self.data.seek(0)

    def close(self):
        self.stream.close()
        self.dir.close()
        if self.data: self.data.close()

    def fileno(self):
        return self.fh

    def isatty(self):
        return False

    def read(self, count=-1):
        if count < 0: return self.readall()
        elif count == 0: return b''

        state = self.state
        if state == File.HEADER:
            previous_offset = self.offset
            self.offset += count

            result = self.zip_header[previous_offset:self.offset]
            self.cursor += len(result)

            if self.offset >= len(self.zip_header):
                self.state = File.DATA
                if not self.data: self._open_data_file()

            return result

        elif state == File.DATA:
            result = self.data.read(count)
            self.cursor += len(result)

            if self.data.tell() >= self.data_len:
                self.state = File.DESCRIPTOR
                self.offset = 0

            # empty data file (state will now be DESCRIPTOR)
            if not result: return self.read(count)

            return result

        elif state == File.DESCRIPTOR:
            previous_offset = self.offset
            self.offset += count

            result = self.descriptor[previous_offset:self.offset]
            self.cursor += len(result)

            if self.offset >= len(self.descriptor):
                if self.cursor >= self.info.directory_offset:
                    self.state = File.DIRECTORY
                    self.dir.seek(0)
                    self.stream_offset = None

                    if self.data:
                        self.data.close()
                        self.data = None

                else:
                    self.state = File.HEADER
                    self.offset = 0
                    self.stream_offset = self.stream.tell()
                    self._load_stream_item()

            # descriptor is optional (state will now be HEADER or DIRECTORY)
            if not result: return self.read(count)

            return result
        elif state == File.DIRECTORY:
            result = self.dir.read(count)
            self.cursor += len(result)

            return result
        else:
            raise RuntimeError('Invalid state: %r' % self.state)

    def readable(self):
        return True

    def readinto(self, b):
        count = len(b)
        if count == 0: return 0

        state = self.state
        if state == File.HEADER:
            header_len = len(self.zip_header)
            previous_offset = self.offset

            current_offset = self.offset = \
                    min(previous_offset + count, header_len)

            read = current_offset - previous_offset
            b[:read] = self.zip_header[previous_offset:current_offset]
            self.cursor += read

            if current_offset == header_len:
                self.state = File.DATA
                if not self.data: self._open_data_file()

            return read

        elif state == File.DATA:
            read = self.data.readinto(b)
            self.cursor += read

            if self.data.tell() >= self.data_len:
                self.state = File.DESCRIPTOR
                self.offset = 0

            # empty data file (state will now be DESCRIPTOR)
            if not read: return self.readinto(b)

            return read

        elif state == File.DESCRIPTOR:
            descriptor_len = len(self.descriptor)
            previous_offset = self.offset

            current_offset = self.offset = \
                    min(previous_offset + count, descriptor_len)

            read = current_offset - previous_offset
            b[:read] = self.descriptor[previous_offset:current_offset]
            self.cursor += read

            if current_offset == descriptor_len:
                if self.cursor >= self.info.directory_offset:
                    self.state = File.DIRECTORY
                    self.dir.seek(0)
                    self.stream_offset = None

                    if self.data:
                        self.data.close()
                        self.data = None

                else:
                    self.state = File.HEADER
                    self.offset = 0
                    self.stream_offset = self.stream.tell()
                    self._load_stream_item()

            # descriptor is optional (state will now be HEADER or DIRECTORY)
            if not read: return self.readinto(b)

            return read
        elif state == File.DIRECTORY:
            read = self.dir.readinto(b)
            self.cursor += read

            return read
        else:
            raise RuntimeError('Invalid state: %r' % self.state)

    def seek(self, pos, offset=0):
        if offset == 1:
            pos += self.cursor
        elif offset == 2:
            pos += self.info.filesize

        if pos == self.cursor: return pos
        self.cursor = pos

        # skip directly to the central directory
        if pos >= self.info.directory_offset:
            if self.data:
                self.data.close()
                self.data = None

            self.state = File.DIRECTORY
            self.stream_offset = None
            self.dir.seek(pos - self.info.directory_offset)
            return pos

        # calculate the offset into the stream file
        z_offset, s_offset = self.info.jump_tree.find(pos).location
        additional = pos - z_offset

        # we're looking at a different data file
        # (load local header into memory)
        if s_offset != self.stream_offset:
            self.stream_offset = s_offset
            self.stream.seek(s_offset)
            self._load_stream_item()

        header_len = len(self.zip_header)
        if additional < header_len:
            self.state = File.HEADER
            self.offset = additional
            return pos

        # assume currently in the data file
        additional -= header_len
        self.state = File.DATA

        # if the file hasn't been opened yet, open it and find its size
        if not self.data: self._open_data_file()

        if additional < self.data_len:
            self.data.seek(additional)
        else:
            self.state = File.DESCRIPTOR
            self.offset = additional - self.data_len

        return pos

    def seekable(self):
        return True

    def tell(self):
        return self.cursor

    def writeable(self):
        return False
Exemple #21
0
 def load(self, file: FileIO):
     self.ptr = file.tell()
     indic = file.read(1)
     assert unpack('B', indic)[0] in (0, 1)
     self.key, self.value = load(file)
     self.size = file.tell() - self.ptr
Exemple #22
0
 def load(self, file: FileIO):
     self.ptr = file.tell()
     indicator = file.read(1)
     assert indicator in (OP, ED)
     self.key, self.value = load(file)
     self.size = file.tell() - self.ptr