def get_data_by_index(fmdx, index): fmdx.seek(index['file_pos']) record_block_compressed = fmdx.read(index['compressed_size']) record_block_type = record_block_compressed[:4] record_block_type = index['record_block_type'] decompressed_size = index['decompressed_size'] #adler32 = unpack('>I', record_block_compressed[4:8])[0] if record_block_type == 0: _record_block = record_block_compressed[8:] # lzo compression elif record_block_type == 1: if lzo is None: print("LZO compression is not supported") # decompress header = b'\xf0' + pack('>I', index['decompressed_size']) _record_block = lzo.decompress(record_block_compressed[8:], initSize=decompressed_size, blockSize=1308672) # zlib compression elif record_block_type == 2: # decompress _record_block = zlib.decompress(record_block_compressed[8:]) data = _record_block[index['record_start'] - index['offset']:index['record_end'] - index['offset']] return data
def _decode_key_block(self, key_block_compressed, key_block_info_list): key_list = [] i = 0 for compressed_size, decompressed_size in key_block_info_list: start = i end = i + compressed_size # 4 bytes : compression type key_block_type = key_block_compressed[start:start + 4] # 4 bytes : adler checksum of decompressed key block adler32 = unpack('>I', key_block_compressed[start + 4:start + 8])[0] if key_block_type == b'\x00\x00\x00\x00': key_block = key_block_compressed[start + 8:end] elif key_block_type == b'\x01\x00\x00\x00': if lzo is None: print("LZO compression is not supported") break # decompress key block header = b'\xf0' + pack('>I', decompressed_size) key_block = lzo.decompress(key_block_compressed[start + 8:end], initSize = decompressed_size, blockSize=1308672) elif key_block_type == b'\x02\x00\x00\x00': # decompress key block key_block = zlib.decompress(key_block_compressed[start + 8:end]) # extract one single key block into a key list key_list += self._split_key_block(key_block) # notice that adler32 returns signed value assert(adler32 == zlib.adler32(key_block) & 0xffffffff) i += compressed_size return key_list
def get_index(self, check_block = True): ### 索引列表 index_dict_list = [] f = open(self._fname, 'rb') f.seek(self._record_block_offset) num_record_blocks = self._read_number(f) num_entries = self._read_number(f) assert(num_entries == self._num_entries) record_block_info_size = self._read_number(f) record_block_size = self._read_number(f) # record block info section record_block_info_list = [] size_counter = 0 for i in range(num_record_blocks): compressed_size = self._read_number(f) decompressed_size = self._read_number(f) record_block_info_list += [(compressed_size, decompressed_size)] size_counter += self._number_width * 2 assert(size_counter == record_block_info_size) # actual record block data offset = 0 i = 0 size_counter = 0 ###最后的索引表的格式为 ### key_text(关键词,可以由后面的 keylist 得到) ### file_pos(record_block开始的位置) ### compressed_size(record_block压缩前的大小) ### decompressed_size(解压后的大小) ### record_block_type(record_block 的压缩类型) ### record_start (以下三个为从 record_block 中提取某一调记录需要的参数,可以直接保存) ### record_end ### offset for compressed_size, decompressed_size in record_block_info_list: current_pos = f.tell() record_block_compressed = f.read(compressed_size) ###### 要得到 record_block_compressed 需要得到 compressed_size (这个可以直接记录) ###### 另外还需要记录当前 f 对象的位置 ###### 使用 f.tell() 命令/ 在建立索引是需要 f.seek() # 4 bytes indicates block compression type record_block_type = record_block_compressed[:4] # 4 bytes adler checksum of uncompressed content adler32 = unpack('>I', record_block_compressed[4:8])[0] # no compression if record_block_type == b'\x00\x00\x00\x00': _type = 0 record_block = record_block_compressed[8:] # lzo compression elif record_block_type == b'\x01\x00\x00\x00': _type = 1 if lzo is None: print("LZO compression is not supported") break # decompress header = b'\xf0' + pack('>I', decompressed_size) if check_block: record_block = lzo.decompress(record_block_compressed[8:], initSize = decompressed_size, blockSize=1308672) # zlib compression elif record_block_type == b'\x02\x00\x00\x00': # decompress _type = 2 if check_block: record_block = zlib.decompress(record_block_compressed[8:]) ###### 这里比较重要的是先要得到 record_block, 而 record_block 是解压得到的,其中一共有三种解压方法 ###### 需要的信息有 record_block_compressed, decompress_size, ###### record_block_type ###### 另外还需要校验信息 adler32 # notice that adler32 return signed value if check_block: assert(adler32 == zlib.adler32(record_block) & 0xffffffff) assert(len(record_block) == decompressed_size) # split record block according to the offset info from key block while i < len(self._key_list): ### 用来保存索引信息的空字典 index_dict = {} index_dict['file_pos'] = current_pos index_dict['compressed_size'] = compressed_size index_dict['decompressed_size'] = decompressed_size index_dict['record_block_type'] = _type record_start, key_text = self._key_list[i] index_dict['record_start'] = record_start index_dict['key_text'] = key_text.decode('utf-8') index_dict['offset'] = offset # reach the end of current record block if record_start - offset >= decompressed_size: break # record end index if i < len(self._key_list) - 1: record_end = self._key_list[i + 1][0] else: record_end = decompressed_size + offset index_dict['record_end'] = record_end i += 1 #############需要得到 record_block , record_start, record_end, #############offset if check_block: record = record_block[record_start - offset:record_end - offset] # convert to utf-8 record = record.decode(self._encoding, errors='ignore').strip(u'\x00').encode('utf-8') # substitute styles #############是否替换样式表 if self._substyle and self._stylesheet: record = self._substitute_stylesheet(record) index_dict_list.append(index_dict) offset += decompressed_size size_counter += compressed_size #todo: 注意!!! #assert(size_counter == record_block_size) f.close #这里比 mdd 部分稍有不同,应该还需要传递编码以及样式表信息 meta = {} meta['encoding'] = self._encoding meta['stylesheet'] = json.dumps(self._stylesheet) meta['title'] = self._title meta['description'] = self._description return {"index_dict_list":index_dict_list, 'meta':meta}
def _decode_record_block(self): f = open(self._fname, 'rb') f.seek(self._record_block_offset) num_record_blocks = self._read_number(f) num_entries = self._read_number(f) assert(num_entries == self._num_entries) record_block_info_size = self._read_number(f) record_block_size = self._read_number(f) # record block info section record_block_info_list = [] size_counter = 0 for i in range(num_record_blocks): compressed_size = self._read_number(f) decompressed_size = self._read_number(f) record_block_info_list += [(compressed_size, decompressed_size)] size_counter += self._number_width * 2 assert(size_counter == record_block_info_size) # actual record block data offset = 0 i = 0 size_counter = 0 ###最后的索引表的格式为 ### key_text(关键词,可以由后面的 keylist 得到) ### file_pos(record_block开始的位置) ### compressed_size(record_block压缩前的大小) ### decompressed_size(解压后的大小) ### record_block_type(record_block 的压缩类型) ### record_start (以下三个为从 record_block 中提取某一调记录需要的参数,可以直接保存) ### record_end ### offset for compressed_size, decompressed_size in record_block_info_list: record_block_compressed = f.read(compressed_size) ###### 要得到 record_block_compressed 需要得到 compressed_size (这个可以直接记录) ###### 另外还需要记录当前 f 对象的位置 ###### 使用 f.tell() 命令/ 在建立索引是需要 f.seek() # 4 bytes indicates block compression type record_block_type = record_block_compressed[:4] # 4 bytes adler checksum of uncompressed content adler32 = unpack('>I', record_block_compressed[4:8])[0] # no compression if record_block_type == b'\x00\x00\x00\x00': record_block = record_block_compressed[8:] # lzo compression elif record_block_type == b'\x01\x00\x00\x00': if lzo is None: print("LZO compression is not supported") break # decompress header = b'\xf0' + pack('>I', decompressed_size) record_block = lzo.decompress(record_block_compressed[8:], initSize = decompressed_size, blockSize=1308672) # zlib compression elif record_block_type == b'\x02\x00\x00\x00': # decompress record_block = zlib.decompress(record_block_compressed[8:]) ###### 这里比较重要的是先要得到 record_block, 而 record_block 是解压得到的,其中一共有三种解压方法 ###### 需要的信息有 record_block_compressed, decompress_size, ###### record_block_type ###### 另外还需要校验信息 adler32 # notice that adler32 return signed value assert(adler32 == zlib.adler32(record_block) & 0xffffffff) assert(len(record_block) == decompressed_size) # split record block according to the offset info from key block while i < len(self._key_list): record_start, key_text = self._key_list[i] # reach the end of current record block if record_start - offset >= len(record_block): break # record end index if i < len(self._key_list) - 1: record_end = self._key_list[i + 1][0] else: record_end = len(record_block) + offset i += 1 #############需要得到 record_block , record_start, record_end, #############offset record = record_block[record_start - offset:record_end - offset] # convert to utf-8 record = record.decode(self._encoding, errors='ignore').strip(u'\x00').encode('utf-8') # substitute styles #############是否替换样式表 if self._substyle and self._stylesheet: record = self._substitute_stylesheet(record) yield key_text, record offset += len(record_block) size_counter += compressed_size assert(size_counter == record_block_size) f.close()
def get_index(self, check_block = True): f = open(self._fname, 'rb') index_dict_list = [] f.seek(self._record_block_offset) num_record_blocks = self._read_number(f) num_entries = self._read_number(f) assert(num_entries == self._num_entries) record_block_info_size = self._read_number(f) record_block_size = self._read_number(f) # record block info section record_block_info_list = [] size_counter = 0 for i in range(num_record_blocks): compressed_size = self._read_number(f) decompressed_size = self._read_number(f) record_block_info_list += [(compressed_size, decompressed_size)] size_counter += self._number_width * 2 # todo:注意!!! assert(size_counter == record_block_info_size) # actual record block offset = 0 i = 0 size_counter = 0 for compressed_size, decompressed_size in record_block_info_list: current_pos = f.tell() record_block_compressed = f.read(compressed_size) # 4 bytes: compression type record_block_type = record_block_compressed[:4] # 4 bytes: adler32 checksum of decompressed record block adler32 = unpack('>I', record_block_compressed[4:8])[0] if record_block_type == b'\x00\x00\x00\x00': _type = 0 if check_block: record_block = record_block_compressed[8:] elif record_block_type == b'\x01\x00\x00\x00': _type = 1 if lzo is None: print("LZO compression is not supported") break # decompress header = b'\xf0' + pack('>I', decompressed_size) if check_block: record_block = lzo.decompress(record_block_compressed[start + 8:end], initSize = decompressed_size, blockSize=1308672) elif record_block_type == b'\x02\x00\x00\x00': # decompress _type = 2 if check_block: record_block = zlib.decompress(record_block_compressed[8:]) # notice that adler32 return signed value if check_block: assert(adler32 == zlib.adler32(record_block) & 0xffffffff) assert(len(record_block) == decompressed_size) # split record block according to the offset info from key block while i < len(self._key_list): ### 用来保存索引信息的空字典 index_dict = {} index_dict['file_pos'] = current_pos index_dict['compressed_size'] = compressed_size index_dict['decompressed_size'] = decompressed_size index_dict['record_block_type'] = _type record_start, key_text = self._key_list[i] index_dict['record_start'] = record_start index_dict['key_text'] = key_text.decode("utf-8") index_dict['offset'] = offset # reach the end of current record block if record_start - offset >= decompressed_size: break # record end index if i < len(self._key_list) - 1: record_end = self._key_list[i + 1][0] else: record_end = decompressed_size + offset index_dict['record_end'] = record_end i += 1 if check_block: data = record_block[record_start - offset:record_end - offset] index_dict_list.append(index_dict) #yield key_text, data offset += decompressed_size size_counter += compressed_size assert(size_counter == record_block_size) f.close() return index_dict_list
def _decode_record_block(self): f = open(self._fname, 'rb') f.seek(self._record_block_offset) num_record_blocks = self._read_number(f) num_entries = self._read_number(f) assert(num_entries == self._num_entries) record_block_info_size = self._read_number(f) record_block_size = self._read_number(f) # record block info section record_block_info_list = [] size_counter = 0 for i in range(num_record_blocks): compressed_size = self._read_number(f) decompressed_size = self._read_number(f) record_block_info_list += [(compressed_size, decompressed_size)] size_counter += self._number_width * 2 assert(size_counter == record_block_info_size) # actual record block offset = 0 i = 0 size_counter = 0 for compressed_size, decompressed_size in record_block_info_list: record_block_compressed = f.read(compressed_size) # 4 bytes: compression type record_block_type = record_block_compressed[:4] # 4 bytes: adler32 checksum of decompressed record block adler32 = unpack('>I', record_block_compressed[4:8])[0] if record_block_type == b'\x00\x00\x00\x00': record_block = record_block_compressed[8:] elif record_block_type == b'\x01\x00\x00\x00': if lzo is None: print("LZO compression is not supported") break # decompress header = b'\xf0' + pack('>I', decompressed_size) record_block = lzo.decompress(record_block_compressed[start + 8:end], initSize = decompressed_size, blockSize=1308672) elif record_block_type == b'\x02\x00\x00\x00': # decompress record_block = zlib.decompress(record_block_compressed[8:]) # notice that adler32 return signed value assert(adler32 == zlib.adler32(record_block) & 0xffffffff) assert(len(record_block) == decompressed_size) # split record block according to the offset info from key block while i < len(self._key_list): record_start, key_text = self._key_list[i] # reach the end of current record block if record_start - offset >= len(record_block): break # record end index if i < len(self._key_list) - 1: record_end = self._key_list[i + 1][0] else: record_end = len(record_block) + offset i += 1 data = record_block[record_start - offset:record_end - offset] yield key_text, data offset += len(record_block) size_counter += compressed_size assert(size_counter == record_block_size) f.close()