def encode(string, symbol_to_encoding_dict): string_len = 0 for symbol in string: string_len += len(symbol_to_encoding_dict[symbol]) padding = 8 - (string_len % 8) bit_array = BitArray((padding - 1) * '1' + '0') for symbol in string: bit_array += BitArray(symbol_to_encoding_dict[symbol]) return bit_array.tobytes()
def __init__(self, buf, unitsize): if buf.nbytes % unitsize != 0: raise InvalidArgument( "Buffer provided is not a multiple of unit size") self.buf = buf self.count = buf.nbytes // unitsize self.bits = BitArray(self.buf, 0, (self.count, unitsize * 8), (unitsize * 8, 1))
def decode(binary_data, symbol_to_encoding_dict): padding = 0 bit_array = BitArray() bit_array.frombytes(binary_data) while bool(bit_array[padding]): padding += 1 decoded_chars = bit_array[padding+1:].decode(symbol_to_encoding_dict) return ''.join(decoded_chars)
def derive_encoding(symbol_to_frequency_dict): assert(len(symbol_to_frequency_dict) > 0) symbol_to_encoding_dict = \ dict(((symbol, '') for symbol in symbol_to_frequency_dict.keys())) heap = [Node().init_leaf(symbol, frequency) for symbol, frequency in symbol_to_frequency_dict.items()] heapq.heapify(heap) while len(heap) > 1: left_child = heapq.heappop(heap) right_child = heapq.heappop(heap) heapq.heappush(heap, Node().init_parent(left_child, right_child)) for symbol in left_child.symbols_in_subtree(): symbol_to_encoding_dict[symbol] = \ '0' + symbol_to_encoding_dict[symbol] for symbol in right_child.symbols_in_subtree(): symbol_to_encoding_dict[symbol] = \ '1' + symbol_to_encoding_dict[symbol] for key, value in symbol_to_encoding_dict.items(): symbol_to_encoding_dict[key] = BitArray(value) return symbol_to_encoding_dict
def __init__(self, rect, heightmapTypes=[ "MOTION_BLOCKING", "MOTION_BLOCKING_NO_LEAVES", "OCEAN_FLOOR", "WORLD_SURFACE" ]): self.rect = rect self.chunkRect = (rect[0] >> 4, rect[1] >> 4, ((rect[0] + rect[2] - 1) >> 4) - (rect[0] >> 4) + 1, ((rect[1] + rect[3] - 1) >> 4) - (rect[1] >> 4) + 1) self.heightmapTypes = heightmapTypes bytes = getChunks(*self.chunkRect, rtype='bytes') file_like = BytesIO(bytes) print("parsing NBT") self.nbtfile = nbt.nbt.NBTFile(buffer=file_like) rectOffset = [rect[0] % 16, rect[1] % 16] # heightmaps self.heightmaps = {} for hmName in self.heightmapTypes: self.heightmaps[hmName] = np.zeros((rect[2], rect[3]), dtype=np.int) # Sections are in x,z,y order!!! (reverse minecraft order :p) self.sections = [[[None for i in range(16)] for z in range(self.chunkRect[3])] for x in range(self.chunkRect[2])] # heightmaps print("extracting heightmaps") for x in range(self.chunkRect[2]): for z in range(self.chunkRect[3]): chunkID = x + z * self.chunkRect[2] hms = self.nbtfile['Chunks'][chunkID]['Level']['Heightmaps'] for hmName in self.heightmapTypes: # hmRaw = hms['MOTION_BLOCKING'] hmRaw = hms[hmName] heightmapBitArray = BitArray(9, 16 * 16, hmRaw) heightmap = self.heightmaps[hmName] for cz in range(16): for cx in range(16): try: heightmap[-rectOffset[0] + x * 16 + cx, -rectOffset[1] + z * 16 + cz] = heightmapBitArray.getAt(cz * 16 + cx) except IndexError: pass # sections print("extracting chunk sections") for x in range(self.chunkRect[2]): for z in range(self.chunkRect[3]): chunkID = x + z * self.chunkRect[2] chunkSections = self.nbtfile['Chunks'][chunkID]['Level'][ 'Sections'] for section in chunkSections: y = section['Y'].value if not ('BlockStates' in section) or len( section['BlockStates']) == 0: continue palette = section['Palette'] rawBlockStates = section['BlockStates'] bitsPerEntry = max(4, ceil(log2(len(palette)))) blockStatesBitArray = BitArray(bitsPerEntry, 16 * 16 * 16, rawBlockStates) self.sections[x][z][y] = CachedSection( palette, blockStatesBitArray) print("done")
def __init__(self, x1, z1, x2, z2, heightmapTypes=["MOTION_BLOCKING", "MOTION_BLOCKING_NO_LEAVES", "OCEAN_FLOOR", "WORLD_SURFACE"]): """**Initialise WorldSlice with region and heightmaps**.""" self.rect = x1, z1, x2 - x1, z2 - z1 self.chunkRect = (self.rect[0] >> 4, self.rect[1] >> 4, ((self.rect[0] + self.rect[2] - 1) >> 4) - (self.rect[0] >> 4) + 1, ((self.rect[1] + self.rect[3] - 1) >> 4) - (self.rect[1] >> 4) + 1) self.heightmapTypes = heightmapTypes t0 = time.perf_counter() bytes = getChunks(*self.chunkRect, rtype='bytes') showPerf = False if showPerf: print(f"took {time.perf_counter() - t0}s") t0 = time.perf_counter() file_like = BytesIO(bytes) print("parsing NBT") self.nbtfile = nbt.nbt.NBTFile(buffer=file_like) if showPerf: print(f"took {time.perf_counter() - t0}s") t0 = time.perf_counter() rectOffset = [self.rect[0] % 16, self.rect[1] % 16] # heightmaps self.heightmaps = {} for hmName in self.heightmapTypes: self.heightmaps[hmName] = np.zeros( (self.rect[2], self.rect[3]), dtype=np.int) # Sections are in x,z,y order!!! (reverse minecraft order :p) self.sections = [[[None for i in range(16)] for z in range( self.chunkRect[3])] for x in range(self.chunkRect[2])] # heightmaps print("extracting heightmaps") for x in range(self.chunkRect[2]): for z in range(self.chunkRect[3]): chunkID = x + z * self.chunkRect[2] hms = self.nbtfile['Chunks'][chunkID]['Level']['Heightmaps'] for hmName in self.heightmapTypes: # hmRaw = hms['MOTION_BLOCKING'] hmRaw = hms[hmName] heightmapBitArray = BitArray(9, 16 * 16, hmRaw) heightmap = self.heightmaps[hmName] for cz in range(16): for cx in range(16): try: heightmap[-rectOffset[0] + x * 16 + cx, -rectOffset[1] + z * 16 + cz] \ = heightmapBitArray.getAt(cz * 16 + cx) except IndexError: pass if showPerf: print(f"took {time.perf_counter() - t0}s") t0 = time.perf_counter() # sections print("extracting chunk sections") for x in range(self.chunkRect[2]): for z in range(self.chunkRect[3]): chunkID = x + z * self.chunkRect[2] chunk = self.nbtfile['Chunks'][chunkID] chunkSections = chunk['Level']['Sections'] for section in chunkSections: y = section['Y'].value if (not ('BlockStates' in section) or len(section['BlockStates']) == 0): continue palette = section['Palette'] rawBlockStates = section['BlockStates'] bitsPerEntry = max(4, ceil(log2(len(palette)))) blockStatesBitArray = BitArray( bitsPerEntry, 16 * 16 * 16, rawBlockStates) self.sections[x][z][y] = CachedSection( palette, blockStatesBitArray) if showPerf: print(f"took {time.perf_counter() - t0}s") print("done")
def __init__(self, iterable=(), population=56, probes=6): self.population = xrange(population) self.probes = probes self.data = BitArray(population) for name in iterable: self.add(name)
bytes = getChunks(0, 0, 2, 2, rtype='bytes') print(len(bytes)) print(bytes) # print(getChunks(0, 0, 2, 2, rtype='text')) print("") file_like = BytesIO(bytes) nbtfile = nbt.nbt.NBTFile(buffer=file_like) print(nbtfile['Chunks']) print(nbtfile['Chunks'][0]['Level']['Sections']) sections = nbtfile['Chunks'][0]['Level']['Sections'] def sectionIsEmpty(section): return not ('BlockStates' in section) or len(section['BlockStates']) == 0 for section in sections: if not sectionIsEmpty(section): palette = section['Palette'] blockStates = section['BlockStates'] bitsPerEntry = max(4, ceil(log2(len(palette)))) bitarray = BitArray(bitsPerEntry, 16*16*16, blockStates) def printBlock(blockStateID): print(palette[blockStateID]) bitarray.getAll(printBlock) pass
from bitarray import BitArray import numpy as np buf = np.arange(10, dtype=np.uint8) offset = 0 shape = (10, 8) strides = (8, 1) b = BitArray(buf, offset, shape, strides) print(b[1, 0]) print(b[0, 8:16]) print(b[:])
def huffman_compression(self, generate_encoding=False): # compress using Huffman encoding symbol_to_encoding_dict = {} # count all occuring UTF-8 characters if generate_encoding: symbol_to_frequency_dict = Counter() with self.report.measure('counting utf8 characters'): with open(f'{self.directory}/index.csv') as index_file: chunk_size = 100000 def next_chunk_generator(): chunk = index_file.read(chunk_size) while chunk: yield chunk chunk = index_file.read(chunk_size) for i, chunk in enumerate(next_chunk_generator(), 1): symbol_to_frequency_dict.update(Counter(chunk)) self.report.progress( i, f' chunks counted ({chunk_size} characters ' 'each)', 100) if '\n' in symbol_to_frequency_dict.keys(): del symbol_to_frequency_dict['\n'] # derive huffman encoding from character counts with self.report.measure('deriving huffman encoding'): symbol_to_encoding_dict = Huffman.derive_encoding( symbol_to_frequency_dict) for key, value in symbol_to_encoding_dict.items(): assert (len(key) == 1) symbol_to_encoding_list[ord(key[0])] = value with open(f'{self.directory}/symbol_to_encoding_dict.pickle', mode='wb') as f: pickle.dump(symbol_to_encoding_dict, f, pickle.HIGHEST_PROTOCOL) else: # optimal encoding for guardian # character distribution should be similar for all datasets symbol_to_encoding_dict = { '\a': BitArray('1111'), ',': BitArray('001'), '0': BitArray('1000'), '1': BitArray('011'), '2': BitArray('010'), '3': BitArray('000'), '4': BitArray('1110'), '5': BitArray('1101'), '6': BitArray('1100'), '7': BitArray('1011'), '8': BitArray('1010'), '9': BitArray('1001') } with open(f'{self.directory}/symbol_to_encoding_dict.pickle', mode='wb') as f: pickle.dump(symbol_to_encoding_dict, f, pickle.HIGHEST_PROTOCOL) # save compressed index and corresponding seek_list with self.report.measure('saving compressed files'): self.compressed_seek_list = [] with open(f'{self.directory}/compressed_index', mode='wb') \ as compressed_index_file: offset = 0 for i, orig_line in enumerate( binary_read_line_generator_path( f'{self.directory}/index.csv'), 1): term = next( csv.reader(io.StringIO(orig_line), delimiter=posting_list_separator))[0] line_without_term = orig_line[len(term) + 3:] encoded_line = Huffman.encode(line_without_term, symbol_to_encoding_dict) compressed_index_file.write(encoded_line) self.compressed_seek_list.append( (term, (offset, len(encoded_line)))) self.report.progress(i, ' index lines compressed', 100000) offset += len(encoded_line) self.compressed_seek_list = \ RecordDAWG('>QQ', self.compressed_seek_list) self.compressed_seek_list.save( f'{self.directory}/compressed_seek_list.dawg')