def OutputBitList(bit_list, stream): # Make sure that the bit list is aligned to the byte boundary. assert len(bit_list) % 8 == 0 for bits in code_generator_util.SplitChunk(bit_list, 8): byte = 0 for bit_index, bit in enumerate(bits): if bit: # Fill in LSB to MSB order. byte |= (1 << bit_index) stream.write(struct.pack('B', byte))
def BuildBinaryData(matrix, mode_value_list, use_1byte_cost): # To compress the connection data, we use two-level succinct bit vector. # # The basic idea to compress the rid-lid matrix is compressing each row as # follows: # find the mode value of the row, and set the cells containins the value # empty, thus we get a sparse array. # We can compress sparse array by using succinct bit vector. # (Please see also storage/louds/simple_succinct_bit_vector_index and # storage/louds/bit_vector_based_array.) # In addition, we compress the bit vector, too. Fortunately the distribution # of bits is biased, so we group consecutive 8-bits and create another # bit vector, named chunk-bits; # - if no bits are 1, the corresponding bit is 0, otherwise 1. # By using the bit vector, we can compact the original bit vector by skipping # consecutive eight 0-bits. We can calculate the actual bit position in # the compact bit vector by using Rank1 operation on chunk-bits. # # The file format is as follows: # FILE_MAGIC (\xAB\xCD): 2bytes # Resolution: 2bytes # Num rids: 2bytes # Num lids: 2bytes # A list of mode values: 2bytes * rids (aligned to 32bits) # A list of row data. # # The row data format is as follows: # The size of compact bits in bytes: 2bytes # The size of values in bytes: 2bytes # chunk_bits, compact_bits, followed by values. if use_1byte_cost: resolution = RESOLUTION_FOR_1BYTE else: resolution = 1 stream = StringIO.StringIO() # Output header. stream.write(FILE_MAGIC) matrix_size = len(matrix) assert 0 <= matrix_size <= 65535 stream.write(struct.pack('<HHH', resolution, matrix_size, matrix_size)) # Output mode value list. for value in mode_value_list: assert 0 <= value <= 65536 stream.write(struct.pack('<H', value)) # 4 bytes alignment. if len(mode_value_list) % 2: stream.write('\x00\x00') # Process each row: for row in matrix: chunk_bits = [] compact_bits = [] values = [] for chunk in code_generator_util.SplitChunk(row, 8): if all(cost is None for cost in chunk): # All bits are 0, so output 0-chunk bit. chunk_bits.append(False) continue chunk_bits.append(True) for cost in chunk: if cost is None: compact_bits.append(False) else: compact_bits.append(True) if use_1byte_cost: if cost == INVALID_COST: cost = INVALID_1BYTE_COST else: cost /= resolution assert cost != INVALID_1BYTE_COST values.append(cost) # 4 bytes alignment. while len(chunk_bits) % 32: chunk_bits.append(False) while len(compact_bits) % 32: compact_bits.append(False) if use_1byte_cost: while len(values) % 4: values.append(0) values_size = len(values) else: while len(values) % 2: values.append(0) values_size = len(values) * 2 # Output the bits for a row. stream.write(struct.pack('<HH', len(compact_bits) / 8, values_size)) OutputBitList(chunk_bits, stream) OutputBitList(compact_bits, stream) if use_1byte_cost: for value in values: assert 0 <= value <= 255 stream.write(struct.pack('<B', value)) else: for value in values: assert 0 <= value <= 65535 stream.write(struct.pack('<H', value)) return stream.getvalue()