def read_def(io_obj, daph, helper, metadata, out=None): """ Read the definition levels from this page, if any. """ definition_levels = None num_nulls = 0 if not helper.is_required(metadata.path_in_schema): max_definition_level = helper.max_definition_level( metadata.path_in_schema) bit_width = encoding.width_from_max_int(max_definition_level) if bit_width: # NB: num_values is index 1 for either type of page header definition_levels = read_data( io_obj, parquet_thrift.Encoding.RLE, daph.num_values, bit_width, out=out) if ( daph.statistics is not None and getattr(daph.statistics, "null_count", None) is not None ): num_nulls = daph.statistics.null_count elif ( daph.num_values == metadata.num_values and metadata.statistics and getattr(metadata.statistics, "null_count", None) is not None ): num_nulls = metadata.statistics.null_count else: num_nulls = daph.num_values - (definition_levels == max_definition_level).sum() if num_nulls == 0: definition_levels = None return definition_levels, num_nulls
def test_bitpack(): for _ in range(10): values = np.random.randint(0, 15000, size=np.random.randint(10, 100), dtype=np.int32) width = cencoding.width_from_max_int(values.max()) buf = np.zeros(900, dtype=np.uint8) o = cencoding.NumpyIO(buf) cencoding.encode_bitpacked(values, width, o) o.seek(0) head = cencoding.read_unsigned_var_int(o) buf2 = np.zeros(300, dtype=np.int32) out = cencoding.NumpyIO(buf2.view("uint8")) cencoding.read_bitpacked(o, head, width, out) assert (values == buf2[:len(values)]).all() assert buf2[len(values):].sum() == 0 # zero padding assert out.tell() // 8 - len(values) < 8
def test_rle_bp(): for _ in range(10): values = np.random.randint(0, 15000, size=np.random.randint(10, 100), dtype=np.int32) buf = np.empty(len(values) + 5, dtype=np.int32) out = cencoding.NumpyIO(buf.view('uint8')) buf2 = np.zeros(900, dtype=np.uint8) o = cencoding.NumpyIO(buf2) width = cencoding.width_from_max_int(values.max()) # without length cencoding.encode_rle_bp(values, width, o) l = o.tell() o.seek(0) cencoding.read_rle_bit_packed_hybrid(o, width, length=l, o=out) assert (buf[:len(values)] == values).all()
def read_rep(io_obj, daph, helper, metadata, out=None): """ Read the repetition levels from this page, if any. """ repetition_levels = None if len(metadata.path_in_schema) > 1: max_repetition_level = helper.max_repetition_level( metadata.path_in_schema) if max_repetition_level == 0: repetition_levels = None else: bit_width = encoding.width_from_max_int(max_repetition_level) # NB: num_values is index 1 for either type of page header repetition_levels = read_data(io_obj, parquet_thrift.Encoding.RLE, daph.num_values, bit_width, out=out) return repetition_levels
def read_data_page_v2(infile, schema_helper, se, data_header2, cmd, dic, assign, num, use_cat, file_offset, ph, idx=None, selfmade=False): """ :param infile: open file :param schema_helper: :param se: schema element :param data_header2: page header struct :param cmd: column metadata :param dic: any dictionary labels encountered :param assign: output array (all of it) :param num: offset, rows so far :param use_cat: output is categorical? :return: None test data "/Users/mdurant/Downloads/datapage_v2.snappy.parquet" a b c d e 0 abc 1 2.0 True [1, 2, 3] 1 abc 2 3.0 True None 2 abc 3 4.0 True None 3 None 4 5.0 False [1, 2, 3] 4 abc 5 2.0 True [1, 2] b is delta encoded; c is dict encoded """ if data_header2.encoding not in [parquet_thrift.Encoding.PLAIN_DICTIONARY, parquet_thrift.Encoding.RLE_DICTIONARY, parquet_thrift.Encoding.RLE, parquet_thrift.Encoding.PLAIN, parquet_thrift.Encoding.DELTA_BINARY_PACKED ]: raise NotImplementedError size = (ph.compressed_page_size - data_header2.repetition_levels_byte_length - data_header2.definition_levels_byte_length) data = infile.tell() + data_header2.definition_levels_byte_length + data_header2.repetition_levels_byte_length n_values = data_header2.num_values - data_header2.num_nulls max_rep = schema_helper.max_repetition_level(cmd.path_in_schema) if max_rep: # TODO: probably not functional bit_width = encoding.width_from_max_int(max_rep) io_obj = encoding.NumpyIO(infile.read(data_header2.repetition_levels_byte_length)) repi = np.empty(data_header2.num_values, dtype="uint8") encoding.read_rle_bit_packed_hybrid(io_obj, bit_width, data_header2.num_values, encoding.NumpyIO(repi), itemsize=1) max_def = schema_helper.max_definition_level(cmd.path_in_schema) nullable = isinstance(assign.dtype, pd.core.arrays.masked.BaseMaskedDtype) if max_def and data_header2.num_nulls: bit_width = encoding.width_from_max_int(max_def) # not the same as read_data(), because we know the length io_obj = encoding.NumpyIO(infile.read(data_header2.definition_levels_byte_length)) if nullable: defi = assign._mask else: # TODO: in tabular data, nulls arrays could be reused for each column defi = np.empty(data_header2.num_values, dtype=np.uint8) encoding.read_rle_bit_packed_hybrid(io_obj, bit_width, data_header2.num_values, encoding.NumpyIO(defi), itemsize=1) if max_rep: # assemble_objects needs both arrays nulls = defi != max_def else: np.not_equal(defi.view("uint8"), max_def, out=defi) nulls = defi.view(np.bool_) infile.seek(data) # input and output element sizes match see = se.type_length == assign.dtype.itemsize * 8 or simple.get(se.type).itemsize == assign.dtype.itemsize # can read-into into0 = ((use_cat or converts_inplace(se) and see) and data_header2.num_nulls == 0 and max_rep == 0 and assign.dtype.kind != "O") # can decompress-into into = (data_header2.is_compressed and rev_map[cmd.codec] in decom_into and into0) if nullable: assign = assign._data uncompressed_page_size = (ph.uncompressed_page_size - data_header2.definition_levels_byte_length - data_header2.repetition_levels_byte_length) if into0 and data_header2.encoding == parquet_thrift.Encoding.PLAIN and ( not data_header2.is_compressed or cmd.codec == parquet_thrift.CompressionCodec.UNCOMPRESSED ): # PLAIN read directly into output (a copy for remote files) infile.readinto(assign[num:num+n_values].view('uint8')) convert(assign[num:num+n_values], se) elif into and data_header2.encoding == parquet_thrift.Encoding.PLAIN: # PLAIN decompress directly into output decomp = decom_into[rev_map[cmd.codec]] decomp(infile.read(size), assign[num:num+data_header2.num_values].view('uint8')) convert(assign[num:num+n_values], se) elif data_header2.encoding == parquet_thrift.Encoding.PLAIN: # PLAIN, but with nulls or not in-place conversion codec = cmd.codec if data_header2.is_compressed else "UNCOMPRESSED" raw_bytes = decompress_data(infile.read(size), uncompressed_page_size, codec) values = read_plain(raw_bytes, cmd.type, n_values, width=se.type_length, utf=se.converted_type == 0) if data_header2.num_nulls: if nullable: assign[num:num+data_header2.num_values][~nulls] = convert(values, se) else: assign[num:num+data_header2.num_values][nulls] = None # or nan or nat assign[num:num+data_header2.num_values][~nulls] = convert(values, se) else: assign[num:num+data_header2.num_values] = convert(values, se) elif (use_cat and data_header2.encoding in [ parquet_thrift.Encoding.PLAIN_DICTIONARY, parquet_thrift.Encoding.RLE_DICTIONARY, ]) or (data_header2.encoding == parquet_thrift.Encoding.RLE): # DICTIONARY or BOOL direct decode RLE into output (no nulls) codec = cmd.codec if data_header2.is_compressed else "UNCOMPRESSED" raw_bytes = np.empty(size, dtype='uint8') # TODO: small improvement possible by file.readinto and decompress_into if we # don't first read raw_bytes but seek in the open file infile.readinto(raw_bytes) raw_bytes = decompress_data(raw_bytes, uncompressed_page_size, codec) pagefile = encoding.NumpyIO(raw_bytes) if data_header2.encoding != parquet_thrift.Encoding.RLE: # TODO: check this bit; is the varint read only row byte-exact fastpath? bit_width = pagefile.read_byte() encoding.read_unsigned_var_int(pagefile) else: bit_width = 1 pagefile.seek(4, 1) if bit_width in [8, 16, 32] and selfmade: # special fastpath for cats outbytes = raw_bytes[pagefile.tell():] if len(outbytes) == assign[num:num+data_header2.num_values].nbytes: assign[num:num+data_header2.num_values].view('uint8')[:] = outbytes else: if data_header2.num_nulls == 0: assign[num:num+data_header2.num_values][:] = outbytes else: assign[num:num+data_header2.num_values][~nulls] = outbytes assign[num:num+data_header2.num_values][nulls] = -1 else: if data_header2.num_nulls == 0: encoding.read_rle_bit_packed_hybrid( pagefile, bit_width, uncompressed_page_size, encoding.NumpyIO(assign[num:num+data_header2.num_values].view('uint8')), itemsize=bit_width ) else: temp = np.empty(data_header2.num_values, assign.dtype) encoding.read_rle_bit_packed_hybrid( pagefile, bit_width, uncompressed_page_size, encoding.NumpyIO(temp.view('uint8')), itemsize=bit_width ) if not nullable: assign[num:num+data_header2.num_values][nulls] = None assign[num:num+data_header2.num_values][~nulls] = temp elif data_header2.encoding in [ parquet_thrift.Encoding.PLAIN_DICTIONARY, parquet_thrift.Encoding.RLE_DICTIONARY ]: # DICTIONARY to be de-referenced, with or without nulls codec = cmd.codec if data_header2.is_compressed else "UNCOMPRESSED" compressed_bytes = infile.read(size) raw_bytes = decompress_data(compressed_bytes, uncompressed_page_size, codec) out = np.empty(n_values, dtype='uint8') pagefile = encoding.NumpyIO(raw_bytes) bit_width = pagefile.read_byte() encoding.read_rle_bit_packed_hybrid( pagefile, bit_width, uncompressed_page_size, encoding.NumpyIO(out), itemsize=1 ) if max_rep: # num_rows got filled, but consumed num_values data entries encoding._assemble_objects( assign[idx[0]:idx[0]+data_header2.num_rows], defi, repi, out, dic, d=True, null=True, null_val=False, max_defi=max_def, prev_i=0 ) idx[0] += data_header2.num_rows elif data_header2.num_nulls: if not nullable and assign.dtype != "O": assign[num:num+data_header2.num_values][nulls] = None # may be unnecessary assign[num:num+data_header2.num_values][~nulls] = dic[out] else: assign[num:num+data_header2.num_values] = dic[out] elif data_header2.encoding == parquet_thrift.Encoding.DELTA_BINARY_PACKED: assert data_header2.num_nulls == 0, "null delta-int not implemented" codec = cmd.codec if data_header2.is_compressed else "UNCOMPRESSED" raw_bytes = decompress_data(infile.read(size), uncompressed_page_size, codec) if converts_inplace(se): encoding.delta_binary_unpack( encoding.NumpyIO(raw_bytes), encoding.NumpyIO(assign[num:num+data_header2.num_values].view('uint8')) ) convert(assign[num:num+data_header2.num_values], se) else: out = np.empty(data_header2.num_values, dtype='int32') encoding.delta_binary_unpack( encoding.NumpyIO(raw_bytes), encoding.NumpyIO(out.view('uint8')) ) assign[num:num+data_header2.num_values] = convert(out, se) else: # codec = cmd.codec if data_header2.is_compressed else "UNCOMPRESSED" # raw_bytes = decompress_data(infile.read(size), # ph.uncompressed_page_size, codec) raise NotImplementedError return data_header2.num_values
def testWidths(): """Test all possible widths for a single byte.""" assert 0 == cencoding.width_from_max_int(0) assert 1 == cencoding.width_from_max_int(1) assert 2 == cencoding.width_from_max_int(2) assert 2 == cencoding.width_from_max_int(3) assert 3 == cencoding.width_from_max_int(4) assert 3 == cencoding.width_from_max_int(5) assert 3 == cencoding.width_from_max_int(6) assert 3 == cencoding.width_from_max_int(7) assert 4 == cencoding.width_from_max_int(8) assert 4 == cencoding.width_from_max_int(15) assert 5 == cencoding.width_from_max_int(16) assert 5 == cencoding.width_from_max_int(31) assert 6 == cencoding.width_from_max_int(32) assert 6 == cencoding.width_from_max_int(63) assert 7 == cencoding.width_from_max_int(64) assert 7 == cencoding.width_from_max_int(127) assert 8 == cencoding.width_from_max_int(128) assert 8 == cencoding.width_from_max_int(255)