def testSingleByte(): """Test reading a single byte value.""" buf = np.frombuffer(struct.pack(b"<i", 0x7F), np.uint8) fo = cencoding.NumpyIO(buf) out = cencoding.read_unsigned_var_int(fo) assert 0x7F == out assert fo.tell() == 1
def read_data_page(f, helper, header, metadata, skip_nulls=False, selfmade=False): """Read a data page: definitions, repetitions, values (in order) Only values are guaranteed to exist, e.g., for a top-level, required field. """ daph = header.data_page_header raw_bytes = _read_page(f, header, metadata) io_obj = encoding.NumpyIO(raw_bytes) repetition_levels = read_rep(io_obj, daph, helper, metadata) if skip_nulls and not helper.is_required(metadata.path_in_schema): num_nulls = 0 definition_levels = None skip_definition_bytes(io_obj, daph.num_values) else: definition_levels, num_nulls = read_def(io_obj, daph, helper, metadata) nval = daph.num_values - num_nulls se = helper.schema_element(metadata.path_in_schema) if daph.encoding == parquet_thrift.Encoding.PLAIN: width = helper.schema_element(metadata.path_in_schema).type_length values = read_plain(io_obj.read(), metadata.type, int(daph.num_values - num_nulls), width=width, utf=se.converted_type == 0) elif daph.encoding in [parquet_thrift.Encoding.PLAIN_DICTIONARY, parquet_thrift.Encoding.RLE_DICTIONARY, parquet_thrift.Encoding.RLE]: # bit_width is stored as single byte. if daph.encoding == parquet_thrift.Encoding.RLE: bit_width = se.type_length else: bit_width = io_obj.read_byte() if bit_width in [8, 16, 32] and selfmade: num = (encoding.read_unsigned_var_int(io_obj) >> 1) * 8 values = np.frombuffer(io_obj.read(num * bit_width // 8), dtype='int%i' % bit_width) elif bit_width: if bit_width > 8: values = np.empty(daph.num_values-num_nulls, dtype=np.int32) o = encoding.NumpyIO(values.view('uint8')) encoding.read_rle_bit_packed_hybrid( io_obj, bit_width, io_obj.len-io_obj.tell(), o=o, itemsize=4) else: values = np.empty(daph.num_values-num_nulls, dtype=np.uint8) o = encoding.NumpyIO(values) encoding.read_rle_bit_packed_hybrid( io_obj, bit_width, io_obj.len-io_obj.tell(), o=o, itemsize=1) values = values.data[:nval] else: values = np.zeros(nval, dtype=np.int8) else: raise NotImplementedError('Encoding %s' % daph.encoding) return definition_levels, repetition_levels, values[:nval]
def testFourByte(): """Test reading a four byte value.""" buf = np.frombuffer(struct.pack(b"<BBBB", 0xFF, 0xFF, 0xFF, 0x7F), np.uint8) fo = cencoding.NumpyIO(buf) out = cencoding.read_unsigned_var_int(fo) assert 0x0FFFFFFF == out assert fo.tell() == 4
def test_uvarint(): values = np.random.randint(0, 15000, size=100) buf = np.zeros(30, dtype=np.uint8) o = cencoding.NumpyIO(buf) for v in values: o.seek(0) cencoding.encode_unsigned_varint(v, o) o.seek(0) out = cencoding.read_unsigned_var_int(o) assert v == out
def test_bitpack(): for _ in range(10): values = np.random.randint(0, 15000, size=np.random.randint(10, 100), dtype=np.int32) width = cencoding.width_from_max_int(values.max()) buf = np.zeros(900, dtype=np.uint8) o = cencoding.NumpyIO(buf) cencoding.encode_bitpacked(values, width, o) o.seek(0) head = cencoding.read_unsigned_var_int(o) buf2 = np.zeros(300, dtype=np.int32) out = cencoding.NumpyIO(buf2.view("uint8")) cencoding.read_bitpacked(o, head, width, out) assert (values == buf2[:len(values)]).all() assert buf2[len(values):].sum() == 0 # zero padding assert out.tell() // 8 - len(values) < 8
def read_data_page_v2(infile, schema_helper, se, data_header2, cmd, dic, assign, num, use_cat, file_offset, ph, idx=None, selfmade=False): """ :param infile: open file :param schema_helper: :param se: schema element :param data_header2: page header struct :param cmd: column metadata :param dic: any dictionary labels encountered :param assign: output array (all of it) :param num: offset, rows so far :param use_cat: output is categorical? :return: None test data "/Users/mdurant/Downloads/datapage_v2.snappy.parquet" a b c d e 0 abc 1 2.0 True [1, 2, 3] 1 abc 2 3.0 True None 2 abc 3 4.0 True None 3 None 4 5.0 False [1, 2, 3] 4 abc 5 2.0 True [1, 2] b is delta encoded; c is dict encoded """ if data_header2.encoding not in [parquet_thrift.Encoding.PLAIN_DICTIONARY, parquet_thrift.Encoding.RLE_DICTIONARY, parquet_thrift.Encoding.RLE, parquet_thrift.Encoding.PLAIN, parquet_thrift.Encoding.DELTA_BINARY_PACKED ]: raise NotImplementedError size = (ph.compressed_page_size - data_header2.repetition_levels_byte_length - data_header2.definition_levels_byte_length) data = infile.tell() + data_header2.definition_levels_byte_length + data_header2.repetition_levels_byte_length n_values = data_header2.num_values - data_header2.num_nulls max_rep = schema_helper.max_repetition_level(cmd.path_in_schema) if max_rep: # TODO: probably not functional bit_width = encoding.width_from_max_int(max_rep) io_obj = encoding.NumpyIO(infile.read(data_header2.repetition_levels_byte_length)) repi = np.empty(data_header2.num_values, dtype="uint8") encoding.read_rle_bit_packed_hybrid(io_obj, bit_width, data_header2.num_values, encoding.NumpyIO(repi), itemsize=1) max_def = schema_helper.max_definition_level(cmd.path_in_schema) nullable = isinstance(assign.dtype, pd.core.arrays.masked.BaseMaskedDtype) if max_def and data_header2.num_nulls: bit_width = encoding.width_from_max_int(max_def) # not the same as read_data(), because we know the length io_obj = encoding.NumpyIO(infile.read(data_header2.definition_levels_byte_length)) if nullable: defi = assign._mask else: # TODO: in tabular data, nulls arrays could be reused for each column defi = np.empty(data_header2.num_values, dtype=np.uint8) encoding.read_rle_bit_packed_hybrid(io_obj, bit_width, data_header2.num_values, encoding.NumpyIO(defi), itemsize=1) if max_rep: # assemble_objects needs both arrays nulls = defi != max_def else: np.not_equal(defi.view("uint8"), max_def, out=defi) nulls = defi.view(np.bool_) infile.seek(data) # input and output element sizes match see = se.type_length == assign.dtype.itemsize * 8 or simple.get(se.type).itemsize == assign.dtype.itemsize # can read-into into0 = ((use_cat or converts_inplace(se) and see) and data_header2.num_nulls == 0 and max_rep == 0 and assign.dtype.kind != "O") # can decompress-into into = (data_header2.is_compressed and rev_map[cmd.codec] in decom_into and into0) if nullable: assign = assign._data uncompressed_page_size = (ph.uncompressed_page_size - data_header2.definition_levels_byte_length - data_header2.repetition_levels_byte_length) if into0 and data_header2.encoding == parquet_thrift.Encoding.PLAIN and ( not data_header2.is_compressed or cmd.codec == parquet_thrift.CompressionCodec.UNCOMPRESSED ): # PLAIN read directly into output (a copy for remote files) infile.readinto(assign[num:num+n_values].view('uint8')) convert(assign[num:num+n_values], se) elif into and data_header2.encoding == parquet_thrift.Encoding.PLAIN: # PLAIN decompress directly into output decomp = decom_into[rev_map[cmd.codec]] decomp(infile.read(size), assign[num:num+data_header2.num_values].view('uint8')) convert(assign[num:num+n_values], se) elif data_header2.encoding == parquet_thrift.Encoding.PLAIN: # PLAIN, but with nulls or not in-place conversion codec = cmd.codec if data_header2.is_compressed else "UNCOMPRESSED" raw_bytes = decompress_data(infile.read(size), uncompressed_page_size, codec) values = read_plain(raw_bytes, cmd.type, n_values, width=se.type_length, utf=se.converted_type == 0) if data_header2.num_nulls: if nullable: assign[num:num+data_header2.num_values][~nulls] = convert(values, se) else: assign[num:num+data_header2.num_values][nulls] = None # or nan or nat assign[num:num+data_header2.num_values][~nulls] = convert(values, se) else: assign[num:num+data_header2.num_values] = convert(values, se) elif (use_cat and data_header2.encoding in [ parquet_thrift.Encoding.PLAIN_DICTIONARY, parquet_thrift.Encoding.RLE_DICTIONARY, ]) or (data_header2.encoding == parquet_thrift.Encoding.RLE): # DICTIONARY or BOOL direct decode RLE into output (no nulls) codec = cmd.codec if data_header2.is_compressed else "UNCOMPRESSED" raw_bytes = np.empty(size, dtype='uint8') # TODO: small improvement possible by file.readinto and decompress_into if we # don't first read raw_bytes but seek in the open file infile.readinto(raw_bytes) raw_bytes = decompress_data(raw_bytes, uncompressed_page_size, codec) pagefile = encoding.NumpyIO(raw_bytes) if data_header2.encoding != parquet_thrift.Encoding.RLE: # TODO: check this bit; is the varint read only row byte-exact fastpath? bit_width = pagefile.read_byte() encoding.read_unsigned_var_int(pagefile) else: bit_width = 1 pagefile.seek(4, 1) if bit_width in [8, 16, 32] and selfmade: # special fastpath for cats outbytes = raw_bytes[pagefile.tell():] if len(outbytes) == assign[num:num+data_header2.num_values].nbytes: assign[num:num+data_header2.num_values].view('uint8')[:] = outbytes else: if data_header2.num_nulls == 0: assign[num:num+data_header2.num_values][:] = outbytes else: assign[num:num+data_header2.num_values][~nulls] = outbytes assign[num:num+data_header2.num_values][nulls] = -1 else: if data_header2.num_nulls == 0: encoding.read_rle_bit_packed_hybrid( pagefile, bit_width, uncompressed_page_size, encoding.NumpyIO(assign[num:num+data_header2.num_values].view('uint8')), itemsize=bit_width ) else: temp = np.empty(data_header2.num_values, assign.dtype) encoding.read_rle_bit_packed_hybrid( pagefile, bit_width, uncompressed_page_size, encoding.NumpyIO(temp.view('uint8')), itemsize=bit_width ) if not nullable: assign[num:num+data_header2.num_values][nulls] = None assign[num:num+data_header2.num_values][~nulls] = temp elif data_header2.encoding in [ parquet_thrift.Encoding.PLAIN_DICTIONARY, parquet_thrift.Encoding.RLE_DICTIONARY ]: # DICTIONARY to be de-referenced, with or without nulls codec = cmd.codec if data_header2.is_compressed else "UNCOMPRESSED" compressed_bytes = infile.read(size) raw_bytes = decompress_data(compressed_bytes, uncompressed_page_size, codec) out = np.empty(n_values, dtype='uint8') pagefile = encoding.NumpyIO(raw_bytes) bit_width = pagefile.read_byte() encoding.read_rle_bit_packed_hybrid( pagefile, bit_width, uncompressed_page_size, encoding.NumpyIO(out), itemsize=1 ) if max_rep: # num_rows got filled, but consumed num_values data entries encoding._assemble_objects( assign[idx[0]:idx[0]+data_header2.num_rows], defi, repi, out, dic, d=True, null=True, null_val=False, max_defi=max_def, prev_i=0 ) idx[0] += data_header2.num_rows elif data_header2.num_nulls: if not nullable and assign.dtype != "O": assign[num:num+data_header2.num_values][nulls] = None # may be unnecessary assign[num:num+data_header2.num_values][~nulls] = dic[out] else: assign[num:num+data_header2.num_values] = dic[out] elif data_header2.encoding == parquet_thrift.Encoding.DELTA_BINARY_PACKED: assert data_header2.num_nulls == 0, "null delta-int not implemented" codec = cmd.codec if data_header2.is_compressed else "UNCOMPRESSED" raw_bytes = decompress_data(infile.read(size), uncompressed_page_size, codec) if converts_inplace(se): encoding.delta_binary_unpack( encoding.NumpyIO(raw_bytes), encoding.NumpyIO(assign[num:num+data_header2.num_values].view('uint8')) ) convert(assign[num:num+data_header2.num_values], se) else: out = np.empty(data_header2.num_values, dtype='int32') encoding.delta_binary_unpack( encoding.NumpyIO(raw_bytes), encoding.NumpyIO(out.view('uint8')) ) assign[num:num+data_header2.num_values] = convert(out, se) else: # codec = cmd.codec if data_header2.is_compressed else "UNCOMPRESSED" # raw_bytes = decompress_data(infile.read(size), # ph.uncompressed_page_size, codec) raise NotImplementedError return data_header2.num_values