コード例 #1
0
ファイル: core.py プロジェクト: joseignaciorc/fastparquet
def read_def(io_obj, daph, helper, metadata, out=None):
    """
    Read the definition levels from this page, if any.
    """
    definition_levels = None
    num_nulls = 0
    if not helper.is_required(metadata.path_in_schema):
        max_definition_level = helper.max_definition_level(
            metadata.path_in_schema)
        bit_width = encoding.width_from_max_int(max_definition_level)
        if bit_width:
            # NB: num_values is index 1 for either type of page header
            definition_levels = read_data(
                    io_obj, parquet_thrift.Encoding.RLE,
                    daph.num_values, bit_width, out=out)
        if (
                daph.statistics is not None
                and getattr(daph.statistics, "null_count", None) is not None
        ):
            num_nulls = daph.statistics.null_count
        elif (
                daph.num_values == metadata.num_values
                and metadata.statistics
                and getattr(metadata.statistics, "null_count", None) is not None
        ):
            num_nulls = metadata.statistics.null_count
        else:
            num_nulls = daph.num_values - (definition_levels ==
                                               max_definition_level).sum()
        if num_nulls == 0:
            definition_levels = None
    return definition_levels, num_nulls
コード例 #2
0
def test_bitpack():
    for _ in range(10):
        values = np.random.randint(0, 15000, size=np.random.randint(10, 100),
                                   dtype=np.int32)
        width = cencoding.width_from_max_int(values.max())
        buf = np.zeros(900, dtype=np.uint8)
        o = cencoding.NumpyIO(buf)
        cencoding.encode_bitpacked(values, width, o)
        o.seek(0)
        head = cencoding.read_unsigned_var_int(o)
        buf2 = np.zeros(300, dtype=np.int32)
        out = cencoding.NumpyIO(buf2.view("uint8"))
        cencoding.read_bitpacked(o, head, width, out)
        assert (values == buf2[:len(values)]).all()
        assert buf2[len(values):].sum() == 0  # zero padding
        assert out.tell() // 8 - len(values) < 8
コード例 #3
0
def test_rle_bp():
    for _ in range(10):
        values = np.random.randint(0, 15000, size=np.random.randint(10, 100),
                                   dtype=np.int32)
        buf = np.empty(len(values) + 5, dtype=np.int32)
        out = cencoding.NumpyIO(buf.view('uint8'))
        buf2 = np.zeros(900, dtype=np.uint8)
        o = cencoding.NumpyIO(buf2)
        width = cencoding.width_from_max_int(values.max())

        # without length
        cencoding.encode_rle_bp(values, width, o)
        l = o.tell()
        o.seek(0)

        cencoding.read_rle_bit_packed_hybrid(o, width, length=l, o=out)
        assert (buf[:len(values)] == values).all()
コード例 #4
0
ファイル: core.py プロジェクト: joseignaciorc/fastparquet
def read_rep(io_obj, daph, helper, metadata, out=None):
    """
    Read the repetition levels from this page, if any.
    """
    repetition_levels = None
    if len(metadata.path_in_schema) > 1:
        max_repetition_level = helper.max_repetition_level(
            metadata.path_in_schema)
        if max_repetition_level == 0:
            repetition_levels = None
        else:
            bit_width = encoding.width_from_max_int(max_repetition_level)
            # NB: num_values is index 1 for either type of page header
            repetition_levels = read_data(io_obj, parquet_thrift.Encoding.RLE,
                                          daph.num_values,
                                          bit_width,
                                          out=out)
    return repetition_levels
コード例 #5
0
ファイル: core.py プロジェクト: joseignaciorc/fastparquet
def read_data_page_v2(infile, schema_helper, se, data_header2, cmd,
                      dic, assign, num, use_cat, file_offset, ph, idx=None,
                      selfmade=False):
    """
    :param infile: open file
    :param schema_helper:
    :param se: schema element
    :param data_header2: page header struct
    :param cmd: column metadata
    :param dic: any dictionary labels encountered
    :param assign: output array (all of it)
    :param num: offset, rows so far
    :param use_cat: output is categorical?
    :return: None

    test data "/Users/mdurant/Downloads/datapage_v2.snappy.parquet"
          a  b    c      d          e
    0   abc  1  2.0   True  [1, 2, 3]
    1   abc  2  3.0   True       None
    2   abc  3  4.0   True       None
    3  None  4  5.0  False  [1, 2, 3]
    4   abc  5  2.0   True     [1, 2]

    b is delta encoded; c is dict encoded

    """
    if data_header2.encoding not in [parquet_thrift.Encoding.PLAIN_DICTIONARY,
                                     parquet_thrift.Encoding.RLE_DICTIONARY,
                                     parquet_thrift.Encoding.RLE,
                                     parquet_thrift.Encoding.PLAIN,
                                     parquet_thrift.Encoding.DELTA_BINARY_PACKED
                                     ]:
        raise NotImplementedError
    size = (ph.compressed_page_size - data_header2.repetition_levels_byte_length -
            data_header2.definition_levels_byte_length)
    data = infile.tell() + data_header2.definition_levels_byte_length + data_header2.repetition_levels_byte_length
    n_values = data_header2.num_values - data_header2.num_nulls

    max_rep = schema_helper.max_repetition_level(cmd.path_in_schema)
    if max_rep:
        # TODO: probably not functional
        bit_width = encoding.width_from_max_int(max_rep)
        io_obj = encoding.NumpyIO(infile.read(data_header2.repetition_levels_byte_length))
        repi = np.empty(data_header2.num_values, dtype="uint8")
        encoding.read_rle_bit_packed_hybrid(io_obj, bit_width, data_header2.num_values,
                                            encoding.NumpyIO(repi), itemsize=1)

    max_def = schema_helper.max_definition_level(cmd.path_in_schema)

    nullable = isinstance(assign.dtype, pd.core.arrays.masked.BaseMaskedDtype)
    if max_def and data_header2.num_nulls:
        bit_width = encoding.width_from_max_int(max_def)
        # not the same as read_data(), because we know the length
        io_obj = encoding.NumpyIO(infile.read(data_header2.definition_levels_byte_length))
        if nullable:
            defi = assign._mask
        else:
            # TODO: in tabular data, nulls arrays could be reused for each column
            defi = np.empty(data_header2.num_values, dtype=np.uint8)
        encoding.read_rle_bit_packed_hybrid(io_obj, bit_width, data_header2.num_values,
                                            encoding.NumpyIO(defi), itemsize=1)
        if max_rep:
            # assemble_objects needs both arrays
            nulls = defi != max_def
        else:
            np.not_equal(defi.view("uint8"), max_def, out=defi)
            nulls = defi.view(np.bool_)
    infile.seek(data)

    # input and output element sizes match
    see = se.type_length == assign.dtype.itemsize * 8 or simple.get(se.type).itemsize == assign.dtype.itemsize
    # can read-into
    into0 = ((use_cat or converts_inplace(se) and see)
             and data_header2.num_nulls == 0
             and max_rep == 0 and assign.dtype.kind != "O")
    # can decompress-into
    into = (data_header2.is_compressed and rev_map[cmd.codec] in decom_into
            and into0)
    if nullable:
        assign = assign._data

    uncompressed_page_size = (ph.uncompressed_page_size - data_header2.definition_levels_byte_length -
                              data_header2.repetition_levels_byte_length)
    if into0 and data_header2.encoding == parquet_thrift.Encoding.PLAIN and (
            not data_header2.is_compressed or cmd.codec == parquet_thrift.CompressionCodec.UNCOMPRESSED
    ):
        # PLAIN read directly into output (a copy for remote files)
        infile.readinto(assign[num:num+n_values].view('uint8'))
        convert(assign[num:num+n_values], se)
    elif into and data_header2.encoding == parquet_thrift.Encoding.PLAIN:
        # PLAIN decompress directly into output
        decomp = decom_into[rev_map[cmd.codec]]
        decomp(infile.read(size), assign[num:num+data_header2.num_values].view('uint8'))
        convert(assign[num:num+n_values], se)
    elif data_header2.encoding == parquet_thrift.Encoding.PLAIN:
        # PLAIN, but with nulls or not in-place conversion
        codec = cmd.codec if data_header2.is_compressed else "UNCOMPRESSED"
        raw_bytes = decompress_data(infile.read(size),
                                    uncompressed_page_size, codec)
        values = read_plain(raw_bytes,
                            cmd.type,
                            n_values,
                            width=se.type_length,
                            utf=se.converted_type == 0)
        if data_header2.num_nulls:
            if nullable:
                assign[num:num+data_header2.num_values][~nulls] = convert(values, se)
            else:
                assign[num:num+data_header2.num_values][nulls] = None  # or nan or nat
                assign[num:num+data_header2.num_values][~nulls] = convert(values, se)
        else:
            assign[num:num+data_header2.num_values] = convert(values, se)
    elif (use_cat and data_header2.encoding in [
        parquet_thrift.Encoding.PLAIN_DICTIONARY,
        parquet_thrift.Encoding.RLE_DICTIONARY,
    ]) or (data_header2.encoding == parquet_thrift.Encoding.RLE):
        # DICTIONARY or BOOL direct decode RLE into output (no nulls)
        codec = cmd.codec if data_header2.is_compressed else "UNCOMPRESSED"
        raw_bytes = np.empty(size, dtype='uint8')
        # TODO: small improvement possible by file.readinto and decompress_into if we
        #  don't first read raw_bytes but seek in the open file
        infile.readinto(raw_bytes)
        raw_bytes = decompress_data(raw_bytes, uncompressed_page_size, codec)
        pagefile = encoding.NumpyIO(raw_bytes)
        if data_header2.encoding != parquet_thrift.Encoding.RLE:
            # TODO: check this bit; is the varint read only row byte-exact fastpath?
            bit_width = pagefile.read_byte()
            encoding.read_unsigned_var_int(pagefile)
        else:
            bit_width = 1
            pagefile.seek(4, 1)
        if bit_width in [8, 16, 32] and selfmade:
            # special fastpath for cats
            outbytes = raw_bytes[pagefile.tell():]
            if len(outbytes) == assign[num:num+data_header2.num_values].nbytes:
                assign[num:num+data_header2.num_values].view('uint8')[:] = outbytes
            else:
                if data_header2.num_nulls == 0:
                    assign[num:num+data_header2.num_values][:] = outbytes
                else:
                    assign[num:num+data_header2.num_values][~nulls] = outbytes
                    assign[num:num+data_header2.num_values][nulls] = -1
        else:
            if data_header2.num_nulls == 0:
                encoding.read_rle_bit_packed_hybrid(
                    pagefile,
                    bit_width,
                    uncompressed_page_size,
                    encoding.NumpyIO(assign[num:num+data_header2.num_values].view('uint8')),
                    itemsize=bit_width
                )
            else:
                temp = np.empty(data_header2.num_values, assign.dtype)
                encoding.read_rle_bit_packed_hybrid(
                    pagefile,
                    bit_width,
                    uncompressed_page_size,
                    encoding.NumpyIO(temp.view('uint8')),
                    itemsize=bit_width
                )
                if not nullable:
                    assign[num:num+data_header2.num_values][nulls] = None
                assign[num:num+data_header2.num_values][~nulls] = temp

    elif data_header2.encoding in [
        parquet_thrift.Encoding.PLAIN_DICTIONARY,
        parquet_thrift.Encoding.RLE_DICTIONARY
    ]:
        # DICTIONARY to be de-referenced, with or without nulls
        codec = cmd.codec if data_header2.is_compressed else "UNCOMPRESSED"
        compressed_bytes = infile.read(size)
        raw_bytes = decompress_data(compressed_bytes, uncompressed_page_size, codec)
        out = np.empty(n_values, dtype='uint8')
        pagefile = encoding.NumpyIO(raw_bytes)
        bit_width = pagefile.read_byte()
        encoding.read_rle_bit_packed_hybrid(
            pagefile,
            bit_width,
            uncompressed_page_size,
            encoding.NumpyIO(out),
            itemsize=1
        )
        if max_rep:
            # num_rows got filled, but consumed num_values data entries
            encoding._assemble_objects(
                assign[idx[0]:idx[0]+data_header2.num_rows], defi, repi, out, dic, d=True,
                null=True, null_val=False, max_defi=max_def, prev_i=0
            )
            idx[0] += data_header2.num_rows
        elif data_header2.num_nulls:
            if not nullable and assign.dtype != "O":
                assign[num:num+data_header2.num_values][nulls] = None  # may be unnecessary
            assign[num:num+data_header2.num_values][~nulls] = dic[out]
        else:
            assign[num:num+data_header2.num_values] = dic[out]
    elif data_header2.encoding == parquet_thrift.Encoding.DELTA_BINARY_PACKED:
        assert data_header2.num_nulls == 0, "null delta-int not implemented"
        codec = cmd.codec if data_header2.is_compressed else "UNCOMPRESSED"
        raw_bytes = decompress_data(infile.read(size),
                                    uncompressed_page_size, codec)
        if converts_inplace(se):
            encoding.delta_binary_unpack(
                encoding.NumpyIO(raw_bytes),
                encoding.NumpyIO(assign[num:num+data_header2.num_values].view('uint8'))
            )
            convert(assign[num:num+data_header2.num_values], se)
        else:
            out = np.empty(data_header2.num_values, dtype='int32')
            encoding.delta_binary_unpack(
                encoding.NumpyIO(raw_bytes), encoding.NumpyIO(out.view('uint8'))
            )
            assign[num:num+data_header2.num_values] = convert(out, se)
    else:
        # codec = cmd.codec if data_header2.is_compressed else "UNCOMPRESSED"
        # raw_bytes = decompress_data(infile.read(size),
        #                             ph.uncompressed_page_size, codec)
        raise NotImplementedError
    return data_header2.num_values
コード例 #6
0
ファイル: test_encoding.py プロジェクト: yohplala/fastparquet
def testWidths():
    """Test all possible widths for a single byte."""
    assert 0 == cencoding.width_from_max_int(0)
    assert 1 == cencoding.width_from_max_int(1)
    assert 2 == cencoding.width_from_max_int(2)
    assert 2 == cencoding.width_from_max_int(3)
    assert 3 == cencoding.width_from_max_int(4)
    assert 3 == cencoding.width_from_max_int(5)
    assert 3 == cencoding.width_from_max_int(6)
    assert 3 == cencoding.width_from_max_int(7)
    assert 4 == cencoding.width_from_max_int(8)
    assert 4 == cencoding.width_from_max_int(15)
    assert 5 == cencoding.width_from_max_int(16)
    assert 5 == cencoding.width_from_max_int(31)
    assert 6 == cencoding.width_from_max_int(32)
    assert 6 == cencoding.width_from_max_int(63)
    assert 7 == cencoding.width_from_max_int(64)
    assert 7 == cencoding.width_from_max_int(127)
    assert 8 == cencoding.width_from_max_int(128)
    assert 8 == cencoding.width_from_max_int(255)