def test_type_schema_pickling():
    cases = [
        pa.int8(),
        pa.string(),
        pa.binary(),
        pa.binary(10),
        pa.list_(pa.string()),
        pa.struct([
            pa.field('a', 'int8'),
            pa.field('b', 'string')
        ]),
        pa.time32('s'),
        pa.time64('us'),
        pa.date32(),
        pa.date64(),
        pa.timestamp('ms'),
        pa.timestamp('ns'),
        pa.decimal(12, 2),
        pa.field('a', 'string', metadata={b'foo': b'bar'})
    ]

    for val in cases:
        roundtripped = pickle.loads(pickle.dumps(val))
        assert val == roundtripped

    fields = []
    for i, f in enumerate(cases):
        if isinstance(f, pa.Field):
            fields.append(f)
        else:
            fields.append(pa.field('_f{}'.format(i), f))

    schema = pa.schema(fields, metadata={b'foo': b'bar'})
    roundtripped = pickle.loads(pickle.dumps(schema))
    assert schema == roundtripped
 def test_fixed_size_bytes(self):
     data = [b'foof', None, b'barb', b'2346']
     arr = pa.from_pylist(data, type=pa.binary(4))
     assert len(arr) == 4
     assert arr.null_count == 1
     assert arr.type == pa.binary(4)
     assert arr.to_pylist() == data
Example #3
0
def test_empty_cast():
    types = [
        pa.null(),
        pa.bool_(),
        pa.int8(),
        pa.int16(),
        pa.int32(),
        pa.int64(),
        pa.uint8(),
        pa.uint16(),
        pa.uint32(),
        pa.uint64(),
        pa.float16(),
        pa.float32(),
        pa.float64(),
        pa.date32(),
        pa.date64(),
        pa.binary(),
        pa.binary(length=4),
        pa.string(),
    ]

    for (t1, t2) in itertools.product(types, types):
        try:
            # ARROW-4766: Ensure that supported types conversion don't segfault
            # on empty arrays of common types
            pa.array([], type=t1).cast(t2)
        except pa.lib.ArrowNotImplementedError:
            continue
Example #4
0
def test_type_to_pandas_dtype():
    M8_ns = np.dtype('datetime64[ns]')
    cases = [
        (pa.null(), np.float64),
        (pa.bool_(), np.bool_),
        (pa.int8(), np.int8),
        (pa.int16(), np.int16),
        (pa.int32(), np.int32),
        (pa.int64(), np.int64),
        (pa.uint8(), np.uint8),
        (pa.uint16(), np.uint16),
        (pa.uint32(), np.uint32),
        (pa.uint64(), np.uint64),
        (pa.float16(), np.float16),
        (pa.float32(), np.float32),
        (pa.float64(), np.float64),
        (pa.date32(), M8_ns),
        (pa.date64(), M8_ns),
        (pa.timestamp('ms'), M8_ns),
        (pa.binary(), np.object_),
        (pa.binary(12), np.object_),
        (pa.string(), np.object_),
        (pa.list_(pa.int8()), np.object_),
    ]
    for arrow_type, numpy_type in cases:
        assert arrow_type.to_pandas_dtype() == numpy_type
Example #5
0
def test_sequence_fixed_size_bytes():
    data = [b'foof', None, bytearray(b'barb'), b'2346']
    arr = pa.array(data, type=pa.binary(4))
    assert len(arr) == 4
    assert arr.null_count == 1
    assert arr.type == pa.binary(4)
    assert arr.to_pylist() == [b'foof', None, b'barb', b'2346']
Example #6
0
def test_cast_from_null():
    in_data = [None] * 3
    in_type = pa.null()
    out_types = [
        pa.null(),
        pa.uint8(),
        pa.float16(),
        pa.utf8(),
        pa.binary(),
        pa.binary(10),
        pa.list_(pa.int16()),
        pa.decimal128(19, 4),
        pa.timestamp('us'),
        pa.timestamp('us', tz='UTC'),
        pa.timestamp('us', tz='Europe/Paris'),
        pa.struct([pa.field('a', pa.int32()),
                   pa.field('b', pa.list_(pa.int8())),
                   pa.field('c', pa.string())]),
        ]
    for out_type in out_types:
        _check_cast_case((in_data, in_type, in_data, out_type))

    out_types = [
        pa.dictionary(pa.int32(), pa.string()),
        pa.union([pa.field('a', pa.binary(10)),
                  pa.field('b', pa.string())], mode=pa.lib.UnionMode_DENSE),
        pa.union([pa.field('a', pa.binary(10)),
                  pa.field('b', pa.string())], mode=pa.lib.UnionMode_SPARSE),
        ]
    in_arr = pa.array(in_data, type=pa.null())
    for out_type in out_types:
        with pytest.raises(NotImplementedError):
            in_arr.cast(out_type)
Example #7
0
File: jvm.py Project: rok/arrow
def field(jvm_field):
    """
    Construct a Field from a org.apache.arrow.vector.types.pojo.Field
    instance.

    Parameters
    ----------
    jvm_field: org.apache.arrow.vector.types.pojo.Field

    Returns
    -------
    pyarrow.Field
    """
    name = jvm_field.getName()
    jvm_type = jvm_field.getType()

    typ = None
    if not jvm_type.isComplex():
        type_str = jvm_type.getTypeID().toString()
        if type_str == 'Null':
            typ = pa.null()
        elif type_str == 'Int':
            typ = _from_jvm_int_type(jvm_type)
        elif type_str == 'FloatingPoint':
            typ = _from_jvm_float_type(jvm_type)
        elif type_str == 'Utf8':
            typ = pa.string()
        elif type_str == 'Binary':
            typ = pa.binary()
        elif type_str == 'FixedSizeBinary':
            typ = pa.binary(jvm_type.getByteWidth())
        elif type_str == 'Bool':
            typ = pa.bool_()
        elif type_str == 'Time':
            typ = _from_jvm_time_type(jvm_type)
        elif type_str == 'Timestamp':
            typ = _from_jvm_timestamp_type(jvm_type)
        elif type_str == 'Date':
            typ = _from_jvm_date_type(jvm_type)
        elif type_str == 'Decimal':
            typ = pa.decimal128(jvm_type.getPrecision(), jvm_type.getScale())
        else:
            raise NotImplementedError(
                "Unsupported JVM type: {}".format(type_str))
    else:
        # TODO: The following JVM types are not implemented:
        #       Struct, List, FixedSizeList, Union, Dictionary
        raise NotImplementedError(
            "JVM field conversion only implemented for primitive types.")

    nullable = jvm_field.isNullable()
    if jvm_field.getMetadata().isEmpty():
        metadata = None
    else:
        metadata = dict(jvm_field.getMetadata())
    return pa.field(name, typ, nullable, metadata)
Example #8
0
def test_is_binary_string():
    assert types.is_binary(pa.binary())
    assert not types.is_binary(pa.string())

    assert types.is_string(pa.string())
    assert types.is_unicode(pa.string())
    assert not types.is_string(pa.binary())

    assert types.is_fixed_size_binary(pa.binary(5))
    assert not types.is_fixed_size_binary(pa.binary())
Example #9
0
def test_bit_width():
    for ty, expected in [(pa.bool_(), 1),
                         (pa.int8(), 8),
                         (pa.uint32(), 32),
                         (pa.float16(), 16),
                         (pa.decimal128(19, 4), 128),
                         (pa.binary(42), 42 * 8)]:
        assert ty.bit_width == expected
    for ty in [pa.binary(), pa.string(), pa.list_(pa.int16())]:
        with pytest.raises(ValueError, match="fixed width"):
            ty.bit_width
Example #10
0
def test_convert_options():
    cls = ConvertOptions
    opts = cls()

    assert opts.check_utf8 is True
    opts.check_utf8 = False
    assert opts.check_utf8 is False

    assert opts.strings_can_be_null is False
    opts.strings_can_be_null = True
    assert opts.strings_can_be_null is True

    assert opts.column_types == {}
    # Pass column_types as mapping
    opts.column_types = {'b': pa.int16(), 'c': pa.float32()}
    assert opts.column_types == {'b': pa.int16(), 'c': pa.float32()}
    opts.column_types = {'v': 'int16', 'w': 'null'}
    assert opts.column_types == {'v': pa.int16(), 'w': pa.null()}
    # Pass column_types as schema
    schema = pa.schema([('a', pa.int32()), ('b', pa.string())])
    opts.column_types = schema
    assert opts.column_types == {'a': pa.int32(), 'b': pa.string()}
    # Pass column_types as sequence
    opts.column_types = [('x', pa.binary())]
    assert opts.column_types == {'x': pa.binary()}

    with pytest.raises(TypeError, match='DataType expected'):
        opts.column_types = {'a': None}
    with pytest.raises(TypeError):
        opts.column_types = 0

    assert isinstance(opts.null_values, list)
    assert '' in opts.null_values
    assert 'N/A' in opts.null_values
    opts.null_values = ['xxx', 'yyy']
    assert opts.null_values == ['xxx', 'yyy']

    assert isinstance(opts.true_values, list)
    opts.true_values = ['xxx', 'yyy']
    assert opts.true_values == ['xxx', 'yyy']

    assert isinstance(opts.false_values, list)
    opts.false_values = ['xxx', 'yyy']
    assert opts.false_values == ['xxx', 'yyy']

    opts = cls(check_utf8=False, column_types={'a': pa.null()},
               null_values=['N', 'nn'], true_values=['T', 'tt'],
               false_values=['F', 'ff'], strings_can_be_null=True)
    assert opts.check_utf8 is False
    assert opts.column_types == {'a': pa.null()}
    assert opts.null_values == ['N', 'nn']
    assert opts.false_values == ['F', 'ff']
    assert opts.true_values == ['T', 'tt']
    assert opts.strings_can_be_null is True
Example #11
0
def test_sequence_bytes():
    u1 = b'ma\xc3\xb1ana'
    data = [b'foo',
            u1.decode('utf-8'),  # unicode gets encoded,
            bytearray(b'bar'),
            None]
    for ty in [None, pa.binary()]:
        arr = pa.array(data, type=ty)
        assert len(arr) == 4
        assert arr.null_count == 1
        assert arr.type == pa.binary()
        assert arr.to_pylist() == [b'foo', u1, b'bar', None]
Example #12
0
def test_array_mixed_unicode_bytes():
    values = [u'qux', b'foo', bytearray(b'barz')]
    b_values = [b'qux', b'foo', b'barz']
    u_values = [u'qux', u'foo', u'barz']

    arr = pa.array(values)
    expected = pa.array(b_values, type=pa.binary())
    assert arr.type == pa.binary()
    assert arr.equals(expected)

    arr = pa.array(values, type=pa.string())
    expected = pa.array(u_values, type=pa.string())
    assert arr.type == pa.string()
    assert arr.equals(expected)
Example #13
0
def numpy_array_from_arrow_array(arrow_array):
    arrow_type = arrow_array.type
    buffers = arrow_array.buffers()
    assert len(buffers) == 2
    bitmap_buffer, data_buffer = buffers
    if isinstance(arrow_type, type(pyarrow.binary(1))):  # todo, is there a better way to typecheck?
        # mimics python/pyarrow/array.pxi::Array::to_numpy
        assert len(buffers) == 2
        dtype = "S" + str(arrow_type.byte_width)
        # arrow seems to do padding, check if it is all ok
        expected_length = arrow_type.byte_width * len(arrow_array)
        actual_length = len(buffers[-1])
        if actual_length < expected_length:
            raise ValueError('buffer is smaller (%d) than expected (%d)' % (actual_length, expected_length))
        array = np.frombuffer(buffers[-1], dtype, len(arrow_array))# TODO: deal with offset ? [arrow_array.offset:arrow_array.offset + len(arrow_array)]
    else:
        dtype = arrow_array.type.to_pandas_dtype()
    if np.bool_ == dtype:
        # TODO: this will also be a copy, we probably want to support bitmasks as well
        bitmap = np.frombuffer(data_buffer, np.uint8, len(data_buffer))
        array = numpy_mask_from_arrow_mask(bitmap, len(arrow_array))
    else:
        array = np.frombuffer(data_buffer, dtype, len(arrow_array))

    if bitmap_buffer is not None:
        bitmap = np.frombuffer(bitmap_buffer, np.uint8, len(bitmap_buffer))
        mask = numpy_mask_from_arrow_mask(bitmap, len(arrow_array))
        array = np.ma.MaskedArray(array, mask=mask)
    return array
Example #14
0
def test_orcfile_empty():
    from pyarrow import orc
    f = orc.ORCFile(path_for_orc_example('TestOrcFile.emptyFile'))
    table = f.read()
    assert table.num_rows == 0
    schema = table.schema
    expected_schema = pa.schema([
        ('boolean1', pa.bool_()),
        ('byte1', pa.int8()),
        ('short1', pa.int16()),
        ('int1', pa.int32()),
        ('long1', pa.int64()),
        ('float1', pa.float32()),
        ('double1', pa.float64()),
        ('bytes1', pa.binary()),
        ('string1', pa.string()),
        ('middle', pa.struct([
            ('list', pa.list_(pa.struct([
                ('int1', pa.int32()),
                ('string1', pa.string()),
                ]))),
            ])),
        ('list', pa.list_(pa.struct([
            ('int1', pa.int32()),
            ('string1', pa.string()),
            ]))),
        ('map', pa.list_(pa.struct([
            ('key', pa.string()),
            ('value', pa.struct([
                ('int1', pa.int32()),
                ('string1', pa.string()),
                ])),
            ]))),
        ])
    assert schema == expected_schema
def test_fixed_size_binary():
    t0 = pa.binary(10)
    data = [b'fooooooooo', None, b'barooooooo', b'quxooooooo']
    a0 = pa.array(data, type=t0)

    table = pa.Table.from_arrays([a0],
                                 ['binary[10]'])
    _check_roundtrip(table)
Example #16
0
 def test_bytes(self):
     u1 = b"ma\xc3\xb1ana"
     data = [b"foo", u1.decode("utf-8"), None]  # unicode gets encoded,
     arr = pyarrow.from_pylist(data)
     assert len(arr) == 3
     assert arr.null_count == 1
     assert arr.type == pyarrow.binary()
     assert arr.to_pylist() == [b"foo", u1, None]
 def test_fixed_size_bytes(self):
     values = [b'foo', None, b'bar', None, None, b'hey']
     df = pd.DataFrame({'strings': values})
     schema = pa.schema([pa.field('strings', pa.binary(3))])
     table = pa.Table.from_pandas(df, schema=schema)
     assert table.schema[0].type == schema[0].type
     assert table.schema[0].name == schema[0].name
     result = table.to_pandas()
     tm.assert_frame_equal(result, df)
Example #18
0
def test_cast_binary_to_utf8():
    binary_arr = pa.array([b'foo', b'bar', b'baz'], type=pa.binary())
    utf8_arr = binary_arr.cast(pa.utf8())
    expected = pa.array(['foo', 'bar', 'baz'], type=pa.utf8())

    assert utf8_arr.equals(expected)

    non_utf8_values = [(u'maรฑana').encode('utf-16-le')]
    non_utf8_binary = pa.array(non_utf8_values)
    assert non_utf8_binary.type == pa.binary()
    with pytest.raises(ValueError):
        non_utf8_binary.cast(pa.string())

    non_utf8_all_null = pa.array(non_utf8_values, mask=np.array([True]),
                                 type=pa.binary())
    # No error
    casted = non_utf8_all_null.cast(pa.string())
    assert casted.null_count == 1
Example #19
0
 def test_bytes(self):
     u1 = b'ma\xc3\xb1ana'
     data = [b'foo',
             u1.decode('utf-8'),  # unicode gets encoded,
             None]
     arr = pyarrow.from_pylist(data)
     assert len(arr) == 3
     assert arr.null_count == 1
     assert arr.type == pyarrow.binary()
     assert arr.to_pylist() == [b'foo', u1, None]
    def test_bytes_to_binary(self):
        values = [u('qux'), b'foo', None, 'bar', 'qux', np.nan]
        df = pd.DataFrame({'strings': values})

        table = pa.Table.from_pandas(df)
        assert table[0].type == pa.binary()

        values2 = [b'qux', b'foo', None, b'bar', b'qux', np.nan]
        expected = pd.DataFrame({'strings': values2})
        self._check_pandas_roundtrip(df, expected)
Example #21
0
def dataframe_with_lists(include_index=False):
    """
    Dataframe with list columns of every possible primtive type.

    Returns
    -------
    df: pandas.DataFrame
    schema: pyarrow.Schema
        Arrow schema definition that is in line with the constructed df.
    """
    arrays = OrderedDict()
    fields = []

    fields.append(pa.field('int64', pa.list_(pa.int64())))
    arrays['int64'] = [
        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
        [0, 1, 2, 3, 4],
        None,
        [],
        np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9] * 2,
                 dtype=np.int64)[::2]
    ]
    fields.append(pa.field('double', pa.list_(pa.float64())))
    arrays['double'] = [
        [0., 1., 2., 3., 4., 5., 6., 7., 8., 9.],
        [0., 1., 2., 3., 4.],
        None,
        [],
        np.array([0., 1., 2., 3., 4., 5., 6., 7., 8., 9.] * 2)[::2],
    ]
    fields.append(pa.field('bytes_list', pa.list_(pa.binary())))
    arrays['bytes_list'] = [
        [b"1", b"f"],
        None,
        [b"1"],
        [b"1", b"2", b"3"],
        [],
    ]
    fields.append(pa.field('str_list', pa.list_(pa.string())))
    arrays['str_list'] = [
        [u"1", u"รค"],
        None,
        [u"1"],
        [u"1", u"2", u"3"],
        [],
    ]

    if include_index:
        fields.append(pa.field('__index_level_0__', pa.int64()))
    df = pd.DataFrame(arrays)
    schema = pa.schema(fields)

    return df, schema
Example #22
0
    def test_fixed_size_bytes(self):
        data = [b'foof', None, b'barb']
        arr = pa.array(data, type=pa.binary(4))

        v = arr[0]
        assert isinstance(v, pa.FixedSizeBinaryValue)
        assert v.as_py() == b'foof'

        assert arr[1] is pa.NA

        v = arr[2].as_py()
        assert v == b'barb'
        assert isinstance(v, bytes)
Example #23
0
def test_types_hashable():
    types = [
        pa.null(),
        pa.int32(),
        pa.time32('s'),
        pa.time64('us'),
        pa.date32(),
        pa.timestamp('us'),
        pa.string(),
        pa.binary(),
        pa.binary(10),
        pa.list_(pa.int32()),
        pa.struct([pa.field('a', pa.int32()),
                   pa.field('b', pa.int8()),
                   pa.field('c', pa.string())])
    ]

    in_dict = {}
    for i, type_ in enumerate(types):
        assert hash(type_) == hash(type_)
        in_dict[type_] = i
        assert in_dict[type_] == i
Example #24
0
def allocate_bytes(pool, nbytes):
    """
    Temporarily allocate *nbytes* from the given *pool*.
    """
    arr = pa.array([b"x" * nbytes], type=pa.binary(), memory_pool=pool)
    # Fetch the values buffer from the varbinary array and release the rest,
    # to get the desired allocation amount
    buf = arr.buffers()[2]
    arr = None
    assert len(buf) == nbytes
    try:
        yield
    finally:
        buf = None
Example #25
0
def get_many_types():
    # returning them from a function is required because of pa.dictionary
    # type holds a pyarrow array and test_array.py::test_toal_bytes_allocated
    # checks that the default memory pool has zero allocated bytes
    return (
        pa.null(),
        pa.bool_(),
        pa.int32(),
        pa.time32('s'),
        pa.time64('us'),
        pa.date32(),
        pa.timestamp('us'),
        pa.timestamp('us', tz='UTC'),
        pa.timestamp('us', tz='Europe/Paris'),
        pa.float16(),
        pa.float32(),
        pa.float64(),
        pa.decimal128(19, 4),
        pa.string(),
        pa.binary(),
        pa.binary(10),
        pa.list_(pa.int32()),
        pa.struct([pa.field('a', pa.int32()),
                   pa.field('b', pa.int8()),
                   pa.field('c', pa.string())]),
        pa.struct([pa.field('a', pa.int32(), nullable=False),
                   pa.field('b', pa.int8(), nullable=False),
                   pa.field('c', pa.string())]),
        pa.union([pa.field('a', pa.binary(10)),
                  pa.field('b', pa.string())], mode=pa.lib.UnionMode_DENSE),
        pa.union([pa.field('a', pa.binary(10)),
                  pa.field('b', pa.string())], mode=pa.lib.UnionMode_SPARSE),
        pa.union([pa.field('a', pa.binary(10), nullable=False),
                  pa.field('b', pa.string())], mode=pa.lib.UnionMode_SPARSE),
        pa.dictionary(pa.int32(), pa.string())
    )
Example #26
0
def arrow_array_from_numpy_array(array):
    dtype = array.dtype
    mask = None
    if np.ma.isMaskedArray(array):
        mask = array.mask
    if dtype.kind == 'S':
        type = pyarrow.binary(dtype.itemsize)
        arrow_array = pyarrow.array(array, type, mask=mask)
    else:
        if dtype.isnative:
            arrow_array = pyarrow.array(array, mask=mask)
        else:
            # TODO: we copy here, but I guess we should not... or give some warning
            arrow_array = pyarrow.array(array.astype(dtype.newbyteorder('=')), mask=mask)
    return arrow_array
Example #27
0
def test_union_type():
    def check_fields(ty, fields):
        assert ty.num_children == len(fields)
        assert [ty[i] for i in range(ty.num_children)] == fields

    fields = [pa.field('x', pa.list_(pa.int32())),
              pa.field('y', pa.binary())]
    for mode in ('sparse', pa.lib.UnionMode_SPARSE):
        ty = pa.union(fields, mode=mode)
        assert ty.mode == 'sparse'
        check_fields(ty, fields)
    for mode in ('dense', pa.lib.UnionMode_DENSE):
        ty = pa.union(fields, mode=mode)
        assert ty.mode == 'dense'
        check_fields(ty, fields)
    for mode in ('unknown', 2):
        with pytest.raises(ValueError, match='Invalid union mode'):
            pa.union(fields, mode=mode)
Example #28
0
 def test_simple_nulls(self):
     # Infer various kinds of data, with nulls
     rows = (b"a,b,c,d,e\n"
             b"1,2,,,3\n"
             b"nan,-5,foo,,nan\n"
             b"4.5,#N/A,nan,,\xff\n")
     table = self.read_bytes(rows)
     schema = pa.schema([('a', pa.float64()),
                         ('b', pa.int64()),
                         ('c', pa.string()),
                         ('d', pa.null()),
                         ('e', pa.binary())])
     assert table.schema == schema
     assert table.to_pydict() == {
         'a': [1.0, None, 4.5],
         'b': [2, -5, None],
         'c': [u"", u"foo", u"nan"],
         'd': [None, None, None],
         'e': [b"3", b"nan", b"\xff"],
         }
def test_bq_to_arrow_data_type_w_struct(module_under_test, bq_type):
    fields = (
        schema.SchemaField("field01", "STRING"),
        schema.SchemaField("field02", "BYTES"),
        schema.SchemaField("field03", "INTEGER"),
        schema.SchemaField("field04", "INT64"),
        schema.SchemaField("field05", "FLOAT"),
        schema.SchemaField("field06", "FLOAT64"),
        schema.SchemaField("field07", "NUMERIC"),
        schema.SchemaField("field08", "BOOLEAN"),
        schema.SchemaField("field09", "BOOL"),
        schema.SchemaField("field10", "TIMESTAMP"),
        schema.SchemaField("field11", "DATE"),
        schema.SchemaField("field12", "TIME"),
        schema.SchemaField("field13", "DATETIME"),
        schema.SchemaField("field14", "GEOGRAPHY"),
    )
    field = schema.SchemaField("ignored_name", bq_type, mode="NULLABLE", fields=fields)
    actual = module_under_test.bq_to_arrow_data_type(field)
    expected = pyarrow.struct(
        (
            pyarrow.field("field01", pyarrow.string()),
            pyarrow.field("field02", pyarrow.binary()),
            pyarrow.field("field03", pyarrow.int64()),
            pyarrow.field("field04", pyarrow.int64()),
            pyarrow.field("field05", pyarrow.float64()),
            pyarrow.field("field06", pyarrow.float64()),
            pyarrow.field("field07", module_under_test.pyarrow_numeric()),
            pyarrow.field("field08", pyarrow.bool_()),
            pyarrow.field("field09", pyarrow.bool_()),
            pyarrow.field("field10", module_under_test.pyarrow_timestamp()),
            pyarrow.field("field11", pyarrow.date32()),
            pyarrow.field("field12", module_under_test.pyarrow_time()),
            pyarrow.field("field13", module_under_test.pyarrow_datetime()),
            pyarrow.field("field14", pyarrow.string()),
        )
    )
    assert pyarrow.types.is_struct(actual)
    assert actual.num_children == len(fields)
    assert actual.equals(expected)
Example #30
0
def test_type_for_alias():
    cases = [
        ('i1', pa.int8()),
        ('int8', pa.int8()),
        ('i2', pa.int16()),
        ('int16', pa.int16()),
        ('i4', pa.int32()),
        ('int32', pa.int32()),
        ('i8', pa.int64()),
        ('int64', pa.int64()),
        ('u1', pa.uint8()),
        ('uint8', pa.uint8()),
        ('u2', pa.uint16()),
        ('uint16', pa.uint16()),
        ('u4', pa.uint32()),
        ('uint32', pa.uint32()),
        ('u8', pa.uint64()),
        ('uint64', pa.uint64()),
        ('f4', pa.float32()),
        ('float32', pa.float32()),
        ('f8', pa.float64()),
        ('float64', pa.float64()),
        ('date32', pa.date32()),
        ('date64', pa.date64()),
        ('string', pa.string()),
        ('str', pa.string()),
        ('binary', pa.binary()),
        ('time32[s]', pa.time32('s')),
        ('time32[ms]', pa.time32('ms')),
        ('time64[us]', pa.time64('us')),
        ('time64[ns]', pa.time64('ns')),
        ('timestamp[s]', pa.timestamp('s')),
        ('timestamp[ms]', pa.timestamp('ms')),
        ('timestamp[us]', pa.timestamp('us')),
        ('timestamp[ns]', pa.timestamp('ns')),
    ]

    for val, expected in cases:
        assert pa.type_for_alias(val) == expected
@pytest.mark.parametrize(
    ('data', 'type', 'physical_type', 'min_value', 'max_value', 'null_count',
     'num_values', 'distinct_count'), [
         ([1, 2, 2, None, 4], pa.uint8(), 'INT32', 1, 4, 1, 4, 0),
         ([1, 2, 2, None, 4], pa.uint16(), 'INT32', 1, 4, 1, 4, 0),
         ([1, 2, 2, None, 4], pa.uint32(), 'INT32', 1, 4, 1, 4, 0),
         ([1, 2, 2, None, 4], pa.uint64(), 'INT64', 1, 4, 1, 4, 0),
         ([-1, 2, 2, None, 4], pa.int8(), 'INT32', -1, 4, 1, 4, 0),
         ([-1, 2, 2, None, 4], pa.int16(), 'INT32', -1, 4, 1, 4, 0),
         ([-1, 2, 2, None, 4], pa.int32(), 'INT32', -1, 4, 1, 4, 0),
         ([-1, 2, 2, None, 4], pa.int64(), 'INT64', -1, 4, 1, 4, 0),
         ([-1.1, 2.2, 2.3, None, 4.4
           ], pa.float32(), 'FLOAT', -1.1, 4.4, 1, 4, 0),
         ([-1.1, 2.2, 2.3, None, 4.4
           ], pa.float64(), 'DOUBLE', -1.1, 4.4, 1, 4, 0),
         (['', 'b', chr(1000), None, 'aaa'], pa.binary(), 'BYTE_ARRAY', b'',
          chr(1000).encode('utf-8'), 1, 4, 0),
         ([True, False, False, True, True
           ], pa.bool_(), 'BOOLEAN', False, True, 0, 5, 0),
         ([b'\x00', b'b', b'12', None, b'aaa'
           ], pa.binary(), 'BYTE_ARRAY', b'\x00', b'b', 1, 4, 0),
     ])
def test_parquet_column_statistics_api(data, type, physical_type, min_value,
                                       max_value, null_count, num_values,
                                       distinct_count):
    df = pd.DataFrame({'data': data})
    schema = pa.schema([pa.field('data', type)])
    table = pa.Table.from_pandas(df, schema=schema, safe=False)
    fileh = make_sample_file(table)

    meta = fileh.metadata
  def test_stats_pipeline_with_examples_with_no_values(self):
    record_batches = [
        pa.RecordBatch.from_arrays([
            pa.array([[]], type=pa.list_(pa.float32())),
            pa.array([[]], type=pa.list_(pa.binary())),
            pa.array([[]], type=pa.list_(pa.int32())),
            pa.array([[2]]),
        ], ['a', 'b', 'c', 'w']),
        pa.RecordBatch.from_arrays([
            pa.array([[]], type=pa.list_(pa.float32())),
            pa.array([[]], type=pa.list_(pa.binary())),
            pa.array([[]], type=pa.list_(pa.int32())),
            pa.array([[2]]),
        ], ['a', 'b', 'c', 'w']),
        pa.RecordBatch.from_arrays([
            pa.array([[]], type=pa.list_(pa.float32())),
            pa.array([[]], type=pa.list_(pa.binary())),
            pa.array([[]], type=pa.list_(pa.int32())),
            pa.array([[2]]),
        ], ['a', 'b', 'c', 'w'])
    ]

    expected_result = text_format.Parse(
        """
      datasets{
        num_examples: 3
        features {
          path {
            step: 'a'
          }
          type: FLOAT
          num_stats {
            common_stats {
              num_non_missing: 3
              num_values_histogram {
                buckets {
                  sample_count: 1.5
                }
                buckets {
                  sample_count: 1.5
                }
                type: QUANTILES
              }
              weighted_common_stats {
                num_non_missing: 6
              }
            }
          }
        }
        features {
          path {
            step: 'b'
          }
          type: STRING
          string_stats {
            common_stats {
              num_non_missing: 3
              num_values_histogram {
                buckets {
                  sample_count: 1.5
                }
                buckets {
                  sample_count: 1.5
                }
                type: QUANTILES
              }
              weighted_common_stats {
                num_non_missing: 6
              }
            }
          }
        }
        features {
          path {
            step: 'c'
          }
          type: INT
          num_stats {
            common_stats {
              num_non_missing: 3
              num_values_histogram {
                buckets {
                  sample_count: 1.5
                }
                buckets {
                  sample_count: 1.5
                }
                type: QUANTILES
              }
              weighted_common_stats {
                num_non_missing: 6
              }
            }
          }
        }
        features {
          path {
          step: 'w'
        }
        type: INT
        num_stats {
          common_stats {
            num_non_missing: 3
            num_missing: 0
            min_num_values: 1
            max_num_values: 1
            avg_num_values: 1.0
            tot_num_values: 3
            num_values_histogram {
              buckets {
                low_value: 1.0
                high_value: 1.0
                sample_count: 1.5
              }
              buckets {
                low_value: 1.0
                high_value: 1.0
                sample_count: 1.5
              }
              type: QUANTILES
            }
            weighted_common_stats {
                num_non_missing: 6.0
                avg_num_values: 1.0
                tot_num_values: 6.0
            }
          }
          mean: 2.0
          std_dev: 0.0
          min: 2.0
          max: 2.0
          median: 2.0
          histograms {
            buckets {
              low_value: 2.0
              high_value: 2.0
              sample_count: 3.0
            }
            type: STANDARD
          }
          histograms {
            buckets {
              low_value: 2.0
              high_value: 2.0
              sample_count: 3.0
            }
            type: QUANTILES
          }
          weighted_numeric_stats {
            mean: 2.0
            median: 2.0
            histograms {
              buckets {
                low_value: 2.0
                high_value: 2.0
                sample_count: 6.0
              }
              type: STANDARD
            }
            histograms {
              buckets {
                low_value: 2.0
                high_value: 2.0
                sample_count: 6.0
              }
              type: QUANTILES
            }
          }
        }
      }
    }
    """, statistics_pb2.DatasetFeatureStatisticsList())
    with beam.Pipeline() as p:
      options = stats_options.StatsOptions(
          weight_feature='w',
          num_top_values=1,
          num_rank_histogram_buckets=1,
          num_values_histogram_buckets=2,
          num_histogram_buckets=1,
          num_quantiles_histogram_buckets=1,
          epsilon=0.001)
      result = (
          p | beam.Create(record_batches)
          | stats_api.GenerateStatistics(options))
      util.assert_that(
          result,
          test_util.make_dataset_feature_stats_list_proto_equal_fn(
              self, expected_result))
Example #33
0
def test_sql(parameters, db_type):
    df = get_df()
    if db_type == "redshift":
        df.drop(["binary"], axis=1, inplace=True)
    engine = wr.catalog.get_engine(connection=f"aws-data-wrangler-{db_type}")
    wr.db.to_sql(
        df=df,
        con=engine,
        name="test_sql",
        schema=parameters[db_type]["schema"],
        if_exists="replace",
        index=False,
        index_label=None,
        chunksize=None,
        method=None,
        dtype={"iint32": sqlalchemy.types.Integer},
    )
    df = wr.db.read_sql_query(
        sql=f"SELECT * FROM {parameters[db_type]['schema']}.test_sql",
        con=engine)
    ensure_data_types(df, has_list=False)
    engine = wr.db.get_engine(
        db_type=db_type,
        host=parameters[db_type]["host"],
        port=parameters[db_type]["port"],
        database=parameters[db_type]["database"],
        user=parameters["user"],
        password=parameters["password"],
    )
    dfs = wr.db.read_sql_query(
        sql=f"SELECT * FROM {parameters[db_type]['schema']}.test_sql",
        con=engine,
        chunksize=1,
        dtype={
            "iint8": pa.int8(),
            "iint16": pa.int16(),
            "iint32": pa.int32(),
            "iint64": pa.int64(),
            "float": pa.float32(),
            "double": pa.float64(),
            "decimal": pa.decimal128(3, 2),
            "string_object": pa.string(),
            "string": pa.string(),
            "date": pa.date32(),
            "timestamp": pa.timestamp(unit="ns"),
            "binary": pa.binary(),
            "category": pa.float64(),
        },
    )
    for df in dfs:
        ensure_data_types(df, has_list=False)
    if db_type != "redshift":
        account_id = boto3.client("sts").get_caller_identity().get("Account")
        engine = wr.catalog.get_engine(
            connection=f"aws-data-wrangler-{db_type}", catalog_id=account_id)
        wr.db.to_sql(
            df=pd.DataFrame({"col0": [1, 2, 3]}, dtype="Int32"),
            con=engine,
            name="test_sql",
            schema=parameters[db_type]["schema"],
            if_exists="replace",
            index=True,
            index_label="index",
        )
        schema = None
        if db_type == "postgresql":
            schema = parameters[db_type]["schema"]
        df = wr.db.read_sql_table(con=engine,
                                  table="test_sql",
                                  schema=schema,
                                  index_col="index")
        assert len(df.index) == 3
        assert len(df.columns) == 1
Example #34
0
MANY_TYPES = [
    pa.null(),
    pa.bool_(),
    pa.int32(),
    pa.time32('s'),
    pa.time64('us'),
    pa.date32(),
    pa.timestamp('us'),
    pa.timestamp('us', tz='UTC'),
    pa.timestamp('us', tz='Europe/Paris'),
    pa.float16(),
    pa.float32(),
    pa.float64(),
    pa.decimal128(19, 4),
    pa.string(),
    pa.binary(),
    pa.binary(10),
    pa.list_(pa.int32()),
    pa.struct([
        pa.field('a', pa.int32()),
        pa.field('b', pa.int8()),
        pa.field('c', pa.string())
    ]),
    pa.union([pa.field('a', pa.binary(10)),
              pa.field('b', pa.string())],
             mode=pa.lib.UnionMode_DENSE),
    pa.union([pa.field('a', pa.binary(10)),
              pa.field('b', pa.string())],
             mode=pa.lib.UnionMode_SPARSE),
    # XXX Needs array pickling
    # pa.dictionary(pa.int32(), pa.array(['a', 'b', 'c'])),
Example #35
0
import hypothesis as h
import hypothesis.strategies as st
import hypothesis.extra.numpy as npst
import hypothesis.extra.pytz as tzst
import numpy as np

import pyarrow as pa

# TODO(kszucs): alphanum_text, surrogate_text
custom_text = st.text(
    alphabet=st.characters(min_codepoint=0x41, max_codepoint=0x7E))

null_type = st.just(pa.null())
bool_type = st.just(pa.bool_())

binary_type = st.just(pa.binary())
string_type = st.just(pa.string())
large_binary_type = st.just(pa.large_binary())
large_string_type = st.just(pa.large_string())
fixed_size_binary_type = st.builds(pa.binary,
                                   st.integers(min_value=0, max_value=16))
binary_like_types = st.one_of(binary_type, string_type, large_binary_type,
                              large_string_type, fixed_size_binary_type)

signed_integer_types = st.sampled_from(
    [pa.int8(), pa.int16(), pa.int32(),
     pa.int64()])
unsigned_integer_types = st.sampled_from(
    [pa.uint8(), pa.uint16(),
     pa.uint32(), pa.uint64()])
integer_types = st.one_of(signed_integer_types, unsigned_integer_types)
Example #36
0
    def popbuffers(tpe, buffers, length):
        if isinstance(tpe, pyarrow.lib.DictionaryType):
            index = popbuffers(tpe.index_type, buffers, length)
            content = fromarrow(tpe.dictionary)
            if isinstance(index, awkwardlib.BitMaskedArray):
                return awkwardlib.BitMaskedArray(index.mask,
                                                 awkwardlib.IndexedArray(
                                                     index.content, content),
                                                 maskedwhen=index.maskedwhen,
                                                 lsborder=index.lsborder)
            else:
                return awkwardlib.IndexedArray(index, content)

        elif isinstance(tpe, pyarrow.lib.StructType):
            mask = buffers.pop(0)
            pairs = []
            for i in range(tpe.num_children):
                pairs.append(
                    (tpe[i].name, popbuffers(tpe[i].type, buffers, length)))
            out = awkwardlib.Table.frompairs(pairs,
                                             0)  # FIXME: better rowstart
            if mask is not None:
                mask = awkwardlib.numpy.frombuffer(mask,
                                                   dtype=ARROW_BITMASKTYPE)
                return awkwardlib.BitMaskedArray(mask,
                                                 out,
                                                 maskedwhen=False,
                                                 lsborder=True)
            else:
                return out

        elif isinstance(tpe, pyarrow.lib.ListType):
            mask = buffers.pop(0)
            offsets = awkwardlib.numpy.frombuffer(
                buffers.pop(0), dtype=ARROW_INDEXTYPE)[:length + 1]
            content = popbuffers(tpe.value_type, buffers, offsets[-1])
            out = awkwardlib.JaggedArray.fromoffsets(offsets, content)
            if mask is not None:
                mask = awkwardlib.numpy.frombuffer(mask,
                                                   dtype=ARROW_BITMASKTYPE)
                return awkwardlib.BitMaskedArray(mask,
                                                 out,
                                                 maskedwhen=False,
                                                 lsborder=True)
            else:
                return out

        elif isinstance(tpe, pyarrow.lib.UnionType) and tpe.mode == "sparse":
            mask = buffers.pop(0)
            tags = awkwardlib.numpy.frombuffer(buffers.pop(0),
                                               dtype=ARROW_TAGTYPE)[:length]
            assert buffers.pop(0) is None
            index = awkwardlib.numpy.arange(len(tags), dtype=ARROW_INDEXTYPE)
            contents = []
            for i in range(tpe.num_children):
                try:
                    sublength = index[tags == i][-1] + 1
                except IndexError:
                    sublength = 0
                contents.append(popbuffers(tpe[i].type, buffers, sublength))
            for i in range(len(contents)):
                these = index[tags == i]
                if len(these) == 0:
                    contents[i] = contents[i][0:0]
                else:
                    contents[i] = contents[i][:these[-1] + 1]
            out = awkwardlib.UnionArray(tags, index, contents)
            if mask is not None:
                mask = awkwardlib.numpy.frombuffer(mask,
                                                   dtype=ARROW_BITMASKTYPE)
                return awkwardlib.BitMaskedArray(mask,
                                                 out,
                                                 maskedwhen=False,
                                                 lsborder=True)
            else:
                return out

        elif isinstance(tpe, pyarrow.lib.UnionType) and tpe.mode == "dense":
            mask = buffers.pop(0)
            tags = awkwardlib.numpy.frombuffer(buffers.pop(0),
                                               dtype=ARROW_TAGTYPE)[:length]
            index = awkwardlib.numpy.frombuffer(buffers.pop(0),
                                                dtype=ARROW_INDEXTYPE)[:length]
            contents = []
            for i in range(tpe.num_children):
                try:
                    sublength = index[tags == i].max() + 1
                except ValueError:
                    sublength = 0
                contents.append(popbuffers(tpe[i].type, buffers, sublength))
            for i in range(len(contents)):
                these = index[tags == i]
                if len(these) == 0:
                    contents[i] = contents[i][0:0]
                else:
                    contents[i] = contents[i][:these.max() + 1]
            out = awkwardlib.UnionArray(tags, index, contents)
            if mask is not None:
                mask = awkwardlib.numpy.frombuffer(mask,
                                                   dtype=ARROW_BITMASKTYPE)
                return awkwardlib.BitMaskedArray(mask,
                                                 out,
                                                 maskedwhen=False,
                                                 lsborder=True)
            else:
                return out

        elif tpe == pyarrow.string():
            mask = buffers.pop(0)
            offsets = awkwardlib.numpy.frombuffer(
                buffers.pop(0), dtype=ARROW_INDEXTYPE)[:length + 1]
            content = awkwardlib.numpy.frombuffer(
                buffers.pop(0), dtype=ARROW_CHARTYPE)[:offsets[-1]]
            out = awkwardlib.StringArray.fromoffsets(offsets,
                                                     content[:offsets[-1]],
                                                     encoding="utf-8")
            if mask is not None:
                mask = awkwardlib.numpy.frombuffer(mask,
                                                   dtype=ARROW_BITMASKTYPE)
                return awkwardlib.BitMaskedArray(mask,
                                                 out,
                                                 maskedwhen=False,
                                                 lsborder=True)
            else:
                return out

        elif tpe == pyarrow.binary():
            mask = buffers.pop(0)
            offsets = awkwardlib.numpy.frombuffer(
                buffers.pop(0), dtype=ARROW_INDEXTYPE)[:length + 1]
            content = awkwardlib.numpy.frombuffer(
                buffers.pop(0), dtype=ARROW_CHARTYPE)[:offsets[-1]]
            out = awkwardlib.StringArray.fromoffsets(offsets,
                                                     content[:offsets[-1]],
                                                     encoding=None)
            if mask is not None:
                mask = awkwardlib.numpy.frombuffer(mask,
                                                   dtype=ARROW_BITMASKTYPE)
                return awkwardlib.BitMaskedArray(mask,
                                                 out,
                                                 maskedwhen=False,
                                                 lsborder=True)
            else:
                return out

        elif tpe == pyarrow.bool_():
            mask = buffers.pop(0)
            out = awkwardlib.numpy.unpackbits(
                awkwardlib.numpy.frombuffer(
                    buffers.pop(0), dtype=ARROW_CHARTYPE)).view(
                        awkwardlib.MaskedArray.BOOLTYPE)
            out = out.reshape(-1,
                              8)[:, ::-1].reshape(-1)[:length]  # lsborder=True
            if mask is not None:
                mask = awkwardlib.numpy.frombuffer(mask,
                                                   dtype=ARROW_BITMASKTYPE)
                return awkwardlib.BitMaskedArray(mask,
                                                 out,
                                                 maskedwhen=False,
                                                 lsborder=True)
            else:
                return out

        elif isinstance(tpe, pyarrow.lib.DataType):
            mask = buffers.pop(0)
            out = awkwardlib.numpy.frombuffer(
                buffers.pop(0), dtype=tpe.to_pandas_dtype())[:length]
            if mask is not None:
                mask = awkwardlib.numpy.frombuffer(mask,
                                                   dtype=ARROW_BITMASKTYPE)
                return awkwardlib.BitMaskedArray(mask,
                                                 out,
                                                 maskedwhen=False,
                                                 lsborder=True)
            else:
                return out

        else:
            raise NotImplementedError(repr(tpe))
Example #37
0
    result = arr.cast('i8')

    assert result.equals(expected)


@pytest.mark.parametrize(('ty', 'values'),
                         [('bool', [True, False, True]),
                          ('uint8', range(0, 255)), ('int8', range(0, 128)),
                          ('uint16', range(0, 10)), ('int16', range(0, 10)),
                          ('uint32', range(0, 10)), ('int32', range(0, 10)),
                          ('uint64', range(0, 10)), ('int64', range(0, 10)),
                          ('float', [0.0, 0.1, 0.2]),
                          ('double', [0.0, 0.1, 0.2]),
                          ('string', ['a', 'b', 'c']),
                          ('binary', [b'a', b'b', b'c']),
                          (pa.binary(3), [b'abc', b'bcd', b'cde'])])
def test_cast_identities(ty, values):
    arr = pa.array(values, type=ty)
    assert arr.cast(ty).equals(arr)


pickle_test_parametrize = pytest.mark.parametrize(
    ('data', 'typ'),
    [([True, False, True, True], pa.bool_()), ([1, 2, 4, 6], pa.int64()),
     ([1.0, 2.5, None], pa.float64()), (['a', None, 'b'], pa.string()),
     ([], None), ([[1, 2], [3]], pa.list_(pa.int64())),
     ([['a'], None, ['b', 'c']], pa.list_(pa.string())),
     ([(1, 'a'), (2, 'c'), None
       ], pa.struct([pa.field('a', pa.int64()),
                     pa.field('b', pa.string())]))])
Example #38
0
def check_example_batch(batch):
    arr = batch.column(0)
    assert isinstance(arr, pa.ExtensionArray)
    assert arr.type.storage_type == pa.binary(3)
    assert arr.storage.to_pylist() == [b"foo", b"bar"]
    return arr
Example #39
0
def example_batch():
    ty = ParamExtType(3)
    storage = pa.array([b"foo", b"bar"], type=pa.binary(3))
    arr = pa.ExtensionArray.from_storage(ty, storage)
    return pa.RecordBatch.from_arrays([arr], ["exts"])
Example #40
0
 def test_fixed_size_bytes_does_not_accept_varying_lengths(self):
     values = [b'foo', None, b'ba', None, None, b'hey']
     df = pd.DataFrame({'strings': values})
     schema = pa.schema([pa.field('strings', pa.binary(3))])
     with pytest.raises(pa.ArrowInvalid):
         pa.Table.from_pandas(df, schema=schema)
Example #41
0
 def __init__(self, width):
     self._width = width
     pa.PyExtensionType.__init__(self, pa.binary(width))
Example #42
0
STR_TYPE_ARROW_TYPE_MAP = {
    'int8': pa.int8(),
    'int16': pa.int16(),
    'int32': pa.int32(),
    'int64': pa.int64(),
    'uint8': pa.uint8(),
    'uint16': pa.uint16(),
    'uint32': pa.uint32(),
    'uint64': pa.uint64(),
    'float32': pa.float32(),
    'float64': pa.float64(),
    'double': pa.float64(),
    'half_float': pa.float16(),
    'string': pa.string(),
    'binary': pa.binary(),
    'bool': pa.bool_(),
    'float': pa.float32(),
    'int': pa.int32(),
    'str': pa.string()
}


def _get_arrow_type_from_python_type(python_type):
    try:
        return PYTHON_TYPE_ARROW_TYPE_MAP[python_type]
    except KeyError:
        return None


def _get_arrow_type_from_str_type(cylon_str_type):
Example #43
0
def test_convert_options():
    cls = ConvertOptions
    opts = cls()

    check_options_class(cls,
                        check_utf8=[True, False],
                        strings_can_be_null=[False, True],
                        include_columns=[[], ['def', 'abc']],
                        include_missing_columns=[False, True],
                        auto_dict_encode=[False, True],
                        timestamp_parsers=[[], [ISO8601, '%y-%m']])

    assert opts.auto_dict_max_cardinality > 0
    opts.auto_dict_max_cardinality = 99999
    assert opts.auto_dict_max_cardinality == 99999

    assert opts.column_types == {}
    # Pass column_types as mapping
    opts.column_types = {'b': pa.int16(), 'c': pa.float32()}
    assert opts.column_types == {'b': pa.int16(), 'c': pa.float32()}
    opts.column_types = {'v': 'int16', 'w': 'null'}
    assert opts.column_types == {'v': pa.int16(), 'w': pa.null()}
    # Pass column_types as schema
    schema = pa.schema([('a', pa.int32()), ('b', pa.string())])
    opts.column_types = schema
    assert opts.column_types == {'a': pa.int32(), 'b': pa.string()}
    # Pass column_types as sequence
    opts.column_types = [('x', pa.binary())]
    assert opts.column_types == {'x': pa.binary()}

    with pytest.raises(TypeError, match='DataType expected'):
        opts.column_types = {'a': None}
    with pytest.raises(TypeError):
        opts.column_types = 0

    assert isinstance(opts.null_values, list)
    assert '' in opts.null_values
    assert 'N/A' in opts.null_values
    opts.null_values = ['xxx', 'yyy']
    assert opts.null_values == ['xxx', 'yyy']

    assert isinstance(opts.true_values, list)
    opts.true_values = ['xxx', 'yyy']
    assert opts.true_values == ['xxx', 'yyy']

    assert isinstance(opts.false_values, list)
    opts.false_values = ['xxx', 'yyy']
    assert opts.false_values == ['xxx', 'yyy']

    assert opts.timestamp_parsers == []
    opts.timestamp_parsers = [ISO8601]
    assert opts.timestamp_parsers == [ISO8601]

    opts = cls(column_types={'a': pa.null()},
               null_values=['N', 'nn'],
               true_values=['T', 'tt'],
               false_values=['F', 'ff'],
               auto_dict_max_cardinality=999,
               timestamp_parsers=[ISO8601, '%Y-%m-%d'])
    assert opts.column_types == {'a': pa.null()}
    assert opts.null_values == ['N', 'nn']
    assert opts.false_values == ['F', 'ff']
    assert opts.true_values == ['T', 'tt']
    assert opts.auto_dict_max_cardinality == 999
    assert opts.timestamp_parsers == [ISO8601, '%Y-%m-%d']
Example #44
0
all_array_types = [
    ('bool', [True, False, False, True, True]),
    ('uint8', np.arange(5)),
    ('int8', np.arange(5)),
    ('uint16', np.arange(5)),
    ('int16', np.arange(5)),
    ('uint32', np.arange(5)),
    ('int32', np.arange(5)),
    ('uint64', np.arange(5, 10)),
    ('int64', np.arange(5, 10)),
    ('float', np.arange(0, 0.5, 0.1)),
    ('double', np.arange(0, 0.5, 0.1)),
    ('string', ['a', 'b', None, 'ddd', 'ee']),
    ('binary', [b'a', b'b', b'c', b'ddd', b'ee']),
    (pa.binary(3), [b'abc', b'bcd', b'cde', b'def', b'efg']),
    (pa.list_(pa.int8()), [[1, 2], [3, 4], [5, 6], None, [9, 16]]),
    (pa.large_list(pa.int16()), [[1], [2, 3, 4], [5, 6], None, [9, 16]]),
    (pa.struct([('a', pa.int8()), ('b', pa.int8())]), [{
        'a': 1,
        'b': 2
    }, None, {
        'a': 3,
        'b': 4
    }, None, {
        'a': 5,
        'b': 6
    }]),
]

exported_functions = [
Example #45
0
""" A DoFn that coverts a batch of features into an Arrow table."""


import apache_beam as beam
import pyarrow as pa

from typing import Dict, List, Mapping, Union
from tensorflow_metadata.proto.v0 import schema_pb2
from tensorflow_metadata.proto.v0 import statistics_pb2

_ARROW_TYPE_MAP = {
    ColumnType.UNKNOWN: pa.null(),
    ColumnType.INT: pa.list_(pa.int64()),
    ColumnType.FLOAT: pa.list_(pa.float32()),
    ColumnType.STRING: pa.list_(pa.binary()),
}


SimpleFeatureList = List[Union[int, str, float, bool]]
ColumnName = Union[bytes, Text]

@beam.typehints.with_input_types(List[SimpleFeatureList])
@beam.typehints.with_output_types(pa.RecordBatch)
class BatchedFeatureListsToRecordBatch(beam.DoFn):
    """A DoFn to convert a batch of input instances in a feature
    list format to an Arrow table. 
    """
    
    def __init__(
        self,
Example #46
0
 def __init__(self):
     pa.PyExtensionType.__init__(self, pa.binary(16))
Example #47
0
def test_array_mixed_unicode_bytes():
    check_array_mixed_unicode_bytes(pa.binary(), pa.string())
    check_array_mixed_unicode_bytes(pa.large_binary(), pa.large_string())
Example #48
0
def test_ext_array_errors():
    ty = ParamExtType(4)
    storage = pa.array([b"foo", b"bar"], type=pa.binary(3))
    with pytest.raises(TypeError, match="Incompatible storage type"):
        pa.ExtensionArray.from_storage(ty, storage)
Example #49
0
    def recurse(tpe, nullable):
        if isinstance(tpe, pyarrow.lib.DictionaryType):
            out = recurse(tpe.dictionary.type, nullable)
            if nullable:
                return awkward.type.OptionType(out)
            else:
                return out

        elif isinstance(tpe, pyarrow.lib.StructType):
            out = None
            for i in range(tpe.num_children):
                x = awkward.type.ArrayType(
                    tpe[i].name, recurse(tpe[i].type, tpe[i].nullable))
                if out is None:
                    out = x
                else:
                    out = out & x
            if nullable:
                return awkward.type.OptionType(out)
            else:
                return out

        elif isinstance(tpe, pyarrow.lib.ListType):
            out = awkward.type.ArrayType(float("inf"),
                                         recurse(tpe.value_type, nullable))
            if nullable:
                return awkward.type.OptionType(out)
            else:
                return out

        elif isinstance(tpe, pyarrow.lib.UnionType):
            out = None
            for i in range(tpe.num_children):
                x = recurse(tpe[i].type, nullable)
                if out is None:
                    out = x
                else:
                    out = out | x
            if nullable:
                return awkward.type.OptionType(out)
            else:
                return out

        elif tpe == pyarrow.string():
            if nullable:
                return awkward.type.OptionType(str)
            else:
                return str

        elif tpe == pyarrow.binary():
            if nullable:
                return awkward.type.OptionType(bytes)
            else:
                return bytes

        elif tpe == pyarrow.bool_():
            out = awkward.numpy.dtype(bool)
            if nullable:
                return awkward.type.OptionType(out)
            else:
                return out

        elif isinstance(tpe, pyarrow.lib.DataType):
            if nullable:
                return awkward.type.OptionType(tpe.to_pandas_dtype())
            else:
                return tpe.to_pandas_dtype()

        else:
            raise NotImplementedError(repr(tpe))
Example #50
0
def test_ext_array_to_pylist():
    ty = ParamExtType(3)
    storage = pa.array([b"foo", b"bar", None], type=pa.binary(3))
    arr = pa.ExtensionArray.from_storage(ty, storage)

    assert arr.to_pylist() == [b"foo", b"bar", None]
Example #51
0
def test_fixed_size_binary_byte_width():
    ty = pa.binary(5)
    assert ty.byte_width == 5
def test_fixed_size_bytes_does_not_accept_varying_lengths():
    data = [b'foo', None, b'barb', b'2346']
    with pytest.raises(pa.ArrowInvalid):
        pa.array(data, type=pa.binary(4))
Example #53
0
class KmvSketchTest(parameterized.TestCase):
    @parameterized.named_parameters(
        ("binary", [b"a", b"a", b"b", b"c", None], pa.binary()),
        ("large_binary", [b"a", b"a", b"b", b"c"], pa.large_binary()),
        ("string", ["a", "a", "b", "c", None], pa.string()),
        ("large_string", ["a", "a", "b", "c"], pa.large_string()),
        ("int8", [1, 1, 2, 3, None], pa.int8()),
        ("int16", [1, 1, 2, 3], pa.int16()),
        ("int32", [1, 1, 2, 3, None], pa.int32()),
        ("int64", [1, 1, 2, 3], pa.int64()),
        ("uint8", [1, 1, 2, 3], pa.uint8()),
        ("uint16", [1, None, 1, 2, 3], pa.uint16()),
        ("uint32", [1, 1, 2, 3], pa.uint32()),
        ("uint64", [1, 1, 2, 3, None], pa.uint64()),
    )
    def test_add(self, values, type_):
        sketch = _create_basic_sketch(pa.array(values, type=type_))
        num_unique = sketch.Estimate()

        self.assertEqual(3, num_unique)

    def test_add_unsupported_type(self):
        values = pa.array([True, False], pa.bool_())
        sketch = sketches.KmvSketch(_NUM_BUCKETS)
        with self.assertRaisesRegex(RuntimeError, "Unimplemented: bool"):
            sketch.AddValues(values)

    def test_merge(self):
        sketch1 = _create_basic_sketch(pa.array(["a", "b", "c", "a"]))
        sketch2 = _create_basic_sketch(pa.array(["d", "a"]))

        sketch1.Merge(sketch2)
        num_unique = sketch1.Estimate()

        self.assertEqual(4, num_unique)

    def test_merge_error(self):
        sketch1 = _create_basic_sketch(pa.array(["a", "b", "c", "a"]))
        sketch2 = _create_basic_sketch(pa.array(["d", "a"]), num_buckets=64)
        with self.assertRaisesRegex(
                Exception,
                "Both sketches must have the same number of buckets"):
            sketch1.Merge(sketch2)

    def test_picklable(self):
        sketch = _create_basic_sketch(pa.array(["a", "b", "c", "a"]))
        pickled = pickle.dumps(sketch, 2)
        self.assertIsInstance(pickled, bytes)
        unpickled = pickle.loads(pickled)
        self.assertIsInstance(unpickled, sketches.KmvSketch)

        num_unique = unpickled.Estimate()
        self.assertEqual(3, num_unique)

    def test_serialization(self):
        sketch = _create_basic_sketch(pa.array(["a", "b", "c", "a"]))

        serialized = sketch.Serialize()
        self.assertIsInstance(serialized, bytes)

        deserialized = sketches.KmvSketch.Deserialize(serialized)
        self.assertIsInstance(deserialized, sketches.KmvSketch)

        num_unique = deserialized.Estimate()
        self.assertEqual(3, num_unique)
Example #54
0
 ],
 "type_schema":
 OrderedDict([
     ("a", int),
     ("b", float),
     ("c", str),
     ("d", np.ndarray),
     ("e", bytes),
 ]),
 "pyarrow_schema":
 pa.schema([
     ("a", pa.int64()),
     ("b", pa.float64()),
     ("c", pa.string()),
     ("d", pa.list_(pa.int64())),
     ("e", pa.binary()),
 ]) if pa is not None else None,
 "avro_schema": {
     "namespace":
     "example.avro",
     "name":
     "User",
     "type":
     "record",
     "fields": [
         {
             "name": "a",
             "type": "int"
         },
         {
             "name": "b",
def dataframe_with_lists(include_index=False, parquet_compatible=False):
    """
    Dataframe with list columns of every possible primitive type.

    Returns
    -------
    df: pandas.DataFrame
    schema: pyarrow.Schema
        Arrow schema definition that is in line with the constructed df.
    parquet_compatible: bool
        Exclude types not supported by parquet
    """
    arrays = OrderedDict()
    fields = []

    fields.append(pa.field('int64', pa.list_(pa.int64())))
    arrays['int64'] = [
        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
        [0, 1, 2, 3, 4],
        None,
        [],
        np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9] * 2,
                 dtype=np.int64)[::2]
    ]
    fields.append(pa.field('double', pa.list_(pa.float64())))
    arrays['double'] = [
        [0., 1., 2., 3., 4., 5., 6., 7., 8., 9.],
        [0., 1., 2., 3., 4.],
        None,
        [],
        np.array([0., 1., 2., 3., 4., 5., 6., 7., 8., 9.] * 2)[::2],
    ]
    fields.append(pa.field('bytes_list', pa.list_(pa.binary())))
    arrays['bytes_list'] = [
        [b"1", b"f"],
        None,
        [b"1"],
        [b"1", b"2", b"3"],
        [],
    ]
    fields.append(pa.field('str_list', pa.list_(pa.string())))
    arrays['str_list'] = [
        ["1", "รค"],
        None,
        ["1"],
        ["1", "2", "3"],
        [],
    ]

    date_data = [
        [],
        [date(2018, 1, 1), date(2032, 12, 30)],
        [date(2000, 6, 7)],
        None,
        [date(1969, 6, 9), date(1972, 7, 3)]
    ]
    time_data = [
        [time(23, 11, 11), time(1, 2, 3), time(23, 59, 59)],
        [],
        [time(22, 5, 59)],
        None,
        [time(0, 0, 0), time(18, 0, 2), time(12, 7, 3)]
    ]

    temporal_pairs = [
        (pa.date32(), date_data),
        (pa.date64(), date_data),
        (pa.time32('s'), time_data),
        (pa.time32('ms'), time_data),
        (pa.time64('us'), time_data)
    ]
    if not parquet_compatible:
        temporal_pairs += [
            (pa.time64('ns'), time_data),
        ]

    for value_type, data in temporal_pairs:
        field_name = '{}_list'.format(value_type)
        field_type = pa.list_(value_type)
        field = pa.field(field_name, field_type)
        fields.append(field)
        arrays[field_name] = data

    if include_index:
        fields.append(pa.field('__index_level_0__', pa.int64()))

    df = pd.DataFrame(arrays)
    schema = pa.schema(fields)

    return df, schema
Example #56
0
        "FLOAT64",
        pyarrow.float32().id:
        "FLOAT64",
        pyarrow.float64().id:
        "FLOAT64",
        pyarrow.time32("ms").id:
        "TIME",
        pyarrow.time64("ns").id:
        "TIME",
        pyarrow.timestamp("ns").id:
        "TIMESTAMP",
        pyarrow.date32().id:
        "DATE",
        pyarrow.date64().id:
        "DATETIME",  # because millisecond resolution
        pyarrow.binary().id:
        "BYTES",
        pyarrow.string().id:
        "STRING",  # also alias for pyarrow.utf8()
        pyarrow.decimal128(38, scale=9).id:
        "NUMERIC",
        # The exact decimal's scale and precision are not important, as only
        # the type ID matters, and it's the same for all decimal128 instances.
    }

else:  # pragma: NO COVER
    BQ_TO_ARROW_SCALARS = {}  # pragma: NO COVER
    ARROW_SCALAR_IDS_TO_BQ = {}  # pragma: NO_COVER


def bq_to_arrow_struct_data_type(field):
Example #57
0
import os
import sys

import cx_Oracle

import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq

# cx_Oracle => PyArrow type map
type_map = {
    cx_Oracle.DB_TYPE_BFILE: pa.binary(),
    cx_Oracle.DB_TYPE_BINARY_DOUBLE: pa.float64(),
    cx_Oracle.DB_TYPE_BINARY_FLOAT: pa.float64(),
    cx_Oracle.DB_TYPE_BLOB: pa.binary(),
    cx_Oracle.DB_TYPE_CHAR: pa.string(),
    cx_Oracle.DB_TYPE_CLOB: pa.binary(),
    # cx_Oracle.DB_TYPE_CURSOR
    cx_Oracle.DB_TYPE_DATE: pa.timestamp('ms'),
    # cx_Oracle.DB_TYPE_INTERVAL_DS
    cx_Oracle.DB_TYPE_LONG: pa.string(),
    cx_Oracle.DB_TYPE_LONG_RAW: pa.binary(),
    cx_Oracle.DB_TYPE_NCHAR: pa.string(),
    cx_Oracle.DB_TYPE_NCLOB: pa.binary(),
    # cx_Oracle.DB_TYPE_NUMBER: pa.float64(), # could reflect on precision/scale
    cx_Oracle.DB_TYPE_NVARCHAR: pa.string(),
    # cx_Oracle.DB_TYPE_OBJECT
    cx_Oracle.DB_TYPE_RAW: pa.binary(),
    cx_Oracle.DB_TYPE_ROWID: pa.string(),
    cx_Oracle.DB_TYPE_TIMESTAMP: pa.timestamp('ms'),
    cx_Oracle.DB_TYPE_TIMESTAMP_LTZ: pa.timestamp('ms'),
Example #58
0
def test_convert_options():
    cls = ConvertOptions
    opts = cls()

    assert opts.check_utf8 is True
    opts.check_utf8 = False
    assert opts.check_utf8 is False

    assert opts.strings_can_be_null is False
    opts.strings_can_be_null = True
    assert opts.strings_can_be_null is True

    assert opts.column_types == {}
    # Pass column_types as mapping
    opts.column_types = {'b': pa.int16(), 'c': pa.float32()}
    assert opts.column_types == {'b': pa.int16(), 'c': pa.float32()}
    opts.column_types = {'v': 'int16', 'w': 'null'}
    assert opts.column_types == {'v': pa.int16(), 'w': pa.null()}
    # Pass column_types as schema
    schema = pa.schema([('a', pa.int32()), ('b', pa.string())])
    opts.column_types = schema
    assert opts.column_types == {'a': pa.int32(), 'b': pa.string()}
    # Pass column_types as sequence
    opts.column_types = [('x', pa.binary())]
    assert opts.column_types == {'x': pa.binary()}

    with pytest.raises(TypeError, match='DataType expected'):
        opts.column_types = {'a': None}
    with pytest.raises(TypeError):
        opts.column_types = 0

    assert isinstance(opts.null_values, list)
    assert '' in opts.null_values
    assert 'N/A' in opts.null_values
    opts.null_values = ['xxx', 'yyy']
    assert opts.null_values == ['xxx', 'yyy']

    assert isinstance(opts.true_values, list)
    opts.true_values = ['xxx', 'yyy']
    assert opts.true_values == ['xxx', 'yyy']

    assert isinstance(opts.false_values, list)
    opts.false_values = ['xxx', 'yyy']
    assert opts.false_values == ['xxx', 'yyy']

    assert opts.include_columns == []
    opts.include_columns = ['def', 'abc']
    assert opts.include_columns == ['def', 'abc']

    assert opts.include_missing_columns is False
    opts.include_missing_columns = True
    assert opts.include_missing_columns is True

    opts = cls(check_utf8=False,
               column_types={'a': pa.null()},
               null_values=['N', 'nn'],
               true_values=['T', 'tt'],
               false_values=['F', 'ff'],
               strings_can_be_null=True,
               include_columns=['abc', 'def'],
               include_missing_columns=True)
    assert opts.check_utf8 is False
    assert opts.column_types == {'a': pa.null()}
    assert opts.null_values == ['N', 'nn']
    assert opts.false_values == ['F', 'ff']
    assert opts.true_values == ['T', 'tt']
    assert opts.strings_can_be_null is True
    assert opts.include_columns == ['abc', 'def']
    assert opts.include_missing_columns is True
Example #59
0
# consolidate with the logic from the parquet backend

_to_ibis_dtypes = {
    pa.int8(): dt.Int8,
    pa.int16(): dt.Int16,
    pa.int32(): dt.Int32,
    pa.int64(): dt.Int64,
    pa.uint8(): dt.UInt8,
    pa.uint16(): dt.UInt16,
    pa.uint32(): dt.UInt32,
    pa.uint64(): dt.UInt64,
    pa.float16(): dt.Float16,
    pa.float32(): dt.Float32,
    pa.float64(): dt.Float64,
    pa.string(): dt.String,
    pa.binary(): dt.Binary,
    pa.bool_(): dt.Boolean,
}


@dt.dtype.register(pa.DataType)
def from_pyarrow_primitive(arrow_type, nullable=True):
    return _to_ibis_dtypes[arrow_type](nullable=nullable)


@dt.dtype.register(pa.TimestampType)
def from_pyarrow_timestamp(arrow_type, nullable=True):
    return dt.TimestampType(timezone=arrow_type.tz)


@sch.infer.register(pa.Schema)
Example #60
0
def test_string(value, ty, scalar_typ):
    s = pa.scalar(value, type=ty)
    assert isinstance(s, scalar_typ)
    assert s.as_py() == value
    assert s.as_py() != 'something'
    assert repr(value) in repr(s)
    assert str(s) == str(value)

    buf = s.as_buffer()
    assert isinstance(buf, pa.Buffer)
    assert buf.to_pybytes() == value.encode()


@pytest.mark.parametrize('value', [b'foo', b'bar'])
@pytest.mark.parametrize(('ty', 'scalar_typ'),
                         [(pa.binary(), pa.BinaryScalar),
                          (pa.large_binary(), pa.LargeBinaryScalar)])
def test_binary(value, ty, scalar_typ):
    s = pa.scalar(value, type=ty)
    assert isinstance(s, scalar_typ)
    assert s.as_py() == value
    assert str(s) == str(value)
    assert repr(value) in repr(s)
    assert s.as_py() == value
    assert s != b'xxxxx'

    buf = s.as_buffer()
    assert isinstance(buf, pa.Buffer)
    assert buf.to_pybytes() == value