Example #1
0
def test_empty_cast():
    types = [
        pa.null(),
        pa.bool_(),
        pa.int8(),
        pa.int16(),
        pa.int32(),
        pa.int64(),
        pa.uint8(),
        pa.uint16(),
        pa.uint32(),
        pa.uint64(),
        pa.float16(),
        pa.float32(),
        pa.float64(),
        pa.date32(),
        pa.date64(),
        pa.binary(),
        pa.binary(length=4),
        pa.string(),
    ]

    for (t1, t2) in itertools.product(types, types):
        try:
            # ARROW-4766: Ensure that supported types conversion don't segfault
            # on empty arrays of common types
            pa.array([], type=t1).cast(t2)
        except pa.lib.ArrowNotImplementedError:
            continue
Example #2
0
def test_orcfile_empty():
    from pyarrow import orc
    f = orc.ORCFile(path_for_orc_example('TestOrcFile.emptyFile'))
    table = f.read()
    assert table.num_rows == 0
    schema = table.schema
    expected_schema = pa.schema([
        ('boolean1', pa.bool_()),
        ('byte1', pa.int8()),
        ('short1', pa.int16()),
        ('int1', pa.int32()),
        ('long1', pa.int64()),
        ('float1', pa.float32()),
        ('double1', pa.float64()),
        ('bytes1', pa.binary()),
        ('string1', pa.string()),
        ('middle', pa.struct([
            ('list', pa.list_(pa.struct([
                ('int1', pa.int32()),
                ('string1', pa.string()),
                ]))),
            ])),
        ('list', pa.list_(pa.struct([
            ('int1', pa.int32()),
            ('string1', pa.string()),
            ]))),
        ('map', pa.list_(pa.struct([
            ('key', pa.string()),
            ('value', pa.struct([
                ('int1', pa.int32()),
                ('string1', pa.string()),
                ])),
            ]))),
        ])
    assert schema == expected_schema
Example #3
0
    def __init__(self, values):
        if not isinstance(values, pa.ChunkedArray):
            raise ValueError

        assert values.type == pa.bool_()
        self._data = values
        self._dtype = ArrowBoolDtype()
Example #4
0
def test_sequence_numpy_boolean(seq):
    expected = [np.bool(True), None, np.bool(False), None]
    arr = pa.array(seq(expected))
    assert len(arr) == 4
    assert arr.null_count == 2
    assert arr.type == pa.bool_()
    assert arr.to_pylist() == expected
Example #5
0
def test_struct_from_tuples():
    ty = pa.struct([pa.field('a', pa.int32()),
                    pa.field('b', pa.string()),
                    pa.field('c', pa.bool_())])

    data = [(5, 'foo', True),
            (6, 'bar', False)]
    expected = [{'a': 5, 'b': 'foo', 'c': True},
                {'a': 6, 'b': 'bar', 'c': False}]
    arr = pa.array(data, type=ty)

    data_as_ndarray = np.empty(len(data), dtype=object)
    data_as_ndarray[:] = data
    arr2 = pa.array(data_as_ndarray, type=ty)
    assert arr.to_pylist() == expected

    assert arr.equals(arr2)

    # With omitted values
    data = [(5, 'foo', None),
            None,
            (6, None, False)]
    expected = [{'a': 5, 'b': 'foo', 'c': None},
                None,
                {'a': 6, 'b': None, 'c': False}]
    arr = pa.array(data, type=ty)
    assert arr.to_pylist() == expected

    # Invalid tuple size
    for tup in [(5, 'foo'), (), ('5', 'foo', True, None)]:
        with pytest.raises(ValueError, match="(?i)tuple size"):
            pa.array([tup], type=ty)
Example #6
0
 def test_boolean(self):
     expected = [True, None, False, None]
     arr = pyarrow.from_pylist(expected)
     assert len(arr) == 4
     assert arr.null_count == 2
     assert arr.type == pyarrow.bool_()
     assert arr.to_pylist() == expected
Example #7
0
def test_type_to_pandas_dtype():
    M8_ns = np.dtype('datetime64[ns]')
    cases = [
        (pa.null(), np.float64),
        (pa.bool_(), np.bool_),
        (pa.int8(), np.int8),
        (pa.int16(), np.int16),
        (pa.int32(), np.int32),
        (pa.int64(), np.int64),
        (pa.uint8(), np.uint8),
        (pa.uint16(), np.uint16),
        (pa.uint32(), np.uint32),
        (pa.uint64(), np.uint64),
        (pa.float16(), np.float16),
        (pa.float32(), np.float32),
        (pa.float64(), np.float64),
        (pa.date32(), M8_ns),
        (pa.date64(), M8_ns),
        (pa.timestamp('ms'), M8_ns),
        (pa.binary(), np.object_),
        (pa.binary(12), np.object_),
        (pa.string(), np.object_),
        (pa.list_(pa.int8()), np.object_),
    ]
    for arrow_type, numpy_type in cases:
        assert arrow_type.to_pandas_dtype() == numpy_type
Example #8
0
def test_struct_from_dicts_inference():
    expected_type = pa.struct([pa.field('a', pa.int64()),
                               pa.field('b', pa.string()),
                               pa.field('c', pa.bool_())])
    data = [{'a': 5, 'b': u'foo', 'c': True},
            {'a': 6, 'b': u'bar', 'c': False}]
    arr = pa.array(data)
    check_struct_type(arr.type, expected_type)
    assert arr.to_pylist() == data

    # With omitted values
    data = [{'a': 5, 'c': True},
            None,
            {},
            {'a': None, 'b': u'bar'}]
    expected = [{'a': 5, 'b': None, 'c': True},
                None,
                {'a': None, 'b': None, 'c': None},
                {'a': None, 'b': u'bar', 'c': None}]
    arr = pa.array(data)
    data_as_ndarray = np.empty(len(data), dtype=object)
    data_as_ndarray[:] = data
    arr2 = pa.array(data)

    check_struct_type(arr.type, expected_type)
    assert arr.to_pylist() == expected
    assert arr.equals(arr2)

    # Nested
    expected_type = pa.struct([
        pa.field('a', pa.struct([pa.field('aa', pa.list_(pa.int64())),
                                 pa.field('ab', pa.bool_())])),
        pa.field('b', pa.string())])
    data = [{'a': {'aa': [5, 6], 'ab': True}, 'b': 'foo'},
            {'a': {'aa': None, 'ab': False}, 'b': None},
            {'a': None, 'b': 'bar'}]
    arr = pa.array(data)
    assert arr.to_pylist() == data

    # Edge cases
    arr = pa.array([{}])
    assert arr.type == pa.struct([])
    assert arr.to_pylist() == [{}]

    # Mixing structs and scalars is rejected
    with pytest.raises((pa.ArrowInvalid, pa.ArrowTypeError)):
        pa.array([1, {'a': 2}])
    def test_boolean_no_nulls(self):
        num_values = 100

        np.random.seed(0)

        df = pd.DataFrame({'bools': np.random.randn(num_values) > 0})
        field = A.Field.from_py('bools', A.bool_())
        schema = A.Schema.from_fields([field])
        self._check_pandas_roundtrip(df, expected_schema=schema)
    def test_boolean_no_nulls(self):
        num_values = 100

        np.random.seed(0)

        df = pd.DataFrame({'bools': np.random.randn(num_values) > 0})
        field = pa.field('bools', pa.bool_())
        schema = pa.schema([field])
        self._check_pandas_roundtrip(df, expected_schema=schema)
Example #11
0
def test_mixed_sequence_errors():
    with pytest.raises(ValueError, match="tried to convert to boolean"):
        pa.array([True, 'foo'], type=pa.bool_())

    with pytest.raises(ValueError, match="tried to convert to float32"):
        pa.array([1.5, 'foo'], type=pa.float32())

    with pytest.raises(ValueError, match="tried to convert to double"):
        pa.array([1.5, 'foo'])
Example #12
0
def test_struct_from_mixed_sequence():
    # It is forbidden to mix dicts and tuples when initializing a struct array
    ty = pa.struct([pa.field('a', pa.int32()),
                    pa.field('b', pa.string()),
                    pa.field('c', pa.bool_())])
    data = [(5, 'foo', True),
            {'a': 6, 'b': 'bar', 'c': False}]
    with pytest.raises(TypeError):
        pa.array(data, type=ty)
Example #13
0
File: jvm.py Project: rok/arrow
def field(jvm_field):
    """
    Construct a Field from a org.apache.arrow.vector.types.pojo.Field
    instance.

    Parameters
    ----------
    jvm_field: org.apache.arrow.vector.types.pojo.Field

    Returns
    -------
    pyarrow.Field
    """
    name = jvm_field.getName()
    jvm_type = jvm_field.getType()

    typ = None
    if not jvm_type.isComplex():
        type_str = jvm_type.getTypeID().toString()
        if type_str == 'Null':
            typ = pa.null()
        elif type_str == 'Int':
            typ = _from_jvm_int_type(jvm_type)
        elif type_str == 'FloatingPoint':
            typ = _from_jvm_float_type(jvm_type)
        elif type_str == 'Utf8':
            typ = pa.string()
        elif type_str == 'Binary':
            typ = pa.binary()
        elif type_str == 'FixedSizeBinary':
            typ = pa.binary(jvm_type.getByteWidth())
        elif type_str == 'Bool':
            typ = pa.bool_()
        elif type_str == 'Time':
            typ = _from_jvm_time_type(jvm_type)
        elif type_str == 'Timestamp':
            typ = _from_jvm_timestamp_type(jvm_type)
        elif type_str == 'Date':
            typ = _from_jvm_date_type(jvm_type)
        elif type_str == 'Decimal':
            typ = pa.decimal128(jvm_type.getPrecision(), jvm_type.getScale())
        else:
            raise NotImplementedError(
                "Unsupported JVM type: {}".format(type_str))
    else:
        # TODO: The following JVM types are not implemented:
        #       Struct, List, FixedSizeList, Union, Dictionary
        raise NotImplementedError(
            "JVM field conversion only implemented for primitive types.")

    nullable = jvm_field.isNullable()
    if jvm_field.getMetadata().isEmpty():
        metadata = None
    else:
        metadata = dict(jvm_field.getMetadata())
    return pa.field(name, typ, nullable, metadata)
Example #14
0
def test_bit_width():
    for ty, expected in [(pa.bool_(), 1),
                         (pa.int8(), 8),
                         (pa.uint32(), 32),
                         (pa.float16(), 16),
                         (pa.decimal128(19, 4), 128),
                         (pa.binary(42), 42 * 8)]:
        assert ty.bit_width == expected
    for ty in [pa.binary(), pa.string(), pa.list_(pa.int16())]:
        with pytest.raises(ValueError, match="fixed width"):
            ty.bit_width
def test_bq_to_arrow_data_type_w_struct(module_under_test, bq_type):
    fields = (
        schema.SchemaField("field01", "STRING"),
        schema.SchemaField("field02", "BYTES"),
        schema.SchemaField("field03", "INTEGER"),
        schema.SchemaField("field04", "INT64"),
        schema.SchemaField("field05", "FLOAT"),
        schema.SchemaField("field06", "FLOAT64"),
        schema.SchemaField("field07", "NUMERIC"),
        schema.SchemaField("field08", "BOOLEAN"),
        schema.SchemaField("field09", "BOOL"),
        schema.SchemaField("field10", "TIMESTAMP"),
        schema.SchemaField("field11", "DATE"),
        schema.SchemaField("field12", "TIME"),
        schema.SchemaField("field13", "DATETIME"),
        schema.SchemaField("field14", "GEOGRAPHY"),
    )
    field = schema.SchemaField("ignored_name", bq_type, mode="NULLABLE", fields=fields)
    actual = module_under_test.bq_to_arrow_data_type(field)
    expected = pyarrow.struct(
        (
            pyarrow.field("field01", pyarrow.string()),
            pyarrow.field("field02", pyarrow.binary()),
            pyarrow.field("field03", pyarrow.int64()),
            pyarrow.field("field04", pyarrow.int64()),
            pyarrow.field("field05", pyarrow.float64()),
            pyarrow.field("field06", pyarrow.float64()),
            pyarrow.field("field07", module_under_test.pyarrow_numeric()),
            pyarrow.field("field08", pyarrow.bool_()),
            pyarrow.field("field09", pyarrow.bool_()),
            pyarrow.field("field10", module_under_test.pyarrow_timestamp()),
            pyarrow.field("field11", pyarrow.date32()),
            pyarrow.field("field12", module_under_test.pyarrow_time()),
            pyarrow.field("field13", module_under_test.pyarrow_datetime()),
            pyarrow.field("field14", pyarrow.string()),
        )
    )
    assert pyarrow.types.is_struct(actual)
    assert actual.num_children == len(fields)
    assert actual.equals(expected)
Example #16
0
 def test_column_types(self):
     # Ask for specific column types in ConvertOptions
     opts = ConvertOptions(column_types={'b': 'float32',
                                         'c': 'string',
                                         'd': 'boolean',
                                         'zz': 'null'})
     rows = b"a,b,c,d\n1,2,3,true\n4,-5,6,false\n"
     table = self.read_bytes(rows, convert_options=opts)
     schema = pa.schema([('a', pa.int64()),
                         ('b', pa.float32()),
                         ('c', pa.string()),
                         ('d', pa.bool_())])
     expected = {
         'a': [1, 4],
         'b': [2.0, -5.0],
         'c': ["3", "6"],
         'd': [True, False],
         }
     assert table.schema == schema
     assert table.to_pydict() == expected
     # Pass column_types as schema
     opts = ConvertOptions(
         column_types=pa.schema([('b', pa.float32()),
                                 ('c', pa.string()),
                                 ('d', pa.bool_()),
                                 ('zz', pa.bool_())]))
     table = self.read_bytes(rows, convert_options=opts)
     assert table.schema == schema
     assert table.to_pydict() == expected
     # One of the columns in column_types fails converting
     rows = b"a,b,c,d\n1,XXX,3,true\n4,-5,6,false\n"
     with pytest.raises(pa.ArrowInvalid) as exc:
         self.read_bytes(rows, convert_options=opts)
     err = str(exc.value)
     assert "In column #1: " in err
     assert "CSV conversion error to float: invalid value 'XXX'" in err
Example #17
0
 def test_simple_varied(self):
     # Infer various kinds of data
     rows = b"a,b,c,d\n1,2,3,0\n4.0,-5,foo,True\n"
     table = self.read_bytes(rows)
     schema = pa.schema([('a', pa.float64()),
                         ('b', pa.int64()),
                         ('c', pa.string()),
                         ('d', pa.bool_())])
     assert table.schema == schema
     assert table.to_pydict() == {
         'a': [1.0, 4.0],
         'b': [2, -5],
         'c': [u"3", u"foo"],
         'd': [False, True],
         }
Example #18
0
def test_from_numpy_dtype():
    cases = [
        (np.dtype('bool'), pa.bool_()),
        (np.dtype('int8'), pa.int8()),
        (np.dtype('int16'), pa.int16()),
        (np.dtype('int32'), pa.int32()),
        (np.dtype('int64'), pa.int64()),
        (np.dtype('uint8'), pa.uint8()),
        (np.dtype('uint16'), pa.uint16()),
        (np.dtype('uint32'), pa.uint32()),
        (np.dtype('float16'), pa.float16()),
        (np.dtype('float32'), pa.float32()),
        (np.dtype('float64'), pa.float64()),
        (np.dtype('U'), pa.string()),
        (np.dtype('S'), pa.binary()),
        (np.dtype('datetime64[s]'), pa.timestamp('s')),
        (np.dtype('datetime64[ms]'), pa.timestamp('ms')),
        (np.dtype('datetime64[us]'), pa.timestamp('us')),
        (np.dtype('datetime64[ns]'), pa.timestamp('ns'))
    ]

    for dt, pt in cases:
        result = pa.from_numpy_dtype(dt)
        assert result == pt

    # Things convertible to numpy dtypes work
    assert pa.from_numpy_dtype('U') == pa.string()
    assert pa.from_numpy_dtype(np.unicode) == pa.string()
    assert pa.from_numpy_dtype('int32') == pa.int32()
    assert pa.from_numpy_dtype(bool) == pa.bool_()

    with pytest.raises(NotImplementedError):
        pa.from_numpy_dtype(np.dtype('O'))

    with pytest.raises(TypeError):
        pa.from_numpy_dtype('not_convertible_to_dtype')
Example #19
0
 def test_simple_varied(self):
     # Infer various kinds of data
     rows = (b'{"a": 1,"b": 2, "c": "3", "d": false}\n'
             b'{"a": 4.0, "b": -5, "c": "foo", "d": true}\n')
     table = self.read_bytes(rows)
     schema = pa.schema([('a', pa.float64()),
                         ('b', pa.int64()),
                         ('c', pa.string()),
                         ('d', pa.bool_())])
     assert table.schema == schema
     assert table.to_pydict() == {
         'a': [1.0, 4.0],
         'b': [2, -5],
         'c': [u"3", u"foo"],
         'd': [False, True],
         }
    def test_structarray(self):
        ints = pa.array([None, 2, 3], type=pa.int64())
        strs = pa.array([u'a', None, u'c'], type=pa.string())
        bools = pa.array([True, False, None], type=pa.bool_())
        arr = pa.StructArray.from_arrays(
            ['ints', 'strs', 'bools'],
            [ints, strs, bools])

        expected = pd.Series([
            {'ints': None, 'strs': u'a', 'bools': True},
            {'ints': 2, 'strs': None, 'bools': False},
            {'ints': 3, 'strs': u'c', 'bools': None},
        ])

        series = pd.Series(arr.to_pandas())
        tm.assert_series_equal(series, expected)
Example #21
0
def test_structarray():
    ints = pa.array([None, 2, 3], type=pa.int64())
    strs = pa.array([u'a', None, u'c'], type=pa.string())
    bools = pa.array([True, False, None], type=pa.bool_())
    arr = pa.StructArray.from_arrays(
        [ints, strs, bools],
        ['ints', 'strs', 'bools'])

    expected = [
        {'ints': None, 'strs': u'a', 'bools': True},
        {'ints': 2, 'strs': None, 'bools': False},
        {'ints': 3, 'strs': u'c', 'bools': None},
    ]

    pylist = arr.to_pylist()
    assert pylist == expected, (pylist, expected)
Example #22
0
def test_table_flatten():
    ty1 = pa.struct([pa.field('x', pa.int16()),
                     pa.field('y', pa.float32())])
    ty2 = pa.struct([pa.field('nest', ty1)])
    a = pa.array([(1, 2.5), (3, 4.5)], type=ty1)
    b = pa.array([((11, 12.5),), ((13, 14.5),)], type=ty2)
    c = pa.array([False, True], type=pa.bool_())

    table = pa.Table.from_arrays([a, b, c], names=['a', 'b', 'c'])
    t2 = table.flatten()
    t2._validate()
    expected = pa.Table.from_arrays([
        pa.array([1, 3], type=pa.int16()),
        pa.array([2.5, 4.5], type=pa.float32()),
        pa.array([(11, 12.5), (13, 14.5)], type=ty1),
        c],
        names=['a.x', 'a.y', 'b.nest', 'c'])
    assert t2.equals(expected)
Example #23
0
 def test_simple_nulls(self):
     # Infer various kinds of data, with nulls
     rows = (b'{"a": 1, "b": 2, "c": null, "d": null, "e": null}\n'
             b'{"a": null, "b": -5, "c": "foo", "d": null, "e": true}\n'
             b'{"a": 4.5, "b": null, "c": "nan", "d": null,"e": false}\n')
     table = self.read_bytes(rows)
     schema = pa.schema([('a', pa.float64()),
                         ('b', pa.int64()),
                         ('c', pa.string()),
                         ('d', pa.null()),
                         ('e', pa.bool_())])
     assert table.schema == schema
     assert table.to_pydict() == {
         'a': [1.0, None, 4.5],
         'b': [2, -5, None],
         'c': [None, u"foo", u"nan"],
         'd': [None, None, None],
         'e': [None, True, False],
         }
Example #24
0
 def test_custom_bools(self):
     # Infer booleans with custom values
     opts = ConvertOptions(true_values=['T', 'yes'],
                           false_values=['F', 'no'])
     rows = (b"a,b,c\n"
             b"True,T,t\n"
             b"False,F,f\n"
             b"True,yes,yes\n"
             b"False,no,no\n"
             b"N/A,N/A,N/A\n")
     table = self.read_bytes(rows, convert_options=opts)
     schema = pa.schema([('a', pa.string()),
                         ('b', pa.bool_()),
                         ('c', pa.string())])
     assert table.schema == schema
     assert table.to_pydict() == {
         'a': ["True", "False", "True", "False", "N/A"],
         'b': [True, False, True, False, None],
         'c': ["t", "f", "yes", "no", "N/A"],
         }
Example #25
0
 def test_simple_nulls(self):
     # Infer various kinds of data, with nulls
     rows = (b"a,b,c,d,e,f\n"
             b"1,2,,,3,N/A\n"
             b"nan,-5,foo,,nan,TRUE\n"
             b"4.5,#N/A,nan,,\xff,false\n")
     table = self.read_bytes(rows)
     schema = pa.schema([('a', pa.float64()),
                         ('b', pa.int64()),
                         ('c', pa.string()),
                         ('d', pa.null()),
                         ('e', pa.binary()),
                         ('f', pa.bool_())])
     assert table.schema == schema
     assert table.to_pydict() == {
         'a': [1.0, None, 4.5],
         'b': [2, -5, None],
         'c': [u"", u"foo", u"nan"],
         'd': [None, None, None],
         'e': [b"3", b"nan", b"\xff"],
         'f': [None, True, False],
         }
    def test_boolean_nulls(self):
        # pandas requires upcast to object dtype
        num_values = 100
        np.random.seed(0)

        mask = np.random.randint(0, 10, size=num_values) < 3
        values = np.random.randint(0, 10, size=num_values) < 5

        arr = pa.array(values, mask=mask)

        expected = values.astype(object)
        expected[mask] = None

        field = pa.field('bools', pa.bool_())
        schema = pa.schema([field])
        ex_frame = pd.DataFrame({'bools': expected})

        table = pa.Table.from_arrays([arr], ['bools'])
        assert table.schema.equals(schema)
        result = table.to_pandas()

        tm.assert_frame_equal(result, ex_frame)
Example #27
0
def test_struct_from_dicts():
    ty = pa.struct([pa.field('a', pa.int32()),
                    pa.field('b', pa.string()),
                    pa.field('c', pa.bool_())])
    arr = pa.array([], type=ty)
    assert arr.to_pylist() == []

    data = [{'a': 5, 'b': 'foo', 'c': True},
            {'a': 6, 'b': 'bar', 'c': False}]
    arr = pa.array(data, type=ty)
    assert arr.to_pylist() == data

    # With omitted values
    data = [{'a': 5, 'c': True},
            None,
            {},
            {'a': None, 'b': 'bar'}]
    arr = pa.array(data, type=ty)
    expected = [{'a': 5, 'b': None, 'c': True},
                None,
                {'a': None, 'b': None, 'c': None},
                {'a': None, 'b': 'bar', 'c': None}]
    assert arr.to_pylist() == expected
Example #28
0
def get_many_types():
    # returning them from a function is required because of pa.dictionary
    # type holds a pyarrow array and test_array.py::test_toal_bytes_allocated
    # checks that the default memory pool has zero allocated bytes
    return (
        pa.null(),
        pa.bool_(),
        pa.int32(),
        pa.time32('s'),
        pa.time64('us'),
        pa.date32(),
        pa.timestamp('us'),
        pa.timestamp('us', tz='UTC'),
        pa.timestamp('us', tz='Europe/Paris'),
        pa.float16(),
        pa.float32(),
        pa.float64(),
        pa.decimal128(19, 4),
        pa.string(),
        pa.binary(),
        pa.binary(10),
        pa.list_(pa.int32()),
        pa.struct([pa.field('a', pa.int32()),
                   pa.field('b', pa.int8()),
                   pa.field('c', pa.string())]),
        pa.struct([pa.field('a', pa.int32(), nullable=False),
                   pa.field('b', pa.int8(), nullable=False),
                   pa.field('c', pa.string())]),
        pa.union([pa.field('a', pa.binary(10)),
                  pa.field('b', pa.string())], mode=pa.lib.UnionMode_DENSE),
        pa.union([pa.field('a', pa.binary(10)),
                  pa.field('b', pa.string())], mode=pa.lib.UnionMode_SPARSE),
        pa.union([pa.field('a', pa.binary(10), nullable=False),
                  pa.field('b', pa.string())], mode=pa.lib.UnionMode_SPARSE),
        pa.dictionary(pa.int32(), pa.string())
    )
Example #29
0
def make_sorted_groups(sorting_table: pa.Table,
                       input_table: pa.Table) -> SortedGroups:
    if not sorting_table.num_columns:
        # Exactly one output group, even for empty-table input
        return SortedGroups(
            sorted_groups=pa.table({
                "A": [None]
            }).select([]),  # 1-row, 0-col table
            sorted_input_table=
            input_table,  # everything is one group (maybe 0-row)
            group_splits=np.array([], np.int64()),
        )

    # pyarrow 3.0.0 can't sort dictionary columns.
    # TODO make sort-dictionary work; nix this conversion
    sorting_table_without_dictionary = pa.table(
        [
            column.cast(pa.utf8())
            if pa.types.is_dictionary(column.type) else column
            for column in sorting_table.columns
        ],
        schema=pa.schema([
            pa.field(field.name, pa.utf8())
            if pa.types.is_dictionary(field.type) else field for field in [
                sorting_table.schema.field(i)
                for i in range(len(sorting_table.schema.names))
            ]
        ]),
    )
    indices = pa.compute.sort_indices(
        sorting_table_without_dictionary,
        sort_keys=[(c, "ascending")
                   for c in sorting_table_without_dictionary.column_names],
    )

    sorted_groups_with_dups_and_nulls = sorting_table.take(indices)
    # Behavior we ought to DEPRECATE: to mimic Pandas, we drop all groups that
    # contain NULL. This is mathematically sound for Pandas' "NA" (because if
    # all these unknown things are the same thing, doesn't that mean we know
    # something about them? -- reducto ad absurdum, QED). But Workbench's NULL
    # is a bit closer to SQL NULL, which means "whatever you say, pal".
    #
    # This null-dropping is for backwards compat. TODO make it optional ... and
    # eventually nix the option and always output NULL groups.
    nonnull_indices = indices.filter(
        find_nonnull_table_mask(sorted_groups_with_dups_and_nulls))

    if input_table.num_columns:
        sorted_input_table = input_table.take(nonnull_indices)
    else:
        # Don't .take() on a zero-column Arrow table: its .num_rows would change
        #
        # All rows are identical, so .slice() gives the table we want
        sorted_input_table = input_table.slice(0, len(nonnull_indices))

    sorted_groups_with_dups = sorting_table.take(nonnull_indices)

    # "is_dup": find each row in sorted_groups_with_dups that is _equal_ to
    # the row before it. (The first value compares the first and second row.)
    #
    # We start assuming all are equal; then we search for inequality
    if len(sorted_groups_with_dups):
        is_dup = pa.array(np.ones(len(sorted_groups_with_dups) - 1),
                          pa.bool_())
        for column in sorted_groups_with_dups.itercolumns():
            chunk = column.chunks[0]
            if pa.types.is_dictionary(chunk.type):
                chunk = chunk.indices
            first = chunk.slice(0, len(column) - 1)
            second = chunk.slice(1)
            # TODO when we support NULL groups:
            # both_null = pa.compute.and_(first.is_null(), second.is_null())
            # both_equal_if_not_null = pa.compute.equal(first, second)
            # both_equal = pa.compute.fill_null(both_equal_if_not_null, False)
            # value_is_dup = pa.compute.or_(both_null, both_equal)
            # ... and for now, it's simply:
            value_is_dup = pa.compute.equal(first, second)
            is_dup = pa.compute.and_(is_dup, value_is_dup)

        group_splits = np.where(~(is_dup.to_numpy(
            zero_copy_only=False)))[0] + 1

        sorted_groups = reencode_dictionaries(
            sorted_groups_with_dups.take(np.insert(group_splits, 0, 0)))
    else:
        sorted_groups = sorted_groups_with_dups
        group_splits = np.array([], np.int64())

    return SortedGroups(
        sorted_groups=sorted_groups,
        sorted_input_table=sorted_input_table,
        group_splits=group_splits,
    )
Example #30
0
def test_sequence_mixed_numpy_python_bools(seq):
    values = np.array([True, False])
    arr = pa.array(seq([values[0], None, values[1], True, False]))
    assert arr.type == pa.bool_()
    assert arr.to_pylist() == [True, None, False, True, False]
Example #31
0
    _pyarrow_wrappers.Array_get_address,
    ctypes.c_double,
    addr_func_name="Array_get_address_c",
    override_module_name="katana.numba_support._pyarrow_wrappers",
)

###### Wrap chunked Arrow arrays for Numba

_array_type_map = {
    pyarrow.int64(): pyarrow.Int64Array,
    pyarrow.int32(): pyarrow.Int32Array,
    pyarrow.uint64(): pyarrow.UInt64Array,
    pyarrow.uint32(): pyarrow.UInt32Array,
    pyarrow.float64(): pyarrow.lib.DoubleArray,
    pyarrow.float32(): pyarrow.lib.FloatArray,
    pyarrow.bool_(): pyarrow.lib.BooleanArray,
}

_type_array_map = {a: t for t, a in _array_type_map.items()}

_arrow_ctypes_map = {
    pyarrow.int64(): ctypes.c_int64,
    pyarrow.int32(): ctypes.c_int32,
    pyarrow.uint64(): ctypes.c_uint64,
    pyarrow.uint32(): ctypes.c_uint32,
    pyarrow.float64(): ctypes.c_double,
    pyarrow.float32(): ctypes.c_float,
    pyarrow.bool_(): ctypes.c_bool,
}

Example #32
0
import pytz
import hypothesis as h
import hypothesis.strategies as st
import hypothesis.extra.numpy as npst
import hypothesis.extra.pytz as tzst
import numpy as np

import pyarrow as pa

# TODO(kszucs): alphanum_text, surrogate_text
custom_text = st.text(
    alphabet=st.characters(min_codepoint=0x41, max_codepoint=0x7E))

null_type = st.just(pa.null())
bool_type = st.just(pa.bool_())

binary_type = st.just(pa.binary())
string_type = st.just(pa.string())
large_binary_type = st.just(pa.large_binary())
large_string_type = st.just(pa.large_string())

signed_integer_types = st.sampled_from(
    [pa.int8(), pa.int16(), pa.int32(),
     pa.int64()])
unsigned_integer_types = st.sampled_from(
    [pa.uint8(), pa.uint16(),
     pa.uint32(), pa.uint64()])
integer_types = st.one_of(signed_integer_types, unsigned_integer_types)

floating_types = st.sampled_from([pa.float16(), pa.float32(), pa.float64()])
Example #33
0
try:
    import cython
    CYTHON = True
except:
    CYTHON = False

try:
    import pyarrow as pa
    from pyarrow import csv
    import numpy as np
    ARROW = True
except:
    ARROW = False
else:
    sqream_to_pa = {
        'ftBool': pa.bool_(),
        'ftUByte': pa.uint8(),
        'ftShort': pa.int16(),
        'ftInt': pa.int32(),
        'ftLong': pa.int64(),
        'ftFloat': pa.float32(),
        'ftDouble': pa.float64(),
        'ftDate': pa.timestamp('ns'),
        'ftDateTime': pa.timestamp('ns'),
        'ftVarchar': pa.string(),
        'ftBlob': pa.utf8()
    }

__version__ = '3.0.3'

WIN = True if sys.platform in ('win32', 'cygwin') else False
Example #34
0
                          ('uint16', range(0, 10)), ('int16', range(0, 10)),
                          ('uint32', range(0, 10)), ('int32', range(0, 10)),
                          ('uint64', range(0, 10)), ('int64', range(0, 10)),
                          ('float', [0.0, 0.1, 0.2]),
                          ('double', [0.0, 0.1, 0.2]),
                          ('string', ['a', 'b', 'c']),
                          ('binary', [b'a', b'b', b'c']),
                          (pa.binary(3), [b'abc', b'bcd', b'cde'])])
def test_cast_identities(ty, values):
    arr = pa.array(values, type=ty)
    assert arr.cast(ty).equals(arr)


pickle_test_parametrize = pytest.mark.parametrize(
    ('data', 'typ'),
    [([True, False, True, True], pa.bool_()), ([1, 2, 4, 6], pa.int64()),
     ([1.0, 2.5, None], pa.float64()), (['a', None, 'b'], pa.string()),
     ([], None), ([[1, 2], [3]], pa.list_(pa.int64())),
     ([['a'], None, ['b', 'c']], pa.list_(pa.string())),
     ([(1, 'a'), (2, 'c'), None
       ], pa.struct([pa.field('a', pa.int64()),
                     pa.field('b', pa.string())]))])


@pickle_test_parametrize
def test_array_pickle(data, typ):
    # Allocate here so that we don't have any Arrow data allocated.
    # This is needed to ensure that allocator tests can be reliable.
    array = pa.array(data, type=typ)
    for proto in range(0, pickle.HIGHEST_PROTOCOL + 1):
        result = pickle.loads(pickle.dumps(array, proto))
    expected = pa.array([0, 1, 2], type='i8')

    result = arr.cast('i8')

    assert result.equals(expected)


def test_simple_type_construction():
    result = pa.lib.TimestampType()
    with pytest.raises(TypeError):
        str(result)


@pytest.mark.parametrize(
    ('type', 'expected'),
    [(pa.null(), 'float64'), (pa.bool_(), 'bool'), (pa.int8(), 'int8'),
     (pa.int16(), 'int16'), (pa.int32(), 'int32'), (pa.int64(), 'int64'),
     (pa.uint8(), 'uint8'), (pa.uint16(), 'uint16'), (pa.uint32(), 'uint32'),
     (pa.uint64(), 'uint64'), (pa.float16(), 'float16'),
     (pa.float32(), 'float32'), (pa.float64(), 'float64'),
     (pa.date32(), 'date'), (pa.date64(), 'date'), (pa.binary(), 'bytes'),
     (pa.binary(length=4), 'bytes'), (pa.string(), 'unicode'),
     (pa.list_(pa.list_(pa.int16())), 'list[list[int16]]'),
     (pa.decimal128(18, 3), 'decimal'), (pa.timestamp('ms'), 'datetime'),
     (pa.timestamp('us', 'UTC'), 'datetimetz'), (pa.time32('s'), 'time'),
     (pa.time64('us'), 'time')])
def test_logical_type(type, expected):
    assert get_logical_type(type) == expected


def test_array_conversions_no_sentinel_values():
Example #36
0
    np.dtype("int32"): pd.Int32Dtype(),
    np.dtype("int64"): pd.Int64Dtype(),
    np.dtype("bool_"): pd.BooleanDtype(),
    np.dtype("object"): pd.StringDtype(),
}

pyarrow_dtypes_to_pandas_dtypes = {
    pa.uint8(): pd.UInt8Dtype(),
    pa.uint16(): pd.UInt16Dtype(),
    pa.uint32(): pd.UInt32Dtype(),
    pa.uint64(): pd.UInt64Dtype(),
    pa.int8(): pd.Int8Dtype(),
    pa.int16(): pd.Int16Dtype(),
    pa.int32(): pd.Int32Dtype(),
    pa.int64(): pd.Int64Dtype(),
    pa.bool_(): pd.BooleanDtype(),
    pa.string(): pd.StringDtype(),
}

pandas_dtypes_to_cudf_dtypes = {
    pd.UInt8Dtype(): np.dtype("uint8"),
    pd.UInt16Dtype(): np.dtype("uint16"),
    pd.UInt32Dtype(): np.dtype("uint32"),
    pd.UInt64Dtype(): np.dtype("uint64"),
    pd.Int8Dtype(): np.dtype("int8"),
    pd.Int16Dtype(): np.dtype("int16"),
    pd.Int32Dtype(): np.dtype("int32"),
    pd.Int64Dtype(): np.dtype("int64"),
    pd.BooleanDtype(): np.dtype("bool_"),
    pd.StringDtype(): np.dtype("object"),
}
Example #37
0
    new_index = ExplicitSecondaryIndex(column="another_col",
                                       index_dct={1: ["part_4"]})
    with pytest.raises(ValueError) as e:
        original_index.update(new_index)
    assert (
        str(e.value) ==
        "Trying to update an index with the wrong column. Got `another_col` but expected `col`"
    )


@pytest.mark.parametrize(
    "dtype",
    [
        pa.binary(),
        pa.bool_(),
        pa.date32(),
        pa.float32(),
        pa.float64(),
        pa.int64(),
        pa.int8(),
        pa.string(),
        pa.timestamp("ns"),
    ],
)
def test_index_empty(store, dtype):
    storage_key = "dataset_uuid/some_index.parquet"
    index1 = ExplicitSecondaryIndex(column="col",
                                    index_dct={},
                                    dtype=dtype,
                                    index_storage_key=storage_key)
Example #38
0
    pa.field("visit_id", pa.int64()),
    pa.field("instance_id", pa.uint32(), nullable=False),
    pa.field("extension_session_uuid", pa.string()),
    pa.field("event_ordinal", pa.int64()),
    pa.field("window_id", pa.int64()),
    pa.field("tab_id", pa.int64()),
    pa.field("frame_id", pa.int64()),
    pa.field("url", pa.string(), nullable=False),
    pa.field("top_level_url", pa.string()),
    pa.field("parent_frame_id", pa.int64()),
    pa.field("frame_ancestors", pa.string()),
    pa.field("method", pa.string(), nullable=False),
    pa.field("referrer", pa.string(), nullable=False),
    pa.field("headers", pa.string(), nullable=False),
    pa.field("request_id", pa.int64(), nullable=False),
    pa.field("is_XHR", pa.bool_()),
    pa.field("is_third_party_channel", pa.bool_()),
    pa.field("is_third_party_to_top_window", pa.bool_()),
    pa.field("triggering_origin", pa.string()),
    pa.field("loading_origin", pa.string()),
    pa.field("loading_href", pa.string()),
    pa.field("req_call_stack", pa.string()),
    pa.field("resource_type", pa.string(), nullable=False),
    pa.field("post_body", pa.string()),
    pa.field("post_body_raw", pa.string()),
    pa.field("time_stamp", pa.string(), nullable=False),
]
PQ_SCHEMAS["http_requests"] = pa.schema(fields)

# http_responses
fields = [
Example #39
0
    ], ["x", "y", "z"]),
         examples_text_proto=_ENCODE_TEST_EXAMPLES),
    dict(record_batch=pa.RecordBatch.from_arrays([
        pa.array([None, None, [b"a", b"b"]], type=pa.large_list(pa.binary())),
        pa.array([None, None, [1.0, 2.0]], type=pa.large_list(pa.float32())),
        pa.array([None, None, [4, 5]], type=pa.list_(pa.int64()))
    ], ["x", "y", "z"]),
         examples_text_proto=list(reversed(_ENCODE_TEST_EXAMPLES[:-1]))),
]

_INVALID_ENCODE_TYPE_CASES = [
    dict(record_batch=pa.RecordBatch.from_arrays([pa.array([1, 2, 3])], ["a"]),
         error=RuntimeError,
         error_msg_regex="Expected ListArray or LargeListArray"),
    dict(record_batch=pa.RecordBatch.from_arrays(
        [pa.array([[True], [False]], type=pa.large_list(pa.bool_()))], ["a"]),
         error=RuntimeError,
         error_msg_regex="Bad field type"),
    dict(record_batch=pa.RecordBatch.from_arrays([
        pa.array([[b"a", b"b"], None, None, []],
                 type=pa.large_list(pa.large_binary())),
        pa.array([[1.0, 2.0], None, None, []],
                 type=pa.large_list(pa.float32())),
    ], ["x", "x"]),
         error=RuntimeError,
         error_msg_regex="RecordBatch contains duplicate column names")
]


class RecordBatchToExamplesTest(parameterized.TestCase):
    @parameterized.parameters(*_ENCODE_CASES)
Example #40
0
 def test_boolean_object_nulls(self):
     arr = np.array([False, None, True] * 100, dtype=object)
     df = pd.DataFrame({'bools': arr})
     field = pa.Field.from_py('bools', pa.bool_())
     schema = pa.Schema.from_fields([field])
     self._check_pandas_roundtrip(df, expected_schema=schema)
Example #41
0
    ('int64', range(0, 10)),
    ('float', [0.0, 0.1, 0.2]),
    ('double', [0.0, 0.1, 0.2]),
    ('string', ['a', 'b', 'c']),
    ('binary', [b'a', b'b', b'c']),
    (pa.binary(3), [b'abc', b'bcd', b'cde'])
])
def test_cast_identities(ty, values):
    arr = pa.array(values, type=ty)
    assert arr.cast(ty).equals(arr)


pickle_test_parametrize = pytest.mark.parametrize(
    ('data', 'typ'),
    [
        ([True, False, True, True], pa.bool_()),
        ([1, 2, 4, 6], pa.int64()),
        ([1.0, 2.5, None], pa.float64()),
        (['a', None, 'b'], pa.string()),
        ([], None),
        ([[1, 2], [3]], pa.list_(pa.int64())),
        ([['a'], None, ['b', 'c']], pa.list_(pa.string())),
        ([(1, 'a'), (2, 'c'), None],
            pa.struct([pa.field('a', pa.int64()), pa.field('b', pa.string())]))
    ]
)


@pickle_test_parametrize
def test_array_pickle(data, typ):
    # Allocate here so that we don't have any Arrow data allocated.
Example #42
0
    b = pa.array([0, 2], type=pa.int64())
    c = pa.array([0, 3], type=pa.int32())
    d = pa.array([0, 2, 0, 3], type=pa.int32())

    eq([a], [a])
    ne([a], [b])
    eq([a, c], [a, c])
    eq([a, c], [d])
    ne([c, a], [a, c])

    assert not pa.chunked_array([], type=pa.int32()).equals(None)


@pytest.mark.parametrize(
    ('data', 'typ'),
    [([True, False, True, True], pa.bool_()), ([1, 2, 4, 6], pa.int64()),
     ([1.0, 2.5, None], pa.float64()), (['a', None, 'b'], pa.string()),
     ([], pa.list_(pa.uint8())), ([[1, 2], [3]], pa.list_(pa.int64())),
     ([['a'], None, ['b', 'c']], pa.list_(pa.string())),
     ([(1, 'a'), (2, 'c'), None
       ], pa.struct([pa.field('a', pa.int64()),
                     pa.field('b', pa.string())]))])
def test_chunked_array_pickle(data, typ):
    arrays = []
    while data:
        arrays.append(pa.array(data[:2], type=typ))
        data = data[2:]
    array = pa.chunked_array(arrays, type=typ)
    array.validate()
    result = pickle.loads(pickle.dumps(array))
    result.validate()
Example #43
0
    },
    "time": {
        "type": "long",
        "logicalType": "time-micros"
    },
    "timestamp": {
        "type": "long",
        "logicalType": "timestamp-micros"
    },
}
# This dictionary is duplicated in bigquery/google/cloud/bigquery/_pandas_helpers.py
# When modifying it be sure to update it there as well.
BQ_TO_ARROW_TYPES = {
    "int64": pyarrow.int64(),
    "float64": pyarrow.float64(),
    "bool": pyarrow.bool_(),
    "numeric": pyarrow.decimal128(38, 9),
    "string": pyarrow.utf8(),
    "bytes": pyarrow.binary(),
    "date": pyarrow.date32(),  # int32 days since epoch
    "datetime": pyarrow.timestamp("us"),
    "time": pyarrow.time64("us"),
    "timestamp": pyarrow.timestamp("us", tz="UTC"),
}
SCALAR_COLUMNS = [
    {
        "name": "int_col",
        "type": "int64"
    },
    {
        "name": "float_col",
Example #44
0
    ('int64', range(0, 10)),
    ('float', [0.0, 0.1, 0.2]),
    ('double', [0.0, 0.1, 0.2]),
    ('string', ['a', 'b', 'c']),
    ('binary', [b'a', b'b', b'c']),
    (pa.binary(3), [b'abc', b'bcd', b'cde'])
])
def test_cast_identities(ty, values):
    arr = pa.array(values, type=ty)
    assert arr.cast(ty).equals(arr)


pickle_test_parametrize = pytest.mark.parametrize(
    ('data', 'typ'),
    [
        ([True, False, True, True], pa.bool_()),
        ([1, 2, 4, 6], pa.int64()),
        ([1.0, 2.5, None], pa.float64()),
        (['a', None, 'b'], pa.string()),
        ([], None),
        ([[1, 2], [3]], pa.list_(pa.int64())),
        ([['a'], None, ['b', 'c']], pa.list_(pa.string())),
        ([(1, 'a'), (2, 'c'), None],
            pa.struct([pa.field('a', pa.int64()), pa.field('b', pa.string())]))
    ]
)


@pickle_test_parametrize
def test_array_pickle(data, typ):
    # Allocate here so that we don't have any Arrow data allocated.
Example #45
0
 def test_boolean(self):
     arr = pyarrow.from_pylist([True, None, False, None])
     assert len(arr) == 4
     assert arr.null_count == 2
     assert arr.type == pyarrow.bool_()
Example #46
0
import six
from pandas.api.types import (
    is_array_like,
    is_bool_dtype,
    is_int64_dtype,
    is_integer,
    is_integer_dtype,
)
from pandas.core.arrays import ExtensionArray
from pandas.core.dtypes.dtypes import ExtensionDtype

from ._algorithms import all_op, any_op, extract_isnull_bytemap

_python_type_map = {
    pa.null().id: six.text_type,
    pa.bool_().id: bool,
    pa.int8().id: int,
    pa.uint8().id: int,
    pa.int16().id: int,
    pa.uint16().id: int,
    pa.int32().id: int,
    pa.uint32().id: int,
    pa.int64().id: int,
    pa.uint64().id: int,
    pa.float16().id: float,
    pa.float32().id: float,
    pa.float64().id: float,
    pa.date32().id: datetime.date,
    pa.date64().id: datetime.date,
    pa.timestamp("ms").id: datetime.datetime,
    pa.binary().id: six.binary_type,
 def test_boolean_object_nulls(self):
     arr = np.array([False, None, True] * 100, dtype=object)
     df = pd.DataFrame({'bools': arr})
     field = pa.field('bools', pa.bool_())
     schema = pa.schema([field])
     self._check_pandas_roundtrip(df, expected_schema=schema)
Example #48
0
from absl.testing import absltest
from absl.testing import parameterized


_MERGE_TEST_CASES = [
    dict(
        testcase_name="empty_input",
        inputs=[],
        expected_output=dict(),
    ),
    dict(
        testcase_name="basic_types",
        inputs=[
            {
                "bool": pa.array([False, None, True], type=pa.bool_()),
                "int64": pa.array([1, None, 3], type=pa.int64()),
                "uint64": pa.array([1, None, 3], type=pa.uint64()),
                "int32": pa.array([1, None, 3], type=pa.int32()),
                "uint32": pa.array([1, None, 3], type=pa.uint32()),
                "float": pa.array([1., None, 3.], type=pa.float32()),
                "double": pa.array([1., None, 3.], type=pa.float64()),
                "bytes": pa.array([b"abc", None, b"ghi"], type=pa.binary()),
                "large_bytes": pa.array([b"abc", None, b"ghi"],
                                        type=pa.large_binary()),
                "unicode": pa.array([u"abc", None, u"ghi"], type=pa.utf8()),
                "large_unicode": pa.array([u"abc", None, u"ghi"],
                                          type=pa.large_utf8()),
            },
            {
                "bool": pa.array([None, False], type=pa.bool_()),
Example #49
0
    def get_type_and_builtins(self, n, type_name):
        """
        Return a `(arrow type, list)` tuple where the arrow type
        corresponds to the given logical *type_name*, and the list
        is a list of *n* random-generated Python objects compatible
        with the arrow type.
        """
        size = None

        if type_name in ('bool', 'decimal', 'ascii', 'unicode', 'int64 list'):
            kind = type_name
        elif type_name.startswith(('int', 'uint')):
            kind = 'int'
        elif type_name.startswith('float'):
            kind = 'float'
        elif type_name.startswith('struct'):
            kind = 'struct'
        elif type_name == 'binary':
            kind = 'varying binary'
        elif type_name.startswith('binary'):
            kind = 'fixed binary'
            size = int(type_name[6:])
            assert size > 0
        else:
            raise ValueError("unrecognized type %r" % (type_name,))

        if kind in ('int', 'float'):
            ty = getattr(pa, type_name)()
        elif kind == 'bool':
            ty = pa.bool_()
        elif kind == 'decimal':
            ty = pa.decimal128(9, 9)
        elif kind == 'fixed binary':
            ty = pa.binary(size)
        elif kind == 'varying binary':
            ty = pa.binary()
        elif kind in ('ascii', 'unicode'):
            ty = pa.string()
        elif kind == 'int64 list':
            ty = pa.list_(pa.int64())
        elif kind == 'struct':
            ty = pa.struct([pa.field('u', pa.int64()),
                            pa.field('v', pa.float64()),
                            pa.field('w', pa.bool_())])

        factories = {
            'int': self.generate_int_list,
            'float': self.generate_float_list,
            'bool': self.generate_bool_list,
            'decimal': self.generate_decimal_list,
            'fixed binary': partial(self.generate_fixed_binary_list,
                                    size=size),
            'varying binary': partial(self.generate_varying_binary_list,
                                      min_size=3, max_size=40),
            'ascii': partial(self.generate_ascii_string_list,
                             min_size=3, max_size=40),
            'unicode': partial(self.generate_unicode_string_list,
                               min_size=3, max_size=40),
            'int64 list': partial(self.generate_int_list_list,
                                  min_size=0, max_size=20),
            'struct': self.generate_dict_list,
            'struct from tuples': self.generate_tuple_list,
        }
        data = factories[kind](n)
        return ty, data
Example #50
0
import argparse
import csv
import re
import sys
from datetime import datetime
from base64 import standard_b64decode
import pyarrow as pa
import pyarrow.parquet as pq

PA_BOOL = pa.bool_()
PA_FLOAT32 = pa.float32()
PA_FLOAT64 = pa.float64()
PA_INT8 = pa.int8()
PA_INT16 = pa.int16()
PA_INT32 = pa.int32()
PA_INT64 = pa.int64()
PA_STRING = pa.string()
PA_TIMESTAMP = pa.timestamp('ns')
PA_BINARY = pa.binary()


def get_delimiter(csv_file, custom_delimiter=','):
    if csv_file[-4:] == '.tsv':
        return '\t'
    return custom_delimiter if custom_delimiter else ','


def sanitize_column_name(name):
    cleaned = re.sub('[^a-z0-9]', '_', name.lower())
    cleaned = re.sub('__*', '_', cleaned)
    cleaned = re.sub('^_*', '', cleaned)
Example #51
0
def test_sequence_numpy_boolean(seq):
    expected = [np.bool(True), None, np.bool(False), None]
    arr = pa.array(seq(expected))
    assert arr.type == pa.bool_()
    assert arr.to_pylist() == [True, None, False, None]
Example #52
0
)
xfail_bool_too_few_uniques = pytest.mark.xfail_by_type_filter(
    [pa.types.is_boolean], "Test requires at least 3 unique values")

test_types = [
    FletcherTestType(
        pa.string(),
        ["🙈", "Ö", "Č", "a", "B"] * 20,
        [None, "A"],
        ["B", "B", None, None, "A", "A", "B", "C"],
        ["B", "C", "A"],
        ["B", None, "A"],
        lambda: choices(list(string.ascii_letters), k=10),
    ),
    FletcherTestType(
        pa.bool_(),
        [True, False, True, True, False] * 20,
        [None, False],
        [True, True, None, None, False, False, True, False],
        [True, False, False],
        [True, None, False],
        lambda: choices([True, False], k=10),
    ),
    FletcherTestType(
        pa.int8(),
        # Use small values here so that np.prod stays in int32
        [2, 1, 1, 2, 1] * 20,
        [None, 1],
        [2, 2, None, None, -100, -100, 2, 100],
        [2, 100, -10],
        [2, None, -10],
Example #53
0
import pyarrow as pa

schema_fields = [
    pa.field("timestamp", pa.date64(), False),
    pa.field("timezone", pa.uint64(), False).with_metadata({
        "illex_MIN": "0",
        "illex_MAX": "1024"
    }),
    pa.field("vin", pa.uint64(), False),
    pa.field("odometer", pa.uint64(), False).with_metadata({
        "illex_MIN": "0",
        "illex_MAX": "1000"
    }),
    pa.field("hypermiling", pa.bool_(), False),
    pa.field("avgspeed", pa.uint64(), False).with_metadata({
        "illex_MIN": "0",
        "illex_MAX": "200"
    }),
    pa.field(
        "sec_in_band",
        pa.list_(
            pa.field("item", pa.uint64(), False).with_metadata({
                "illex_MIN":
                "0",
                "illex_MAX":
                "4192"
            }), 12), False),
    pa.field(
        "miles_in_time_range",
        pa.list_(
            pa.field("item", pa.uint64(), False).with_metadata({
Example #54
0
    def csv_to_table(self,
                     csv_path,
                     table_name,
                     read=None,
                     parse=None,
                     convert=None,
                     con=None,
                     auto_infer=False):
        ' Pyarrow CSV reader documentation: https://arrow.apache.org/docs/python/generated/pyarrow.csv.read_csv.html '

        if not ARROW:
            return "Optional pyarrow dependency not found. To install: pip3 install pyarrow"

        sqream_to_pa = {
            'ftBool': pa.bool_(),
            'ftUByte': pa.uint8(),
            'ftShort': pa.int16(),
            'ftInt': pa.int32(),
            'ftLong': pa.int64(),
            'ftFloat': pa.float32(),
            'ftDouble': pa.float64(),
            'ftDate': pa.timestamp('ns'),
            'ftDateTime': pa.timestamp('ns'),
            'ftVarchar': pa.string(),
            'ftBlob': pa.utf8()
        }

        start = time.time()
        # Get table metadata
        con = con or self
        con.execute(f'select * from {table_name} where 1=0')

        # Map column names to pyarrow types and set Arrow's CSV parameters
        sqream_col_types = [col_type[0] for col_type in con.col_type_tups]
        column_types = zip(
            con.col_names,
            [sqream_to_pa[col_type[0]] for col_type in con.col_type_tups])
        read = read or csv.ReadOptions(column_names=con.col_names)
        parse = parse or csv.ParseOptions(delimiter='|')
        convert = convert or csv.ConvertOptions(
            column_types=None if auto_infer else column_types)

        # Read CSV to in-memory arrow format
        csv_arrow = csv.read_csv(csv_path,
                                 read_options=read,
                                 parse_options=parse,
                                 convert_options=convert).combine_chunks()
        num_chunks = len(csv_arrow[0].chunks)
        numpy_cols = []

        # For each column, get the numpy representation for quick packing
        for col_type, col in zip(sqream_col_types, csv_arrow):
            # Only one chunk after combine_chunks()
            col = col.chunks[0]
            if col_type in ('ftVarchar', 'ftBlob', 'ftDate', 'ftDateTime'):
                col = col.to_pandas()
            else:
                col = col.to_numpy()

            numpy_cols.append(col)

        print(f'total loading csv: {time.time()-start}')
        start = time.time()

        # Insert columns into SQream
        col_num = csv_arrow.shape[1]
        con.executemany(
            f'insert into {table_name} values ({"?,"*(col_num-1)}?)',
            numpy_cols)
        print(f'total inserting csv: {time.time()-start}')
def test_struct_from_dicts_inference():
    expected_type = pa.struct([
        pa.field('a', pa.int64()),
        pa.field('b', pa.string()),
        pa.field('c', pa.bool_())
    ])
    data = [{
        'a': 5,
        'b': u'foo',
        'c': True
    }, {
        'a': 6,
        'b': u'bar',
        'c': False
    }]
    arr = pa.array(data)
    check_struct_type(arr.type, expected_type)
    assert arr.to_pylist() == data

    # With omitted values
    data = [{'a': 5, 'c': True}, None, {}, {'a': None, 'b': u'bar'}]
    expected = [{
        'a': 5,
        'b': None,
        'c': True
    }, None, {
        'a': None,
        'b': None,
        'c': None
    }, {
        'a': None,
        'b': u'bar',
        'c': None
    }]
    arr = pa.array(data)
    data_as_ndarray = np.empty(len(data), dtype=object)
    data_as_ndarray[:] = data
    arr2 = pa.array(data)

    check_struct_type(arr.type, expected_type)
    assert arr.to_pylist() == expected
    assert arr.equals(arr2)

    # Nested
    expected_type = pa.struct([
        pa.field(
            'a',
            pa.struct([
                pa.field('aa', pa.list_(pa.int64())),
                pa.field('ab', pa.bool_())
            ])),
        pa.field('b', pa.string())
    ])
    data = [{
        'a': {
            'aa': [5, 6],
            'ab': True
        },
        'b': 'foo'
    }, {
        'a': {
            'aa': None,
            'ab': False
        },
        'b': None
    }, {
        'a': None,
        'b': 'bar'
    }]
    arr = pa.array(data)
    assert arr.to_pylist() == data

    # Edge cases
    arr = pa.array([{}])
    assert arr.type == pa.struct([])
    assert arr.to_pylist() == [{}]

    # Mixing structs and scalars is rejected
    with pytest.raises((pa.ArrowInvalid, pa.ArrowTypeError)):
        pa.array([1, {'a': 2}])
Example #56
0
    def __init__(
        self,
        name: str,
        values: "Union[np.array, List[Optional[Any]]]" = None,
        nullable: bool = True,
        dtype: "Optional[DataType]" = None,
    ):
        """

        Parameters
        ----------
        name
            Name of the series
        values
            Values of the series
        nullable
            If nullable.
                None values in a list will be interpreted as missing.
                NaN values in a numpy array will be interpreted as missing. Note that missing and NaNs are not the same
                in Polars
            Series creation may be faster if set to False and there are no null values.
        """
        # assume the first input were the values
        if values is None and not isinstance(name, str):
            values = name
            name = ""
        if values.__class__ == self.__class__:
            values.rename(name)
            self._s = values._s
            return

        self._s: PySeries
        # series path
        if isinstance(values, Series):
            self._from_pyseries(values)
            return
        elif isinstance(values, dict):
            raise ValueError(
                f"Constructing a Series with a dict is not supported for {values}"
            )
        elif isinstance(values, pa.Array):
            self._s = self.from_arrow(name, values)._s
            return

        # castable to numpy
        if not isinstance(values, np.ndarray) and not nullable:
            values = np.array(values)

        if dtype is not None:
            if dtype == Int8:
                self._s = PySeries.new_i8(name, values)
            elif dtype == Int16:
                self._s = PySeries.new_i16(name, values)
            elif dtype == Int32:
                self._s = PySeries.new_i32(name, values)
            elif dtype == Int64:
                self._s = PySeries.new_i64(name, values)
            elif dtype == UInt8:
                self._s = PySeries.new_u8(name, values)
            elif dtype == UInt16:
                self._s = PySeries.new_u16(name, values)
            elif dtype == UInt32:
                self._s = PySeries.new_u32(name, values)
            elif dtype == UInt64:
                self._s = PySeries.new_u64(name, values)
            elif dtype == Float32:
                self._s = PySeries.new_f32(name, values)
            elif dtype == Float64:
                self._s = PySeries.new_f64(name, values)
            elif dtype == Boolean:
                self._s = PySeries.new_bool(name, values)
            elif dtype == Utf8:
                self._s = PySeries.new_str(name, values)
            else:
                raise ValueError(
                    f"dtype {dtype} not yet supported when creating a Series")
            return

        # numpy path
        if isinstance(values, np.ndarray):
            if not values.data.contiguous:
                values = np.array(values)
            if len(values.shape) > 1:
                self._s = PySeries.new_object(name, values)
                return
            dtype = values.dtype
            if dtype == np.int64:
                self._s = PySeries.new_i64(name, values)
            elif dtype == np.int32:
                self._s = PySeries.new_i32(name, values)
            elif dtype == np.int16:
                self._s = PySeries.new_i16(name, values)
            elif dtype == np.int8:
                self._s = PySeries.new_i8(name, values)
            elif dtype == np.float32:
                self._s = PySeries.new_f32(name, values, nullable)
            elif dtype == np.float64:
                self._s = PySeries.new_f64(name, values, nullable)
            elif isinstance(values[0], str):
                self._s = PySeries.new_str(name, values)
            elif dtype == np.bool:
                self._s = PySeries.new_bool(name, values)
            elif dtype == np.uint8:
                self._s = PySeries.new_u8(name, values)
            elif dtype == np.uint16:
                self._s = PySeries.new_u16(name, values)
            elif dtype == np.uint32:
                self._s = PySeries.new_u32(name, values)
            elif dtype == np.uint64:
                self._s = PySeries.new_u64(name, values)
            else:
                self._s = PySeries.new_object(name, values)
            return
        # list path
        else:
            dtype = _find_first_non_none(values)
            # order is important as booleans are instance of int in python.
            if isinstance(dtype, bool):
                self._s = PySeries.new_opt_bool(name, values)
            elif isinstance(dtype, int):
                self._s = PySeries.new_opt_i64(name, values)
            elif isinstance(dtype, float):
                self._s = PySeries.new_opt_f64(name, values)
            elif isinstance(dtype, str):
                self._s = PySeries.new_str(name, values)
            # make list array
            elif isinstance(dtype, (list, tuple)):
                value_dtype = _find_first_non_none(dtype)

                # we can expect a failure if we pass `[[12], "foo", 9]`
                # in that case we catch the exception and create an object type
                try:
                    if isinstance(value_dtype, bool):
                        arrow_array = pa.array(values,
                                               pa.large_list(pa.bool_()))
                    elif isinstance(value_dtype, int):
                        arrow_array = pa.array(values,
                                               pa.large_list(pa.int64()))
                    elif isinstance(value_dtype, float):
                        arrow_array = pa.array(values,
                                               pa.large_list(pa.float64()))
                    elif isinstance(value_dtype, str):
                        arrow_array = pa.array(values,
                                               pa.large_list(pa.large_utf8()))
                    else:
                        self._s = PySeries.new_object(name, values)
                        return
                    self._s = Series.from_arrow(name, arrow_array)._s

                except pa.lib.ArrowInvalid:
                    self._s = PySeries.new_object(name, values)
            else:
                self._s = PySeries.new_object(name, values)
Example #57
0
# In the following, we use the JSON serialization of the Field objects in Java.
# This ensures that we neither rely on the exact mechanics on how to construct
# them using Java code as well as enables us to define them as parameters
# without to invoke the JVM.
#
# The specifications were created using:
#
#   om = jpype.JClass('com.fasterxml.jackson.databind.ObjectMapper')()
#   field = …  # Code to instantiate the field
#   jvm_spec = om.writeValueAsString(field)
@pytest.mark.parametrize(
    'typ,jvm_spec',
    [
        (pa.null(), '{"name":"null"}'),
        (pa.bool_(), '{"name":"bool"}'),
        (pa.int8(), '{"name":"int","bitWidth":8,"isSigned":true}'),
        (pa.int16(), '{"name":"int","bitWidth":16,"isSigned":true}'),
        (pa.int32(), '{"name":"int","bitWidth":32,"isSigned":true}'),
        (pa.int64(), '{"name":"int","bitWidth":64,"isSigned":true}'),
        (pa.uint8(), '{"name":"int","bitWidth":8,"isSigned":false}'),
        (pa.uint16(), '{"name":"int","bitWidth":16,"isSigned":false}'),
        (pa.uint32(), '{"name":"int","bitWidth":32,"isSigned":false}'),
        (pa.uint64(), '{"name":"int","bitWidth":64,"isSigned":false}'),
        (pa.float16(), '{"name":"floatingpoint","precision":"HALF"}'),
        (pa.float32(), '{"name":"floatingpoint","precision":"SINGLE"}'),
        (pa.float64(), '{"name":"floatingpoint","precision":"DOUBLE"}'),
        (pa.time32('s'), '{"name":"time","unit":"SECOND","bitWidth":32}'),
        (pa.time32('ms'),
         '{"name":"time","unit":"MILLISECOND","bitWidth":32}'),
        (pa.time64('us'),
Example #58
0
def test_is_boolean():
    assert types.is_boolean(pa.bool_())
    assert not types.is_boolean(pa.int8())
Example #59
0
 def test_try_type_and_type_forbidden(self):
     with self.assertRaises(AssertionError):
         _ = pa.array(TypedSequence([1, 2, 3], try_type=pa.bool_(), type=pa.int64()))
Example #60
0
logger = logging.getLogger(__name__)

PROJECT_PQ = pa.schema([
    pa.field('aha_id', pa.string()),
    pa.field('reference_prefix', pa.string()),
    pa.field('name', pa.string()),
    pa.field('last_release_num', pa.int32()),
    pa.field('last_feature_num', pa.int32()),
    pa.field('last_idea_num', pa.int32()),
    pa.field('position', pa.int32()),
    pa.field('positioning_customer', pa.string()),
    pa.field('positioning_problem', pa.string()),
    pa.field('positioning_benefit1', pa.string()),
    pa.field('positioning_benefit2', pa.string()),
    pa.field('positioning_benefit3', pa.string()),
    pa.field('product_line', pa.bool_()),
    pa.field('product_line_type', pa.string()),
    pa.field('capacity_planning_enabled', pa.bool_()),
    pa.field('ideas_scoring_system_id', pa.string()),
    pa.field('ideas_default_user_id', pa.string()),
    pa.field('default_capacity_units', pa.int32()),
    pa.field('default_feature_remaining_estimate', pa.bool_()),
    pa.field('last_page_num', pa.int32()),
    pa.field('color', pa.int32()),
    pa.field('workflow_screen_enabled', pa.bool_()),
    pa.field('competitor_scoring_system_id', pa.string()),
    pa.field('initiative_workflow_id', pa.string()),
    pa.field('strategic_imperative_workflow_id', pa.string()),
    pa.field('estimated_time_as_work_done', pa.bool_()),
    pa.field('last_epic_num', pa.int32()),
    pa.field('configuration', pa.string()),