Esempio n. 1
0
def test_data_type():
    a = pbn.DiscreteFactor("A", [])
    with pytest.raises(ValueError) as ex:
        a.data_type()
    "DiscreteFactor factor not fitted." in str(ex.value)

    categories = np.asarray(["a1", "a2"])
    a_values = pd.Categorical(categories[np.random.randint(len(categories),
                                                           size=100)],
                              categories=categories,
                              ordered=False)
    df = pd.DataFrame({'A': a_values})
    a.fit(df)
    assert a.data_type() == pa.dictionary(pa.int8(), pa.string())

    categories = np.asarray(["a" + str(i) for i in range(1, 129)])
    a_values = pd.Categorical(categories[np.random.randint(len(categories),
                                                           size=100)],
                              categories=categories,
                              ordered=False)
    df = pd.DataFrame({'A': a_values})
    a.fit(df)
    assert a.data_type() == pa.dictionary(pa.int8(), pa.string())

    categories = np.asarray(["a" + str(i) for i in range(1, 130)])
    a_values = pd.Categorical(categories[np.random.randint(len(categories),
                                                           size=100)],
                              categories=categories,
                              ordered=False)
    df = pd.DataFrame({'A': a_values})
    a.fit(df)
    assert a.data_type() == pa.dictionary(pa.int16(), pa.string())
Esempio n. 2
0
def test_schema_merge():
    a = pa.schema([
        pa.field('foo', pa.int32()),
        pa.field('bar', pa.string()),
        pa.field('baz', pa.list_(pa.int8()))
    ])
    b = pa.schema([pa.field('foo', pa.int32()), pa.field('qux', pa.bool_())])
    c = pa.schema([pa.field('quux', pa.dictionary(pa.int32(), pa.string()))])
    d = pa.schema([pa.field('foo', pa.int64()), pa.field('qux', pa.bool_())])

    result = pa.unify_schemas([a, b, c])
    expected = pa.schema([
        pa.field('foo', pa.int32()),
        pa.field('bar', pa.string()),
        pa.field('baz', pa.list_(pa.int8())),
        pa.field('qux', pa.bool_()),
        pa.field('quux', pa.dictionary(pa.int32(), pa.string()))
    ])
    assert result.equals(expected)

    with pytest.raises(pa.ArrowInvalid):
        pa.unify_schemas([b, d])

    # ARROW-14002: Try with tuple instead of list
    result = pa.unify_schemas((a, b, c))
    assert result.equals(expected)
Esempio n. 3
0
    def test_column_types_dict(self):
        # Ask for dict-encoded column types in ConvertOptions
        column_types = [('a', pa.dictionary(pa.int32(), pa.utf8())),
                        ('b', pa.dictionary(pa.int32(), pa.int64())),
                        ('c', pa.dictionary(pa.int32(), pa.decimal128(11, 2))),
                        ('d', pa.dictionary(pa.int32(), pa.large_utf8()))]

        opts = ConvertOptions(column_types=dict(column_types))
        rows = (b"a,b,c,d\n"
                b"abc,123456,1.0,zz\n"
                b"defg,123456,0.5,xx\n"
                b"abc,N/A,1.0,xx\n")
        table = self.read_bytes(rows, convert_options=opts)

        schema = pa.schema(column_types)
        expected = {
            'a': ["abc", "defg", "abc"],
            'b': [123456, 123456, None],
            'c': [Decimal("1.00"),
                  Decimal("0.50"),
                  Decimal("1.00")],
            'd': ["zz", "xx", "xx"],
        }
        assert table.schema == schema
        assert table.to_pydict() == expected

        # Unsupported index type
        column_types[0] = ('a', pa.dictionary(pa.int8(), pa.utf8()))

        opts = ConvertOptions(column_types=dict(column_types))
        with pytest.raises(NotImplementedError):
            table = self.read_bytes(rows, convert_options=opts)
Esempio n. 4
0
def test_table(n, types=None, offset=None, length=None, nullable=True):
    if types is None:
        types = [
            pyarrow.null(),
            pyarrow.bool_(),
            pyarrow.int8(),
            pyarrow.int16(),
            pyarrow.int32(),
            pyarrow.int64(),
            pyarrow.uint8(),
            pyarrow.uint16(),
            pyarrow.uint32(),
            pyarrow.uint64(),
            pyarrow.float16(),
            pyarrow.float32(),
            pyarrow.float64(),
            pyarrow.date32(),
            pyarrow.date64(),
            pyarrow.timestamp('s'),
            pyarrow.timestamp('ms'),
            pyarrow.timestamp('us'),
            pyarrow.timestamp('ns'),
            pyarrow.time32('s'),
            pyarrow.time32('ms'),
            pyarrow.time64('us'),
            pyarrow.time64('ns'),
            pyarrow.string(),
            pyarrow.binary(),
            pyarrow.binary(4),
            pyarrow.dictionary(pyarrow.int32(), pyarrow.string(), True),
            pyarrow.dictionary(pyarrow.int64(), pyarrow.int64(), True),
            pyarrow.dictionary(pyarrow.int32(), pyarrow.string(), False),
            pyarrow.dictionary(pyarrow.int64(), pyarrow.int64(), False),
            pyarrow.list_(pyarrow.int32()),
            pyarrow.struct([pyarrow.field('int32', pyarrow.int32())]),
            pyarrow.list_(
                pyarrow.struct([pyarrow.field('int32', pyarrow.int32())])),
            pyarrow.struct(
                [pyarrow.field('int32', pyarrow.list_(pyarrow.int32()))]),
        ]

    data = list()

    for t in types:
        name = str(t)
        array = TestArrayGenerator(n, t, False).array
        if offset is not None:
            array = array.slice(offset, length)
        data.append(pyarrow.column(name, array))

        if nullable:
            name = str(t) + ' (null)'
            array = TestArrayGenerator(n, t, True).array
            if offset is not None:
                array = array.slice(offset, length)
            data.append(pyarrow.column(name, array))

    return pyarrow.Table.from_arrays(data)
Esempio n. 5
0
def test_dictionary_ordered_equals():
    # Python side checking of ARROW-6345
    d1 = pa.dictionary('int32', 'binary', ordered=True)
    d2 = pa.dictionary('int32', 'binary', ordered=False)
    d3 = pa.dictionary('int8', 'binary', ordered=True)
    d4 = pa.dictionary('int32', 'binary', ordered=True)

    assert not d1.equals(d2)
    assert not d1.equals(d3)
    assert d1.equals(d4)
Esempio n. 6
0
def test_dictionary_type():
    ty0 = pa.dictionary(pa.int32(), pa.array(['a', 'b', 'c']))
    assert ty0.index_type == pa.int32()
    assert isinstance(ty0.dictionary, pa.Array)
    assert ty0.dictionary.to_pylist() == ['a', 'b', 'c']
    assert ty0.ordered is False

    ty1 = pa.dictionary(pa.int8(), pa.array([1.0, 2.0]), ordered=True)
    assert ty1.index_type == pa.int8()
    assert isinstance(ty0.dictionary, pa.Array)
    assert ty1.dictionary.to_pylist() == [1.0, 2.0]
    assert ty1.ordered is True
def test_dictionary_type():
    ty0 = pa.dictionary(pa.int32(), pa.array(['a', 'b', 'c']))
    assert ty0.index_type == pa.int32()
    assert isinstance(ty0.dictionary, pa.Array)
    assert ty0.dictionary.to_pylist() == ['a', 'b', 'c']
    assert ty0.ordered is False

    ty1 = pa.dictionary(pa.float32(), pa.array([1.0, 2.0]), ordered=True)
    assert ty1.index_type == pa.float32()
    assert isinstance(ty0.dictionary, pa.Array)
    assert ty1.dictionary.to_pylist() == [1.0, 2.0]
    assert ty1.ordered is True
Esempio n. 8
0
    def test_auto_dict_encode(self):
        opts = ConvertOptions(auto_dict_encode=True)
        rows = "a,b\nab,1\ncdé,2\ncdé,3\nab,4".encode()
        table = self.read_bytes(rows, convert_options=opts)
        schema = pa.schema([('a', pa.dictionary(pa.int32(), pa.string())),
                            ('b', pa.int64())])
        expected = {
            'a': ["ab", "cdé", "cdé", "ab"],
            'b': [1, 2, 3, 4],
        }
        assert table.schema == schema
        assert table.to_pydict() == expected

        opts.auto_dict_max_cardinality = 2
        table = self.read_bytes(rows, convert_options=opts)
        assert table.schema == schema
        assert table.to_pydict() == expected

        # Cardinality above max => plain-encoded
        opts.auto_dict_max_cardinality = 1
        table = self.read_bytes(rows, convert_options=opts)
        assert table.schema == pa.schema([('a', pa.string()),
                                          ('b', pa.int64())])
        assert table.to_pydict() == expected

        # With invalid UTF8, not checked
        opts.auto_dict_max_cardinality = 50
        opts.check_utf8 = False
        rows = b"a,b\nab,1\ncd\xff,2\nab,3"
        table = self.read_bytes(rows,
                                convert_options=opts,
                                validate_full=False)
        assert table.schema == schema
        dict_values = table['a'].chunk(0).dictionary
        assert len(dict_values) == 2
        assert dict_values[0].as_py() == "ab"
        assert dict_values[1].as_buffer() == b"cd\xff"

        # With invalid UTF8, checked
        opts.check_utf8 = True
        table = self.read_bytes(rows, convert_options=opts)
        schema = pa.schema([('a', pa.dictionary(pa.int32(), pa.binary())),
                            ('b', pa.int64())])
        expected = {
            'a': [b"ab", b"cd\xff", b"ab"],
            'b': [1, 2, 3],
        }
        assert table.schema == schema
        assert table.to_pydict() == expected
Esempio n. 9
0
def test_cast_from_null():
    in_data = [None] * 3
    in_type = pa.null()
    out_types = [
        pa.null(),
        pa.uint8(),
        pa.float16(),
        pa.utf8(),
        pa.binary(),
        pa.binary(10),
        pa.list_(pa.int16()),
        pa.large_list(pa.uint8()),
        pa.decimal128(19, 4),
        pa.timestamp('us'),
        pa.timestamp('us', tz='UTC'),
        pa.timestamp('us', tz='Europe/Paris'),
        pa.struct([pa.field('a', pa.int32()),
                   pa.field('b', pa.list_(pa.int8())),
                   pa.field('c', pa.string())]),
        ]
    for out_type in out_types:
        _check_cast_case((in_data, in_type, in_data, out_type))

    out_types = [
        pa.dictionary(pa.int32(), pa.string()),
        pa.union([pa.field('a', pa.binary(10)),
                  pa.field('b', pa.string())], mode=pa.lib.UnionMode_DENSE),
        pa.union([pa.field('a', pa.binary(10)),
                  pa.field('b', pa.string())], mode=pa.lib.UnionMode_SPARSE),
        ]
    in_arr = pa.array(in_data, type=pa.null())
    for out_type in out_types:
        with pytest.raises(NotImplementedError):
            in_arr.cast(out_type)
Esempio n. 10
0
def test_dictionary_delta(stream_fixture):
    ty = pa.dictionary(pa.int8(), pa.utf8())
    data = [["foo", "foo", None],
            ["foo", "bar", "foo"],  # potential delta
            ["foo", "bar"],
            ["foo", None, "bar", "quux"],  # potential delta
            ["bar", "quux"],  # replacement
            ]
    batches = [
        pa.RecordBatch.from_arrays([pa.array(v, type=ty)], names=['dicts'])
        for v in data]
    schema = batches[0].schema

    def write_batches():
        with stream_fixture._get_writer(pa.MockOutputStream(),
                                        schema) as writer:
            for batch in batches:
                writer.write_batch(batch)
            return writer.stats

    st = write_batches()
    assert st.num_record_batches == 5
    assert st.num_dictionary_batches == 4
    assert st.num_replaced_dictionaries == 3
    assert st.num_dictionary_deltas == 0

    stream_fixture.use_legacy_ipc_format = None
    stream_fixture.options = pa.ipc.IpcWriteOptions(
        emit_dictionary_deltas=True)
    st = write_batches()
    assert st.num_record_batches == 5
    assert st.num_dictionary_batches == 4
    assert st.num_replaced_dictionaries == 1
    assert st.num_dictionary_deltas == 2
Esempio n. 11
0
 def decode(encoding, type_spec):
     if isinstance(type_spec, dict):
         if type_spec['type'] == 'duration':
             return DataType(pa.duration(type_spec['unit']))
         elif type_spec['type'] == 'timestamp':
             return DataType(pa.timestamp(type_spec['unit']))
         elif type_spec['type'] == 'list':
             sub = encoding.decode('dtype', type_spec['value_type']).arrow
             return DataType(pa.list_(sub))
         elif type_spec['type'] == 'dict':
             value_type = encoding.decode('dtype', type_spec["value_type"]).arrow
             index_type = encoding.decode('dtype', type_spec["index_type"]).arrow
             bool_ordered = type_spec["ordered"]
             return DataType(pa.dictionary(index_type, value_type, bool_ordered))
         else:
             raise ValueError(f'Do not understand type {type_spec}')
     if type_spec == 'string':
         return DataType(pa.string())
     if type_spec == 'large_string':
         return DataType(pa.large_string())
     # TODO: find a proper way to support all arrow types
     if type_spec == 'timestamp[ms]':
         return DataType(pa.timestamp('ms'))
     else:
         return DataType(np.dtype(type_spec))
Esempio n. 12
0
 def test_arrow_schema_category_column(self):
     self.assertEqual(
         arrow_schema_to_render_columns(
             pa.schema([pa.field("A", pa.dictionary(pa.int32(), pa.string()))])
         ),
         {"A": RenderColumn("A", "text", None)},
     )
Esempio n. 13
0
def test_cast_from_null():
    in_data = [None] * 3
    in_type = pa.null()
    out_types = [
        pa.null(),
        pa.uint8(),
        pa.float16(),
        pa.utf8(),
        pa.binary(),
        pa.binary(10),
        pa.list_(pa.int16()),
        pa.decimal128(19, 4),
        pa.timestamp('us'),
        pa.timestamp('us', tz='UTC'),
        pa.timestamp('us', tz='Europe/Paris'),
        pa.struct([pa.field('a', pa.int32()),
                   pa.field('b', pa.list_(pa.int8())),
                   pa.field('c', pa.string())]),
        ]
    for out_type in out_types:
        _check_cast_case((in_data, in_type, in_data, out_type))

    out_types = [
        pa.dictionary(pa.int32(), pa.string()),
        pa.union([pa.field('a', pa.binary(10)),
                  pa.field('b', pa.string())], mode=pa.lib.UnionMode_DENSE),
        pa.union([pa.field('a', pa.binary(10)),
                  pa.field('b', pa.string())], mode=pa.lib.UnionMode_SPARSE),
        ]
    in_arr = pa.array(in_data, type=pa.null())
    for out_type in out_types:
        with pytest.raises(NotImplementedError):
            in_arr.cast(out_type)
Esempio n. 14
0
def test_dictionary_type():
    ty0 = pa.dictionary(pa.int32(), pa.string())
    assert ty0.index_type == pa.int32()
    assert ty0.value_type == pa.string()
    assert ty0.ordered is False

    ty1 = pa.dictionary(pa.int8(), pa.float64(), ordered=True)
    assert ty1.index_type == pa.int8()
    assert ty1.value_type == pa.float64()
    assert ty1.ordered is True

    # construct from non-arrow objects
    ty2 = pa.dictionary('int8', 'string')
    assert ty2.index_type == pa.int8()
    assert ty2.value_type == pa.string()
    assert ty2.ordered is False
Esempio n. 15
0
def get_dataframe(parameters, use_threads):
    # Initialize seeds
    if parameters.seed is not None:
        np.random.seed(parameters.seed)

    # For each column, use a generic Mimesis producer to create an Iterable
    # for generating data
    for i, column_params in enumerate(parameters.column_parameters):
        if column_params.dtype is None:
            column_params.generator = column_params.generator(
                Generic("en", seed=parameters.seed))
        else:
            column_params.generator = column_params.generator()

    # Get schema for each column
    schema = pa.schema([
        pa.field(
            name=str(i),
            type=pa.dictionary(
                index_type=pa.int64(),
                value_type=pa.from_numpy_dtype(
                    type(next(iter(column_params.generator)))),
            ) if isinstance(column_params.dtype, str)
            and column_params.dtype == "category" else pa.from_numpy_dtype(
                type(next(iter(column_params.generator)))
                if column_params.dtype is None else column_params.dtype),
            nullable=column_params.null_frequency > 0,
        ) for i, column_params in enumerate(parameters.column_parameters)
    ])

    # Initialize column data and which columns should be sorted
    column_data = [None] * len(parameters.column_parameters)
    columns_to_sort = [
        str(i) for i, column_params in enumerate(parameters.column_parameters)
        if column_params.is_sorted
    ]
    # Generate data
    if not use_threads:
        for i, column_params in enumerate(parameters.column_parameters):
            column_data[i] = _generate_column(column_params,
                                              parameters.num_rows)
    else:
        pool = Pool(pa.cpu_count())
        column_data = pool.starmap(
            _generate_column,
            [(column_params, parameters.num_rows)
             for i, column_params in enumerate(parameters.column_parameters)],
        )
        pool.close()
        pool.join()
    # Convert to Pandas DataFrame and sort columns appropriately
    tbl = pa.Table.from_arrays(
        column_data,
        schema=schema,
    )
    if columns_to_sort:
        tbl = tbl.to_pandas()
        tbl = tbl.sort_values(columns_to_sort)
        tbl = pa.Table.from_pandas(tbl, schema)
    return tbl
def get_many_types():
    # returning them from a function is required because of pa.dictionary
    # type holds a pyarrow array and test_array.py::test_toal_bytes_allocated
    # checks that the default memory pool has zero allocated bytes
    return (pa.null(), pa.bool_(), pa.int32(), pa.time32('s'), pa.time64('us'),
            pa.date32(), pa.timestamp('us'), pa.timestamp('us', tz='UTC'),
            pa.timestamp('us', tz='Europe/Paris'), pa.float16(), pa.float32(),
            pa.float64(), pa.decimal128(19, 4), pa.string(), pa.binary(),
            pa.binary(10), pa.list_(pa.int32()),
            pa.struct([
                pa.field('a', pa.int32()),
                pa.field('b', pa.int8()),
                pa.field('c', pa.string())
            ]),
            pa.struct([
                pa.field('a', pa.int32(), nullable=False),
                pa.field('b', pa.int8(), nullable=False),
                pa.field('c', pa.string())
            ]),
            pa.union(
                [pa.field('a', pa.binary(10)),
                 pa.field('b', pa.string())],
                mode=pa.lib.UnionMode_DENSE),
            pa.union(
                [pa.field('a', pa.binary(10)),
                 pa.field('b', pa.string())],
                mode=pa.lib.UnionMode_SPARSE),
            pa.union([
                pa.field('a', pa.binary(10), nullable=False),
                pa.field('b', pa.string())
            ],
                     mode=pa.lib.UnionMode_SPARSE),
            pa.dictionary(pa.int32(), pa.array(['a', 'b', 'c'])))
Esempio n. 17
0
def test_dictionary_type():
    ty0 = pa.dictionary(pa.int32(), pa.string())
    assert ty0.index_type == pa.int32()
    assert ty0.value_type == pa.string()
    assert ty0.ordered is False

    ty1 = pa.dictionary(pa.int8(), pa.float64(), ordered=True)
    assert ty1.index_type == pa.int8()
    assert ty1.value_type == pa.float64()
    assert ty1.ordered is True

    # construct from non-arrow objects
    ty2 = pa.dictionary('int8', 'string')
    assert ty2.index_type == pa.int8()
    assert ty2.value_type == pa.string()
    assert ty2.ordered is False
Esempio n. 18
0
def test_dict_col(tmpdir):
    # Create the file if necessary
    parquet_path = tmpdir / 'sample_arrow_dict.parquet'
    schema = pa.schema({
        'col1': pa.int32(),
        'col2': pa.float32(),
        'col3': pa.dictionary(pa.int16(), pa.string()),
    })

    table = pa.table(
        {
            'col1': range(10),
            'col2': np.random.randn(10),
            'col3': list(np.random.choice(['A', 'B', 'C'], 10)),
        },
        schema=schema)

    pq.write_table(table, parquet_path)

    # Load df
    df = vaex.open(parquet_path)
    dtypes = df.dtypes
    assert isinstance(dtypes["col3"].arrow, pa.lib.DictionaryType)

    # Filter
    df = df._future()
    dff1 = df[df["col3"] == 'A']
    assert dff1["col3"].unique() == ["A"]
Esempio n. 19
0
def test_arrow():
    a = Series("a", [1, 2, 3, None])
    out = a.to_arrow()
    assert out == pa.array([1, 2, 3, None])

    a = pa.array(["foo", "bar"], pa.dictionary(pa.int32(), pa.utf8()))
    s = pl.Series("a", a)
    assert s.dtype == pl.Utf8
Esempio n. 20
0
def test_filesystem_factory(mockfs, paths_or_selector):
    format = ds.ParquetFileFormat(read_options=ds.ParquetReadOptions(
        dictionary_columns={"str"}))

    options = ds.FileSystemFactoryOptions('subdir')
    options.partitioning = ds.DirectoryPartitioning(
        pa.schema(
            [pa.field('group', pa.int32()),
             pa.field('key', pa.string())]))
    assert options.partition_base_dir == 'subdir'
    assert options.ignore_prefixes == ['.', '_']
    assert options.exclude_invalid_files is False

    factory = ds.FileSystemDatasetFactory(mockfs, paths_or_selector, format,
                                          options)
    inspected_schema = factory.inspect()

    assert factory.inspect().equals(pa.schema([
        pa.field('i64', pa.int64()),
        pa.field('f64', pa.float64()),
        pa.field('str', pa.dictionary(pa.int32(), pa.string())),
        pa.field('const', pa.int64()),
        pa.field('group', pa.int32()),
        pa.field('key', pa.string()),
    ]),
                                    check_metadata=False)

    assert isinstance(factory.inspect_schemas(), list)
    assert isinstance(factory.finish(inspected_schema), ds.FileSystemDataset)
    assert factory.root_partition.equals(ds.ScalarExpression(True))

    dataset = factory.finish()
    assert isinstance(dataset, ds.FileSystemDataset)
    assert len(list(dataset.scan())) == 2

    scanner = ds.Scanner(dataset)
    expected_i64 = pa.array([0, 1, 2, 3, 4], type=pa.int64())
    expected_f64 = pa.array([0, 1, 2, 3, 4], type=pa.float64())
    expected_str = pa.DictionaryArray.from_arrays(
        pa.array([0, 1, 2, 3, 4], type=pa.int32()),
        pa.array("0 1 2 3 4".split(), type=pa.string()))
    for task, group, key in zip(scanner.scan(), [1, 2], ['xxx', 'yyy']):
        expected_group = pa.array([group] * 5, type=pa.int32())
        expected_key = pa.array([key] * 5, type=pa.string())
        expected_const = pa.array([group - 1] * 5, type=pa.int64())
        for batch in task.execute():
            assert batch.num_columns == 6
            assert batch[0].equals(expected_i64)
            assert batch[1].equals(expected_f64)
            assert batch[2].equals(expected_str)
            assert batch[3].equals(expected_const)
            assert batch[4].equals(expected_group)
            assert batch[5].equals(expected_key)

    table = dataset.to_table()
    assert isinstance(table, pa.Table)
    assert len(table) == 10
    assert table.num_columns == 6
Esempio n. 21
0
def test_dictionary_python():
    """
    Python -> Rust -> Python
    """
    a = pa.array(["a", None, "b", None, "a"], type=pa.dictionary(pa.int8(), pa.string()))
    b = rust.round_trip_array(a)
    assert a == b
    del a
    del b
Esempio n. 22
0
 def _dtype_to_arrow(cls, dtype):
     if dtype is None:
         return None
     tname = dtype if isinstance(dtype, str) else dtype.name
     if tname == "category":
         return pa.dictionary(index_type=pa.int32(), value_type=pa.string())
     elif tname == "string":
         return pa.string()
     else:
         return pa.from_numpy_dtype(tname)
Esempio n. 23
0
def test_dictionary_type():
    ty0 = pa.dictionary(pa.int32(), pa.array(['a', 'b', 'c']))
    assert ty0.index_type == pa.int32()
    assert isinstance(ty0.dictionary, pa.Array)
    assert ty0.dictionary.to_pylist() == ['a', 'b', 'c']
    assert ty0.ordered is False

    ty1 = pa.dictionary(pa.int8(), pa.array([1.0, 2.0]), ordered=True)
    assert ty1.index_type == pa.int8()
    assert isinstance(ty0.dictionary, pa.Array)
    assert ty1.dictionary.to_pylist() == [1.0, 2.0]
    assert ty1.ordered is True

    # construct from non-arrow objects
    ty2 = pa.dictionary('int8', ['a', 'b', 'c', 'd'])
    assert ty2.index_type == pa.int8()
    assert isinstance(ty2.dictionary, pa.Array)
    assert ty2.dictionary.to_pylist() == ['a', 'b', 'c', 'd']
    assert ty2.ordered is False
Esempio n. 24
0
 def test_text_dictionary_zero_chunks_is_valid(self):
     validate(
         pyarrow.Table.from_batches(
             [],
             pyarrow.schema([("A",
                              pyarrow.dictionary(pyarrow.int32(),
                                                 pyarrow.string()))]),
         ),
         TableMetadata(0, [Text("A")]),
     )
Esempio n. 25
0
def test_cat_int_types_3500() -> None:
    with pl.StringCache():
        # Create an enum / categorical / dictionary typed pyarrow array
        # Most simply done by creating a pandas categorical series first
        categorical_df = pd.Series(["a", "a", "b"], dtype="category")
        pyarrow_array = pa.Array.from_pandas(categorical_df)

        # The in-memory representation of each category can either be a signed or unsigned 8-bit integer
        # Pandas uses Int8...
        int_dict_type = pa.dictionary(index_type=pa.int8(),
                                      value_type=pa.utf8())
        # ... while DuckDB uses UInt8
        uint_dict_type = pa.dictionary(index_type=pa.uint8(),
                                       value_type=pa.utf8())

        for t in [int_dict_type, uint_dict_type]:
            s = pl.from_arrow(pyarrow_array.cast(t))
            assert s.series_equal(
                pl.Series(["a", "a", "b"]).cast(pl.Categorical))
Esempio n. 26
0
def test_empty_table():
    schema = pa.schema([
        pa.field('f0', pa.int64()),
        pa.field('f1', pa.dictionary(pa.int32(), pa.string())),
        pa.field('f2', pa.list_(pa.list_(pa.int64()))),
    ])
    table = schema.empty_table()
    assert isinstance(table, pa.Table)
    assert table.num_rows == 0
    assert table.schema == schema
Esempio n. 27
0
 def schema(cls):
     return pa.schema(
         {
             "instrument_id": pa.dictionary(pa.int8(), pa.string()),
             "ts_event": pa.int64(),
             "ts_init": pa.int64(),
             "last_traded_price": pa.string(),
             "traded_volume": pa.string(),
         },
         metadata={"type": "BetfairTicker"},
     )
Esempio n. 28
0
def test_arrow():
    a = pl.Series("a", [1, 2, 3, None])
    out = a.to_arrow()
    assert out == pa.array([1, 2, 3, None])

    a = pa.array(["foo", "bar"], pa.dictionary(pa.int32(), pa.utf8()))
    s = pl.Series("a", a)
    assert s.dtype == pl.Categorical
    assert (pl.from_arrow(
        pa.array([["foo"], ["foo", "bar"]],
                 pa.list_(pa.utf8()))).dtype == pl.List)
Esempio n. 29
0
def test_schema_repr_with_dictionaries():
    fields = [
        pa.field('one', pa.dictionary(pa.int16(), pa.string())),
        pa.field('two', pa.int32())
    ]
    sch = pa.schema(fields)

    expected = ("""\
one: dictionary<values=string, indices=int16, ordered=0>
two: int32""")

    assert repr(sch) == expected
def test_dictionary_type():
    ty0 = pa.dictionary(pa.int32(), pa.string())
    assert ty0.index_type == pa.int32()
    assert ty0.value_type == pa.string()
    assert ty0.ordered is False

    ty1 = pa.dictionary(pa.int8(), pa.float64(), ordered=True)
    assert ty1.index_type == pa.int8()
    assert ty1.value_type == pa.float64()
    assert ty1.ordered is True

    # construct from non-arrow objects
    ty2 = pa.dictionary('int8', 'string')
    assert ty2.index_type == pa.int8()
    assert ty2.value_type == pa.string()
    assert ty2.ordered is False

    # allow unsigned integers for index type
    ty3 = pa.dictionary(pa.uint32(), pa.string())
    assert ty3.index_type == pa.uint32()
    assert ty3.value_type == pa.string()
    assert ty3.ordered is False

    # invalid index type raises
    with pytest.raises(TypeError):
        pa.dictionary(pa.string(), pa.int64())
Esempio n. 31
0
def test_schema_repr_with_dictionaries():
    fields = [
        pa.field('one', pa.dictionary(pa.int16(), pa.string())),
        pa.field('two', pa.int32())
    ]
    sch = pa.schema(fields)

    expected = (
        """\
one: dictionary<values=string, indices=int16, ordered=0>
two: int32""")

    assert repr(sch) == expected
Esempio n. 32
0
    def test_dict(self):
        """
        Python -> Rust -> Python
        """
        a = pyarrow.array(
            ["a", "a", "b", None, "c"],
            pyarrow.dictionary(pyarrow.int64(), pyarrow.utf8()),
        )
        b = arrow_pyarrow_integration_testing.round_trip_array(a)

        b.validate(full=True)
        assert a.to_pylist() == b.to_pylist()
        assert a.type == b.type
Esempio n. 33
0
def test_build_table_infer_type():
    table = make_table(
        make_column("A", ["x"]),
        make_column("B", [datetime.date(2021, 4, 7)]),
        make_column("C", [datetime.datetime(2021, 4, 7, 19, 24, 1, 1)]),
        make_column("D", [1.0]),
        make_column("dict", ["x"], dictionary=True),
    )
    assert table["A"].type == pa.string()
    assert table["B"].type == pa.date32()
    assert table["C"].type == pa.timestamp("ns")
    assert table["D"].type == pa.float64()
    assert table["dict"].type == pa.dictionary(pa.int32(), pa.string())
Esempio n. 34
0
def create_cems_schema():
    """Make an explicit Arrow schema for the EPA CEMS data.

    Make changes in the types of the generated parquet files by editing this
    function.

    Note that parquet's internal representation doesn't use unsigned numbers or
    16-bit ints, so just keep things simple here and always use int32 and
    float32.

    Returns:
        pyarrow.schema: An Arrow schema for the EPA CEMS data.

    """
    int_nullable = partial(pa.field, type=pa.int32(), nullable=True)
    int_not_null = partial(pa.field, type=pa.int32(), nullable=False)
    str_not_null = partial(pa.field, type=pa.string(), nullable=False)
    # Timestamp resolution is hourly, but second is the largest allowed.
    timestamp = partial(pa.field,
                        type=pa.timestamp("s", tz="UTC"),
                        nullable=False)
    float_nullable = partial(pa.field, type=pa.float32(), nullable=True)
    float_not_null = partial(pa.field, type=pa.float32(), nullable=False)
    # (float32 can accurately hold integers up to 16,777,216 so no need for
    # float64)
    dict_nullable = partial(pa.field,
                            type=pa.dictionary(pa.int8(),
                                               pa.string(),
                                               ordered=False),
                            nullable=True)
    return pa.schema([
        dict_nullable("state"),
        int_not_null("plant_id_eia"),
        str_not_null("unitid"),
        timestamp("operating_datetime_utc"),
        float_nullable("operating_time_hours"),
        float_not_null("gross_load_mw"),
        float_nullable("steam_load_1000_lbs"),
        float_nullable("so2_mass_lbs"),
        dict_nullable("so2_mass_measurement_code"),
        float_nullable("nox_rate_lbs_mmbtu"),
        dict_nullable("nox_rate_measurement_code"),
        float_nullable("nox_mass_lbs"),
        dict_nullable("nox_mass_measurement_code"),
        float_nullable("co2_mass_tons"),
        dict_nullable("co2_mass_measurement_code"),
        float_not_null("heat_content_mmbtu"),
        int_nullable("facility_id"),
        int_nullable("unit_id_epa"),
        int_not_null("year"),
    ])
Esempio n. 35
0
def test_schema_repr_with_dictionaries():
    dct = pa.array(['foo', 'bar', 'baz'], type=pa.string())
    fields = [
        pa.field('one', pa.dictionary(pa.int16(), dct)),
        pa.field('two', pa.int32())
    ]
    sch = pa.schema(fields)

    expected = (
        """\
one: dictionary<values=string, indices=int16, ordered=0>
  dictionary: ["foo", "bar", "baz"]
two: int32""")

    assert repr(sch) == expected
Esempio n. 36
0
def get_many_types():
    # returning them from a function is required because of pa.dictionary
    # type holds a pyarrow array and test_array.py::test_toal_bytes_allocated
    # checks that the default memory pool has zero allocated bytes
    return (
        pa.null(),
        pa.bool_(),
        pa.int32(),
        pa.time32('s'),
        pa.time64('us'),
        pa.date32(),
        pa.timestamp('us'),
        pa.timestamp('us', tz='UTC'),
        pa.timestamp('us', tz='Europe/Paris'),
        pa.float16(),
        pa.float32(),
        pa.float64(),
        pa.decimal128(19, 4),
        pa.string(),
        pa.binary(),
        pa.binary(10),
        pa.list_(pa.int32()),
        pa.struct([pa.field('a', pa.int32()),
                   pa.field('b', pa.int8()),
                   pa.field('c', pa.string())]),
        pa.struct([pa.field('a', pa.int32(), nullable=False),
                   pa.field('b', pa.int8(), nullable=False),
                   pa.field('c', pa.string())]),
        pa.union([pa.field('a', pa.binary(10)),
                  pa.field('b', pa.string())], mode=pa.lib.UnionMode_DENSE),
        pa.union([pa.field('a', pa.binary(10)),
                  pa.field('b', pa.string())], mode=pa.lib.UnionMode_SPARSE),
        pa.union([pa.field('a', pa.binary(10), nullable=False),
                  pa.field('b', pa.string())], mode=pa.lib.UnionMode_SPARSE),
        pa.dictionary(pa.int32(), pa.string())
    )
Esempio n. 37
0
def test_is_dictionary():
    assert types.is_dictionary(pa.dictionary(pa.int32(), pa.string()))
    assert not types.is_dictionary(pa.int32())
Esempio n. 38
0
def test_dictionary_type():
    ty = pa.dictionary(pa.int32(), pa.array(['a', 'b', 'c']))
    assert ty.index_type == pa.int32()
    assert ty.dictionary.to_pylist() == ['a', 'b', 'c']
Esempio n. 39
0
def test_is_dictionary():
    assert types.is_dictionary(
        pa.dictionary(pa.int32(),
                      pa.array(['a', 'b', 'c'])))
    assert not types.is_dictionary(pa.int32())