Esempi in Python per dictionary, esempi in Python per pyarrow.dictionary

Esempio n. 1

0

Mostra file

def test_data_type():
    a = pbn.DiscreteFactor("A", [])
    with pytest.raises(ValueError) as ex:
        a.data_type()
    "DiscreteFactor factor not fitted." in str(ex.value)

    categories = np.asarray(["a1", "a2"])
    a_values = pd.Categorical(categories[np.random.randint(len(categories),
                                                           size=100)],
                              categories=categories,
                              ordered=False)
    df = pd.DataFrame({'A': a_values})
    a.fit(df)
    assert a.data_type() == pa.dictionary(pa.int8(), pa.string())

    categories = np.asarray(["a" + str(i) for i in range(1, 129)])
    a_values = pd.Categorical(categories[np.random.randint(len(categories),
                                                           size=100)],
                              categories=categories,
                              ordered=False)
    df = pd.DataFrame({'A': a_values})
    a.fit(df)
    assert a.data_type() == pa.dictionary(pa.int8(), pa.string())

    categories = np.asarray(["a" + str(i) for i in range(1, 130)])
    a_values = pd.Categorical(categories[np.random.randint(len(categories),
                                                           size=100)],
                              categories=categories,
                              ordered=False)
    df = pd.DataFrame({'A': a_values})
    a.fit(df)
    assert a.data_type() == pa.dictionary(pa.int16(), pa.string())

Esempio n. 2

0

Mostra file

File: test_schema.py Progetto: sighingnow/arrow

def test_schema_merge():
    a = pa.schema([
        pa.field('foo', pa.int32()),
        pa.field('bar', pa.string()),
        pa.field('baz', pa.list_(pa.int8()))
    ])
    b = pa.schema([pa.field('foo', pa.int32()), pa.field('qux', pa.bool_())])
    c = pa.schema([pa.field('quux', pa.dictionary(pa.int32(), pa.string()))])
    d = pa.schema([pa.field('foo', pa.int64()), pa.field('qux', pa.bool_())])

    result = pa.unify_schemas([a, b, c])
    expected = pa.schema([
        pa.field('foo', pa.int32()),
        pa.field('bar', pa.string()),
        pa.field('baz', pa.list_(pa.int8())),
        pa.field('qux', pa.bool_()),
        pa.field('quux', pa.dictionary(pa.int32(), pa.string()))
    ])
    assert result.equals(expected)

    with pytest.raises(pa.ArrowInvalid):
        pa.unify_schemas([b, d])

    # ARROW-14002: Try with tuple instead of list
    result = pa.unify_schemas((a, b, c))
    assert result.equals(expected)

Esempio n. 3

0

Mostra file

    def test_column_types_dict(self):
        # Ask for dict-encoded column types in ConvertOptions
        column_types = [('a', pa.dictionary(pa.int32(), pa.utf8())),
                        ('b', pa.dictionary(pa.int32(), pa.int64())),
                        ('c', pa.dictionary(pa.int32(), pa.decimal128(11, 2))),
                        ('d', pa.dictionary(pa.int32(), pa.large_utf8()))]

        opts = ConvertOptions(column_types=dict(column_types))
        rows = (b"a,b,c,d\n"
                b"abc,123456,1.0,zz\n"
                b"defg,123456,0.5,xx\n"
                b"abc,N/A,1.0,xx\n")
        table = self.read_bytes(rows, convert_options=opts)

        schema = pa.schema(column_types)
        expected = {
            'a': ["abc", "defg", "abc"],
            'b': [123456, 123456, None],
            'c': [Decimal("1.00"),
                  Decimal("0.50"),
                  Decimal("1.00")],
            'd': ["zz", "xx", "xx"],
        }
        assert table.schema == schema
        assert table.to_pydict() == expected

        # Unsupported index type
        column_types[0] = ('a', pa.dictionary(pa.int8(), pa.utf8()))

        opts = ConvertOptions(column_types=dict(column_types))
        with pytest.raises(NotImplementedError):
            table = self.read_bytes(rows, convert_options=opts)

Esempio n. 4

0

Mostra file

File: test.py Progetto: fairtide/DataFrame

def test_table(n, types=None, offset=None, length=None, nullable=True):
    if types is None:
        types = [
            pyarrow.null(),
            pyarrow.bool_(),
            pyarrow.int8(),
            pyarrow.int16(),
            pyarrow.int32(),
            pyarrow.int64(),
            pyarrow.uint8(),
            pyarrow.uint16(),
            pyarrow.uint32(),
            pyarrow.uint64(),
            pyarrow.float16(),
            pyarrow.float32(),
            pyarrow.float64(),
            pyarrow.date32(),
            pyarrow.date64(),
            pyarrow.timestamp('s'),
            pyarrow.timestamp('ms'),
            pyarrow.timestamp('us'),
            pyarrow.timestamp('ns'),
            pyarrow.time32('s'),
            pyarrow.time32('ms'),
            pyarrow.time64('us'),
            pyarrow.time64('ns'),
            pyarrow.string(),
            pyarrow.binary(),
            pyarrow.binary(4),
            pyarrow.dictionary(pyarrow.int32(), pyarrow.string(), True),
            pyarrow.dictionary(pyarrow.int64(), pyarrow.int64(), True),
            pyarrow.dictionary(pyarrow.int32(), pyarrow.string(), False),
            pyarrow.dictionary(pyarrow.int64(), pyarrow.int64(), False),
            pyarrow.list_(pyarrow.int32()),
            pyarrow.struct([pyarrow.field('int32', pyarrow.int32())]),
            pyarrow.list_(
                pyarrow.struct([pyarrow.field('int32', pyarrow.int32())])),
            pyarrow.struct(
                [pyarrow.field('int32', pyarrow.list_(pyarrow.int32()))]),
        ]

    data = list()

    for t in types:
        name = str(t)
        array = TestArrayGenerator(n, t, False).array
        if offset is not None:
            array = array.slice(offset, length)
        data.append(pyarrow.column(name, array))

        if nullable:
            name = str(t) + ' (null)'
            array = TestArrayGenerator(n, t, True).array
            if offset is not None:
                array = array.slice(offset, length)
            data.append(pyarrow.column(name, array))

    return pyarrow.Table.from_arrays(data)

Esempio n. 5

0

Mostra file

File: test_types.py Progetto: wiltonlazary/arrow-1

def test_dictionary_ordered_equals():
    # Python side checking of ARROW-6345
    d1 = pa.dictionary('int32', 'binary', ordered=True)
    d2 = pa.dictionary('int32', 'binary', ordered=False)
    d3 = pa.dictionary('int8', 'binary', ordered=True)
    d4 = pa.dictionary('int32', 'binary', ordered=True)

    assert not d1.equals(d2)
    assert not d1.equals(d3)
    assert d1.equals(d4)

Esempio n. 6

0

Mostra file

File: test_types.py Progetto: emkornfield/arrow

def test_dictionary_type():
    ty0 = pa.dictionary(pa.int32(), pa.array(['a', 'b', 'c']))
    assert ty0.index_type == pa.int32()
    assert isinstance(ty0.dictionary, pa.Array)
    assert ty0.dictionary.to_pylist() == ['a', 'b', 'c']
    assert ty0.ordered is False

    ty1 = pa.dictionary(pa.int8(), pa.array([1.0, 2.0]), ordered=True)
    assert ty1.index_type == pa.int8()
    assert isinstance(ty0.dictionary, pa.Array)
    assert ty1.dictionary.to_pylist() == [1.0, 2.0]
    assert ty1.ordered is True

Esempio n. 7

0

Mostra file

File: test_types.py Progetto: msimons4/Python-for-Data-Analysis

def test_dictionary_type():
    ty0 = pa.dictionary(pa.int32(), pa.array(['a', 'b', 'c']))
    assert ty0.index_type == pa.int32()
    assert isinstance(ty0.dictionary, pa.Array)
    assert ty0.dictionary.to_pylist() == ['a', 'b', 'c']
    assert ty0.ordered is False

    ty1 = pa.dictionary(pa.float32(), pa.array([1.0, 2.0]), ordered=True)
    assert ty1.index_type == pa.float32()
    assert isinstance(ty0.dictionary, pa.Array)
    assert ty1.dictionary.to_pylist() == [1.0, 2.0]
    assert ty1.ordered is True

Esempio n. 8

0

Mostra file

    def test_auto_dict_encode(self):
        opts = ConvertOptions(auto_dict_encode=True)
        rows = "a,b\nab,1\ncdé,2\ncdé,3\nab,4".encode()
        table = self.read_bytes(rows, convert_options=opts)
        schema = pa.schema([('a', pa.dictionary(pa.int32(), pa.string())),
                            ('b', pa.int64())])
        expected = {
            'a': ["ab", "cdé", "cdé", "ab"],
            'b': [1, 2, 3, 4],
        }
        assert table.schema == schema
        assert table.to_pydict() == expected

        opts.auto_dict_max_cardinality = 2
        table = self.read_bytes(rows, convert_options=opts)
        assert table.schema == schema
        assert table.to_pydict() == expected

        # Cardinality above max => plain-encoded
        opts.auto_dict_max_cardinality = 1
        table = self.read_bytes(rows, convert_options=opts)
        assert table.schema == pa.schema([('a', pa.string()),
                                          ('b', pa.int64())])
        assert table.to_pydict() == expected

        # With invalid UTF8, not checked
        opts.auto_dict_max_cardinality = 50
        opts.check_utf8 = False
        rows = b"a,b\nab,1\ncd\xff,2\nab,3"
        table = self.read_bytes(rows,
                                convert_options=opts,
                                validate_full=False)
        assert table.schema == schema
        dict_values = table['a'].chunk(0).dictionary
        assert len(dict_values) == 2
        assert dict_values[0].as_py() == "ab"
        assert dict_values[1].as_buffer() == b"cd\xff"

        # With invalid UTF8, checked
        opts.check_utf8 = True
        table = self.read_bytes(rows, convert_options=opts)
        schema = pa.schema([('a', pa.dictionary(pa.int32(), pa.binary())),
                            ('b', pa.int64())])
        expected = {
            'a': [b"ab", b"cd\xff", b"ab"],
            'b': [1, 2, 3],
        }
        assert table.schema == schema
        assert table.to_pydict() == expected

Esempio n. 9

0

Mostra file

def test_cast_from_null():
    in_data = [None] * 3
    in_type = pa.null()
    out_types = [
        pa.null(),
        pa.uint8(),
        pa.float16(),
        pa.utf8(),
        pa.binary(),
        pa.binary(10),
        pa.list_(pa.int16()),
        pa.large_list(pa.uint8()),
        pa.decimal128(19, 4),
        pa.timestamp('us'),
        pa.timestamp('us', tz='UTC'),
        pa.timestamp('us', tz='Europe/Paris'),
        pa.struct([pa.field('a', pa.int32()),
                   pa.field('b', pa.list_(pa.int8())),
                   pa.field('c', pa.string())]),
        ]
    for out_type in out_types:
        _check_cast_case((in_data, in_type, in_data, out_type))

    out_types = [
        pa.dictionary(pa.int32(), pa.string()),
        pa.union([pa.field('a', pa.binary(10)),
                  pa.field('b', pa.string())], mode=pa.lib.UnionMode_DENSE),
        pa.union([pa.field('a', pa.binary(10)),
                  pa.field('b', pa.string())], mode=pa.lib.UnionMode_SPARSE),
        ]
    in_arr = pa.array(in_data, type=pa.null())
    for out_type in out_types:
        with pytest.raises(NotImplementedError):
            in_arr.cast(out_type)

Esempio n. 10

0

Mostra file

File: test_ipc.py Progetto: zhztheplayer/arrow-1

def test_dictionary_delta(stream_fixture):
    ty = pa.dictionary(pa.int8(), pa.utf8())
    data = [["foo", "foo", None],
            ["foo", "bar", "foo"],  # potential delta
            ["foo", "bar"],
            ["foo", None, "bar", "quux"],  # potential delta
            ["bar", "quux"],  # replacement
            ]
    batches = [
        pa.RecordBatch.from_arrays([pa.array(v, type=ty)], names=['dicts'])
        for v in data]
    schema = batches[0].schema

    def write_batches():
        with stream_fixture._get_writer(pa.MockOutputStream(),
                                        schema) as writer:
            for batch in batches:
                writer.write_batch(batch)
            return writer.stats

    st = write_batches()
    assert st.num_record_batches == 5
    assert st.num_dictionary_batches == 4
    assert st.num_replaced_dictionaries == 3
    assert st.num_dictionary_deltas == 0

    stream_fixture.use_legacy_ipc_format = None
    stream_fixture.options = pa.ipc.IpcWriteOptions(
        emit_dictionary_deltas=True)
    st = write_batches()
    assert st.num_record_batches == 5
    assert st.num_dictionary_batches == 4
    assert st.num_replaced_dictionaries == 1
    assert st.num_dictionary_deltas == 2

Esempio n. 11

0

Mostra file

File: encoding.py Progetto: t-triobox/vaex

 def decode(encoding, type_spec):
     if isinstance(type_spec, dict):
         if type_spec['type'] == 'duration':
             return DataType(pa.duration(type_spec['unit']))
         elif type_spec['type'] == 'timestamp':
             return DataType(pa.timestamp(type_spec['unit']))
         elif type_spec['type'] == 'list':
             sub = encoding.decode('dtype', type_spec['value_type']).arrow
             return DataType(pa.list_(sub))
         elif type_spec['type'] == 'dict':
             value_type = encoding.decode('dtype', type_spec["value_type"]).arrow
             index_type = encoding.decode('dtype', type_spec["index_type"]).arrow
             bool_ordered = type_spec["ordered"]
             return DataType(pa.dictionary(index_type, value_type, bool_ordered))
         else:
             raise ValueError(f'Do not understand type {type_spec}')
     if type_spec == 'string':
         return DataType(pa.string())
     if type_spec == 'large_string':
         return DataType(pa.large_string())
     # TODO: find a proper way to support all arrow types
     if type_spec == 'timestamp[ms]':
         return DataType(pa.timestamp('ms'))
     else:
         return DataType(np.dtype(type_spec))

Esempio n. 12

0

Mostra file

File: test_types.py Progetto: vishalbelsare/cjworkbench

 def test_arrow_schema_category_column(self):
     self.assertEqual(
         arrow_schema_to_render_columns(
             pa.schema([pa.field("A", pa.dictionary(pa.int32(), pa.string()))])
         ),
         {"A": RenderColumn("A", "text", None)},
     )

Esempio n. 13

0

Mostra file

File: test_array.py Progetto: rok/arrow

def test_cast_from_null():
    in_data = [None] * 3
    in_type = pa.null()
    out_types = [
        pa.null(),
        pa.uint8(),
        pa.float16(),
        pa.utf8(),
        pa.binary(),
        pa.binary(10),
        pa.list_(pa.int16()),
        pa.decimal128(19, 4),
        pa.timestamp('us'),
        pa.timestamp('us', tz='UTC'),
        pa.timestamp('us', tz='Europe/Paris'),
        pa.struct([pa.field('a', pa.int32()),
                   pa.field('b', pa.list_(pa.int8())),
                   pa.field('c', pa.string())]),
        ]
    for out_type in out_types:
        _check_cast_case((in_data, in_type, in_data, out_type))

    out_types = [
        pa.dictionary(pa.int32(), pa.string()),
        pa.union([pa.field('a', pa.binary(10)),
                  pa.field('b', pa.string())], mode=pa.lib.UnionMode_DENSE),
        pa.union([pa.field('a', pa.binary(10)),
                  pa.field('b', pa.string())], mode=pa.lib.UnionMode_SPARSE),
        ]
    in_arr = pa.array(in_data, type=pa.null())
    for out_type in out_types:
        with pytest.raises(NotImplementedError):
            in_arr.cast(out_type)

Esempio n. 14

0

Mostra file

File: test_types.py Progetto: rok/arrow

def test_dictionary_type():
    ty0 = pa.dictionary(pa.int32(), pa.string())
    assert ty0.index_type == pa.int32()
    assert ty0.value_type == pa.string()
    assert ty0.ordered is False

    ty1 = pa.dictionary(pa.int8(), pa.float64(), ordered=True)
    assert ty1.index_type == pa.int8()
    assert ty1.value_type == pa.float64()
    assert ty1.ordered is True

    # construct from non-arrow objects
    ty2 = pa.dictionary('int8', 'string')
    assert ty2.index_type == pa.int8()
    assert ty2.value_type == pa.string()
    assert ty2.ordered is False

Esempio n. 15

0

Mostra file

File: dataset_generator.py Progetto: mnicely/cudf

def get_dataframe(parameters, use_threads):
    # Initialize seeds
    if parameters.seed is not None:
        np.random.seed(parameters.seed)

    # For each column, use a generic Mimesis producer to create an Iterable
    # for generating data
    for i, column_params in enumerate(parameters.column_parameters):
        if column_params.dtype is None:
            column_params.generator = column_params.generator(
                Generic("en", seed=parameters.seed))
        else:
            column_params.generator = column_params.generator()

    # Get schema for each column
    schema = pa.schema([
        pa.field(
            name=str(i),
            type=pa.dictionary(
                index_type=pa.int64(),
                value_type=pa.from_numpy_dtype(
                    type(next(iter(column_params.generator)))),
            ) if isinstance(column_params.dtype, str)
            and column_params.dtype == "category" else pa.from_numpy_dtype(
                type(next(iter(column_params.generator)))
                if column_params.dtype is None else column_params.dtype),
            nullable=column_params.null_frequency > 0,
        ) for i, column_params in enumerate(parameters.column_parameters)
    ])

    # Initialize column data and which columns should be sorted
    column_data = [None] * len(parameters.column_parameters)
    columns_to_sort = [
        str(i) for i, column_params in enumerate(parameters.column_parameters)
        if column_params.is_sorted
    ]
    # Generate data
    if not use_threads:
        for i, column_params in enumerate(parameters.column_parameters):
            column_data[i] = _generate_column(column_params,
                                              parameters.num_rows)
    else:
        pool = Pool(pa.cpu_count())
        column_data = pool.starmap(
            _generate_column,
            [(column_params, parameters.num_rows)
             for i, column_params in enumerate(parameters.column_parameters)],
        )
        pool.close()
        pool.join()
    # Convert to Pandas DataFrame and sort columns appropriately
    tbl = pa.Table.from_arrays(
        column_data,
        schema=schema,
    )
    if columns_to_sort:
        tbl = tbl.to_pandas()
        tbl = tbl.sort_values(columns_to_sort)
        tbl = pa.Table.from_pandas(tbl, schema)
    return tbl

Esempio n. 16

0

Mostra file

File: test_types.py Progetto: msimons4/Python-for-Data-Analysis

def get_many_types():
    # returning them from a function is required because of pa.dictionary
    # type holds a pyarrow array and test_array.py::test_toal_bytes_allocated
    # checks that the default memory pool has zero allocated bytes
    return (pa.null(), pa.bool_(), pa.int32(), pa.time32('s'), pa.time64('us'),
            pa.date32(), pa.timestamp('us'), pa.timestamp('us', tz='UTC'),
            pa.timestamp('us', tz='Europe/Paris'), pa.float16(), pa.float32(),
            pa.float64(), pa.decimal128(19, 4), pa.string(), pa.binary(),
            pa.binary(10), pa.list_(pa.int32()),
            pa.struct([
                pa.field('a', pa.int32()),
                pa.field('b', pa.int8()),
                pa.field('c', pa.string())
            ]),
            pa.struct([
                pa.field('a', pa.int32(), nullable=False),
                pa.field('b', pa.int8(), nullable=False),
                pa.field('c', pa.string())
            ]),
            pa.union(
                [pa.field('a', pa.binary(10)),
                 pa.field('b', pa.string())],
                mode=pa.lib.UnionMode_DENSE),
            pa.union(
                [pa.field('a', pa.binary(10)),
                 pa.field('b', pa.string())],
                mode=pa.lib.UnionMode_SPARSE),
            pa.union([
                pa.field('a', pa.binary(10), nullable=False),
                pa.field('b', pa.string())
            ],
                     mode=pa.lib.UnionMode_SPARSE),
            pa.dictionary(pa.int32(), pa.array(['a', 'b', 'c'])))

Esempio n. 17

0

Mostra file

File: test_types.py Progetto: yisuoyanyudmj/arrow

def test_dictionary_type():
    ty0 = pa.dictionary(pa.int32(), pa.string())
    assert ty0.index_type == pa.int32()
    assert ty0.value_type == pa.string()
    assert ty0.ordered is False

    ty1 = pa.dictionary(pa.int8(), pa.float64(), ordered=True)
    assert ty1.index_type == pa.int8()
    assert ty1.value_type == pa.float64()
    assert ty1.ordered is True

    # construct from non-arrow objects
    ty2 = pa.dictionary('int8', 'string')
    assert ty2.index_type == pa.int8()
    assert ty2.value_type == pa.string()
    assert ty2.ordered is False

Esempio n. 18

0

Mostra file

File: dict_test.py Progetto: t-triobox/vaex

def test_dict_col(tmpdir):
    # Create the file if necessary
    parquet_path = tmpdir / 'sample_arrow_dict.parquet'
    schema = pa.schema({
        'col1': pa.int32(),
        'col2': pa.float32(),
        'col3': pa.dictionary(pa.int16(), pa.string()),
    })

    table = pa.table(
        {
            'col1': range(10),
            'col2': np.random.randn(10),
            'col3': list(np.random.choice(['A', 'B', 'C'], 10)),
        },
        schema=schema)

    pq.write_table(table, parquet_path)

    # Load df
    df = vaex.open(parquet_path)
    dtypes = df.dtypes
    assert isinstance(dtypes["col3"].arrow, pa.lib.DictionaryType)

    # Filter
    df = df._future()
    dff1 = df[df["col3"] == 'A']
    assert dff1["col3"].unique() == ["A"]

Esempio n. 19

0

Mostra file

File: test_series.py Progetto: stjordanis/polars

def test_arrow():
    a = Series("a", [1, 2, 3, None])
    out = a.to_arrow()
    assert out == pa.array([1, 2, 3, None])

    a = pa.array(["foo", "bar"], pa.dictionary(pa.int32(), pa.utf8()))
    s = pl.Series("a", a)
    assert s.dtype == pl.Utf8

Esempio n. 20

0

Mostra file

def test_filesystem_factory(mockfs, paths_or_selector):
    format = ds.ParquetFileFormat(read_options=ds.ParquetReadOptions(
        dictionary_columns={"str"}))

    options = ds.FileSystemFactoryOptions('subdir')
    options.partitioning = ds.DirectoryPartitioning(
        pa.schema(
            [pa.field('group', pa.int32()),
             pa.field('key', pa.string())]))
    assert options.partition_base_dir == 'subdir'
    assert options.ignore_prefixes == ['.', '_']
    assert options.exclude_invalid_files is False

    factory = ds.FileSystemDatasetFactory(mockfs, paths_or_selector, format,
                                          options)
    inspected_schema = factory.inspect()

    assert factory.inspect().equals(pa.schema([
        pa.field('i64', pa.int64()),
        pa.field('f64', pa.float64()),
        pa.field('str', pa.dictionary(pa.int32(), pa.string())),
        pa.field('const', pa.int64()),
        pa.field('group', pa.int32()),
        pa.field('key', pa.string()),
    ]),
                                    check_metadata=False)

    assert isinstance(factory.inspect_schemas(), list)
    assert isinstance(factory.finish(inspected_schema), ds.FileSystemDataset)
    assert factory.root_partition.equals(ds.ScalarExpression(True))

    dataset = factory.finish()
    assert isinstance(dataset, ds.FileSystemDataset)
    assert len(list(dataset.scan())) == 2

    scanner = ds.Scanner(dataset)
    expected_i64 = pa.array([0, 1, 2, 3, 4], type=pa.int64())
    expected_f64 = pa.array([0, 1, 2, 3, 4], type=pa.float64())
    expected_str = pa.DictionaryArray.from_arrays(
        pa.array([0, 1, 2, 3, 4], type=pa.int32()),
        pa.array("0 1 2 3 4".split(), type=pa.string()))
    for task, group, key in zip(scanner.scan(), [1, 2], ['xxx', 'yyy']):
        expected_group = pa.array([group] * 5, type=pa.int32())
        expected_key = pa.array([key] * 5, type=pa.string())
        expected_const = pa.array([group - 1] * 5, type=pa.int64())
        for batch in task.execute():
            assert batch.num_columns == 6
            assert batch[0].equals(expected_i64)
            assert batch[1].equals(expected_f64)
            assert batch[2].equals(expected_str)
            assert batch[3].equals(expected_const)
            assert batch[4].equals(expected_group)
            assert batch[5].equals(expected_key)

    table = dataset.to_table()
    assert isinstance(table, pa.Table)
    assert len(table) == 10
    assert table.num_columns == 6

Esempio n. 21

0

Mostra file

def test_dictionary_python():
    """
    Python -> Rust -> Python
    """
    a = pa.array(["a", None, "b", None, "a"], type=pa.dictionary(pa.int8(), pa.string()))
    b = rust.round_trip_array(a)
    assert a == b
    del a
    del b

Esempio n. 22

0

Mostra file

 def _dtype_to_arrow(cls, dtype):
     if dtype is None:
         return None
     tname = dtype if isinstance(dtype, str) else dtype.name
     if tname == "category":
         return pa.dictionary(index_type=pa.int32(), value_type=pa.string())
     elif tname == "string":
         return pa.string()
     else:
         return pa.from_numpy_dtype(tname)

Esempio n. 23

0

Mostra file

def test_dictionary_type():
    ty0 = pa.dictionary(pa.int32(), pa.array(['a', 'b', 'c']))
    assert ty0.index_type == pa.int32()
    assert isinstance(ty0.dictionary, pa.Array)
    assert ty0.dictionary.to_pylist() == ['a', 'b', 'c']
    assert ty0.ordered is False

    ty1 = pa.dictionary(pa.int8(), pa.array([1.0, 2.0]), ordered=True)
    assert ty1.index_type == pa.int8()
    assert isinstance(ty0.dictionary, pa.Array)
    assert ty1.dictionary.to_pylist() == [1.0, 2.0]
    assert ty1.ordered is True

    # construct from non-arrow objects
    ty2 = pa.dictionary('int8', ['a', 'b', 'c', 'd'])
    assert ty2.index_type == pa.int8()
    assert isinstance(ty2.dictionary, pa.Array)
    assert ty2.dictionary.to_pylist() == ['a', 'b', 'c', 'd']
    assert ty2.ordered is False

Esempio n. 24

0

Mostra file

 def test_text_dictionary_zero_chunks_is_valid(self):
     validate(
         pyarrow.Table.from_batches(
             [],
             pyarrow.schema([("A",
                              pyarrow.dictionary(pyarrow.int32(),
                                                 pyarrow.string()))]),
         ),
         TableMetadata(0, [Text("A")]),
     )

Esempio n. 25

0

Mostra file

def test_cat_int_types_3500() -> None:
    with pl.StringCache():
        # Create an enum / categorical / dictionary typed pyarrow array
        # Most simply done by creating a pandas categorical series first
        categorical_df = pd.Series(["a", "a", "b"], dtype="category")
        pyarrow_array = pa.Array.from_pandas(categorical_df)

        # The in-memory representation of each category can either be a signed or unsigned 8-bit integer
        # Pandas uses Int8...
        int_dict_type = pa.dictionary(index_type=pa.int8(),
                                      value_type=pa.utf8())
        # ... while DuckDB uses UInt8
        uint_dict_type = pa.dictionary(index_type=pa.uint8(),
                                       value_type=pa.utf8())

        for t in [int_dict_type, uint_dict_type]:
            s = pl.from_arrow(pyarrow_array.cast(t))
            assert s.series_equal(
                pl.Series(["a", "a", "b"]).cast(pl.Categorical))

Esempio n. 26

0

Mostra file

def test_empty_table():
    schema = pa.schema([
        pa.field('f0', pa.int64()),
        pa.field('f1', pa.dictionary(pa.int32(), pa.string())),
        pa.field('f2', pa.list_(pa.list_(pa.int64()))),
    ])
    table = schema.empty_table()
    assert isinstance(table, pa.Table)
    assert table.num_rows == 0
    assert table.schema == schema

Esempio n. 27

0

Mostra file

 def schema(cls):
     return pa.schema(
         {
             "instrument_id": pa.dictionary(pa.int8(), pa.string()),
             "ts_event": pa.int64(),
             "ts_init": pa.int64(),
             "last_traded_price": pa.string(),
             "traded_volume": pa.string(),
         },
         metadata={"type": "BetfairTicker"},
     )

Esempio n. 28

0

Mostra file

def test_arrow():
    a = pl.Series("a", [1, 2, 3, None])
    out = a.to_arrow()
    assert out == pa.array([1, 2, 3, None])

    a = pa.array(["foo", "bar"], pa.dictionary(pa.int32(), pa.utf8()))
    s = pl.Series("a", a)
    assert s.dtype == pl.Categorical
    assert (pl.from_arrow(
        pa.array([["foo"], ["foo", "bar"]],
                 pa.list_(pa.utf8()))).dtype == pl.List)

Esempio n. 29

0

Mostra file

def test_schema_repr_with_dictionaries():
    fields = [
        pa.field('one', pa.dictionary(pa.int16(), pa.string())),
        pa.field('two', pa.int32())
    ]
    sch = pa.schema(fields)

    expected = ("""\
one: dictionary<values=string, indices=int16, ordered=0>
two: int32""")

    assert repr(sch) == expected

Esempio n. 30

0

Mostra file

File: test_types.py Progetto: siawayforward/the-library-is-open

def test_dictionary_type():
    ty0 = pa.dictionary(pa.int32(), pa.string())
    assert ty0.index_type == pa.int32()
    assert ty0.value_type == pa.string()
    assert ty0.ordered is False

    ty1 = pa.dictionary(pa.int8(), pa.float64(), ordered=True)
    assert ty1.index_type == pa.int8()
    assert ty1.value_type == pa.float64()
    assert ty1.ordered is True

    # construct from non-arrow objects
    ty2 = pa.dictionary('int8', 'string')
    assert ty2.index_type == pa.int8()
    assert ty2.value_type == pa.string()
    assert ty2.ordered is False

    # allow unsigned integers for index type
    ty3 = pa.dictionary(pa.uint32(), pa.string())
    assert ty3.index_type == pa.uint32()
    assert ty3.value_type == pa.string()
    assert ty3.ordered is False

    # invalid index type raises
    with pytest.raises(TypeError):
        pa.dictionary(pa.string(), pa.int64())

Esempio n. 31

0

Mostra file

File: test_schema.py Progetto: rok/arrow

def test_schema_repr_with_dictionaries():
    fields = [
        pa.field('one', pa.dictionary(pa.int16(), pa.string())),
        pa.field('two', pa.int32())
    ]
    sch = pa.schema(fields)

    expected = (
        """\
one: dictionary<values=string, indices=int16, ordered=0>
two: int32""")

    assert repr(sch) == expected

Esempio n. 32

0

Mostra file

    def test_dict(self):
        """
        Python -> Rust -> Python
        """
        a = pyarrow.array(
            ["a", "a", "b", None, "c"],
            pyarrow.dictionary(pyarrow.int64(), pyarrow.utf8()),
        )
        b = arrow_pyarrow_integration_testing.round_trip_array(a)

        b.validate(full=True)
        assert a.to_pylist() == b.to_pylist()
        assert a.type == b.type

Esempio n. 33

0

Mostra file

File: test_testing.py Progetto: CJWorkbench/cjwmodule

def test_build_table_infer_type():
    table = make_table(
        make_column("A", ["x"]),
        make_column("B", [datetime.date(2021, 4, 7)]),
        make_column("C", [datetime.datetime(2021, 4, 7, 19, 24, 1, 1)]),
        make_column("D", [1.0]),
        make_column("dict", ["x"], dictionary=True),
    )
    assert table["A"].type == pa.string()
    assert table["B"].type == pa.date32()
    assert table["C"].type == pa.timestamp("ns")
    assert table["D"].type == pa.float64()
    assert table["dict"].type == pa.dictionary(pa.int32(), pa.string())

Esempio n. 34

0

Mostra file

def create_cems_schema():
    """Make an explicit Arrow schema for the EPA CEMS data.

    Make changes in the types of the generated parquet files by editing this
    function.

    Note that parquet's internal representation doesn't use unsigned numbers or
    16-bit ints, so just keep things simple here and always use int32 and
    float32.

    Returns:
        pyarrow.schema: An Arrow schema for the EPA CEMS data.

    """
    int_nullable = partial(pa.field, type=pa.int32(), nullable=True)
    int_not_null = partial(pa.field, type=pa.int32(), nullable=False)
    str_not_null = partial(pa.field, type=pa.string(), nullable=False)
    # Timestamp resolution is hourly, but second is the largest allowed.
    timestamp = partial(pa.field,
                        type=pa.timestamp("s", tz="UTC"),
                        nullable=False)
    float_nullable = partial(pa.field, type=pa.float32(), nullable=True)
    float_not_null = partial(pa.field, type=pa.float32(), nullable=False)
    # (float32 can accurately hold integers up to 16,777,216 so no need for
    # float64)
    dict_nullable = partial(pa.field,
                            type=pa.dictionary(pa.int8(),
                                               pa.string(),
                                               ordered=False),
                            nullable=True)
    return pa.schema([
        dict_nullable("state"),
        int_not_null("plant_id_eia"),
        str_not_null("unitid"),
        timestamp("operating_datetime_utc"),
        float_nullable("operating_time_hours"),
        float_not_null("gross_load_mw"),
        float_nullable("steam_load_1000_lbs"),
        float_nullable("so2_mass_lbs"),
        dict_nullable("so2_mass_measurement_code"),
        float_nullable("nox_rate_lbs_mmbtu"),
        dict_nullable("nox_rate_measurement_code"),
        float_nullable("nox_mass_lbs"),
        dict_nullable("nox_mass_measurement_code"),
        float_nullable("co2_mass_tons"),
        dict_nullable("co2_mass_measurement_code"),
        float_not_null("heat_content_mmbtu"),
        int_nullable("facility_id"),
        int_nullable("unit_id_epa"),
        int_not_null("year"),
    ])

Esempio n. 35

0

Mostra file

File: test_schema.py Progetto: giantwhale/arrow

def test_schema_repr_with_dictionaries():
    dct = pa.array(['foo', 'bar', 'baz'], type=pa.string())
    fields = [
        pa.field('one', pa.dictionary(pa.int16(), dct)),
        pa.field('two', pa.int32())
    ]
    sch = pa.schema(fields)

    expected = (
        """\
one: dictionary<values=string, indices=int16, ordered=0>
  dictionary: ["foo", "bar", "baz"]
two: int32""")

    assert repr(sch) == expected

Esempio n. 36

0

Mostra file

File: test_types.py Progetto: rok/arrow

def get_many_types():
    # returning them from a function is required because of pa.dictionary
    # type holds a pyarrow array and test_array.py::test_toal_bytes_allocated
    # checks that the default memory pool has zero allocated bytes
    return (
        pa.null(),
        pa.bool_(),
        pa.int32(),
        pa.time32('s'),
        pa.time64('us'),
        pa.date32(),
        pa.timestamp('us'),
        pa.timestamp('us', tz='UTC'),
        pa.timestamp('us', tz='Europe/Paris'),
        pa.float16(),
        pa.float32(),
        pa.float64(),
        pa.decimal128(19, 4),
        pa.string(),
        pa.binary(),
        pa.binary(10),
        pa.list_(pa.int32()),
        pa.struct([pa.field('a', pa.int32()),
                   pa.field('b', pa.int8()),
                   pa.field('c', pa.string())]),
        pa.struct([pa.field('a', pa.int32(), nullable=False),
                   pa.field('b', pa.int8(), nullable=False),
                   pa.field('c', pa.string())]),
        pa.union([pa.field('a', pa.binary(10)),
                  pa.field('b', pa.string())], mode=pa.lib.UnionMode_DENSE),
        pa.union([pa.field('a', pa.binary(10)),
                  pa.field('b', pa.string())], mode=pa.lib.UnionMode_SPARSE),
        pa.union([pa.field('a', pa.binary(10), nullable=False),
                  pa.field('b', pa.string())], mode=pa.lib.UnionMode_SPARSE),
        pa.dictionary(pa.int32(), pa.string())
    )

Esempio n. 37

0

Mostra file

File: test_types.py Progetto: rok/arrow

def test_is_dictionary():
    assert types.is_dictionary(pa.dictionary(pa.int32(), pa.string()))
    assert not types.is_dictionary(pa.int32())

Esempio n. 38

0

Mostra file

File: test_types.py Progetto: CodingCat/arrow

def test_dictionary_type():
    ty = pa.dictionary(pa.int32(), pa.array(['a', 'b', 'c']))
    assert ty.index_type == pa.int32()
    assert ty.dictionary.to_pylist() == ['a', 'b', 'c']

Esempio n. 39

0

Mostra file

File: test_types.py Progetto: giantwhale/arrow

def test_is_dictionary():
    assert types.is_dictionary(
        pa.dictionary(pa.int32(),
                      pa.array(['a', 'b', 'c'])))
    assert not types.is_dictionary(pa.int32())