Esempio n. 1
0
def test_table_from_arrays_preserves_column_metadata():
    # Added to test https://issues.apache.org/jira/browse/ARROW-3866
    arr0 = pa.array([1, 2])
    arr1 = pa.array([3, 4])
    field0 = pa.field('field1', pa.int64(), metadata=dict(a="A", b="B"))
    field1 = pa.field('field2', pa.int64(), nullable=False)
    columns = [pa.column(field0, arr0), pa.column(field1, arr1)]
    table = pa.Table.from_arrays(columns)
    assert b"a" in table.column(0).field.metadata
    assert table.column(1).field.nullable is False
Esempio n. 2
0
def test_cast_column():
    arrays = [pa.array([1, 2, 3]), pa.array([4, 5, 6])]

    col = pa.column('foo', arrays)

    target = pa.float64()
    casted = col.cast(target)

    expected = pa.column('foo', [x.cast(target) for x in arrays])
    assert casted.equals(expected)
Esempio n. 3
0
def test_cast_column():
    arrays = [pa.array([1, 2, 3]), pa.array([4, 5, 6])]

    col = pa.column('foo', arrays)

    target = pa.float64()
    casted = col.cast(target)

    expected = pa.column('foo', [x.cast(target) for x in arrays])
    assert casted.equals(expected)
Esempio n. 4
0
def test_table_from_arrays_preserves_column_metadata():
    # Added to test https://issues.apache.org/jira/browse/ARROW-3866
    arr0 = pa.array([1, 2])
    arr1 = pa.array([3, 4])
    field0 = pa.field('field1', pa.int64(), metadata=dict(a="A", b="B"))
    field1 = pa.field('field2', pa.int64(), nullable=False)
    columns = [
        pa.column(field0, arr0),
        pa.column(field1, arr1)
    ]
    table = pa.Table.from_arrays(columns)
    assert b"a" in table.column(0).field.metadata
    assert table.column(1).field.nullable is False
Esempio n. 5
0
def test_column_flatten():
    ty = pa.struct([pa.field('x', pa.int16()), pa.field('y', pa.float32())])
    a = pa.array([(1, 2.5), (3, 4.5), (5, 6.5)], type=ty)
    col = pa.Column.from_array('foo', a)
    x, y = col.flatten()
    assert x == pa.column('foo.x', pa.array([1, 3, 5], type=pa.int16()))
    assert y == pa.column('foo.y', pa.array([2.5, 4.5, 6.5],
                                            type=pa.float32()))
    # Empty column
    a = pa.array([], type=ty)
    col = pa.Column.from_array('foo', a)
    x, y = col.flatten()
    assert x == pa.column('foo.x', pa.array([], type=pa.int16()))
    assert y == pa.column('foo.y', pa.array([], type=pa.float32()))
Esempio n. 6
0
def test_column_flatten():
    ty = pa.struct([pa.field('x', pa.int16()),
                    pa.field('y', pa.float32())])
    a = pa.array([(1, 2.5), (3, 4.5), (5, 6.5)], type=ty)
    col = pa.Column.from_array('foo', a)
    x, y = col.flatten()
    assert x == pa.column('foo.x', pa.array([1, 3, 5], type=pa.int16()))
    assert y == pa.column('foo.y', pa.array([2.5, 4.5, 6.5],
                                            type=pa.float32()))
    # Empty column
    a = pa.array([], type=ty)
    col = pa.Column.from_array('foo', a)
    x, y = col.flatten()
    assert x == pa.column('foo.x', pa.array([], type=pa.int16()))
    assert y == pa.column('foo.y', pa.array([], type=pa.float32()))
Esempio n. 7
0
 def test_arrow_nonnullable_table(self):
     if pyarrow is not None:
         x = pyarrow.array([1, 2, 3])
         y = pyarrow.array([1.1, 2.2, 3.3])
         table = pyarrow.Table.from_arrays([x], ["x"])
         table2 = table.add_column(1, pyarrow.column(pyarrow.field("y", y.type, False), numpy.array([1.1, 2.2, 3.3])))
         assert awkward.arrow.view(table2).tolist() == [{"x": 1, "y": 1.1}, {"x": 2, "y": 2.2}, {"x": 3, "y": 3.3}]
Esempio n. 8
0
 def __getitem__(self, item):
     # type (Any) -> Any
     """Select a subset of self.
     Parameters
     ----------
     item : int, slice, or ndarray
         * int: The position in 'self' to get.
         * slice: A slice object, where 'start', 'stop', and 'step' are
           integers or None
         * ndarray: A 1-d boolean NumPy ndarray the same length as 'self'
     Returns
     -------
     item : scalar or ExtensionArray
     Notes
     -----
     For scalar ``item``, return a scalar value suitable for the array's
     type. This should be an instance of ``self.dtype.type``.
     For slice ``key``, return an instance of ``ExtensionArray``, even
     if the slice is length 0 or 1.
     For a boolean mask, return an instance of ``ExtensionArray``, filtered
     to the values where ``item`` is True.
     """
     # Workaround for Arrow bug that segfaults on empty slice.
     # This is fixed in Arrow master, will be released in 0.10
     if isinstance(item, slice):
         start = item.start or 0
         stop = item.stop if item.stop is not None else len(self.data)
         if stop - start == 0:
             return type(self)(pa.column("dummy",
                                         pa.array([], type=self.data.type)))
     value = self.data[item]
     if isinstance(value, pa.ChunkedArray):
         return type(self)(value)
     else:
         return value
Esempio n. 9
0
    def _write_buffer(self):
        buffer_table = []
        if not isinstance(self.schema, ParquetSchema):
            self.schema = ParquetSchema.convert(self.schema)
        if not self.pq_writer:
            self.pq_writer = pq.ParquetWriter(
                self.options.outfile, self.schema.to_arrow()
            )

        for col_name, col_data in self.column_buffer.items():
            col_type = self.schema.columns[col_name].type_py
            pa_type = self.schema.columns[col_name].type_pa
            col = pa.column(
                col_name,
                pa.array(
                    self.coerce_column(col_name, col_data, col_type),
                    type=pa_type,
                ),
            )
            buffer_table.append(col)

        self.pq_writer.write_table(
            pa.Table.from_arrays(buffer_table, schema=self.schema.to_arrow())
        )

        for col in self.column_buffer.keys():
            self.column_buffer[col] = []
        self.buffer_line = 0
Esempio n. 10
0
def read_table(doc) -> pyarrow.Table:
    if isinstance(doc, bytes):
        doc = bson.raw_bson.RawBSONDocument(doc)

    data = [pyarrow.column(k, read_array(v)) for k, v in doc.items()]

    return pyarrow.Table.from_arrays(data)
Esempio n. 11
0
def test_column_pickle():
    arr = pa.chunked_array([[1, 2], [5, 6, 7]], type=pa.int16())
    field = pa.field("ints", pa.int16()).add_metadata({b"foo": b"bar"})
    col = pa.column(field, arr)

    result = pickle.loads(pickle.dumps(col))
    assert result.equals(col)
    assert result.data.num_chunks == 2
    assert result.field == field
Esempio n. 12
0
def test_column_pickle():
    arr = pa.chunked_array([[1, 2], [5, 6, 7]], type=pa.int16())
    field = pa.field("ints", pa.int16()).add_metadata({b"foo": b"bar"})
    col = pa.column(field, arr)

    result = pickle.loads(pickle.dumps(col))
    assert result.equals(col)
    assert result.data.num_chunks == 2
    assert result.field == field
Esempio n. 13
0
def test_unique_simple():
    cases = [(pa.array([1, 2, 3, 1, 2, 3]), pa.array([1, 2, 3])),
             (pa.array(['foo', None, 'bar', 'foo']), pa.array(['foo', 'bar']))]
    for arr, expected in cases:
        result = arr.unique()
        assert result.equals(expected)
        result = pa.column("column", arr).unique()
        assert result.equals(expected)
        result = pa.chunked_array([arr]).unique()
        assert result.equals(expected)
Esempio n. 14
0
def test_cast_none():
    # ARROW-3735: Ensure that calling cast(None) doesn't segfault.
    arr = pa.array([1, 2, 3])
    col = pa.column('foo', [arr])

    with pytest.raises(TypeError):
        arr.cast(None)

    with pytest.raises(TypeError):
        col.cast(None)
Esempio n. 15
0
def test_cast_none():
    # ARROW-3735: Ensure that calling cast(None) doesn't segfault.
    arr = pa.array([1, 2, 3])
    col = pa.column('foo', [arr])

    with pytest.raises(TypeError):
        arr.cast(None)

    with pytest.raises(TypeError):
        col.cast(None)
Esempio n. 16
0
def test_table_from_batches_and_schema():
    schema = pa.schema([
        pa.field('a', pa.int64()),
        pa.field('b', pa.float64()),
    ])
    batch = pa.RecordBatch.from_arrays([pa.array([1]), pa.array([3.14])],
                                       names=['a', 'b'])
    table = pa.Table.from_batches([batch], schema)
    assert table.schema.equals(schema)
    assert table.column(0) == pa.column('a', pa.array([1]))
    assert table.column(1) == pa.column('b', pa.array([3.14]))

    incompatible_schema = pa.schema([pa.field('a', pa.int64())])
    with pytest.raises(pa.ArrowInvalid):
        pa.Table.from_batches([batch], incompatible_schema)

    incompatible_batch = pa.RecordBatch.from_arrays([pa.array([1])], ['a'])
    with pytest.raises(pa.ArrowInvalid):
        pa.Table.from_batches([incompatible_batch], schema)
Esempio n. 17
0
def test_table_from_batches_and_schema():
    schema = pa.schema([
        pa.field('a', pa.int64()),
        pa.field('b', pa.float64()),
    ])
    batch = pa.RecordBatch.from_arrays([pa.array([1]), pa.array([3.14])],
                                       names=['a', 'b'])
    table = pa.Table.from_batches([batch], schema)
    assert table.schema.equals(schema)
    assert table.column(0) == pa.column('a', pa.array([1]))
    assert table.column(1) == pa.column('b', pa.array([3.14]))

    incompatible_schema = pa.schema([pa.field('a', pa.int64())])
    with pytest.raises(pa.ArrowInvalid):
        pa.Table.from_batches([batch], incompatible_schema)

    incompatible_batch = pa.RecordBatch.from_arrays([pa.array([1])], ['a'])
    with pytest.raises(pa.ArrowInvalid):
        pa.Table.from_batches([incompatible_batch], schema)
Esempio n. 18
0
def test_column_factory_function():
    # ARROW-1575
    arr = pa.array([0, 1, 2, 3, 4])
    arr2 = pa.array([5, 6, 7, 8])

    col1 = pa.Column.from_array('foo', arr)
    col2 = pa.Column.from_array(pa.field('foo', arr.type), arr)

    assert col1.equals(col2)

    col3 = pa.column('foo', [arr, arr2])
    chunked_arr = pa.chunked_array([arr, arr2])
    col4 = pa.column('foo', chunked_arr)
    assert col3.equals(col4)

    col5 = pa.column('foo', arr.to_pandas())
    assert col5.equals(pa.column('foo', arr))

    # Type mismatch
    with pytest.raises(ValueError):
        pa.Column.from_array(pa.field('foo', pa.string()), arr)
Esempio n. 19
0
def test_column_factory_function():
    # ARROW-1575
    arr = pa.array([0, 1, 2, 3, 4])
    arr2 = pa.array([5, 6, 7, 8])

    col1 = pa.Column.from_array('foo', arr)
    col2 = pa.Column.from_array(pa.field('foo', arr.type), arr)

    assert col1.equals(col2)

    col3 = pa.column('foo', [arr, arr2])
    chunked_arr = pa.chunked_array([arr, arr2])
    col4 = pa.column('foo', chunked_arr)
    assert col3.equals(col4)

    col5 = pa.column('foo', arr.to_pandas())
    assert col5.equals(pa.column('foo', arr))

    # Type mismatch
    with pytest.raises(ValueError):
        pa.Column.from_array(pa.field('foo', pa.string()), arr)
Esempio n. 20
0
def test_unique_simple():
    cases = [
        (pa.array([1, 2, 3, 1, 2, 3]), pa.array([1, 2, 3])),
        (pa.array(['foo', None, 'bar', 'foo']),
         pa.array(['foo', 'bar']))
    ]
    for arr, expected in cases:
        result = arr.unique()
        assert result.equals(expected)
        result = pa.column("column", arr).unique()
        assert result.equals(expected)
        result = pa.chunked_array([arr]).unique()
        assert result.equals(expected)
Esempio n. 21
0
def test_dictionary_encode_simple():
    cases = [(pa.array([1, 2, 3, None, 1, 2, 3]),
              pa.DictionaryArray.from_arrays(
                  pa.array([0, 1, 2, None, 0, 1, 2], type='int32'),
                  [1, 2, 3])),
             (pa.array(['foo', None, 'bar', 'foo']),
              pa.DictionaryArray.from_arrays(
                  pa.array([0, None, 1, 0], type='int32'), ['foo', 'bar']))]
    for arr, expected in cases:
        result = arr.dictionary_encode()
        assert result.equals(expected)
        result = pa.column("column", arr).dictionary_encode()
        assert result.data.chunk(0).equals(expected)
        result = pa.chunked_array([arr]).dictionary_encode()
        assert result.chunk(0).equals(expected)
Esempio n. 22
0
def test_dictionary_encode_simple():
    cases = [
        (pa.array([1, 2, 3, None, 1, 2, 3]),
         pa.DictionaryArray.from_arrays(
             pa.array([0, 1, 2, None, 0, 1, 2], type='int32'),
             [1, 2, 3])),
        (pa.array(['foo', None, 'bar', 'foo']),
         pa.DictionaryArray.from_arrays(
             pa.array([0, None, 1, 0], type='int32'),
             ['foo', 'bar']))
    ]
    for arr, expected in cases:
        result = arr.dictionary_encode()
        assert result.equals(expected)
        result = pa.column("column", arr).dictionary_encode()
        assert result.data.chunk(0).equals(expected)
        result = pa.chunked_array([arr]).dictionary_encode()
        assert result.chunk(0).equals(expected)
Esempio n. 23
0
def test_column_getitem():
    arr = pa.array([1, 2, 3, 4, 5, 6])
    col = pa.column('ints', arr)

    assert col[1].as_py() == 2
    assert col[-1].as_py() == 6
    assert col[-6].as_py() == 1
    with pytest.raises(IndexError):
        col[6]
    with pytest.raises(IndexError):
        col[-7]

    data_slice = col[2:4]
    assert data_slice.to_pylist() == [3, 4]

    data_slice = col[4:-1]
    assert data_slice.to_pylist() == [5]

    data_slice = col[99:99]
    assert data_slice.type == col.type
    assert data_slice.to_pylist() == []
Esempio n. 24
0
def test_column_getitem():
    arr = pa.array([1, 2, 3, 4, 5, 6])
    col = pa.column('ints', arr)

    assert col[1].as_py() == 2
    assert col[-1].as_py() == 6
    assert col[-6].as_py() == 1
    with pytest.raises(IndexError):
        col[6]
    with pytest.raises(IndexError):
        col[-7]

    data_slice = col[2:4]
    assert data_slice.to_pylist() == [3, 4]

    data_slice = col[4:-1]
    assert data_slice.to_pylist() == [5]

    data_slice = col[99:99]
    assert data_slice.type == col.type
    assert data_slice.to_pylist() == []
Esempio n. 25
0
 def factorize(self, na_sentinel=-1):
     # type: (int) -> Tuple[np.ndarray, ExtensionArray]
     """Encode the extension array as an enumerated type.
     Parameters
     ----------
     na_sentinel : int, default -1
         Value to use in the `labels` array to indicate missing values.
     Returns
     -------
     labels : ndarray
         An integer NumPy array that's an indexer into the original
         ExtensionArray.
     uniques : ExtensionArray
         An ExtensionArray containing the unique values of `self`.
         .. note::
            uniques will *not* contain an entry for the NA value of
            the ExtensionArray if there are any missing values present
            in `self`.
     See Also
     --------
     pandas.factorize : Top-level factorize method that dispatches here.
     Notes
     -----
     :meth:`pandas.factorize` offers a `sort` keyword as well.
     """
     if pa.types.is_dictionary(self.data.type):
         raise NotImplementedError()
     elif self.data.num_chunks == 1:
         # Dictionaryencode and do the same as above
         encoded = self.data.chunk(0).dictionary_encode()
         indices = encoded.indices.to_pandas()
         if indices.dtype.kind == "f":
             indices[np.isnan(indices)] = na_sentinel
             indices = indices.astype(int)
         if not is_int64_dtype(indices):
             indices = indices.astype(np.int64)
         return indices, type(self)(encoded.dictionary)
     else:
         np_array = pa.column("dummy", self.data).to_pandas().values
         return pd.factorize(np_array, na_sentinel=na_sentinel)
Esempio n. 26
0
 def __array__(self):
     """
     Correctly construct numpy arrays when passed to `np.asarray()`.
     """
     return pa.column("dummy", self.data).to_pandas().values