def test_table_from_arrays_preserves_column_metadata(): # Added to test https://issues.apache.org/jira/browse/ARROW-3866 arr0 = pa.array([1, 2]) arr1 = pa.array([3, 4]) field0 = pa.field('field1', pa.int64(), metadata=dict(a="A", b="B")) field1 = pa.field('field2', pa.int64(), nullable=False) columns = [pa.column(field0, arr0), pa.column(field1, arr1)] table = pa.Table.from_arrays(columns) assert b"a" in table.column(0).field.metadata assert table.column(1).field.nullable is False
def test_cast_column(): arrays = [pa.array([1, 2, 3]), pa.array([4, 5, 6])] col = pa.column('foo', arrays) target = pa.float64() casted = col.cast(target) expected = pa.column('foo', [x.cast(target) for x in arrays]) assert casted.equals(expected)
def test_table_from_arrays_preserves_column_metadata(): # Added to test https://issues.apache.org/jira/browse/ARROW-3866 arr0 = pa.array([1, 2]) arr1 = pa.array([3, 4]) field0 = pa.field('field1', pa.int64(), metadata=dict(a="A", b="B")) field1 = pa.field('field2', pa.int64(), nullable=False) columns = [ pa.column(field0, arr0), pa.column(field1, arr1) ] table = pa.Table.from_arrays(columns) assert b"a" in table.column(0).field.metadata assert table.column(1).field.nullable is False
def test_column_flatten(): ty = pa.struct([pa.field('x', pa.int16()), pa.field('y', pa.float32())]) a = pa.array([(1, 2.5), (3, 4.5), (5, 6.5)], type=ty) col = pa.Column.from_array('foo', a) x, y = col.flatten() assert x == pa.column('foo.x', pa.array([1, 3, 5], type=pa.int16())) assert y == pa.column('foo.y', pa.array([2.5, 4.5, 6.5], type=pa.float32())) # Empty column a = pa.array([], type=ty) col = pa.Column.from_array('foo', a) x, y = col.flatten() assert x == pa.column('foo.x', pa.array([], type=pa.int16())) assert y == pa.column('foo.y', pa.array([], type=pa.float32()))
def test_arrow_nonnullable_table(self): if pyarrow is not None: x = pyarrow.array([1, 2, 3]) y = pyarrow.array([1.1, 2.2, 3.3]) table = pyarrow.Table.from_arrays([x], ["x"]) table2 = table.add_column(1, pyarrow.column(pyarrow.field("y", y.type, False), numpy.array([1.1, 2.2, 3.3]))) assert awkward.arrow.view(table2).tolist() == [{"x": 1, "y": 1.1}, {"x": 2, "y": 2.2}, {"x": 3, "y": 3.3}]
def __getitem__(self, item): # type (Any) -> Any """Select a subset of self. Parameters ---------- item : int, slice, or ndarray * int: The position in 'self' to get. * slice: A slice object, where 'start', 'stop', and 'step' are integers or None * ndarray: A 1-d boolean NumPy ndarray the same length as 'self' Returns ------- item : scalar or ExtensionArray Notes ----- For scalar ``item``, return a scalar value suitable for the array's type. This should be an instance of ``self.dtype.type``. For slice ``key``, return an instance of ``ExtensionArray``, even if the slice is length 0 or 1. For a boolean mask, return an instance of ``ExtensionArray``, filtered to the values where ``item`` is True. """ # Workaround for Arrow bug that segfaults on empty slice. # This is fixed in Arrow master, will be released in 0.10 if isinstance(item, slice): start = item.start or 0 stop = item.stop if item.stop is not None else len(self.data) if stop - start == 0: return type(self)(pa.column("dummy", pa.array([], type=self.data.type))) value = self.data[item] if isinstance(value, pa.ChunkedArray): return type(self)(value) else: return value
def _write_buffer(self): buffer_table = [] if not isinstance(self.schema, ParquetSchema): self.schema = ParquetSchema.convert(self.schema) if not self.pq_writer: self.pq_writer = pq.ParquetWriter( self.options.outfile, self.schema.to_arrow() ) for col_name, col_data in self.column_buffer.items(): col_type = self.schema.columns[col_name].type_py pa_type = self.schema.columns[col_name].type_pa col = pa.column( col_name, pa.array( self.coerce_column(col_name, col_data, col_type), type=pa_type, ), ) buffer_table.append(col) self.pq_writer.write_table( pa.Table.from_arrays(buffer_table, schema=self.schema.to_arrow()) ) for col in self.column_buffer.keys(): self.column_buffer[col] = [] self.buffer_line = 0
def read_table(doc) -> pyarrow.Table: if isinstance(doc, bytes): doc = bson.raw_bson.RawBSONDocument(doc) data = [pyarrow.column(k, read_array(v)) for k, v in doc.items()] return pyarrow.Table.from_arrays(data)
def test_column_pickle(): arr = pa.chunked_array([[1, 2], [5, 6, 7]], type=pa.int16()) field = pa.field("ints", pa.int16()).add_metadata({b"foo": b"bar"}) col = pa.column(field, arr) result = pickle.loads(pickle.dumps(col)) assert result.equals(col) assert result.data.num_chunks == 2 assert result.field == field
def test_unique_simple(): cases = [(pa.array([1, 2, 3, 1, 2, 3]), pa.array([1, 2, 3])), (pa.array(['foo', None, 'bar', 'foo']), pa.array(['foo', 'bar']))] for arr, expected in cases: result = arr.unique() assert result.equals(expected) result = pa.column("column", arr).unique() assert result.equals(expected) result = pa.chunked_array([arr]).unique() assert result.equals(expected)
def test_cast_none(): # ARROW-3735: Ensure that calling cast(None) doesn't segfault. arr = pa.array([1, 2, 3]) col = pa.column('foo', [arr]) with pytest.raises(TypeError): arr.cast(None) with pytest.raises(TypeError): col.cast(None)
def test_table_from_batches_and_schema(): schema = pa.schema([ pa.field('a', pa.int64()), pa.field('b', pa.float64()), ]) batch = pa.RecordBatch.from_arrays([pa.array([1]), pa.array([3.14])], names=['a', 'b']) table = pa.Table.from_batches([batch], schema) assert table.schema.equals(schema) assert table.column(0) == pa.column('a', pa.array([1])) assert table.column(1) == pa.column('b', pa.array([3.14])) incompatible_schema = pa.schema([pa.field('a', pa.int64())]) with pytest.raises(pa.ArrowInvalid): pa.Table.from_batches([batch], incompatible_schema) incompatible_batch = pa.RecordBatch.from_arrays([pa.array([1])], ['a']) with pytest.raises(pa.ArrowInvalid): pa.Table.from_batches([incompatible_batch], schema)
def test_column_factory_function(): # ARROW-1575 arr = pa.array([0, 1, 2, 3, 4]) arr2 = pa.array([5, 6, 7, 8]) col1 = pa.Column.from_array('foo', arr) col2 = pa.Column.from_array(pa.field('foo', arr.type), arr) assert col1.equals(col2) col3 = pa.column('foo', [arr, arr2]) chunked_arr = pa.chunked_array([arr, arr2]) col4 = pa.column('foo', chunked_arr) assert col3.equals(col4) col5 = pa.column('foo', arr.to_pandas()) assert col5.equals(pa.column('foo', arr)) # Type mismatch with pytest.raises(ValueError): pa.Column.from_array(pa.field('foo', pa.string()), arr)
def test_unique_simple(): cases = [ (pa.array([1, 2, 3, 1, 2, 3]), pa.array([1, 2, 3])), (pa.array(['foo', None, 'bar', 'foo']), pa.array(['foo', 'bar'])) ] for arr, expected in cases: result = arr.unique() assert result.equals(expected) result = pa.column("column", arr).unique() assert result.equals(expected) result = pa.chunked_array([arr]).unique() assert result.equals(expected)
def test_dictionary_encode_simple(): cases = [(pa.array([1, 2, 3, None, 1, 2, 3]), pa.DictionaryArray.from_arrays( pa.array([0, 1, 2, None, 0, 1, 2], type='int32'), [1, 2, 3])), (pa.array(['foo', None, 'bar', 'foo']), pa.DictionaryArray.from_arrays( pa.array([0, None, 1, 0], type='int32'), ['foo', 'bar']))] for arr, expected in cases: result = arr.dictionary_encode() assert result.equals(expected) result = pa.column("column", arr).dictionary_encode() assert result.data.chunk(0).equals(expected) result = pa.chunked_array([arr]).dictionary_encode() assert result.chunk(0).equals(expected)
def test_dictionary_encode_simple(): cases = [ (pa.array([1, 2, 3, None, 1, 2, 3]), pa.DictionaryArray.from_arrays( pa.array([0, 1, 2, None, 0, 1, 2], type='int32'), [1, 2, 3])), (pa.array(['foo', None, 'bar', 'foo']), pa.DictionaryArray.from_arrays( pa.array([0, None, 1, 0], type='int32'), ['foo', 'bar'])) ] for arr, expected in cases: result = arr.dictionary_encode() assert result.equals(expected) result = pa.column("column", arr).dictionary_encode() assert result.data.chunk(0).equals(expected) result = pa.chunked_array([arr]).dictionary_encode() assert result.chunk(0).equals(expected)
def test_column_getitem(): arr = pa.array([1, 2, 3, 4, 5, 6]) col = pa.column('ints', arr) assert col[1].as_py() == 2 assert col[-1].as_py() == 6 assert col[-6].as_py() == 1 with pytest.raises(IndexError): col[6] with pytest.raises(IndexError): col[-7] data_slice = col[2:4] assert data_slice.to_pylist() == [3, 4] data_slice = col[4:-1] assert data_slice.to_pylist() == [5] data_slice = col[99:99] assert data_slice.type == col.type assert data_slice.to_pylist() == []
def factorize(self, na_sentinel=-1): # type: (int) -> Tuple[np.ndarray, ExtensionArray] """Encode the extension array as an enumerated type. Parameters ---------- na_sentinel : int, default -1 Value to use in the `labels` array to indicate missing values. Returns ------- labels : ndarray An integer NumPy array that's an indexer into the original ExtensionArray. uniques : ExtensionArray An ExtensionArray containing the unique values of `self`. .. note:: uniques will *not* contain an entry for the NA value of the ExtensionArray if there are any missing values present in `self`. See Also -------- pandas.factorize : Top-level factorize method that dispatches here. Notes ----- :meth:`pandas.factorize` offers a `sort` keyword as well. """ if pa.types.is_dictionary(self.data.type): raise NotImplementedError() elif self.data.num_chunks == 1: # Dictionaryencode and do the same as above encoded = self.data.chunk(0).dictionary_encode() indices = encoded.indices.to_pandas() if indices.dtype.kind == "f": indices[np.isnan(indices)] = na_sentinel indices = indices.astype(int) if not is_int64_dtype(indices): indices = indices.astype(np.int64) return indices, type(self)(encoded.dictionary) else: np_array = pa.column("dummy", self.data).to_pandas().values return pd.factorize(np_array, na_sentinel=na_sentinel)
def __array__(self): """ Correctly construct numpy arrays when passed to `np.asarray()`. """ return pa.column("dummy", self.data).to_pandas().values