def test_iterator_without_size(): expected = pa.array((0, 1, 2)) arr1 = pa.array(iter(range(3))) assert arr1.equals(expected) # Same with explicit type arr1 = pa.array(iter(range(3)), type=pa.int64()) assert arr1.equals(expected)
def make_recordbatch(length): schema = pa.schema([pa.field('f0', pa.int16()), pa.field('f1', pa.int16())]) a0 = pa.array(np.random.randint(0, 255, size=length, dtype=np.int16)) a1 = pa.array(np.random.randint(0, 255, size=length, dtype=np.int16)) batch = pa.RecordBatch.from_arrays([a0, a1], schema) return batch
def test_file_reader_writer(): data = [ pa.array([1, 2, 3, 4]), pa.array(['foo', 'bar', 'baz', None]), pa.array([True, None, False, True]) ] batch = pa.RecordBatch.from_arrays(data, ['f0', 'f1', 'f2']) sink = pa.BufferOutputStream() with pytest.warns(FutureWarning): stream_writer = pa.StreamWriter(sink, batch.schema) assert isinstance(stream_writer, pa.RecordBatchStreamWriter) sink2 = pa.BufferOutputStream() with pytest.warns(FutureWarning): file_writer = pa.FileWriter(sink2, batch.schema) assert isinstance(file_writer, pa.RecordBatchFileWriter) file_writer.write_batch(batch) stream_writer.write_batch(batch) file_writer.close() stream_writer.close() buf = sink.get_result() buf2 = sink2.get_result() with pytest.warns(FutureWarning): stream_reader = pa.StreamReader(buf) assert isinstance(stream_reader, pa.RecordBatchStreamReader) with pytest.warns(FutureWarning): file_reader = pa.FileReader(buf2) assert isinstance(file_reader, pa.RecordBatchFileReader)
def test_sequence_nesting_levels(): data = [1, 2, None] arr = pa.array(data) assert arr.type == pa.int64() assert arr.to_pylist() == data data = [[1], [2], None] arr = pa.array(data) assert arr.type == pa.list_(pa.int64()) assert arr.to_pylist() == data data = [[1], [2, 3, 4], [None]] arr = pa.array(data) assert arr.type == pa.list_(pa.int64()) assert arr.to_pylist() == data data = [None, [[None, 1]], [[2, 3, 4], None], [None]] arr = pa.array(data) assert arr.type == pa.list_(pa.list_(pa.int64())) assert arr.to_pylist() == data exceptions = (pa.ArrowInvalid, pa.ArrowTypeError) # Mixed nesting levels are rejected with pytest.raises(exceptions): pa.array([1, 2, [1]]) with pytest.raises(exceptions): pa.array([1, 2, []]) with pytest.raises(exceptions): pa.array([[1], [2], [None, [1]]])
def test_struct_from_tuples(): ty = pa.struct([pa.field('a', pa.int32()), pa.field('b', pa.string()), pa.field('c', pa.bool_())]) data = [(5, 'foo', True), (6, 'bar', False)] expected = [{'a': 5, 'b': 'foo', 'c': True}, {'a': 6, 'b': 'bar', 'c': False}] arr = pa.array(data, type=ty) data_as_ndarray = np.empty(len(data), dtype=object) data_as_ndarray[:] = data arr2 = pa.array(data_as_ndarray, type=ty) assert arr.to_pylist() == expected assert arr.equals(arr2) # With omitted values data = [(5, 'foo', None), None, (6, None, False)] expected = [{'a': 5, 'b': 'foo', 'c': None}, None, {'a': 6, 'b': None, 'c': False}] arr = pa.array(data, type=ty) assert arr.to_pylist() == expected # Invalid tuple size for tup in [(5, 'foo'), (), ('5', 'foo', True, None)]: with pytest.raises(ValueError, match="(?i)tuple size"): pa.array([tup], type=ty)
def test_unsigned_integer_overflow(bits): ty = getattr(pa, "uint%d" % bits)() # XXX ideally would raise OverflowError with pytest.raises((ValueError, pa.ArrowException)): pa.array([2 ** bits], ty) with pytest.raises((ValueError, pa.ArrowException)): pa.array([-1], ty)
def test_table_basics(): data = [ pa.array(range(5)), pa.array([-10, -5, 0, 5, 10]) ] table = pa.Table.from_arrays(data, names=('a', 'b')) table._validate() assert len(table) == 5 assert table.num_rows == 5 assert table.num_columns == 2 assert table.shape == (5, 2) assert table.to_pydict() == OrderedDict([ ('a', [0, 1, 2, 3, 4]), ('b', [-10, -5, 0, 5, 10]) ]) columns = [] for col in table.itercolumns(): columns.append(col) for chunk in col.data.iterchunks(): assert chunk is not None with pytest.raises(IndexError): col.data.chunk(-1) with pytest.raises(IndexError): col.data.chunk(col.data.num_chunks) assert table.columns == columns
def test_empty_cast(): types = [ pa.null(), pa.bool_(), pa.int8(), pa.int16(), pa.int32(), pa.int64(), pa.uint8(), pa.uint16(), pa.uint32(), pa.uint64(), pa.float16(), pa.float32(), pa.float64(), pa.date32(), pa.date64(), pa.binary(), pa.binary(length=4), pa.string(), ] for (t1, t2) in itertools.product(types, types): try: # ARROW-4766: Ensure that supported types conversion don't segfault # on empty arrays of common types pa.array([], type=t1).cast(t2) except pa.lib.ArrowNotImplementedError: continue
def test_struct_array_field(): ty = pa.struct([pa.field('x', pa.int16()), pa.field('y', pa.float32())]) a = pa.array([(1, 2.5), (3, 4.5), (5, 6.5)], type=ty) x0 = a.field(0) y0 = a.field(1) x1 = a.field(-2) y1 = a.field(-1) x2 = a.field('x') y2 = a.field('y') assert isinstance(x0, pa.lib.Int16Array) assert isinstance(y1, pa.lib.FloatArray) assert x0.equals(pa.array([1, 3, 5], type=pa.int16())) assert y0.equals(pa.array([2.5, 4.5, 6.5], type=pa.float32())) assert x0.equals(x1) assert x0.equals(x2) assert y0.equals(y1) assert y0.equals(y2) for invalid_index in [None, pa.int16()]: with pytest.raises(TypeError): a.field(invalid_index) for invalid_index in [3, -3]: with pytest.raises(IndexError): a.field(invalid_index) for invalid_name in ['z', '']: with pytest.raises(KeyError): a.field(invalid_name)
def test_infinite_iterator(): expected = pa.array((0, 1, 2)) arr1 = pa.array(itertools.count(0), size=3) assert arr1.equals(expected) # Same with explicit type arr1 = pa.array(itertools.count(0), type=pa.int64(), size=3) assert arr1.equals(expected)
def test_asarray(): arr = pa.array(range(4)) # The iterator interface gives back an array of Int64Value's np_arr = np.asarray([_ for _ in arr]) assert np_arr.tolist() == [0, 1, 2, 3] assert np_arr.dtype == np.dtype('O') assert type(np_arr[0]) == pa.lib.Int64Value # Calling with the arrow array gives back an array with 'int64' dtype np_arr = np.asarray(arr) assert np_arr.tolist() == [0, 1, 2, 3] assert np_arr.dtype == np.dtype('int64') # An optional type can be specified when calling np.asarray np_arr = np.asarray(arr, dtype='str') assert np_arr.tolist() == ['0', '1', '2', '3'] # If PyArrow array has null values, numpy type will be changed as needed # to support nulls. arr = pa.array([0, 1, 2, None]) assert arr.type == pa.int64() np_arr = np.asarray(arr) elements = np_arr.tolist() assert elements[:3] == [0., 1., 2.] assert np.isnan(elements[3]) assert np_arr.dtype == np.dtype('float64')
def test_recordbatch_basics(): data = [ pa.array(range(5)), pa.array([-10, -5, 0, 5, 10]) ] batch = pa.RecordBatch.from_arrays(data, ['c0', 'c1']) assert not batch.schema.metadata assert len(batch) == 5 assert batch.num_rows == 5 assert batch.num_columns == len(data) assert batch.to_pydict() == OrderedDict([ ('c0', [0, 1, 2, 3, 4]), ('c1', [-10, -5, 0, 5, 10]) ]) with pytest.raises(IndexError): # bounds checking batch[2] # Schema passed explicitly schema = pa.schema([pa.field('c0', pa.int16()), pa.field('c1', pa.int32())], metadata={b'foo': b'bar'}) batch = pa.RecordBatch.from_arrays(data, schema) assert batch.schema == schema
def test_invalid_table_construct(): array = np.array([0, 1], dtype=np.uint8) u8 = pa.uint8() arrays = [pa.array(array, type=u8), pa.array(array[1:], type=u8)] with pytest.raises(pa.lib.ArrowInvalid): pa.Table.from_arrays(arrays, names=["a1", "a2"])
def test_cast_time32_to_int(): arr = pa.array(np.array([0, 1, 2], dtype='int32'), type=pa.time32('s')) expected = pa.array([0, 1, 2], type='i4') result = arr.cast('i4') assert result.equals(expected)
def test_cast_timestamp_unit(): # ARROW-1680 val = datetime.datetime.now() s = pd.Series([val]) s_nyc = s.dt.tz_localize('tzlocal()').dt.tz_convert('America/New_York') us_with_tz = pa.timestamp('us', tz='America/New_York') arr = pa.Array.from_pandas(s_nyc, type=us_with_tz) # ARROW-1906 assert arr.type == us_with_tz arr2 = pa.Array.from_pandas(s, type=pa.timestamp('us')) assert arr[0].as_py() == s_nyc[0] assert arr2[0].as_py() == s[0] # Disallow truncation arr = pa.array([123123], type='int64').cast(pa.timestamp('ms')) expected = pa.array([123], type='int64').cast(pa.timestamp('s')) target = pa.timestamp('s') with pytest.raises(ValueError): arr.cast(target) result = arr.cast(target, safe=False) assert result.equals(expected)
def test_buffers_primitive(): a = pa.array([1, 2, None, 4], type=pa.int16()) buffers = a.buffers() assert len(buffers) == 2 null_bitmap = buffers[0].to_pybytes() assert 1 <= len(null_bitmap) <= 64 # XXX this is varying assert bytearray(null_bitmap)[0] == 0b00001011 # Slicing does not affect the buffers but the offset a_sliced = a[1:] buffers = a_sliced.buffers() a_sliced.offset == 1 assert len(buffers) == 2 null_bitmap = buffers[0].to_pybytes() assert 1 <= len(null_bitmap) <= 64 # XXX this is varying assert bytearray(null_bitmap)[0] == 0b00001011 assert struct.unpack('hhxxh', buffers[1].to_pybytes()) == (1, 2, 4) a = pa.array(np.int8([4, 5, 6])) buffers = a.buffers() assert len(buffers) == 2 # No null bitmap from Numpy int array assert buffers[0] is None assert struct.unpack('3b', buffers[1].to_pybytes()) == (4, 5, 6) a = pa.array([b'foo!', None, b'bar!!']) buffers = a.buffers() assert len(buffers) == 3 null_bitmap = buffers[0].to_pybytes() assert bytearray(null_bitmap)[0] == 0b00000101 offsets = buffers[1].to_pybytes() assert struct.unpack('4i', offsets) == (0, 4, 4, 9) values = buffers[2].to_pybytes() assert values == b'foo!bar!!'
def test_cast_timestamp_to_int(): arr = pa.array(np.array([0, 1, 2], dtype='int64'), type=pa.timestamp('us')) expected = pa.array([0, 1, 2], type='i8') result = arr.cast('i8') assert result.equals(expected)
def test_buffers_nested(): a = pa.array([[1, 2], None, [3, None, 4, 5]], type=pa.list_(pa.int64())) buffers = a.buffers() assert len(buffers) == 4 # The parent buffers null_bitmap = buffers[0].to_pybytes() assert bytearray(null_bitmap)[0] == 0b00000101 offsets = buffers[1].to_pybytes() assert struct.unpack('4i', offsets) == (0, 2, 2, 6) # The child buffers null_bitmap = buffers[2].to_pybytes() assert bytearray(null_bitmap)[0] == 0b00110111 values = buffers[3].to_pybytes() assert struct.unpack('qqq8xqq', values) == (1, 2, 3, 4, 5) a = pa.array([(42, None), None, (None, 43)], type=pa.struct([pa.field('a', pa.int8()), pa.field('b', pa.int16())])) buffers = a.buffers() assert len(buffers) == 5 # The parent buffer null_bitmap = buffers[0].to_pybytes() assert bytearray(null_bitmap)[0] == 0b00000101 # The child buffers: 'a' null_bitmap = buffers[1].to_pybytes() assert bytearray(null_bitmap)[0] == 0b00000001 values = buffers[2].to_pybytes() assert struct.unpack('bxx', values) == (42,) # The child buffers: 'b' null_bitmap = buffers[3].to_pybytes() assert bytearray(null_bitmap)[0] == 0b00000100 values = buffers[4].to_pybytes() assert struct.unpack('4xh', values) == (43,)
def test_to_pandas_zero_copy(): import gc arr = pa.array(range(10)) for i in range(10): np_arr = arr.to_pandas() assert sys.getrefcount(np_arr) == 2 np_arr = None # noqa assert sys.getrefcount(arr) == 2 for i in range(10): arr = pa.array(range(10)) np_arr = arr.to_pandas() arr = None gc.collect() # Ensure base is still valid # Because of py.test's assert inspection magic, if you put getrefcount # on the line being examined, it will be 1 higher than you expect base_refcount = sys.getrefcount(np_arr.base) assert base_refcount == 2 np_arr.sum()
def test_recordbatch_slice(): data = [ pa.array(range(5)), pa.array([-10, -5, 0, 5, 10]) ] names = ['c0', 'c1'] batch = pa.RecordBatch.from_arrays(data, names) sliced = batch.slice(2) assert sliced.num_rows == 3 expected = pa.RecordBatch.from_arrays( [x.slice(2) for x in data], names) assert sliced.equals(expected) sliced2 = batch.slice(2, 2) expected2 = pa.RecordBatch.from_arrays( [x.slice(2, 2) for x in data], names) assert sliced2.equals(expected2) # 0 offset assert batch.slice(0).equals(batch) # Slice past end of array assert len(batch.slice(len(batch))) == 0 with pytest.raises(IndexError): batch.slice(-1)
def test_chunked_array_str(): data = [ pa.array([1, 2, 3]), pa.array([4, 5, 6]) ] data = pa.chunked_array(data) assert str(data) == """[
def test_list_from_arrays(): offsets_arr = np.array([0, 2, 5, 8], dtype='i4') offsets = pa.array(offsets_arr, type='int32') pyvalues = [b'a', b'b', b'c', b'd', b'e', b'f', b'g', b'h'] values = pa.array(pyvalues, type='binary') result = pa.ListArray.from_arrays(offsets, values) expected = pa.array([pyvalues[:2], pyvalues[2:5], pyvalues[5:8]]) assert result.equals(expected) # With nulls offsets = [0, None, 2, 6] values = ['a', 'b', 'c', 'd', 'e', 'f'] result = pa.ListArray.from_arrays(offsets, values) expected = pa.array([values[:2], None, values[2:]]) assert result.equals(expected) # Another edge case offsets2 = [0, 2, None, 6] result = pa.ListArray.from_arrays(offsets2, values) expected = pa.array([values[:2], values[2:], None]) assert result.equals(expected)
def test_recordbatch_from_arrays_validate_lengths(): # ARROW-2820 data = [pa.array([1]), pa.array(["tokyo", "like", "happy"]), pa.array(["derek"])] with pytest.raises(ValueError): pa.RecordBatch.from_arrays(data, ['id', 'tags', 'name'])
def test_array_slice(): arr = pa.array(range(10)) sliced = arr.slice(2) expected = pa.array(range(2, 10)) assert sliced.equals(expected) sliced2 = arr.slice(2, 4) expected2 = pa.array(range(2, 6)) assert sliced2.equals(expected2) # 0 offset assert arr.slice(0).equals(arr) # Slice past end of array assert len(arr.slice(len(arr))) == 0 with pytest.raises(IndexError): arr.slice(-1) # Test slice notation assert arr[2:].equals(arr.slice(2)) assert arr[2:5].equals(arr.slice(2, 3)) assert arr[-5:].equals(arr.slice(len(arr) - 5)) with pytest.raises(IndexError): arr[::-1] with pytest.raises(IndexError): arr[::2] n = len(arr) for start in range(-n * 2, n * 2): for stop in range(-n * 2, n * 2): assert arr[start:stop].to_pylist() == arr.to_pylist()[start:stop]
def test_chunked_array_asarray(): data = [ pa.array([0]), pa.array([1, 2, 3]) ] chunked_arr = pa.chunked_array(data) np_arr = np.asarray(chunked_arr) assert np_arr.tolist() == [0, 1, 2, 3] assert np_arr.dtype == np.dtype('int64') # An optional type can be specified when calling np.asarray np_arr = np.asarray(chunked_arr, dtype='str') assert np_arr.tolist() == ['0', '1', '2', '3'] # Types are modified when there are nulls data = [ pa.array([1, None]), pa.array([1, 2, 3]) ] chunked_arr = pa.chunked_array(data) np_arr = np.asarray(chunked_arr) elements = np_arr.tolist() assert elements[0] == 1. assert np.isnan(elements[1]) assert elements[2:] == [1., 2., 3.] assert np_arr.dtype == np.dtype('float64')
def dataframe_to_types(df, preserve_index, columns=None): (all_names, column_names, index_descriptors, index_columns, columns_to_convert, _) = _get_columns_to_convert(df, None, preserve_index, columns) types = [] # If pandas knows type, skip conversion for c in columns_to_convert: values = c.values if _pandas_api.is_categorical(values): type_ = pa.array(c, from_pandas=True).type else: values, type_ = get_datetimetz_type(values, c.dtype, None) type_ = pa.lib._ndarray_to_arrow_type(values, type_) if type_ is None: type_ = pa.array(c, from_pandas=True).type types.append(type_) metadata = construct_metadata(df, column_names, index_columns, index_descriptors, preserve_index, types) return all_names, types, metadata
def test_sequence_timestamp_from_int_with_unit(): data = [1] s = pa.timestamp('s') ms = pa.timestamp('ms') us = pa.timestamp('us') ns = pa.timestamp('ns') arr_s = pa.array(data, type=s) assert len(arr_s) == 1 assert arr_s.type == s assert str(arr_s[0]) == "Timestamp('1970-01-01 00:00:01')" arr_ms = pa.array(data, type=ms) assert len(arr_ms) == 1 assert arr_ms.type == ms assert str(arr_ms[0]) == "Timestamp('1970-01-01 00:00:00.001000')" arr_us = pa.array(data, type=us) assert len(arr_us) == 1 assert arr_us.type == us assert str(arr_us[0]) == "Timestamp('1970-01-01 00:00:00.000001')" arr_ns = pa.array(data, type=ns) assert len(arr_ns) == 1 assert arr_ns.type == ns assert str(arr_ns[0]) == "Timestamp('1970-01-01 00:00:00.000000001')" with pytest.raises(pa.ArrowException): class CustomClass(): pass pa.array([1, CustomClass()], type=ns) pa.array([1, CustomClass()], type=pa.date32()) pa.array([1, CustomClass()], type=pa.date64())
def test_sequence_timestamp_with_unit(): data = [ datetime.datetime(2007, 7, 13, 1, 23, 34, 123456), ] s = pa.timestamp('s') ms = pa.timestamp('ms') us = pa.timestamp('us') ns = pa.timestamp('ns') arr_s = pa.array(data, type=s) assert len(arr_s) == 1 assert arr_s.type == s assert arr_s[0].as_py() == datetime.datetime(2007, 7, 13, 1, 23, 34, 0) arr_ms = pa.array(data, type=ms) assert len(arr_ms) == 1 assert arr_ms.type == ms assert arr_ms[0].as_py() == datetime.datetime(2007, 7, 13, 1, 23, 34, 123000) arr_us = pa.array(data, type=us) assert len(arr_us) == 1 assert arr_us.type == us assert arr_us[0].as_py() == datetime.datetime(2007, 7, 13, 1, 23, 34, 123456) arr_ns = pa.array(data, type=ns) assert len(arr_ns) == 1 assert arr_ns.type == ns assert arr_ns[0].as_py() == datetime.datetime(2007, 7, 13, 1, 23, 34, 123456)
def test_decimal_array_with_none_and_nan(): values = [decimal.Decimal('1.234'), None, np.nan, decimal.Decimal('nan')] array = pa.array(values) assert array.type == pa.decimal128(4, 3) assert array.to_pylist() == values[:2] + [None, None] array = pa.array(values, type=pa.decimal128(10, 4)) assert array.to_pylist() == [decimal.Decimal('1.2340'), None, None, None]
def test_array_from_pandas_type_cast(self): arr = np.arange(10, dtype='int64') target_type = pa.int8() result = pa.array(arr, type=target_type) expected = pa.array(arr.astype('int8')) assert result.equals(expected)
def test_chunked_array_str(): data = [pa.array([1, 2, 3]), pa.array([4, 5, 6])] data = pa.chunked_array(data) assert str(data) == """[
def _to_arrow_array(cls, scalars): return pa.array(scalars)
def test_in_memory_table_filter(in_memory_pa_table): mask = pa.array([i % 2 == 0 for i in range(len(in_memory_pa_table))]) table = InMemoryTable(in_memory_pa_table).filter(mask) assert table.table == in_memory_pa_table.filter(mask) assert isinstance(table, InMemoryTable)
def test_chunked_array_mismatch_types(): with pytest.raises(pa.ArrowInvalid): pa.chunked_array([pa.array([1, 2]), pa.array(['foo', 'bar'])])
def test_recordbatch_from_struct_array_invalid(): with pytest.raises(TypeError): pa.RecordBatch.from_struct_array(pa.array(range(5)))
def simple_ints_table(): data = [pa.array([-10, -5, 0, 5, 10])] return pa.Table.from_arrays(data, names=['some_ints'])
def _to_record_batch(df): data = [] names = list(df.columns.values) for c in names: data.append(pa.array(df[c])) return pa.RecordBatch.from_arrays(data, names)
def convert_column(col, ty): return pa.array(col, from_pandas=True, type=ty)
def setUp(self): super(NonStreamingCustomStatsGeneratorTest, self).setUp() # Integration tests involving Beam and AMI are challenging to write # because Beam PCollections are unordered while the results of adjusted MI # depend on the order of the data for small datasets. This test case tests # MI with one label which will give a value of 0 regardless of # the ordering of elements in the PCollection. The purpose of this test is # to ensure that the Mutual Information pipeline is able to handle a # variety of input types. Unit tests ensuring correctness of the MI value # itself are included in sklearn_mutual_information_test. # fa is categorical, fb is numeric, fc is multivalent and fd has null values self.tables = [ pa.Table.from_arrays([ pa.array([['Red']]), pa.array([[1.0]]), pa.array([[1, 3, 1]]), pa.array([[0.4]]), pa.array([['Label']]), ], ['fa', 'fb', 'fc', 'fd', 'label_key']), pa.Table.from_arrays([ pa.array([['Green']]), pa.array([[2.2]]), pa.array([[2, 6]]), pa.array([[0.4]]), pa.array([['Label']]), ], ['fa', 'fb', 'fc', 'fd', 'label_key']), pa.Table.from_arrays([ pa.array([['Blue']]), pa.array([[3.3]]), pa.array([[4, 6]]), pa.array([[0.3]]), pa.array([['Label']]), ], ['fa', 'fb', 'fc', 'fd', 'label_key']), pa.Table.from_arrays([ pa.array([['Green']]), pa.array([[1.3]]), pa.array([None]), pa.array([[0.2]]), pa.array([['Label']]), ], ['fa', 'fb', 'fc', 'fd', 'label_key']), pa.Table.from_arrays([ pa.array([['Red']]), pa.array([[1.2]]), pa.array([[1]]), pa.array([[0.3]]), pa.array([['Label']]), ], ['fa', 'fb', 'fc', 'fd', 'label_key']), pa.Table.from_arrays([ pa.array([['Blue']]), pa.array([[0.5]]), pa.array([[3, 2]]), pa.array([[0.4]]), pa.array([['Label']]), ], ['fa', 'fb', 'fc', 'fd', 'label_key']), pa.Table.from_arrays([ pa.array([['Blue']]), pa.array([[1.3]]), pa.array([[1, 4]]), pa.array([[1.7]]), pa.array([['Label']]), ], ['fa', 'fb', 'fc', 'fd', 'label_key']), pa.Table.from_arrays([ pa.array([['Green']]), pa.array([[2.3]]), pa.array([[0]]), pa.array([[np.NaN]], type=pa.list_(pa.float64())), pa.array([['Label']]), ], ['fa', 'fb', 'fc', 'fd', 'label_key']), pa.Table.from_arrays([ pa.array([['Green']]), pa.array([[0.3]]), pa.array([[3]]), pa.array([[4.4]]), pa.array([['Label']]), ], ['fa', 'fb', 'fc', 'fd', 'label_key']), ] self.schema = text_format.Parse( """ feature { name: "fa" type: BYTES shape { dim { size: 1 } } } feature { name: "fb" type: FLOAT shape { dim { size: 1 } } } feature { name: "fc" type: INT value_count: { min: 0 max: 2 } } feature { name: "fd" type: FLOAT shape { dim { size: 1 } } } feature { name: "label_key" type: BYTES shape { dim { size: 1 } } }""", schema_pb2.Schema())
def np2arrowArray(x): if len(x.shape) > 1: x = np.transpose(x, [2,0,1]) return pa.array([x.tolist()]) else: return pa.array([x.tolist()])
def recurse(obj, mask): if isinstance(obj, numpy.ndarray): return pyarrow.array(obj, mask=mask) elif isinstance(obj, awkward0.array.chunked.ChunkedArray ): # includes AppendableArray raise TypeError( "only top-level ChunkedArrays can be converted to Arrow (as RecordBatches)" ) elif isinstance(obj, awkward0.array.indexed.IndexedArray): if mask is None: return pyarrow.DictionaryArray.from_arrays( obj.index, recurse(obj.content, mask)) else: return recurse(obj.content[obj.index], mask) elif isinstance(obj, awkward0.array.indexed.SparseArray): return recurse(obj.dense, mask) elif isinstance(obj, awkward0.array.jagged.JaggedArray): obj = obj.compact() if mask is not None: mask = obj.tojagged(mask).flatten() arrow_type = pyarrow.ListArray # 64bit offsets not yet completely golden in arrow # if hasattr(pyarrow, 'LargeListArray') and obj.starts.itemsize > 4: # arrow_type = pyarrow.LargeListArray return arrow_type.from_arrays(obj.offsets, recurse(obj.content, mask)) elif isinstance(obj, awkward0.array.masked.IndexedMaskedArray): thismask = obj.boolmask(maskedwhen=True) if mask is not None: thismask = mask | thismask if len(obj.content) == 0: content = obj.numpy.empty(len(obj.mask), dtype=obj.DEFAULTTYPE) else: content = obj.content[obj.mask] return recurse(content, thismask) elif isinstance( obj, awkward0.array.masked.MaskedArray): # includes BitMaskedArray thismask = obj.boolmask(maskedwhen=True) if mask is not None: thismask = mask | thismask return recurse(obj.content, thismask) elif isinstance(obj, awkward0.array.objects.StringArray): if obj.encoding is None and hasattr(pyarrow.BinaryArray, 'from_buffers'): arrow_type = pyarrow.BinaryArray arrow_offset_type = pyarrow.binary() # 64bit offsets not yet completely golden in arrow # if hasattr(pyarrow, 'LargeBinaryArray') and obj.starts.itemsize > 4: # arrow_type = pyarrow.LargeBinaryArray # arrow_offset_type = pyarrow.large_binary() convert = lambda length, offsets, content: arrow_type.from_buffers( arrow_offset_type, length, [None, offsets, content]) elif codecs.lookup(obj.encoding) is codecs.lookup( "utf-8") or obj.encoding is None: arrow_type = pyarrow.StringArray # if hasattr(pyarrow, 'LargeStringArray') and obj.starts.itemsize > 4: # arrow_type = pyarrow.LargeStringArray convert = lambda length, offsets, content: arrow_type.from_buffers( length, offsets, content) else: raise ValueError( "only encoding=None or encoding='utf-8' can be converted to Arrow" ) obj = obj.compact() offsets = obj.offsets if offsets.dtype != numpy.dtype(numpy.int32): offsets = offsets.astype(numpy.int32) return convert( len(offsets) - 1, pyarrow.py_buffer(offsets), pyarrow.py_buffer(obj.content)) elif isinstance(obj, awkward0.array.objects.ObjectArray): # throw away Python object interpretation, which Arrow can't handle while being multilingual return recurse(obj.content, mask) elif isinstance(obj, awkward0.array.table.Table): return pyarrow.StructArray.from_arrays( [recurse(x, mask) for x in obj.contents.values()], list(obj.contents)) elif isinstance(obj, awkward0.array.union.UnionArray): contents = [] for i, x in enumerate(obj.contents): if mask is None: thismask = None else: thistags = (obj.tags == i) thismask = obj.numpy.empty(len(x), dtype=obj.MASKTYPE) thismask[obj.index[thistags]] = mask[ thistags] # hmm... obj.index could have repeats; the Arrow mask in that case would not be well-defined... contents.append(recurse(x, thismask)) return pyarrow.UnionArray.from_dense( pyarrow.array(obj.tags.astype(numpy.int8)), pyarrow.array(obj.index.astype(numpy.int32)), contents) elif isinstance(obj, awkward0.array.virtual.VirtualArray): return recurse(obj.array, mask) else: raise TypeError("cannot convert type {0} to Arrow".format( type(obj)))
def test_recordbatch_column_sets_private_name(): # ARROW-6429 rb = pa.record_batch([pa.array([1, 2, 3, 4])], names=['a0']) assert rb[0]._name == 'a0'
def test_recordbatch_empty_metadata(): data = [pa.array(range(5)), pa.array([-10, -5, 0, 5, 10])] batch = pa.RecordBatch.from_arrays(data, ['c0', 'c1']) assert batch.schema.metadata is None
def do_get(self, context, ticket): assert self.expected_ticket == ticket.ticket data1 = [pa.array([-10, -5, 0, 5, 10], type=pa.int32())] table = pa.Table.from_arrays(data1, names=['a']) return flight.RecordBatchStream(table)
def as_column(arbitrary): """Create a Column from an arbitrary object Currently support inputs are: * ``Column`` * ``Buffer`` * numba device array * numpy array * pandas.Categorical Returns ------- result : subclass of TypedColumnBase - CategoricalColumn for pandas.Categorical input. - NumericalColumn for all other inputs. """ from . import numerical, categorical, datetime if isinstance(arbitrary, Column): if not isinstance(arbitrary, TypedColumnBase): # interpret as numeric data = arbitrary.view(numerical.NumericalColumn, dtype=arbitrary.dtype) else: data = arbitrary elif isinstance(arbitrary, Buffer): data = numerical.NumericalColumn(data=arbitrary, dtype=arbitrary.dtype) elif cuda.devicearray.is_cuda_ndarray(arbitrary): data = as_column(Buffer(arbitrary)) if (data.dtype in [np.float16, np.float32, np.float64] and arbitrary.size > 0): mask = cudautils.mask_from_devary(arbitrary) data = data.set_mask(mask) elif isinstance(arbitrary, np.ndarray): if arbitrary.dtype.kind == 'M': data = datetime.DatetimeColumn.from_numpy(arbitrary) else: data = as_column(rmm.to_device(arbitrary)) elif isinstance(arbitrary, pa.Array): if isinstance(arbitrary, pa.StringArray): raise NotImplementedError("Strings are not yet supported") elif isinstance(arbitrary, pa.NullArray): pamask = Buffer(np.empty(0, dtype='int8')) padata = Buffer(np.empty(0, dtype=arbitrary.type.to_pandas_dtype())) data = numerical.NumericalColumn( data=padata, mask=pamask, null_count=0, dtype=np.dtype(arbitrary.type.to_pandas_dtype())) elif isinstance(arbitrary, pa.DictionaryArray): if arbitrary.buffers()[0]: pamask = Buffer(np.array(arbitrary.buffers()[0])) else: pamask = None padata = Buffer( np.array(arbitrary.buffers()[1]).view( arbitrary.indices.type.to_pandas_dtype())) data = categorical.CategoricalColumn( data=padata, mask=pamask, null_count=arbitrary.null_count, categories=arbitrary.dictionary.to_pylist(), ordered=arbitrary.type.ordered, ) elif isinstance(arbitrary, pa.TimestampArray): arbitrary = arbitrary.cast(pa.timestamp('ms')) if arbitrary.buffers()[0]: pamask = Buffer(np.array(arbitrary.buffers()[0])) else: pamask = None padata = Buffer( np.array(arbitrary.buffers()[1]).view(np.dtype('M8[ms]'))) data = datetime.DatetimeColumn(data=padata, mask=pamask, null_count=arbitrary.null_count, dtype=np.dtype('M8[ms]')) elif isinstance(arbitrary, pa.Date64Array): if arbitrary.buffers()[0]: pamask = Buffer(np.array(arbitrary.buffers()[0])) else: pamask = None padata = Buffer( np.array(arbitrary.buffers()[1]).view(np.dtype('M8[ms]'))) data = datetime.DatetimeColumn(data=padata, mask=pamask, null_count=arbitrary.null_count, dtype=np.dtype('M8[ms]')) elif isinstance(arbitrary, pa.Date32Array): # No equivalent np dtype and not yet supported warnings.warn( "Date32 values are not yet supported so this will " "be typecast to a Date64 value", UserWarning) arbitrary = arbitrary.cast(pa.date64()) data = as_column(arbitrary) elif isinstance(arbitrary, pa.BooleanArray): # Arrow uses 1 bit per value while we use int8 dtype = np.dtype(np.bool) arbitrary = arbitrary.cast(pa.int8()) if arbitrary.buffers()[0]: pamask = Buffer(np.array(arbitrary.buffers()[0])) else: pamask = None padata = Buffer(np.array(arbitrary.buffers()[1]).view(dtype)) data = numerical.NumericalColumn(data=padata, mask=pamask, null_count=arbitrary.null_count, dtype=dtype) else: if arbitrary.buffers()[0]: pamask = Buffer(np.array(arbitrary.buffers()[0])) else: pamask = None padata = Buffer( np.array(arbitrary.buffers()[1]).view( np.dtype(arbitrary.type.to_pandas_dtype()))) data = numerical.NumericalColumn( data=padata, mask=pamask, null_count=arbitrary.null_count, dtype=np.dtype(arbitrary.type.to_pandas_dtype())) elif isinstance(arbitrary, (pd.Series, pd.Categorical)): data = as_column(pa.array(arbitrary, from_pandas=True)) elif np.isscalar(arbitrary): if hasattr(arbitrary, 'dtype'): data_type = _gdf.np_to_pa_dtype(arbitrary.dtype) if data_type in (pa.date64(), pa.date32()): # PyArrow can't construct date64 or date32 arrays from np # datetime types arbitrary = arbitrary.astype('int64') data = as_column(pa.array([arbitrary], type=data_type)) else: data = as_column(pa.array([arbitrary])) else: data = as_column(pa.array(arbitrary)) return data
def do_get(self, context, ticket): data = [pa.array([-10, -5, 0, 5, 10])] table = pa.Table.from_arrays(data, names=['a']) return flight.GeneratorStream(table.schema, self.number_batches(table))
def test_concat_tables_with_promotion_error(): t1 = pa.Table.from_arrays([pa.array([1, 2], type=pa.int64())], ["f"]) t2 = pa.Table.from_arrays([pa.array([1, 2], type=pa.float32())], ["f"]) with pytest.raises(pa.ArrowInvalid): pa.concat_tables([t1, t2], promote=True)
def test_is_dictionary(): assert types.is_dictionary( pa.dictionary(pa.int32(), pa.array(['a', 'b', 'c']))) assert not types.is_dictionary(pa.int32())
def test_table_column_sets_private_name(): # ARROW-6429 t = pa.table([pa.array([1, 2, 3, 4])], names=['a0']) assert t[0]._name == 'a0'
def method(self, other): is_arithmetic = \ True if op.__name__ in ops.ARITHMETIC_BINOPS else False pandas_only = cls._pandas_only() is_other_array = False if not is_scalar(other): is_other_array = True other = np.asarray(other) self_is_na = self.isna() other_is_na = pd.isna(other) mask = self_is_na | other_is_na if pa is None or pandas_only: if is_arithmetic: ret = np.empty(self.shape, dtype=object) else: ret = np.zeros(self.shape, dtype=bool) valid = ~mask arr = self._arrow_array.to_pandas().to_numpy() \ if self._use_arrow else self._ndarray o = other[valid] if is_other_array else other ret[valid] = op(arr[valid], o) if is_arithmetic: return ArrowStringArray(ret) else: return pd.arrays.BooleanArray(ret, mask) chunks = [] mask_chunks = [] start = 0 for chunk_array in self._arrow_array.chunks: chunk_array = np.asarray(chunk_array.to_pandas()) end = start + len(chunk_array) chunk_mask = mask[start:end] chunk_valid = ~chunk_mask if is_arithmetic: result = np.empty(chunk_array.shape, dtype=object) else: result = np.zeros(chunk_array.shape, dtype=bool) chunk_other = other if is_other_array: chunk_other = other[start:end] chunk_other = chunk_other[chunk_valid] # calculate only for both not None result[chunk_valid] = op(chunk_array[chunk_valid], chunk_other) if is_arithmetic: chunks.append( pa.array(result, type=pa.string(), from_pandas=True)) else: chunks.append(result) mask_chunks.append(chunk_mask) if is_arithmetic: return ArrowStringArray(pa.chunked_array(chunks)) else: return pd.arrays.BooleanArray(np.concatenate(chunks), np.concatenate(mask_chunks))
def test_recordbatch_from_arrays_validate_schema(): # ARROW-6263 arr = pa.array([1, 2]) schema = pa.schema([pa.field('f0', pa.list_(pa.utf8()))]) with pytest.raises(NotImplementedError): pa.record_batch([arr], schema=schema)
def _to_arrow_array(cls, scalars): return pa.array(scalars).cast(pa.string())
def __setitem__(self, key: Union[int, np.ndarray], value: Any) -> None: """Set one or more values inplace. Parameters ---------- key : int, ndarray, or slice When called from, e.g. ``Series.__setitem__``, ``key`` will be one of * scalar int * ndarray of integers. * boolean ndarray * slice object value : ExtensionDtype.type, Sequence[ExtensionDtype.type], or object value or values to be set of ``key``. Returns ------- None """ key = check_array_indexer(self, key) if is_integer(key): if not is_scalar(value): raise ValueError("Must pass scalars with scalar indexer") elif isna(value): value = None elif not isinstance(value, str): raise ValueError("Scalar must be NA or str") # Slice data and insert in-between new_data = [ *self._data[0:key].chunks, pa.array([value], type=pa.string()), *self._data[(key + 1):].chunks, ] self._data = pa.chunked_array(new_data) else: # Convert to integer indices and iteratively assign. # TODO: Make a faster variant of this in Arrow upstream. # This is probably extremely slow. # Convert all possible input key types to an array of integers if is_bool_dtype(key): # TODO(ARROW-9430): Directly support setitem(booleans) key_array = np.argwhere(key).flatten() elif isinstance(key, slice): key_array = np.array(range(len(self))[key]) else: # TODO(ARROW-9431): Directly support setitem(integers) key_array = np.asanyarray(key) if is_scalar(value): value = np.broadcast_to(value, len(key_array)) else: value = np.asarray(value) if len(key_array) != len(value): raise ValueError("Length of indexer and values mismatch") for k, v in zip(key_array, value): self[k] = v
def test_in_memory_table_append_column(in_memory_pa_table): field_ = "new_field" column = pa.array([i for i in range(len(in_memory_pa_table))]) table = InMemoryTable(in_memory_pa_table).append_column(field_, column) assert table.table == in_memory_pa_table.append_column(field_, column) assert isinstance(table, InMemoryTable)
def test_mi_regression_with_float_label_and_numeric_features(self): label_array = pa.array([ [0.1], [0.2], [0.8], [0.7], [0.2], [0.3], [0.9], [0.4], [0.1], [0.0], [0.4], [0.6], [0.4], [0.8]]) # Random floats that do not map onto the label terrible_feat_array = pa.array([ [0.4], [0.1], [0.4], [0.4], [0.8], [0.7], [0.2], [0.1], [0.0], [0.4], [0.8], [0.2], [0.5], [0.1]]) batch = pa.RecordBatch.from_arrays( [label_array, label_array, terrible_feat_array], ["label_key", "perfect_feature", "terrible_feature"]) schema = text_format.Parse( """ feature { name: "perfect_feature" type: FLOAT shape { dim { size: 1 } } } feature { name: "terrible_feature" type: FLOAT shape { dim { size: 1 } } } feature { name: "label_key" type: FLOAT shape { dim { size: 1 } } } """, schema_pb2.Schema()) expected = text_format.Parse( """ features { path { step: "perfect_feature" } custom_stats { name: "sklearn_adjusted_mutual_information" num: 1.0096965 } custom_stats { name: "sklearn_mutual_information" num: 1.1622766 } } features { path { step: "terrible_feature" } custom_stats { name: "sklearn_adjusted_mutual_information" num: 0.0211485 } custom_stats { name: "sklearn_mutual_information" num: 0.0211485 } }""", statistics_pb2.DatasetFeatureStatistics()) self._assert_mi_output_equal(batch, expected, schema, types.FeaturePath(["label_key"]))
def take(self, indices: Sequence[int], allow_fill: bool = False, fill_value: Any = None) -> "ExtensionArray": """ Take elements from an array. Parameters ---------- indices : sequence of int Indices to be taken. allow_fill : bool, default False How to handle negative values in `indices`. * False: negative values in `indices` indicate positional indices from the right (the default). This is similar to :func:`numpy.take`. * True: negative values in `indices` indicate missing values. These values are set to `fill_value`. Any other other negative values raise a ``ValueError``. fill_value : any, optional Fill value to use for NA-indices when `allow_fill` is True. This may be ``None``, in which case the default NA value for the type, ``self.dtype.na_value``, is used. For many ExtensionArrays, there will be two representations of `fill_value`: a user-facing "boxed" scalar, and a low-level physical NA value. `fill_value` should be the user-facing version, and the implementation should handle translating that to the physical version for processing the take if necessary. Returns ------- ExtensionArray Raises ------ IndexError When the indices are out of bounds for the array. ValueError When `indices` contains negative values other than ``-1`` and `allow_fill` is True. See Also -------- numpy.take api.extensions.take Notes ----- ExtensionArray.take is called by ``Series.__getitem__``, ``.loc``, ``iloc``, when `indices` is a sequence of values. Additionally, it's called by :meth:`Series.reindex`, or any other method that causes realignment, with a `fill_value`. """ # TODO: Remove once we got rid of the (indices < 0) check if not is_array_like(indices): indices_array = np.asanyarray(indices) else: indices_array = indices if len(self._data) == 0 and (indices_array >= 0).any(): raise IndexError("cannot do a non-empty take") if indices_array.size > 0 and indices_array.max() >= len(self._data): raise IndexError("out of bounds value in 'indices'.") if allow_fill: fill_mask = indices_array < 0 if fill_mask.any(): validate_indices(indices_array, len(self._data)) # TODO(ARROW-9433): Treat negative indices as NULL indices_array = pa.array(indices_array, mask=fill_mask) result = self._data.take(indices_array) if isna(fill_value): return type(self)(result) # TODO: ArrowNotImplementedError: Function fill_null has no # kernel matching input types (array[string], scalar[string]) result = type(self)(result) result[fill_mask] = fill_value return result # return type(self)(pc.fill_null(result, pa.scalar(fill_value))) else: # Nothing to fill return type(self)(self._data.take(indices)) else: # allow_fill=False # TODO(ARROW-9432): Treat negative indices as indices from the right. if (indices_array < 0).any(): # Don't modify in-place indices_array = np.copy(indices_array) indices_array[indices_array < 0] += len(self._data) return type(self)(self._data.take(indices_array))
def test_date_time_types(): t1 = pa.date32() data1 = np.array([17259, 17260, 17261], dtype='int32') a1 = pa.array(data1, type=t1) t2 = pa.date64() data2 = data1.astype('int64') * 86400000 a2 = pa.array(data2, type=t2) t3 = pa.timestamp('us') start = pd.Timestamp('2000-01-01').value / 1000 data3 = np.array([start, start + 1, start + 2], dtype='int64') a3 = pa.array(data3, type=t3) t4 = pa.time32('ms') data4 = np.arange(3, dtype='i4') a4 = pa.array(data4, type=t4) t5 = pa.time64('us') a5 = pa.array(data4.astype('int64'), type=t5) t6 = pa.time32('s') a6 = pa.array(data4, type=t6) ex_t6 = pa.time32('ms') ex_a6 = pa.array(data4 * 1000, type=ex_t6) t7 = pa.timestamp('ns') start = pd.Timestamp('2001-01-01').value data7 = np.array([start, start + 1000, start + 2000], dtype='int64') a7 = pa.array(data7, type=t7) t7_us = pa.timestamp('us') start = pd.Timestamp('2001-01-01').value data7_us = np.array([start, start + 1000, start + 2000], dtype='int64') // 1000 a7_us = pa.array(data7_us, type=t7_us) table = pa.Table.from_arrays([a1, a2, a3, a4, a5, a6, a7], [ 'date32', 'date64', 'timestamp[us]', 'time32[s]', 'time64[us]', 'time32_from64[s]', 'timestamp[ns]' ]) # date64 as date32 # time32[s] to time32[ms] # 'timestamp[ns]' to 'timestamp[us]' expected = pa.Table.from_arrays([a1, a1, a3, a4, a5, ex_a6, a7_us], [ 'date32', 'date64', 'timestamp[us]', 'time32[s]', 'time64[us]', 'time32_from64[s]', 'timestamp[ns]' ]) _check_roundtrip(table, expected=expected, version='2.0') # date64 as date32 # time32[s] to time32[ms] # 'timestamp[ns]' is saved as INT96 timestamp expected = pa.Table.from_arrays([a1, a1, a3, a4, a5, ex_a6, a7], [ 'date32', 'date64', 'timestamp[us]', 'time32[s]', 'time64[us]', 'time32_from64[s]', 'timestamp[ns]' ]) _check_roundtrip(table, expected=expected, version='2.0', use_deprecated_int96_timestamps=True) # Check that setting flavor to 'spark' uses int96 timestamps _check_roundtrip(table, expected=expected, version='2.0', flavor='spark') # Unsupported stuff def _assert_unsupported(array): table = pa.Table.from_arrays([array], ['unsupported']) buf = io.BytesIO() with pytest.raises(NotImplementedError): _write_table(table, buf, version="2.0") t7 = pa.time64('ns') a7 = pa.array(data4.astype('int64'), type=t7) _assert_unsupported(a7)
def _from_sequence(cls, scalars, dtype=None, copy=False): cls._chk_pyarrow_available() # convert non-na-likes to str, and nan-likes to ArrowStringDtype.na_value scalars = lib.ensure_string_array(scalars, copy=False) return cls(pa.array(scalars, type=pa.string(), from_pandas=True))
def test_sum(arrow_type): arr = pa.array([1, 2, 3, 4], type=arrow_type) assert arr.sum() == 10
def test_mi_classif_with_categorical_all_unique_labels(self): label_array = pa.array([[0], [2], [0], [1], [2], [1], [1], [0], [2], [1], [0]]) # A categorical feature that maps directly on to the label. perfect_feat_array = pa.array([["Red"], ["Blue"], ["Red"], ["Green"], ["Blue"], ["Green"], ["Green"], ["Red"], ["Blue"], ["Green"], ["Red"]]) # A categorical feature that has all values unique. unique_feat_array = pa.array([["Red1"], ["Red2"], ["Red3"], ["Red4"], ["Red5"], ["Red6"], ["Red7"], ["Red8"], ["Red9"], ["Red10"], ["Red11"]]) batch = pa.RecordBatch.from_arrays( [label_array, perfect_feat_array, unique_feat_array], ["label_key", "perfect_feature", "unique_feat_array"]) schema = text_format.Parse( """ feature { name: "label_key" type: INT int_domain { is_categorical: false } shape { dim { size: 1 } } } feature { name: "perfect_feature" type: BYTES shape { dim { size: 1 } } } feature { name: "unique_feat_array" type: BYTES shape { dim { size: 1 } } } """, schema_pb2.Schema()) # Note that the MI is different from above since the label is declared as # continuous feature. expected = text_format.Parse( """ features { path { step: "perfect_feature" } custom_stats { name: 'sklearn_adjusted_mutual_information' num: 1.7319986 } custom_stats { name: 'sklearn_mutual_information' num: 1.7319986 } }""", statistics_pb2.DatasetFeatureStatistics()) self._assert_mi_output_equal(batch, expected, schema, types.FeaturePath(["label_key"]))