def test_array_slice(): arr = pyarrow.from_pylist(range(10)) sliced = arr.slice(2) expected = pyarrow.from_pylist(range(2, 10)) assert sliced.equals(expected) sliced2 = arr.slice(2, 4) expected2 = pyarrow.from_pylist(range(2, 6)) assert sliced2.equals(expected2) # 0 offset assert arr.slice(0).equals(arr) # Slice past end of array assert len(arr.slice(len(arr))) == 0 with pytest.raises(IndexError): arr.slice(-1) # Test slice notation assert arr[2:].equals(arr.slice(2)) assert arr[2:5].equals(arr.slice(2, 3)) assert arr[-5:].equals(arr.slice(len(arr) - 5)) with pytest.raises(IndexError): arr[::-1] with pytest.raises(IndexError): arr[::2]
def test_recordbatch_slice(): data = [pa.from_pylist(range(5)), pa.from_pylist([-10, -5, 0, 5, 10])] names = ['c0', 'c1'] batch = pa.RecordBatch.from_arrays(data, names) sliced = batch.slice(2) assert sliced.num_rows == 3 expected = pa.RecordBatch.from_arrays([x.slice(2) for x in data], names) assert sliced.equals(expected) sliced2 = batch.slice(2, 2) expected2 = pa.RecordBatch.from_arrays([x.slice(2, 2) for x in data], names) assert sliced2.equals(expected2) # 0 offset assert batch.slice(0).equals(batch) # Slice past end of array assert len(batch.slice(len(batch))) == 0 with pytest.raises(IndexError): batch.slice(-1)
def test_concat_tables(): data = [ list(range(5)), [-10., -5., 0., 5., 10.] ] data2 = [ list(range(5, 10)), [1., 2., 3., 4., 5.] ] t1 = pa.Table.from_arrays([pa.from_pylist(x) for x in data], names=('a', 'b'), name='table_name') t2 = pa.Table.from_arrays([pa.from_pylist(x) for x in data2], names=('a', 'b'), name='table_name') result = pa.concat_tables([t1, t2], output_name='foo') assert result.name == 'foo' assert len(result) == 10 expected = pa.Table.from_arrays([pa.from_pylist(x + y) for x, y in zip(data, data2)], names=('a', 'b'), name='foo') assert result.equals(expected)
def test_recordbatch_slice(): data = [ pa.from_pylist(range(5)), pa.from_pylist([-10, -5, 0, 5, 10]) ] names = ['c0', 'c1'] batch = pa.RecordBatch.from_arrays(data, names) sliced = batch.slice(2) assert sliced.num_rows == 3 expected = pa.RecordBatch.from_arrays( [x.slice(2) for x in data], names) assert sliced.equals(expected) sliced2 = batch.slice(2, 2) expected2 = pa.RecordBatch.from_arrays( [x.slice(2, 2) for x in data], names) assert sliced2.equals(expected2) # 0 offset assert batch.slice(0).equals(batch) # Slice past end of array assert len(batch.slice(len(batch))) == 0 with pytest.raises(IndexError): batch.slice(-1)
def test_to_pandas_zero_copy(): import gc arr = pyarrow.from_pylist(range(10)) for i in range(10): np_arr = arr.to_pandas() assert sys.getrefcount(np_arr) == 2 np_arr = None # noqa assert sys.getrefcount(arr) == 2 for i in range(10): arr = pyarrow.from_pylist(range(10)) np_arr = arr.to_pandas() arr = None gc.collect() # Ensure base is still valid # Because of py.test's assert inspection magic, if you put getrefcount # on the line being examined, it will be 1 higher than you expect base_refcount = sys.getrefcount(np_arr.base) assert base_refcount == 2 np_arr.sum()
def test_recordbatch_basics(): data = [pa.from_pylist(range(5)), pa.from_pylist([-10, -5, 0, 5, 10])] batch = pa.RecordBatch.from_arrays(['c0', 'c1'], data) assert len(batch) == 5 assert batch.num_rows == 5 assert batch.num_columns == len(data)
def test_garbage_collection(self): import gc # Force the cyclic garbage collector to run gc.collect() bytes_before = pyarrow.total_allocated_bytes() pyarrow.from_pylist([1, None, 3, None]) gc.collect() assert pyarrow.total_allocated_bytes() == bytes_before
def test_garbage_collection(self): import gc # Force the cyclic garbage collector to run gc.collect() bytes_before = pa.total_allocated_bytes() pa.from_pylist([1, None, 3, None]) gc.collect() assert pa.total_allocated_bytes() == bytes_before
def test_recordbatch_basics(): data = [pa.from_pylist(range(5)), pa.from_pylist([-10, -5, 0, 5, 10])] batch = pa.RecordBatch.from_arrays(['c0', 'c1'], data) assert len(batch) == 5 assert batch.num_rows == 5 assert batch.num_columns == len(data) assert batch.to_pydict() == OrderedDict([('c0', [0, 1, 2, 3, 4]), ('c1', [-10, -5, 0, 5, 10])])
def test_table_remove_column(): data = [ pa.from_pylist(range(5)), pa.from_pylist([-10, -5, 0, 5, 10]), pa.from_pylist(range(5, 10)) ] table = pa.Table.from_arrays(data, names=('a', 'b', 'c')) t2 = table.remove_column(0) expected = pa.Table.from_arrays(data[1:], names=('b', 'c')) assert t2.equals(expected)
def test_recordbatch_basics(): data = [ pa.from_pylist(range(5)), pa.from_pylist([-10, -5, 0, 5, 10]) ] batch = pa.RecordBatch.from_arrays(['c0', 'c1'], data) assert len(batch) == 5 assert batch.num_rows == 5 assert batch.num_columns == len(data)
def test_basics(self): data = [A.from_pylist(range(5)), A.from_pylist([-10, -5, 0, 5, 10])] table = A.Table.from_arrays(('a', 'b'), data, 'table_name') assert table.name == 'table_name' assert len(table) == 5 assert table.num_rows == 5 assert table.num_columns == 2 assert table.shape == (5, 2) for col in table.itercolumns(): for chunk in col.data.iterchunks(): assert chunk is not None
def test_table_basics(): data = [pa.from_pylist(range(5)), pa.from_pylist([-10, -5, 0, 5, 10])] table = pa.Table.from_arrays(data, names=('a', 'b')) assert len(table) == 5 assert table.num_rows == 5 assert table.num_columns == 2 assert table.shape == (5, 2) assert table.to_pydict() == OrderedDict([('a', [0, 1, 2, 3, 4]), ('b', [-10, -5, 0, 5, 10])]) for col in table.itercolumns(): for chunk in col.data.iterchunks(): assert chunk is not None
def test_pandas(self): data = [A.from_pylist(range(5)), A.from_pylist([-10, -5, 0, 5, 10])] table = A.Table.from_arrays(('a', 'b'), data, 'table_name') # TODO: Use this part once from_pandas is implemented # data = {'a': range(5), 'b': [-10, -5, 0, 5, 10]} # df = pd.DataFrame(data) # A.Table.from_pandas(df) df = table.to_pandas() assert set(df.columns) == set(('a', 'b')) assert df.shape == (5, 2) assert df.ix[0, 'b'] == -10
def test_basics(self): data = [A.from_pylist(range(5)), A.from_pylist([-10, -5, 0, 5, 10])] num_rows = 5 descr = A.schema( [A.field('c0', data[0].type), A.field('c1', data[1].type)]) batch = A.RowBatch(descr, num_rows, data) assert len(batch) == num_rows assert batch.num_rows == num_rows assert batch.num_columns == len(data)
def test_table_pandas(): data = [pa.from_pylist(range(5)), pa.from_pylist([-10, -5, 0, 5, 10])] table = pa.Table.from_arrays(data, names=('a', 'b')) # TODO: Use this part once from_pandas is implemented # data = {'a': range(5), 'b': [-10, -5, 0, 5, 10]} # df = pd.DataFrame(data) # pa.Table.from_pandas(df) df = table.to_pandas() assert set(df.columns) == set(('a', 'b')) assert df.shape == (5, 2) assert df.loc[0, 'b'] == -10
def test_recordbatch_basics(): data = [ pa.from_pylist(range(5)), pa.from_pylist([-10, -5, 0, 5, 10]) ] batch = pa.RecordBatch.from_arrays(data, ['c0', 'c1']) assert len(batch) == 5 assert batch.num_rows == 5 assert batch.num_columns == len(data) assert batch.to_pydict() == OrderedDict([ ('c0', [0, 1, 2, 3, 4]), ('c1', [-10, -5, 0, 5, 10]) ])
def test_table_basics(): data = [ pa.from_pylist(range(5)), pa.from_pylist([-10, -5, 0, 5, 10]) ] table = pa.Table.from_arrays(('a', 'b'), data, 'table_name') assert table.name == 'table_name' assert len(table) == 5 assert table.num_rows == 5 assert table.num_columns == 2 assert table.shape == (5, 2) for col in table.itercolumns(): for chunk in col.data.iterchunks(): assert chunk is not None
def test_fixed_size_bytes(self): data = [b'foof', None, b'barb', b'2346'] arr = pa.from_pylist(data, type=pa.binary(4)) assert len(arr) == 4 assert arr.null_count == 1 assert arr.type == pa.binary(4) assert arr.to_pylist() == data
def test_concat_tables(): data = [list(range(5)), [-10., -5., 0., 5., 10.]] data2 = [list(range(5, 10)), [1., 2., 3., 4., 5.]] t1 = pa.Table.from_arrays([pa.from_pylist(x) for x in data], names=('a', 'b')) t2 = pa.Table.from_arrays([pa.from_pylist(x) for x in data2], names=('a', 'b')) result = pa.concat_tables([t1, t2]) assert len(result) == 10 expected = pa.Table.from_arrays( [pa.from_pylist(x + y) for x, y in zip(data, data2)], names=('a', 'b')) assert result.equals(expected)
def test_table_pandas(): data = [ pa.from_pylist(range(5)), pa.from_pylist([-10, -5, 0, 5, 10]) ] table = pa.Table.from_arrays(('a', 'b'), data, 'table_name') # TODO: Use this part once from_pandas is implemented # data = {'a': range(5), 'b': [-10, -5, 0, 5, 10]} # df = pd.DataFrame(data) # pa.Table.from_pandas(df) df = table.to_pandas() assert set(df.columns) == set(('a', 'b')) assert df.shape == (5, 2) assert df.loc[0, 'b'] == -10
def test_from_array(self): arr = pa.from_pylist([0, 1, 2, 3, 4]) col1 = pa.Column.from_array('foo', arr) col2 = pa.Column.from_array(pa.field('foo', arr.type), arr) assert col1.equals(col2)
def test_unicode(self): data = [u'foo', u'bar', None, u'mañana'] arr = pa.from_pylist(data) assert len(arr) == 4 assert arr.null_count == 1 assert arr.type == pa.string() assert arr.to_pylist() == data
def test_integer(self): expected = [1, None, 3, None] arr = pa.from_pylist(expected) assert len(arr) == 4 assert arr.null_count == 2 assert arr.type == pa.int64() assert arr.to_pylist() == expected
def test_integer(self): expected = [1, None, 3, None] arr = pyarrow.from_pylist(expected) assert len(arr) == 4 assert arr.null_count == 2 assert arr.type == pyarrow.int64() assert arr.to_pylist() == expected
def test_double(self): data = [1.5, 1, None, 2.5, None, None] arr = pa.from_pylist(data) assert len(arr) == 6 assert arr.null_count == 3 assert arr.type == pa.float64() assert arr.to_pylist() == data
def test_unicode(self): data = [u'foo', u'bar', None, u'mañana'] arr = pyarrow.from_pylist(data) assert len(arr) == 4 assert arr.null_count == 1 assert arr.type == pyarrow.string() assert arr.to_pylist() == data
def test_list_of_int(self): data = [[1, 2, 3], [], None, [1, 2]] arr = pyarrow.from_pylist(data) assert len(arr) == 4 assert arr.null_count == 1 assert arr.type == pyarrow.list_(pyarrow.int64()) assert arr.to_pylist() == data
def test_list_of_int(self): data = [[1, 2, 3], [], None, [1, 2]] arr = pa.from_pylist(data) assert len(arr) == 4 assert arr.null_count == 1 assert arr.type == pa.list_(pa.int64()) assert arr.to_pylist() == data
def test_unicode(self): data = [u("foo"), u("bar"), None, u("arrow")] arr = pyarrow.from_pylist(data) assert len(arr) == 4 assert arr.null_count == 1 assert arr.type == pyarrow.string() assert arr.to_pylist() == [u("foo"), u("bar"), None, u("arrow")]
def test_boolean(self): expected = [True, None, False, None] arr = pyarrow.from_pylist(expected) assert len(arr) == 4 assert arr.null_count == 2 assert arr.type == pyarrow.bool_() assert arr.to_pylist() == expected
def test_unicode(self): data = [u('foo'), u('bar'), None, u('arrow')] arr = pyarrow.from_pylist(data) assert len(arr) == 4 assert arr.null_count == 1 assert arr.type == pyarrow.string() assert arr.to_pylist() == [u('foo'), u('bar'), None, u('arrow')]
def test_boolean(self): expected = [True, None, False, None] arr = pa.from_pylist(expected) assert len(arr) == 4 assert arr.null_count == 2 assert arr.type == pa.bool_() assert arr.to_pylist() == expected
def test_double(self): data = [1.5, 1, None, 2.5, None, None] arr = pyarrow.from_pylist(data) assert len(arr) == 6 assert arr.null_count == 3 assert arr.type == pyarrow.double() assert arr.to_pylist() == data
def test_pandas(self): data = [pa.from_pylist([-10, -5, 0, 5, 10])] table = pa.Table.from_arrays(data, names=['a']) column = table.column(0) series = column.to_pandas() assert series.name == 'a' assert series.shape == (5, ) assert series.iloc[0] == -10
def test_decimal_different_precisions(self): data = [ decimal.Decimal('1234234983.183'), decimal.Decimal('80943244.234') ] type = pa.decimal(precision=13, scale=3) arr = pa.from_pylist(data, type=type) assert arr.to_pylist() == data
def test_bytes(self): u1 = b"ma\xc3\xb1ana" data = [b"foo", u1.decode("utf-8"), None] # unicode gets encoded, arr = pyarrow.from_pylist(data) assert len(arr) == 3 assert arr.null_count == 1 assert arr.type == pyarrow.binary() assert arr.to_pylist() == [b"foo", u1, None]
def test_decimal_large_integer(self): data = [ decimal.Decimal('-394029506937548693.42983'), decimal.Decimal('32358695912932.01033') ] type = pa.decimal(precision=23, scale=5) arr = pa.from_pylist(data, type=type) assert arr.to_pylist() == data
def test_table_basics(): data = [ pa.from_pylist(range(5)), pa.from_pylist([-10, -5, 0, 5, 10]) ] table = pa.Table.from_arrays(data, names=('a', 'b')) assert len(table) == 5 assert table.num_rows == 5 assert table.num_columns == 2 assert table.shape == (5, 2) assert table.to_pydict() == OrderedDict([ ('a', [0, 1, 2, 3, 4]), ('b', [-10, -5, 0, 5, 10]) ]) for col in table.itercolumns(): for chunk in col.data.iterchunks(): assert chunk is not None
def test_basics(self): data = [pa.from_pylist([-10, -5, 0, 5, 10])] table = pa.Table.from_arrays(data, names=['a']) column = table.column(0) assert column.name == 'a' assert column.length() == 5 assert len(column) == 5 assert column.shape == (5, ) assert column.to_pylist() == [-10, -5, 0, 5, 10]
def test_bool(self): arr = A.from_pylist([True, None, False, None]) v = arr[0] assert isinstance(v, A.BooleanValue) assert repr(v) == "True" assert v.as_py() is True assert arr[1] is A.NA
def test_bool(self): arr = A.from_pylist([True, None, False, None]) v = arr[0] assert isinstance(v, A.BooleanValue) assert repr(v) == "True" assert v.as_py() == True assert arr[1] is A.NA
def test_int64(self): arr = A.from_pylist([1, 2, None]) v = arr[0] assert isinstance(v, A.Int64Value) assert repr(v) == "1" assert v.as_py() == 1 assert arr[2] is A.NA
def test_string_format(self): arr = pyarrow.from_pylist(['', None, 'foo']) result = fmt.array_format(arr) expected = """\ [ '', NA, 'foo' ]""" assert result == expected
def test_bytes(self): u1 = b'ma\xc3\xb1ana' data = [b'foo', u1.decode('utf-8'), # unicode gets encoded, None] arr = pyarrow.from_pylist(data) assert len(arr) == 3 assert arr.null_count == 1 assert arr.type == pyarrow.binary() assert arr.to_pylist() == [b'foo', u1, None]
def test_date(self): data = [datetime.date(2000, 1, 1), None, datetime.date(1970, 1, 1), datetime.date(2040, 2, 26)] arr = pyarrow.from_pylist(data) assert len(arr) == 4 assert arr.type == pyarrow.date() assert arr.null_count == 1 assert arr[0].as_py() == datetime.date(2000, 1, 1) assert arr[1].as_py() is None assert arr[2].as_py() == datetime.date(1970, 1, 1) assert arr[3].as_py() == datetime.date(2040, 2, 26)
def test_pandas(self): data = [ pa.from_pylist([-10, -5, 0, 5, 10]) ] table = pa.Table.from_arrays(data, names=['a'], name='table_name') column = table.column(0) series = column.to_pandas() assert series.name == 'a' assert series.shape == (5,) assert series.iloc[0] == -10
def test_string_format(): arr = pyarrow.from_pylist(['', None, 'foo']) result = fmt.array_format(arr) expected = """\ [ '', NA, 'foo' ]""" assert result == expected
def test_basics(self): data = [ pa.from_pylist([-10, -5, 0, 5, 10]) ] table = pa.Table.from_arrays(data, names=['a'], name='table_name') column = table.column(0) assert column.name == 'a' assert column.length() == 5 assert len(column) == 5 assert column.shape == (5,) assert column.to_pylist() == [-10, -5, 0, 5, 10]
def test_list_format(self): arr = pyarrow.from_pylist([[1], None, [2, 3]]) result = fmt.array_format(arr) expected = """\ [ [1], NA, [2, 3] ]""" assert result == expected
def test_bytes(self): u1 = b'ma\xc3\xb1ana' data = [ b'foo', u1.decode('utf-8'), # unicode gets encoded, None ] arr = pa.from_pylist(data) assert len(arr) == 3 assert arr.null_count == 1 assert arr.type == pa.binary() assert arr.to_pylist() == [b'foo', u1, None]
def test_bytes(self): arr = A.from_pylist([b'foo', None, u('bar')]) v = arr[0] assert isinstance(v, A.BinaryValue) assert v.as_py() == b'foo' assert arr[1] is A.NA v = arr[2].as_py() assert v == b'bar' assert isinstance(v, bytes)
def test_long_array_format(self): arr = pyarrow.from_pylist(range(100)) result = fmt.array_format(arr, window=2) expected = """\ [ 0, 1, ... 98, 99 ]""" assert result == expected
def test_double(self): arr = A.from_pylist([1.5, None, 3]) v = arr[0] assert isinstance(v, A.DoubleValue) assert repr(v) == "1.5" assert v.as_py() == 1.5 assert arr[1] is A.NA v = arr[2] assert v.as_py() == 3.0