def test_arrow_string_array_functions(): lst = np.array(['abc', 'de', 'eee', '中文'], dtype=object) # leverage string array to get the right answer string_array = pd.arrays.StringArray(lst) has_na_arrow_array = ArrowStringArray(['abc', None, 'eee', '中文']) has_na_string_array = pd.arrays.StringArray( np.array(['abc', pd.NA, 'eee', '中文'], dtype=object)) for pandas_only in [False, True]: with option_context({'dataframe.arrow_array.pandas_only': pandas_only}): arrow_array = ArrowStringArray(lst) # getitem, scalar assert arrow_array[1] == string_array[1] assert arrow_array[-1] == string_array[-1] # getitem, slice assert list(arrow_array[:2]) == list(string_array[:2]) assert list(arrow_array[1:-1]) == list(string_array[1:-1]) assert list(arrow_array[::2]) == list(string_array[::2]) # getitem, boolean index cond = np.array([len(c) > 2 for c in lst]) assert list(arrow_array[cond]) == list(string_array[cond]) # getitem, fancy index selection = [3, 1, 2] assert list(arrow_array[selection]) == list( string_array[selection]) selection = [3, -1, 2, -4] assert list(arrow_array[selection]) == list( string_array[selection]) selection = np.array([3, -1, 2, -4]) assert list(arrow_array[selection]) == list( string_array[selection]) # setitem arrow_array2 = arrow_array.copy() string_array2 = string_array.copy() arrow_array2[0] = 'ss' string_array2[0] = 'ss' assert list(arrow_array2) == list(string_array2) arrow_array2[1:3] = ['ss1', 'ss2'] string_array2[1:3] = ['ss1', 'ss2'] assert list(arrow_array2) == list(string_array2) arrow_array2[1:3] = arrow_array2[2:4] string_array2[1:3] = string_array2[2:4] assert list(arrow_array2) == list(string_array2) arrow_array2[2:] = pd.Series(['ss3', 'ss4']) string_array2[2:] = pd.Series(['ss3', 'ss4']) assert list(arrow_array2) == list(string_array2) with pytest.raises(ValueError): arrow_array2[0] = ['a', 'b'] arrow_array2[-1] = None string_array2[-1] = None assert list(arrow_array2)[:-1] == list(string_array2)[:-1] assert pd.isna(list(arrow_array2)[-1]) is True with pytest.raises(ValueError): arrow_array2[0] = 2 with pytest.raises(ValueError): arrow_array2[:2] = [1, 2] # test to_numpy np.testing.assert_array_equal(arrow_array.to_numpy(), string_array.to_numpy()) np.testing.assert_array_equal(arrow_array.to_numpy(copy=True), string_array.to_numpy(copy=True)) np.testing.assert_array_equal( has_na_arrow_array.to_numpy(copy=True, na_value='ss'), has_na_string_array.to_numpy(copy=True, na_value='ss')) # test fillna arrow_array3 = has_na_arrow_array.fillna('filled') string_array3 = has_na_string_array.fillna('filled') assert list(arrow_array3) == list(string_array3) # test astype arrow_array4 = ArrowStringArray(['1', '10', '100']) # leverage string array to get the right answer string_array4 = pd.arrays.StringArray( np.array(['1', '10', '100'], dtype=object)) np.testing.assert_array_equal(arrow_array4.astype(np.int64), string_array4.astype(np.int64)) np.testing.assert_almost_equal(arrow_array4.astype(float), string_array4.astype(float)) assert list(arrow_array4.astype( ArrowStringDtype(), copy=False)) == list( string_array4.astype(pd.StringDtype(), copy=False)) assert list(arrow_array4.astype( ArrowStringDtype(), copy=True)) == list( string_array4.astype(pd.StringDtype(), copy=True)) # test factorize codes, unique = arrow_array.factorize() codes2, unique2 = string_array.factorize() assert list(codes) == list(codes2) assert list(unique) == list(unique2) # test nbytes assert arrow_array.nbytes < pd.Series( string_array.astype(object)).memory_usage(deep=True, index=False) # test memory_usage if pandas_only: assert arrow_array.memory_usage( deep=False) == pd.Series(string_array).memory_usage( index=False) else: assert arrow_array.memory_usage( deep=True) == arrow_array.nbytes # test isna np.testing.assert_array_equal(has_na_arrow_array.isna(), has_na_string_array.isna()) has_na_arrow_array2 = has_na_arrow_array.copy() has_na_arrow_array2._force_use_pandas = True np.testing.assert_array_equal(has_na_arrow_array2.isna(), has_na_string_array.isna()) # test take assert list(arrow_array.take([1, 2, -1])) == list( string_array.take([1, 2, -1])) assert list(arrow_array.take([1, 2, -1], allow_fill=True).fillna('aa')) \ == list(string_array.take([1, 2, -1], allow_fill=True).fillna('aa')) assert list(arrow_array.take([1, 2, -1], allow_fill=True, fill_value='aa')) \ == list(string_array.take([1, 2, -1], allow_fill=True, fill_value='aa')) # test shift assert list(arrow_array.shift(2, fill_value='aa')) == list( string_array.shift(2, fill_value='aa')) # test value_counts assert list(arrow_array.value_counts()) == list( string_array.value_counts()) assert list(has_na_arrow_array.value_counts(dropna=True)) == list( has_na_string_array.value_counts(dropna=True)) # test all any assert arrow_array.all() == string_array.all() assert arrow_array.any() == string_array.any() # test arithmetic assert list(arrow_array + 's') == list(string_array + 's') assert list( (arrow_array + has_na_arrow_array).fillna('ss')) == list( (string_array + has_na_string_array).fillna('ss')) # test comparison np.testing.assert_array_equal(arrow_array < 's', string_array < 's') pd.testing.assert_series_equal( pd.Series(arrow_array < has_na_arrow_array), pd.Series(string_array < has_na_string_array)) # test repr assert 'ArrowStringArray' in repr(arrow_array) # test concat empty arrow_array5 = ArrowStringArray( pa.chunked_array([], type=pa.string())) concatenated = ArrowStringArray._concat_same_type( [arrow_array5, arrow_array5]) if not pandas_only: assert len(concatenated._arrow_array.chunks) == 1 pd.testing.assert_series_equal(pd.Series(arrow_array5), pd.Series(concatenated))
def testArrowStringArrayFunctions(self): lst = np.array(['abc', 'de', 'eee', '中文'], dtype=object) arrow_array = ArrowStringArray(lst) # leverage string array to get the right answer string_array = pd.arrays.StringArray(lst) has_na_arrow_array = ArrowStringArray(['abc', None, 'eee', '中文']) has_na_string_array = pd.arrays.StringArray( np.array(['abc', pd.NA, 'eee', '中文'], dtype=object)) # getitem, scalar self.assertEqual(arrow_array[1], string_array[1]) self.assertEqual(arrow_array[-1], string_array[-1]) # getitem, slice self.assertListEqual(list(arrow_array[:2]), list(string_array[:2])) self.assertListEqual(list(arrow_array[1:-1]), list(string_array[1:-1])) self.assertListEqual(list(arrow_array[::2]), list(string_array[::2])) # getitem, boolean index cond = np.array([len(c) > 2 for c in lst]) self.assertListEqual(list(arrow_array[cond]), list(string_array[cond])) # getitem, fancy index selection = [3, 1, 2] self.assertListEqual(list(arrow_array[selection]), list(string_array[selection])) selection = [3, -1, 2, -4] self.assertListEqual(list(arrow_array[selection]), list(string_array[selection])) selection = np.array([3, -1, 2, -4]) self.assertListEqual(list(arrow_array[selection]), list(string_array[selection])) # setitem arrow_array2 = arrow_array.copy() string_array2 = string_array.copy() arrow_array2[0] = 'ss' string_array2[0] = 'ss' self.assertListEqual(list(arrow_array2), list(string_array2)) arrow_array2[1:3] = ['ss1', 'ss2'] string_array2[1:3] = ['ss1', 'ss2'] self.assertListEqual(list(arrow_array2), list(string_array2)) arrow_array2[1:3] = arrow_array2[2:4] string_array2[1:3] = string_array2[2:4] self.assertListEqual(list(arrow_array2), list(string_array2)) arrow_array2[2:] = pd.Series(['ss3', 'ss4']) string_array2[2:] = pd.Series(['ss3', 'ss4']) self.assertListEqual(list(arrow_array2), list(string_array2)) with self.assertRaises(ValueError): arrow_array2[0] = ['a', 'b'] arrow_array2[-1] = None string_array2[-1] = None self.assertListEqual(list(arrow_array2)[:-1], list(string_array2)[:-1]) self.assertTrue(pd.isna(list(arrow_array2)[-1])) with self.assertRaises(ValueError): arrow_array2[0] = 2 with self.assertRaises(ValueError): arrow_array2[:2] = [1, 2] # test to_numpy np.testing.assert_array_equal(arrow_array.to_numpy(), string_array.to_numpy()) np.testing.assert_array_equal(arrow_array.to_numpy(copy=True), string_array.to_numpy(copy=True)) np.testing.assert_array_equal( has_na_arrow_array.to_numpy(copy=True, na_value='ss'), has_na_string_array.to_numpy(copy=True, na_value='ss')) # test fillna arrow_array3 = has_na_arrow_array.fillna('filled') string_array3 = has_na_string_array.fillna('filled') self.assertListEqual(list(arrow_array3), list(string_array3)) # test astype arrow_array4 = ArrowStringArray(['1', '10', '100']) # leverage string array to get the right answer string_array4 = pd.arrays.StringArray( np.array(['1', '10', '100'], dtype=object)) np.testing.assert_array_equal(arrow_array4.astype(np.int64), string_array4.astype(np.int64)) np.testing.assert_almost_equal(arrow_array4.astype(float), string_array4.astype(float)) self.assertListEqual( list(arrow_array4.astype(ArrowStringDtype(), copy=False)), list(string_array4.astype(pd.StringDtype(), copy=False))) self.assertListEqual( list(arrow_array4.astype(ArrowStringDtype(), copy=True)), list(string_array4.astype(pd.StringDtype(), copy=True))) # test factorize codes, unique = arrow_array.factorize() codes2, unique2 = string_array.factorize() self.assertListEqual(list(codes), list(codes2)) self.assertListEqual(list(unique), list(unique2)) # test nbytes self.assertLess(arrow_array.nbytes, pd.Series(string_array).memory_usage(deep=True)) # test memory_usage self.assertEqual(arrow_array.memory_usage(deep=True), arrow_array.nbytes) # test isna np.testing.assert_array_equal(has_na_arrow_array.isna(), has_na_string_array.isna()) has_na_arrow_array2 = has_na_arrow_array.copy() has_na_arrow_array2._force_use_pandas = True np.testing.assert_array_equal(has_na_arrow_array2.isna(), has_na_string_array.isna()) # test take self.assertListEqual(list(arrow_array.take([1, 2, -1])), list(string_array.take([1, 2, -1]))) self.assertListEqual( list(arrow_array.take([1, 2, -1], allow_fill=True).fillna('aa')), list(string_array.take([1, 2, -1], allow_fill=True).fillna('aa'))) self.assertListEqual( list(arrow_array.take([1, 2, -1], allow_fill=True, fill_value='aa')), list( string_array.take([1, 2, -1], allow_fill=True, fill_value='aa'))) # test shift self.assertListEqual(list(arrow_array.shift(2, fill_value='aa')), list(string_array.shift(2, fill_value='aa'))) # test value_counts self.assertListEqual(list(arrow_array.value_counts()), list(string_array.value_counts())) self.assertListEqual( list(has_na_arrow_array.value_counts(dropna=True)), list(has_na_string_array.value_counts(dropna=True))) # test all any self.assertEqual(arrow_array.all(), string_array.all()) self.assertEqual(arrow_array.any(), string_array.any()) # test arithmetic self.assertListEqual(list(arrow_array + 's'), list(string_array + 's')) self.assertListEqual( list((arrow_array + has_na_arrow_array).fillna('ss')), list((string_array + has_na_string_array).fillna('ss'))) # test comparison np.testing.assert_array_equal(arrow_array < 's', string_array < 's') pd.testing.assert_series_equal( pd.Series(arrow_array < has_na_arrow_array), pd.Series(string_array < has_na_string_array)) # test repr self.assertIn('ArrowStringArray', repr(arrow_array)) # test concat empty arrow_array5 = ArrowStringArray(pa.chunked_array([], type=pa.string())) concatenated = ArrowStringArray._concat_same_type( [arrow_array5, arrow_array5]) self.assertEqual(len(concatenated._arrow_array.chunks), 1) pd.testing.assert_series_equal(pd.Series(arrow_array5), pd.Series(concatenated))