def test_setitem_scalar(): ser = pd.Series(fr.FletcherChunkedArray(TEST_ARRAY)) ser[1] = "other_string" expected = pd.Series( fr.FletcherChunkedArray(pa.array(["Test", "other_string", None])) ) tm.assert_series_equal(ser, expected)
def setup(self): np.random.seed(93487) # TODO: Is it maybe faster to separate each type into its own Take* class? # It seems like the data is regenerated for each benchmark and thus # is quite some overhead here. self.data = np.random.randint(0, 2**20, size=2**24) self.pd_int = pd.Series(self.data) self.fr_cont_int = pd.Series(fr.FletcherContinuousArray(self.data)) chunked_data = pa.chunked_array([ pa.array(self.data[0:len(self.data) // 2]), pa.array(self.data[len(self.data) // 2:-1]), ]) self.fr_chunked_int = pd.Series(fr.FletcherChunkedArray(chunked_data)) mask = np.random.rand(2**24) > 0.8 self.pd_int_na = pd.Series(pd.arrays.IntegerArray(self.data, mask)) self.fr_cont_int_na = pd.Series( fr.FletcherContinuousArray(pa.array(self.data, mask=mask))) self.fr_chunked_int_na = pd.Series( fr.FletcherChunkedArray(pa.array(self.data, mask=mask))) self.data_small = np.random.randint(0, 2**16, size=2**18) self.data_small_missing = self.data_small.copy() self.data_small_missing[0:-1:2] = -1 data_small_str = self.data_small.astype(str) self.pd_str = pd.Series(data_small_str) self.fr_cont_str = pd.Series( fr.FletcherContinuousArray(data_small_str)) data_small_str_chunked = pa.chunked_array([ pa.array(data_small_str[0:len(data_small_str) // 2]), pa.array(data_small_str[len(data_small_str) // 2:-1]), ]) self.fr_chunked_str = pd.Series( fr.FletcherChunkedArray(data_small_str_chunked))
def test_concatenate_blocks(): v1 = fr.FletcherChunkedArray(TEST_ARRAY) s = pd.Series(v1, index=pd.RangeIndex(3), fastpath=True) result = pd.concat([s, s], ignore_index=True) expected = pd.Series( fr.FletcherChunkedArray( pa.array(["Test", "string", None, "Test", "string", None]))) tm.assert_series_equal(result, expected)
def setup(self): data = np.zeros(2 ** 24).astype(bool) self.fr_data = pd.Series(fr.FletcherChunkedArray(pa.array(data))) self.np_data = pd.Series(data.astype(np.float32)) data_withna = np.zeros(2 ** 24).astype(bool).astype(object) data_withna[-1] = None self.fr_data_withna = pd.Series(fr.FletcherChunkedArray(pa.array(data_withna))) self.np_data_withna = pd.Series(data_withna.astype(np.float32))
def test_dataframe_from_series_no_dict(): s = pd.Series(fr.FletcherChunkedArray(TEST_ARRAY)) result = pd.DataFrame(s) expected = pd.DataFrame({0: s}) tm.assert_frame_equal(result, expected) s = pd.Series(fr.FletcherChunkedArray(TEST_ARRAY), name="A") result = pd.DataFrame(s) expected = pd.DataFrame({"A": s}) tm.assert_frame_equal(result, expected)
def test_fillna_chunked(test_array_chunked): ser = pd.Series(fr.FletcherChunkedArray(test_array_chunked)) ser = ser.fillna("filled") expected_list = TEST_LIST[:2] + ["filled"] chunks = [] for _ in range(10): chunks.append(pa.array(expected_list)) chunked_exp = pa.chunked_array(chunks) expected = pd.Series(fr.FletcherChunkedArray(chunked_exp)) tm.assert_series_equal(ser, expected)
def setup(self): data = np.random.randint(0, 2**20, size=2**24) self.pd_int = pd.Series(data) self.fr_cont_int = pd.Series(fr.FletcherContinuousArray(data)) self.fr_chunked_int = pd.Series(fr.FletcherChunkedArray(data)) mask = np.random.rand(2**24) > 0.8 self.pd_int_na = pd.Series(pd.arrays.IntegerArray(data, mask)) self.fr_cont_int_na = pd.Series( fr.FletcherContinuousArray(pa.array(data, mask=mask))) self.fr_chunked_int_na = pd.Series( fr.FletcherChunkedArray(pa.array(data, mask=mask)))
def test_factorize(test_array_chunked): arr = fr.FletcherChunkedArray(TEST_ARRAY) labels, uniques = arr.factorize() expected_labels, expected_uniques = pd.factorize(arr.astype(object)) assert isinstance(uniques, fr.FletcherChunkedArray) uniques = uniques.astype(object) npt.assert_array_equal(labels, expected_labels) npt.assert_array_equal(uniques, expected_uniques) # Check that it works with chunked (n_chunks > 1) arrays arr = fr.FletcherChunkedArray(test_array_chunked) arr.factorize()
def test_bool_np_any(): arr = fr.FletcherChunkedArray([True, False, None]) assert np.any(arr) arr = fr.FletcherChunkedArray([True, False, True]) assert np.any(arr) # TODO(pandas-0.26): Uncomment this when BooleanArray landed. # Then we change the behaviour. # arr = fr.FletcherChunkedArray([False, False, None]) # assert np.any(arr) is pd.NA arr = fr.FletcherChunkedArray([False, False, False]) assert not np.any(arr)
def test_groupby(): arr = fr.FletcherChunkedArray(["a", "a", "b", None]) df = pd.DataFrame({"str": arr, "int": [10, 5, 24, 6]}) result = df.groupby("str").sum() expected = pd.DataFrame({"int": [15, 24]}, index=pd.Index(["a", "b"], name="str")) tm.assert_frame_equal(result, expected)
def test_isna_empty(): np.testing.assert_array_equal( fr.FletcherChunkedArray( pa.chunked_array([[], [None], [1]], type=pa.int32()) ).isna(), np.array([True, False]), )
def _do_test_text_strip(str_accessor, fletcher_variant, fletcher_slice_offset, data, strip_method="strip"): if any("\x00" in x for x in data if x): # pytest.skip("pandas cannot handle \\x00 characters in tests") # Skip is not working properly with hypothesis return ser_pd = pd.Series(data, dtype=str) arrow_data = pa.array([None for _ in range(fletcher_slice_offset)] + data, type=pa.string()) if fletcher_variant == "chunked": fr_array = fr.FletcherChunkedArray(arrow_data) else: fr_array = fr.FletcherContinuousArray(arrow_data) ser_fr = pd.Series(fr_array[fletcher_slice_offset:]) result_pd = getattr(ser_pd.str, strip_method)() result_fr = getattr(getattr(ser_fr, str_accessor), strip_method)() result_fr = result_fr.astype(object) # Pandas returns np.nan for NA values in cat, keep this in line result_fr[result_fr.isna()] = np.nan result_pd[result_pd.isna()] = np.nan tm.assert_series_equal(result_fr, result_pd)
def test_setitem_chunked(test_array_chunked): ser = pd.Series(fr.FletcherChunkedArray(test_array_chunked)) new_val = "new_value" old_val = ser[15] assert new_val != old_val ser[15] = new_val assert new_val == ser[15]
def test_nbytes(): array = fr.FletcherChunkedArray(pa.array(["A", None, "CC"])) # Minimal storage usage: # 1 byte for the valid bitmap # 4 bytes for the offset array # 3 bytes for the actual string content assert array.nbytes >= 8
def _fr_series_from_data(data, fletcher_variant, dtype=pa.string()): arrow_data = pa.array(data, type=dtype) if fletcher_variant == "chunked": fr_array = fr.FletcherChunkedArray(arrow_data) else: fr_array = fr.FletcherContinuousArray(arrow_data) return pd.Series(fr_array)
def setup(self, chunked, value, indices): # assert np.isscalar(values) or len(values) == len(indices) array = generate_test_array(self.n) if indices == "int": if value == "array_value": raise NotImplementedError() self.indexer = 50 elif indices == "int_array": self.indexer = list(range(0, self.n, 5)) elif indices == "bool_array": self.indexer = np.zeros(self.n, dtype=bool) self.indexer[list(range(0, self.n, 5))] = True elif indices == "slice": self.indexer = slice(0, self.n, 5) if value == "scalar_value": self.value = "setitem" elif value == "array_value": self.value = [str(x) for x in range(self.n)] self.value = np.array(self.value)[self.indexer] if len(self.value) == 1: self.value = self.value[0] self.df = pd.DataFrame({"str": array}) if chunked: array = np.array_split(array, 1000) else: array = [array] self.df_ext = pd.DataFrame( { "str": fr.FletcherChunkedArray( pa.chunked_array([pa.array(chunk, pa.string()) for chunk in array]) ) } )
def setup(self): array = [("a" * 50 + "b" if i % 2 == 0 else "c") * 5 + str(i) for i in range(2**16)] self.pattern = "a" * 30 + "b" self.df = pd.DataFrame({"str": array}) self.df_ext = pd.DataFrame( {"str": fr.FletcherChunkedArray(pa.array(array, pa.string()))})
def test_dataframe_constructor(): v = fr.FletcherChunkedArray(TEST_ARRAY) df = pd.DataFrame({"A": v}) assert isinstance(df.dtypes["A"], fr.FletcherChunkedDtype) assert df.shape == (3, 1) # Test some calls to typical DataFrame functions str(df) df.info()
def test_unique(): arr = fr.FletcherChunkedArray(TEST_ARRAY) uniques = arr.unique() expected_uniques = pd.unique(arr.astype(object)) assert isinstance(uniques, fr.FletcherChunkedArray) uniques = uniques.astype(object) npt.assert_array_equal(uniques, expected_uniques)
def test_flatten(): list_array = pa.array([[1, 2], [3, 4]]) npt.assert_array_equal( fr.FletcherContinuousArray(list_array).flatten(), [1, 2, 3, 4]) chunked_list_array = pa.chunked_array([list_array, list_array]) npt.assert_array_equal( fr.FletcherChunkedArray(chunked_list_array).flatten(), [1, 2, 3, 4, 1, 2, 3, 4])
def test_factorize(): arr = fr.FletcherChunkedArray(TEST_ARRAY) labels, uniques = arr.factorize() expected_labels, expected_uniques = pd.factorize(arr.astype(object)) assert isinstance(uniques, fr.FletcherChunkedArray) uniques = uniques.astype(object) npt.assert_array_equal(labels, expected_labels) npt.assert_array_equal(uniques, expected_uniques)
def test_pandas_from_arrow(): arr = pa.array(["a", "b", "c"], pa.string()) expected_series_woutname = pd.Series(fr.FletcherChunkedArray(arr)) pdt.assert_series_equal(expected_series_woutname, fr.pandas_from_arrow(arr)) expected_series_woutname = pd.Series(fr.FletcherContinuousArray(arr)) pdt.assert_series_equal( expected_series_woutname, fr.pandas_from_arrow(arr, continuous=True) ) rb = pa.RecordBatch.from_arrays([arr], ["column"]) expected_df = pd.DataFrame({"column": fr.FletcherChunkedArray(arr)}) table = pa.Table.from_arrays([arr], ["column"]) pdt.assert_frame_equal(expected_df, fr.pandas_from_arrow(rb)) pdt.assert_frame_equal(expected_df, fr.pandas_from_arrow(table)) expected_df = pd.DataFrame({"column": fr.FletcherContinuousArray(arr)}) table = pa.Table.from_arrays([arr], ["column"]) pdt.assert_frame_equal(expected_df, fr.pandas_from_arrow(rb, continuous=True)) pdt.assert_frame_equal(expected_df, fr.pandas_from_arrow(table, continuous=True))
def test_text_cat(data, fletcher_variant, fletcher_variant_2): if any("\x00" in x for x in data if x): # pytest.skip("pandas cannot handle \\x00 characters in tests") # Skip is not working properly with hypothesis return ser_pd = pd.Series(data, dtype=str) arrow_data = pa.array(data, type=pa.string()) if fletcher_variant == "chunked": fr_array = fr.FletcherChunkedArray(arrow_data) else: fr_array = fr.FletcherContinuousArray(arrow_data) ser_fr = pd.Series(fr_array) if fletcher_variant_2 == "chunked": fr_other_array = fr.FletcherChunkedArray(arrow_data) else: fr_other_array = fr.FletcherContinuousArray(arrow_data) ser_fr_other = pd.Series(fr_other_array) result_pd = ser_pd.str.cat(ser_pd) result_fr = ser_fr.fr_text.cat(ser_fr_other) result_fr = result_fr.astype(object) # Pandas returns np.nan for NA values in cat, keep this in line result_fr[result_fr.isna()] = np.nan tm.assert_series_equal(result_fr, result_pd)
def test_text_zfill(data, str_accessor, fletcher_variant): if any("\x00" in x for x in data if x): # pytest.skip("pandas cannot handle \\x00 characters in tests") # Skip is not working properly with hypothesis return ser_pd = pd.Series(data, dtype=str) max_str_len = ser_pd.map(_optional_len).max() if pd.isna(max_str_len): max_str_len = 0 arrow_data = pa.array(data, type=pa.string()) if fletcher_variant == "chunked": fr_array = fr.FletcherChunkedArray(arrow_data) else: fr_array = fr.FletcherContinuousArray(arrow_data) ser_fr = pd.Series(fr_array) result_pd = ser_pd.str.zfill(max_str_len + 1) result_fr = getattr(ser_fr, str_accessor).zfill(max_str_len + 1) result_fr = result_fr.astype(object) # Pandas returns np.nan for NA values in cat, keep this in line result_fr[result_fr.isna()] = np.nan tm.assert_series_equal(result_fr, result_pd)
def test_setitem_chunked_int_index(indices, test_array_chunked): ser = pd.Series(fr.FletcherChunkedArray(test_array_chunked)) integer_index = indices ser[integer_index] = ["int", "index"] assert ser[indices[0]] == "int" assert ser[indices[1]] == "index"
def setup(self): array = generate_test_array(2 ** 17) self.df = pd.DataFrame({"str": array}) self.df_ext = pd.DataFrame( {"str": fr.FletcherChunkedArray(pa.array(array, pa.string()))} )
def test_series_constructor(): v = fr.FletcherChunkedArray(TEST_ARRAY) result = pd.Series(v) assert result.dtype == v.dtype assert isinstance(result._data.blocks[0], ExtensionBlock)
def test_dataframe_from_series(): s = pd.Series(fr.FletcherChunkedArray(TEST_ARRAY)) c = pd.Series(pd.Categorical(["a", "b"])) result = pd.DataFrame({"A": s, "B": c}) assert isinstance(result.dtypes["A"], fr.FletcherChunkedDtype)
def test_fletcherarray_constructor(): with pytest.raises(ValueError): fr.FletcherChunkedArray(None)
def array_inhom_chunks(): chunk1 = pa.array(list("abc"), pa.string()) chunk2 = pa.array(list("12345"), pa.string()) chunk3 = pa.array(list("Z"), pa.string()) chunked_array = pa.chunked_array([chunk1, chunk2, chunk3]) return fr.FletcherChunkedArray(chunked_array)