Exemple #1
0
def test_setitem_scalar():
    ser = pd.Series(fr.FletcherChunkedArray(TEST_ARRAY))
    ser[1] = "other_string"
    expected = pd.Series(
        fr.FletcherChunkedArray(pa.array(["Test", "other_string", None]))
    )
    tm.assert_series_equal(ser, expected)
Exemple #2
0
    def setup(self):
        np.random.seed(93487)
        # TODO: Is it maybe faster to separate each type into its own Take* class?
        #       It seems like the data is regenerated for each benchmark and thus
        #       is quite some overhead here.
        self.data = np.random.randint(0, 2**20, size=2**24)
        self.pd_int = pd.Series(self.data)
        self.fr_cont_int = pd.Series(fr.FletcherContinuousArray(self.data))
        chunked_data = pa.chunked_array([
            pa.array(self.data[0:len(self.data) // 2]),
            pa.array(self.data[len(self.data) // 2:-1]),
        ])
        self.fr_chunked_int = pd.Series(fr.FletcherChunkedArray(chunked_data))

        mask = np.random.rand(2**24) > 0.8
        self.pd_int_na = pd.Series(pd.arrays.IntegerArray(self.data, mask))
        self.fr_cont_int_na = pd.Series(
            fr.FletcherContinuousArray(pa.array(self.data, mask=mask)))
        self.fr_chunked_int_na = pd.Series(
            fr.FletcherChunkedArray(pa.array(self.data, mask=mask)))

        self.data_small = np.random.randint(0, 2**16, size=2**18)
        self.data_small_missing = self.data_small.copy()
        self.data_small_missing[0:-1:2] = -1
        data_small_str = self.data_small.astype(str)
        self.pd_str = pd.Series(data_small_str)
        self.fr_cont_str = pd.Series(
            fr.FletcherContinuousArray(data_small_str))
        data_small_str_chunked = pa.chunked_array([
            pa.array(data_small_str[0:len(data_small_str) // 2]),
            pa.array(data_small_str[len(data_small_str) // 2:-1]),
        ])
        self.fr_chunked_str = pd.Series(
            fr.FletcherChunkedArray(data_small_str_chunked))
def test_concatenate_blocks():
    v1 = fr.FletcherChunkedArray(TEST_ARRAY)
    s = pd.Series(v1, index=pd.RangeIndex(3), fastpath=True)
    result = pd.concat([s, s], ignore_index=True)
    expected = pd.Series(
        fr.FletcherChunkedArray(
            pa.array(["Test", "string", None, "Test", "string", None])))
    tm.assert_series_equal(result, expected)
Exemple #4
0
 def setup(self):
     data = np.zeros(2 ** 24).astype(bool)
     self.fr_data = pd.Series(fr.FletcherChunkedArray(pa.array(data)))
     self.np_data = pd.Series(data.astype(np.float32))
     data_withna = np.zeros(2 ** 24).astype(bool).astype(object)
     data_withna[-1] = None
     self.fr_data_withna = pd.Series(fr.FletcherChunkedArray(pa.array(data_withna)))
     self.np_data_withna = pd.Series(data_withna.astype(np.float32))
Exemple #5
0
def test_dataframe_from_series_no_dict():
    s = pd.Series(fr.FletcherChunkedArray(TEST_ARRAY))
    result = pd.DataFrame(s)
    expected = pd.DataFrame({0: s})
    tm.assert_frame_equal(result, expected)

    s = pd.Series(fr.FletcherChunkedArray(TEST_ARRAY), name="A")
    result = pd.DataFrame(s)
    expected = pd.DataFrame({"A": s})
    tm.assert_frame_equal(result, expected)
Exemple #6
0
def test_fillna_chunked(test_array_chunked):
    ser = pd.Series(fr.FletcherChunkedArray(test_array_chunked))
    ser = ser.fillna("filled")

    expected_list = TEST_LIST[:2] + ["filled"]
    chunks = []
    for _ in range(10):
        chunks.append(pa.array(expected_list))
    chunked_exp = pa.chunked_array(chunks)
    expected = pd.Series(fr.FletcherChunkedArray(chunked_exp))

    tm.assert_series_equal(ser, expected)
Exemple #7
0
    def setup(self):
        data = np.random.randint(0, 2**20, size=2**24)
        self.pd_int = pd.Series(data)
        self.fr_cont_int = pd.Series(fr.FletcherContinuousArray(data))
        self.fr_chunked_int = pd.Series(fr.FletcherChunkedArray(data))

        mask = np.random.rand(2**24) > 0.8
        self.pd_int_na = pd.Series(pd.arrays.IntegerArray(data, mask))
        self.fr_cont_int_na = pd.Series(
            fr.FletcherContinuousArray(pa.array(data, mask=mask)))
        self.fr_chunked_int_na = pd.Series(
            fr.FletcherChunkedArray(pa.array(data, mask=mask)))
def test_factorize(test_array_chunked):
    arr = fr.FletcherChunkedArray(TEST_ARRAY)
    labels, uniques = arr.factorize()
    expected_labels, expected_uniques = pd.factorize(arr.astype(object))

    assert isinstance(uniques, fr.FletcherChunkedArray)

    uniques = uniques.astype(object)
    npt.assert_array_equal(labels, expected_labels)
    npt.assert_array_equal(uniques, expected_uniques)

    # Check that it works with chunked (n_chunks > 1) arrays
    arr = fr.FletcherChunkedArray(test_array_chunked)
    arr.factorize()
Exemple #9
0
def test_bool_np_any():
    arr = fr.FletcherChunkedArray([True, False, None])
    assert np.any(arr)

    arr = fr.FletcherChunkedArray([True, False, True])
    assert np.any(arr)

    # TODO(pandas-0.26): Uncomment this when BooleanArray landed.
    #   Then we change the behaviour.
    # arr = fr.FletcherChunkedArray([False, False, None])
    # assert np.any(arr) is pd.NA

    arr = fr.FletcherChunkedArray([False, False, False])
    assert not np.any(arr)
Exemple #10
0
def test_groupby():
    arr = fr.FletcherChunkedArray(["a", "a", "b", None])
    df = pd.DataFrame({"str": arr, "int": [10, 5, 24, 6]})
    result = df.groupby("str").sum()

    expected = pd.DataFrame({"int": [15, 24]}, index=pd.Index(["a", "b"], name="str"))
    tm.assert_frame_equal(result, expected)
Exemple #11
0
def test_isna_empty():
    np.testing.assert_array_equal(
        fr.FletcherChunkedArray(
            pa.chunked_array([[], [None], [1]], type=pa.int32())
        ).isna(),
        np.array([True, False]),
    )
Exemple #12
0
def _do_test_text_strip(str_accessor,
                        fletcher_variant,
                        fletcher_slice_offset,
                        data,
                        strip_method="strip"):
    if any("\x00" in x for x in data if x):
        # pytest.skip("pandas cannot handle \\x00 characters in tests")
        # Skip is not working properly with hypothesis
        return
    ser_pd = pd.Series(data, dtype=str)
    arrow_data = pa.array([None for _ in range(fletcher_slice_offset)] + data,
                          type=pa.string())
    if fletcher_variant == "chunked":
        fr_array = fr.FletcherChunkedArray(arrow_data)
    else:
        fr_array = fr.FletcherContinuousArray(arrow_data)
    ser_fr = pd.Series(fr_array[fletcher_slice_offset:])

    result_pd = getattr(ser_pd.str, strip_method)()
    result_fr = getattr(getattr(ser_fr, str_accessor), strip_method)()
    result_fr = result_fr.astype(object)
    # Pandas returns np.nan for NA values in cat, keep this in line
    result_fr[result_fr.isna()] = np.nan
    result_pd[result_pd.isna()] = np.nan
    tm.assert_series_equal(result_fr, result_pd)
Exemple #13
0
def test_setitem_chunked(test_array_chunked):
    ser = pd.Series(fr.FletcherChunkedArray(test_array_chunked))
    new_val = "new_value"
    old_val = ser[15]
    assert new_val != old_val
    ser[15] = new_val
    assert new_val == ser[15]
Exemple #14
0
def test_nbytes():
    array = fr.FletcherChunkedArray(pa.array(["A", None, "CC"]))
    # Minimal storage usage:
    # 1 byte for the valid bitmap
    # 4 bytes for the offset array
    # 3 bytes for the actual string content
    assert array.nbytes >= 8
Exemple #15
0
def _fr_series_from_data(data, fletcher_variant, dtype=pa.string()):
    arrow_data = pa.array(data, type=dtype)
    if fletcher_variant == "chunked":
        fr_array = fr.FletcherChunkedArray(arrow_data)
    else:
        fr_array = fr.FletcherContinuousArray(arrow_data)
    return pd.Series(fr_array)
Exemple #16
0
    def setup(self, chunked, value, indices):
        # assert np.isscalar(values) or len(values) == len(indices)
        array = generate_test_array(self.n)
        if indices == "int":
            if value == "array_value":
                raise NotImplementedError()
            self.indexer = 50
        elif indices == "int_array":
            self.indexer = list(range(0, self.n, 5))
        elif indices == "bool_array":
            self.indexer = np.zeros(self.n, dtype=bool)
            self.indexer[list(range(0, self.n, 5))] = True
        elif indices == "slice":
            self.indexer = slice(0, self.n, 5)

        if value == "scalar_value":
            self.value = "setitem"
        elif value == "array_value":
            self.value = [str(x) for x in range(self.n)]
            self.value = np.array(self.value)[self.indexer]
            if len(self.value) == 1:
                self.value = self.value[0]

        self.df = pd.DataFrame({"str": array})
        if chunked:
            array = np.array_split(array, 1000)
        else:
            array = [array]
        self.df_ext = pd.DataFrame(
            {
                "str": fr.FletcherChunkedArray(
                    pa.chunked_array([pa.array(chunk, pa.string()) for chunk in array])
                )
            }
        )
Exemple #17
0
    def setup(self):
        array = [("a" * 50 + "b" if i % 2 == 0 else "c") * 5 + str(i)
                 for i in range(2**16)]
        self.pattern = "a" * 30 + "b"

        self.df = pd.DataFrame({"str": array})
        self.df_ext = pd.DataFrame(
            {"str": fr.FletcherChunkedArray(pa.array(array, pa.string()))})
Exemple #18
0
def test_dataframe_constructor():
    v = fr.FletcherChunkedArray(TEST_ARRAY)
    df = pd.DataFrame({"A": v})
    assert isinstance(df.dtypes["A"], fr.FletcherChunkedDtype)
    assert df.shape == (3, 1)

    # Test some calls to typical DataFrame functions
    str(df)
    df.info()
Exemple #19
0
def test_unique():
    arr = fr.FletcherChunkedArray(TEST_ARRAY)
    uniques = arr.unique()
    expected_uniques = pd.unique(arr.astype(object))

    assert isinstance(uniques, fr.FletcherChunkedArray)

    uniques = uniques.astype(object)
    npt.assert_array_equal(uniques, expected_uniques)
Exemple #20
0
def test_flatten():
    list_array = pa.array([[1, 2], [3, 4]])
    npt.assert_array_equal(
        fr.FletcherContinuousArray(list_array).flatten(), [1, 2, 3, 4])

    chunked_list_array = pa.chunked_array([list_array, list_array])
    npt.assert_array_equal(
        fr.FletcherChunkedArray(chunked_list_array).flatten(),
        [1, 2, 3, 4, 1, 2, 3, 4])
Exemple #21
0
def test_factorize():
    arr = fr.FletcherChunkedArray(TEST_ARRAY)
    labels, uniques = arr.factorize()
    expected_labels, expected_uniques = pd.factorize(arr.astype(object))

    assert isinstance(uniques, fr.FletcherChunkedArray)

    uniques = uniques.astype(object)
    npt.assert_array_equal(labels, expected_labels)
    npt.assert_array_equal(uniques, expected_uniques)
Exemple #22
0
def test_pandas_from_arrow():
    arr = pa.array(["a", "b", "c"], pa.string())

    expected_series_woutname = pd.Series(fr.FletcherChunkedArray(arr))
    pdt.assert_series_equal(expected_series_woutname, fr.pandas_from_arrow(arr))

    expected_series_woutname = pd.Series(fr.FletcherContinuousArray(arr))
    pdt.assert_series_equal(
        expected_series_woutname, fr.pandas_from_arrow(arr, continuous=True)
    )

    rb = pa.RecordBatch.from_arrays([arr], ["column"])
    expected_df = pd.DataFrame({"column": fr.FletcherChunkedArray(arr)})
    table = pa.Table.from_arrays([arr], ["column"])
    pdt.assert_frame_equal(expected_df, fr.pandas_from_arrow(rb))
    pdt.assert_frame_equal(expected_df, fr.pandas_from_arrow(table))

    expected_df = pd.DataFrame({"column": fr.FletcherContinuousArray(arr)})
    table = pa.Table.from_arrays([arr], ["column"])
    pdt.assert_frame_equal(expected_df, fr.pandas_from_arrow(rb, continuous=True))
    pdt.assert_frame_equal(expected_df, fr.pandas_from_arrow(table, continuous=True))
def test_text_cat(data, fletcher_variant, fletcher_variant_2):
    if any("\x00" in x for x in data if x):
        # pytest.skip("pandas cannot handle \\x00 characters in tests")
        # Skip is not working properly with hypothesis
        return
    ser_pd = pd.Series(data, dtype=str)
    arrow_data = pa.array(data, type=pa.string())
    if fletcher_variant == "chunked":
        fr_array = fr.FletcherChunkedArray(arrow_data)
    else:
        fr_array = fr.FletcherContinuousArray(arrow_data)
    ser_fr = pd.Series(fr_array)
    if fletcher_variant_2 == "chunked":
        fr_other_array = fr.FletcherChunkedArray(arrow_data)
    else:
        fr_other_array = fr.FletcherContinuousArray(arrow_data)
    ser_fr_other = pd.Series(fr_other_array)

    result_pd = ser_pd.str.cat(ser_pd)
    result_fr = ser_fr.fr_text.cat(ser_fr_other)
    result_fr = result_fr.astype(object)
    # Pandas returns np.nan for NA values in cat, keep this in line
    result_fr[result_fr.isna()] = np.nan
    tm.assert_series_equal(result_fr, result_pd)
Exemple #24
0
def test_text_zfill(data, str_accessor, fletcher_variant):
    if any("\x00" in x for x in data if x):
        # pytest.skip("pandas cannot handle \\x00 characters in tests")
        # Skip is not working properly with hypothesis
        return
    ser_pd = pd.Series(data, dtype=str)
    max_str_len = ser_pd.map(_optional_len).max()
    if pd.isna(max_str_len):
        max_str_len = 0
    arrow_data = pa.array(data, type=pa.string())
    if fletcher_variant == "chunked":
        fr_array = fr.FletcherChunkedArray(arrow_data)
    else:
        fr_array = fr.FletcherContinuousArray(arrow_data)
    ser_fr = pd.Series(fr_array)

    result_pd = ser_pd.str.zfill(max_str_len + 1)
    result_fr = getattr(ser_fr, str_accessor).zfill(max_str_len + 1)
    result_fr = result_fr.astype(object)
    # Pandas returns np.nan for NA values in cat, keep this in line
    result_fr[result_fr.isna()] = np.nan
    tm.assert_series_equal(result_fr, result_pd)
Exemple #25
0
def test_setitem_chunked_int_index(indices, test_array_chunked):
    ser = pd.Series(fr.FletcherChunkedArray(test_array_chunked))
    integer_index = indices
    ser[integer_index] = ["int", "index"]
    assert ser[indices[0]] == "int"
    assert ser[indices[1]] == "index"
Exemple #26
0
 def setup(self):
     array = generate_test_array(2 ** 17)
     self.df = pd.DataFrame({"str": array})
     self.df_ext = pd.DataFrame(
         {"str": fr.FletcherChunkedArray(pa.array(array, pa.string()))}
     )
Exemple #27
0
def test_series_constructor():
    v = fr.FletcherChunkedArray(TEST_ARRAY)
    result = pd.Series(v)
    assert result.dtype == v.dtype
    assert isinstance(result._data.blocks[0], ExtensionBlock)
Exemple #28
0
def test_dataframe_from_series():
    s = pd.Series(fr.FletcherChunkedArray(TEST_ARRAY))
    c = pd.Series(pd.Categorical(["a", "b"]))
    result = pd.DataFrame({"A": s, "B": c})
    assert isinstance(result.dtypes["A"], fr.FletcherChunkedDtype)
Exemple #29
0
def test_fletcherarray_constructor():
    with pytest.raises(ValueError):
        fr.FletcherChunkedArray(None)
Exemple #30
0
def array_inhom_chunks():
    chunk1 = pa.array(list("abc"), pa.string())
    chunk2 = pa.array(list("12345"), pa.string())
    chunk3 = pa.array(list("Z"), pa.string())
    chunked_array = pa.chunked_array([chunk1, chunk2, chunk3])
    return fr.FletcherChunkedArray(chunked_array)