Beispiel #1
0
def test_reduce_max_min_no_nulls():
    test = [1, 2, 3, -23, 75]

    fr_test_int = fr.FletcherArray(test, dtype=pa.int64())
    fr_test_float = fr.FletcherArray(test, dtype=pa.float64())

    fr_test_int_no_nulls = fr.FletcherArray(
        pa.Array.from_buffers(
            type=pa.int64(),
            length=len(fr_test_int),
            buffers=[None, fr_test_int.data.chunk(0).buffers()[1]],
        ))
    fr_test_float_no_nulls = fr.FletcherArray(
        pa.Array.from_buffers(
            type=pa.float64(),
            length=len(fr_test_float),
            buffers=[None, fr_test_float.data.chunk(0).buffers()[1]],
        ))

    result_int_max = fr_test_int_no_nulls._reduce("max")
    result_int_min = fr_test_int_no_nulls._reduce("min")

    result_float_max = fr_test_float_no_nulls._reduce("max")
    result_float_min = fr_test_float_no_nulls._reduce("min")

    expected_result_int_max = 75
    expected_result_int_min = -23

    expected_result_float_max = 75.0
    expected_result_float_min = -23.0

    assert result_int_max == expected_result_int_max
    assert result_int_min == expected_result_int_min
    assert result_float_max == expected_result_float_max
    assert result_float_min == expected_result_float_min
Beispiel #2
0
def test_take_list_arrays():
    indices = [0, 1, 4, 3, 5]
    indptr = [0, 2, 3, 5]
    list_array = pa.ListArray.from_arrays(indptr, indices)
    large_list_array = pa.LargeListArray.from_arrays(indptr, indices)

    test_with_null = fr.FletcherArray(pa.array([[1, 2], [None, 3], [4, 5]]))

    assert np.all(
        pa.array(test_with_null.take([1, 2, 1])).to_pylist() ==
        [[None, 3], [4, 5], [None, 3]])

    test = fr.FletcherArray(pa.chunked_array([list_array,
                                              list_array])).take([0, 5, 1])
    test_large = fr.FletcherArray(
        pa.chunked_array([large_list_array, large_list_array])).take([0, 5, 1])

    expected = [[0, 1], [3, 5], [4]]

    assert np.all(
        list(
            map(
                lambda x: np.all(np.array(test[x]) == np.array(expected)[x]),
                range(0, len(test)),
            )))
    assert np.all(
        list(
            map(
                lambda x: np.all(
                    np.array(test_large[x]) == np.array(expected)[x]),
                range(0, len(test_large)),
            )))
Beispiel #3
0
def test_take_on_chunks_with_many_chunks():
    test = [[1, 2, 3] for _ in range(100)]
    fr_test = fr.FletcherArray(pa.chunked_array(test))

    indices1 = np.array([(30 * k + (k % 3)) for k in range(0, 10)])
    # bins will be already sorted
    indices2 = np.array([2, 5] * 100)
    # bins will have to be sorted

    limits_idx1 = np.array([0] + [k // 10 for k in range(10, 110)])
    limits_idx2 = np.array([0] + [100] + [200] * 99)

    sort_idx1 = None
    sort_idx2 = np.array([2 * k for k in range(0, 100)] +
                         [2 * k + 1 for k in range(100)])

    indices2 = indices2[sort_idx2]

    cum_lengths = np.array([3 * k for k in range(100)])

    for indices, limits_idx, cum_lengths, sort_idx in [
        (indices1, limits_idx1, cum_lengths, sort_idx1),
        (indices2, limits_idx2, cum_lengths, sort_idx2),
    ]:
        expected_result = fr.FletcherArray(
            [np.concatenate(test)[e] for e in indices])
        result = fr_test._take_on_chunks(indices,
                                         limits_idx=limits_idx,
                                         cum_lengths=cum_lengths,
                                         sort_idx=sort_idx)
        npt.assert_array_equal(expected_result, result)
Beispiel #4
0
def test_concatenate_blocks():
    v1 = fr.FletcherArray(TEST_ARRAY)
    s = pd.Series(v1, index=pd.RangeIndex(3), fastpath=True)
    result = pd.concat([s, s], ignore_index=True)
    expected = pd.Series(
        fr.FletcherArray(
            pa.array(["Test", "string", None, "Test", "string", None])))
    tm.assert_series_equal(result, expected)
Beispiel #5
0
def test_take():
    test = [[1, 2, 8, 3], [4, 1, 5, 6], [7, 8, 9]]
    indices = [4, 2, 8] * 100
    fr_test = fr.FletcherArray(pa.chunked_array(test))
    result = fr_test.take(indices)
    expected_result = fr.FletcherArray(
        pa.chunked_array([[4, 8, 7] for _ in range(100)]))
    npt.assert_array_equal(expected_result, result)
Beispiel #6
0
def test_take_on_concatenated_chunks():
    test = [[1, 2, 8, 3], [4, 1, 5, 6], [7, 8, 9]]
    indices = np.array([4, 2, 8])
    expected_result = fr.FletcherArray(
        [np.concatenate(test)[e] for e in indices])
    result = fr.FletcherArray(
        pa.chunked_array(test))._take_on_concatenated_chunks(indices)
    npt.assert_array_equal(expected_result, result)
Beispiel #7
0
 def setup(self):
     data = np.zeros(2 ** 24).astype(bool)
     self.fr_data = pd.Series(fr.FletcherArray(pa.array(data)))
     self.np_data = pd.Series(data.astype(np.float32))
     data_withna = np.zeros(2 ** 24).astype(bool).astype(object)
     data_withna[-1] = None
     self.fr_data_withna = pd.Series(fr.FletcherArray(pa.array(data_withna)))
     self.np_data_withna = pd.Series(data_withna.astype(np.float32))
Beispiel #8
0
def test_indices_dtype():
    arr1 = fr.FletcherArray(np.zeros(np.iinfo(np.int32()).max + 1))
    arr2 = fr.FletcherArray(np.zeros(np.iinfo(np.int32()).max + 2))
    for arr in [arr1, arr2]:
        npt.assert_equal(
            len(arr) - 1,
            np.array([len(arr) - 1], dtype=arr._indices_dtype)[0])
    npt.assert_equal(arr1._indices_dtype, np.dtype(np.int32))
    npt.assert_equal(arr2._indices_dtype, np.dtype(np.int64))
Beispiel #9
0
def test_pandas_from_arrow():
    arr = pa.array(["a", "b", "c"], pa.string())

    expected_series_woutname = pd.Series(fr.FletcherArray(arr))
    pdt.assert_series_equal(expected_series_woutname,
                            fr.pandas_from_arrow(arr))

    rb = pa.RecordBatch.from_arrays([arr], ["column"])
    expected_df = pd.DataFrame({"column": fr.FletcherArray(arr)})
    pdt.assert_frame_equal(expected_df, fr.pandas_from_arrow(rb))
Beispiel #10
0
def test_reduce_mean():
    test = [[1, 2, 3], [1, 2, None]]
    fr_test_int = fr.FletcherArray(pa.chunked_array(test), dtype=pa.int64())
    fr_test_float = fr.FletcherArray(pa.chunked_array(test),
                                     dtype=pa.float64())
    result_int = fr_test_int._reduce("mean")
    result_float = fr_test_float._reduce("mean")
    expected_result = 9 / 5
    assert result_int == expected_result
    assert result_float == expected_result
Beispiel #11
0
def test_take_on_chunks():
    test = [[1, 2, 8, 3], [4, 1, 5, 6], [7, 8, 9]]
    indices = np.array([2, 4, 8])
    limits_idx = np.array([0, 1, 2, 3])
    cum_lengths = np.array([0, 4, 8])
    expected_result = fr.FletcherArray(
        [np.concatenate(test)[e] for e in indices])
    result = fr.FletcherArray(pa.chunked_array(test))._take_on_chunks(
        indices, limits_idx=limits_idx, cum_lengths=cum_lengths)
    npt.assert_array_equal(expected_result, result)
Beispiel #12
0
def test_take_on_concatenated_chunks_with_many_chunks():
    test = [[1, 2, 3] for _ in range(100)]
    fr_test = fr.FletcherArray(pa.chunked_array(test))
    indices1 = np.array([(30 * k + (k % 3)) for k in range(0, 10)])
    indices2 = np.array([2, 5] * 100)
    for indices in [indices1, indices2]:
        expected_result = fr.FletcherArray(
            [np.concatenate(test)[e] for e in indices])
        result = fr_test._take_on_concatenated_chunks(indices)
        npt.assert_array_equal(expected_result, result)
Beispiel #13
0
def test_dataframe_from_series_no_dict():
    s = pd.Series(fr.FletcherArray(TEST_ARRAY))
    result = pd.DataFrame(s)
    expected = pd.DataFrame({0: s})
    tm.assert_frame_equal(result, expected)

    s = pd.Series(fr.FletcherArray(TEST_ARRAY), name="A")
    result = pd.DataFrame(s)
    expected = pd.DataFrame({"A": s})
    tm.assert_frame_equal(result, expected)
Beispiel #14
0
def test_flatten():
    test = [[1, 2], [], [3, 4]]
    empty_array = [[], [], [], []]

    fr_test = fr.FletcherArray(test)
    fr_test_empty_array = fr.FletcherArray(
        pa.chunked_array([empty_array, empty_array]))

    npt.assert_array_equal(fr_test.flatten(), fr.FletcherArray([1, 2, 3, 4]))
    npt.assert_array_equal(fr_test_empty_array.flatten(),
                           np.array([], dtype=np.int64))
Beispiel #15
0
    def test_arrow_array_types(self):  # noqa: F811
        fr_arr = fr.FletcherArray(pa.array([3, None, 4.4]))
        # non-safe casting
        assert fr_arr.__arrow_array__(type=pa.int64()).equals(
            pa.array([3, None, 4]))
        assert fr_arr.data.chunk(0).equals(pa.array([3, None, 4.4]))

        fr_arr = fr.FletcherArray(pa.array(["3", "-2", "4.4"]))
        # non-safe casting
        assert fr_arr.__arrow_array__(type=pa.float64()).equals(
            pa.array([3, -2, 4.4]))
        assert fr_arr.data.chunk(0).equals(pa.array(["3", "-2", "4.4"]))
Beispiel #16
0
def test_fillna_chunked(test_array_chunked):
    ser = pd.Series(fr.FletcherArray(test_array_chunked))
    ser = ser.fillna("filled")

    expected_list = TEST_LIST[:2] + ["filled"]
    chunks = []
    for _ in range(10):
        chunks.append(pa.array(expected_list))
    chunked_exp = pa.chunked_array(chunks)
    expected = pd.Series(fr.FletcherArray(chunked_exp))

    tm.assert_series_equal(ser, expected)
def test_groupby():
    arr = fr.FletcherArray(["a", "a", "b", None])
    df = pd.DataFrame({"str": arr, "int": [10, 5, 24, 6]})
    result = df.groupby("str").sum()

    expected = pd.DataFrame({"int": [15, 24]}, index=pd.Index(["a", "b"], name="str"))
    tm.assert_frame_equal(result, expected)
Beispiel #18
0
    def setup(self, chunked, value, indices):
        # assert np.isscalar(values) or len(values) == len(indices)
        array = generate_test_array(self.n)
        if indices == "int":
            if value == "array_value":
                raise NotImplementedError()
            self.indexer = 50
        elif indices == "int_array":
            self.indexer = list(range(0, self.n, 5))
        elif indices == "bool_array":
            self.indexer = np.zeros(self.n, dtype=bool)
            self.indexer[list(range(0, self.n, 5))] = True
        elif indices == "slice":
            self.indexer = slice(0, self.n, 5)

        if value == "scalar_value":
            self.value = "setitem"
        elif value == "array_value":
            self.value = [str(x) for x in range(self.n)]
            self.value = np.array(self.value)[self.indexer]
            if len(self.value) == 1:
                self.value = self.value[0]

        self.df = pd.DataFrame({"str": array})
        if chunked:
            array = np.array_split(array, 1000)
        else:
            array = [array]
        self.df_ext = pd.DataFrame({
            "str":
            fr.FletcherArray(
                pa.chunked_array(
                    [pa.array(chunk, pa.string()) for chunk in array]))
        })
Beispiel #19
0
def test_setitem_chunked(test_array_chunked):
    ser = pd.Series(fr.FletcherArray(test_array_chunked))
    new_val = "new_value"
    old_val = ser[15]
    assert new_val != old_val
    ser[15] = new_val
    assert new_val == ser[15]
Beispiel #20
0
def test_max_min_with_offset():
    # pyarrow fills the buffer with value zero when there is a null, so we do a test with only negative values.
    test = [[-30, None, -1, None], [-2, -15, -6]]
    fr_test = fr.FletcherArray(pa.chunked_array(test))

    assert fr_test[1:]._reduce("max") == -1
    assert fr_test[1:]._reduce("min") == -15
Beispiel #21
0
def test_nbytes():
    array = fr.FletcherArray(pa.array(["A", None, "CC"]))
    # Minimal storage usage:
    # 1 byte for the valid bitmap
    # 4 bytes for the offset array
    # 3 bytes for the actual string content
    assert array.nbytes >= 8
Beispiel #22
0
def test_reduce_sum():
    test = [[1, 2, 3], [1, 2, None]]

    fr_test_int = fr.FletcherArray(pa.chunked_array(test), dtype=pa.int64())
    fr_test_float = fr.FletcherArray(pa.chunked_array(test),
                                     dtype=pa.float64())

    result_int = fr_test_int._reduce("sum")
    result_float = fr_test_float._reduce("sum")

    expected_result_int = 9
    expected_result_float = 9.0

    assert result_int == expected_result_int
    assert result_float == expected_result_float

    assert fr.FletcherArray([], dtype=pa.int32())._reduce("sum") == 0
Beispiel #23
0
def test_series_attributes():
    s = pd.Series(fr.FletcherArray(TEST_ARRAY))
    assert s.ndim == 1
    assert s.size == 3
    assert s.values is not None
    # This line currently fails with pandas master: https://github.com/pandas-dev/pandas/issues/22414
    assert (s.T == s).all()
    assert s.memory_usage() > 8
Beispiel #24
0
def test_dataframe_constructor():
    v = fr.FletcherArray(TEST_ARRAY)
    df = pd.DataFrame({"A": v})
    assert isinstance(df.dtypes["A"], fr.FletcherDtype)
    assert df.shape == (3, 1)

    # Test some calls to typical DataFrame functions
    str(df)
    df.info()
Beispiel #25
0
def test_text_cat(data):
    if any("\x00" in x for x in data if x):
        # pytest.skip("pandas cannot handle \\x00 characters in tests")
        # Skip is not working properly with hypothesis
        return
    ser_pd = pd.Series(data, dtype=str)
    arrow_data = pa.array(data, type=pa.string())
    fr_array = fr.FletcherArray(arrow_data)
    ser_fr = pd.Series(fr_array)
    fr_other_array = fr.FletcherArray(arrow_data)
    ser_fr_other = pd.Series(fr_other_array)

    result_pd = ser_pd.str.cat(ser_pd)
    result_fr = ser_fr.fr_text.cat(ser_fr_other)
    result_fr = result_fr.astype(object)
    # Pandas returns np.nan for NA values in cat, keep this in line
    result_fr[result_fr.isna()] = np.nan
    tm.assert_series_equal(result_fr, result_pd)
Beispiel #26
0
def test_factorize():
    arr = fr.FletcherArray(TEST_ARRAY)
    labels, uniques = arr.factorize()
    expected_labels, expected_uniques = pd.factorize(arr.astype(object))

    assert isinstance(uniques, fr.FletcherArray)

    uniques = uniques.astype(object)
    npt.assert_array_equal(labels, expected_labels)
    npt.assert_array_equal(uniques, expected_uniques)
Beispiel #27
0
def test_access_element_with_np_integers():
    inputs = fr.FletcherArray([1, 2, 5, 7])
    head = inputs[np.int64(0)]
    second = inputs[np.int32(1)]
    third = inputs[np.int16(2)]
    last = inputs[np.int8(3)]
    assert head == 1
    assert second == 2
    assert third == 5
    assert last == 7
Beispiel #28
0
def test_factorize_with_offset(test, test_with_nulls):
    fr_test = fr.FletcherArray(test)

    result_indices_with_offset, result_unique_with_offset = fr_test[
        1:].factorize()
    expected_indices_with_offset, expected_unique_with_offset = (
        [0, 0, 0],
        fr.FletcherArray(["b"]),
    )

    npt.assert_array_equal(result_indices_with_offset,
                           expected_indices_with_offset)
    npt.assert_array_equal(result_unique_with_offset,
                           expected_unique_with_offset)

    test_with_chunks_and_nulls = fr.FletcherArray(
        pa.chunked_array([test_with_nulls, test_with_nulls]))[1:]
    indices, unique = test_with_chunks_and_nulls.factorize()
    npt.assert_array_equal(test_with_chunks_and_nulls,
                           unique.take(indices, allow_fill=True))
Beispiel #29
0
def test_eq():
    test = [[1, 2, 3], [4, 5, 1, None]]
    fr_test = fr.FletcherArray(pa.chunked_array(test))
    df_test = pd.DataFrame({"a":
                            fr.FletcherArray(pa.chunked_array(test))})["a"]
    result = fr_test == 1
    expected_result = np.array([True, False, False, False, False, True, False])

    npt.assert_array_equal(result, expected_result)
    npt.assert_array_equal(fr_test == fr_test, np.array([True] * 6 + [False]))
    npt.assert_array_equal(df_test == 2, np.array([False, True] + 5 * [False]))
    npt.assert_array_equal(
        df_test == np.array([1, 3, 2, 4, 5, 6, 7]),
        np.array([True, False, False, True, True, False, False]),
    )

    with pytest.raises(ValueError) as error_length:
        df_test == [1, 2]

    assert "Lengths must match to compare" == str(error_length.value)
Beispiel #30
0
def test_reduce_max_min():
    test = [[1, 2, 3], [-23, 75, None]]

    fr_test_int = fr.FletcherArray(pa.chunked_array(test), dtype=pa.int64())
    fr_test_float = fr.FletcherArray(pa.chunked_array(test),
                                     dtype=pa.float64())

    result_int_max = fr_test_int._reduce("max")
    result_int_min = fr_test_int._reduce("min")

    result_float_max = fr_test_float._reduce("max")
    result_float_min = fr_test_float._reduce("min")

    expected_result_int_max = 75
    expected_result_int_min = -23

    expected_result_float_max = 75.0
    expected_result_float_min = -23.0

    assert result_int_max == expected_result_int_max
    assert result_int_min == expected_result_int_min
    assert result_float_max == expected_result_float_max
    assert result_float_min == expected_result_float_min