def test_factorize_int() -> None: array = RLEArray._from_sequence([42, -10, -10], dtype=RLEDtype(np.int32)) codes_actual, uniques_actual = array.factorize() codes_expected = np.array([0, 1, 1], dtype=np.int64) assert codes_actual.dtype == codes_expected.dtype npt.assert_array_equal(codes_actual, codes_expected) uniques_expected = RLEArray._from_sequence([42, -10], dtype=np.int32) assert uniques_actual.dtype == uniques_expected.dtype npt.assert_array_equal(uniques_actual, uniques_expected)
def test_add_unhandled(array_orig: np.ndarray, array_rle: RLEArray, t: type) -> None: other = t(array_orig) # the pandas docs say we should not handle these assert (array_rle.__array_ufunc__(np.add, "__call__", array_rle, other) is NotImplemented)
def test_fail_two_dim_indexing() -> None: array = RLEArray._from_sequence(range(10)) with pytest.raises( NotImplementedError, match="__getitem__ does currently only work w/ a single parameter", ): array[1, 2]
def test_bool_ensure_int_or_float() -> None: array = RLEArray._from_sequence([False, True], dtype=np.bool_) actual = ensure_int_or_float(array) expected = np.array([0, 1], dtype=np.int64) assert actual.dtype == expected.dtype npt.assert_array_equal(actual, expected)
def test_groupby_bool_first() -> None: df = pd.DataFrame({"x": pd.Series([True, True], dtype=RLEDtype(bool)), "g": 1}) series = df.groupby("g")["x"].first() assert series.dtype == RLEDtype(bool) expected = RLEArray._from_sequence([True]) npt.assert_array_equal(series.array, expected)
def test_different_lengths() -> None: with pytest.raises( ValueError, match="data and positions must have same length but have 3 and 2"): RLEArray( data=np.asarray([1.0, 2.0, 3.0]), positions=np.asarray([10, 20], dtype=POSITIONS_DTYPE), )
def data_for_grouping(): """Data for factorization, grouping, and unique tests. Expected to be like [B, B, NA, NA, A, A, B, C] Where A < B < C and NA is missing """ return RLEArray( data=np.asarray([2.0, np.nan, 1.0, 2.0, 3.0], dtype=np.float32), positions=np.asarray([2, 4, 6, 7, 8], dtype=POSITIONS_DTYPE), )
def data(): """Length-100 array for this type. * data[0] and data[1] should both be non missing * data[0] and data[1] should not be equal """ return RLEArray( data=np.asarray([13, -1, -2, 42], dtype=np.float32), positions=np.asarray([1, 2, 4, 100], dtype=POSITIONS_DTYPE), )
def data_missing_for_sorting(): """Length-3 array with a known sort order. This should be three items [B, NA, A] with A < B and NA missing. """ return RLEArray( data=np.asarray([2.0, np.nan, 1.0], dtype=np.float32), positions=np.asarray([1, 2, 3], dtype=POSITIONS_DTYPE), )
def test_inplace_update() -> None: array = RLEArray._from_sequence([1], dtype=np.int64) array[[True]] = 2 expected = np.array([2], dtype=np.int64) npt.assert_array_equal(array, expected) assert array._dtype._dtype == np.int64 assert array._data.dtype == np.int64
def test_data_invalid_dims() -> None: with pytest.raises( ValueError, match="data must be an 1-dimensional ndarray but has 2 dimensions" ): RLEArray( data=np.asarray([[1.0, 2.0], [3.0, 4.0]]), positions=np.asarray([10, 20], dtype=POSITIONS_DTYPE), )
def data_for_sorting() -> RLEArray: """Length-3 array with a known sort order. This should be three items [B, C, A] with A < B < C """ return RLEArray( data=np.asarray([2.0, 3.0, 1.0], dtype=np.float32), positions=np.asarray([1, 2, 3], dtype=POSITIONS_DTYPE), )
def test_square_out(array_orig, array_rle, out_is_rle): out_orig = np.array([0] * len(array_orig), dtype=array_orig.dtype) if out_is_rle: out_rle = RLEArray(*compress(out_orig)) else: out_rle = out_orig.copy() np.square(array_orig, out=out_orig) np.square(array_rle, out=out_rle) npt.assert_array_equal(out_orig, out_rle)
def test_square_out(array_orig: np.ndarray, array_rle: RLEArray, out_is_rle: bool) -> None: out_orig = np.array([0] * len(array_orig), dtype=array_orig.dtype) if out_is_rle: out_rle = RLEArray._from_sequence(out_orig) else: out_rle = out_orig.copy() np.square(array_orig, out=out_orig) np.square(array_rle, out=out_rle) npt.assert_array_equal(out_orig, out_rle)
def test_pickle() -> None: array = RLEArray._from_sequence([1]) # roundtrip s = pickle.dumps(array) array2 = pickle.loads(s) npt.assert_array_equal(array, array2) # views must not be linked (A) array2_orig = array2.copy() array[:] = 2 npt.assert_array_equal(array2, array2_orig) # views must not be linked (B) array_orig = array.copy() array2[:] = 3 npt.assert_array_equal(array, array_orig)
def test_from_sequence_bool() -> None: array = RLEArray._from_sequence( np.array([0, 1], dtype=np.int64), dtype=RLEDtype(bool) ) npt.assert_array_equal(array, np.array([False, True])) array = RLEArray._from_sequence( np.array([0.0, 1.0], dtype=np.float64), dtype=RLEDtype(bool) ) npt.assert_array_equal(array, np.array([False, True])) with pytest.raises(TypeError, match="Need to pass bool-like values"): RLEArray._from_sequence(np.array([1, 2], dtype=np.int64), dtype=RLEDtype(bool)) with pytest.raises(TypeError, match="Need to pass bool-like values"): RLEArray._from_sequence(np.array([-1, 1], dtype=np.int64), dtype=RLEDtype(bool)) with pytest.raises(TypeError, match="Masked booleans are not supported"): RLEArray._from_sequence( np.array([np.nan, 1.0], dtype=np.float64), dtype=RLEDtype(bool) )
def data_for_twos(): """Length-100 array in which all the elements are two.""" return RLEArray( data=np.asarray([2.0], dtype=np.float32), positions=np.asarray([100], dtype=POSITIONS_DTYPE), )
def test_positions_invalid_type() -> None: with pytest.raises(TypeError, match="positions must be an ndarray but is int"): RLEArray(data=np.asarray([1.0, 2.0]), positions=1)
def test_data_invalid_type() -> None: with pytest.raises(TypeError, match="data must be an ndarray but is int"): RLEArray(data=1, positions=np.asarray([10, 20], dtype=POSITIONS_DTYPE))
def test_valid() -> None: RLEArray( data=np.asarray([1.0, 2.0]), positions=np.asarray([10, 20], dtype=POSITIONS_DTYPE), )
def test_positions_invalid_dtype() -> None: with pytest.raises(ValueError, match="positions must have dtype int64 but has uint64"): RLEArray(data=np.asarray([1.0, 2.0]), positions=np.asarray([10, 20], dtype=np.uint64))
def test_object_isna() -> None: array = RLEArray._from_sequence(["foo", None], dtype=object) actual = array.isna() expected = np.asarray([False, True]) npt.assert_equal(actual, expected)
def test_mean_divisor_overflow() -> None: # https://github.com/JDASoftwareGroup/rle-array/issues/22 array = RLEArray._from_sequence([1] * 256, dtype=np.uint8) assert array.mean() == 1
def rle_bool_series2(bool_values: np.ndarray) -> pd.Series: # TODO: Use `index=np.arange(len(bool_values)) + 1`. # For some reason, pandas casts us back to dtype=bool in that case. return pd.Series(RLEArray._from_sequence(bool_values[::-1]))
def test_not_sorted_2() -> None: with pytest.raises(ValueError, match="positions must be strictly sorted"): RLEArray( data=np.asarray([1.0, 2.0]), positions=np.asarray([10, 10], dtype=POSITIONS_DTYPE), )
def rle_bool_series(bool_values: np.ndarray) -> pd.Series: return pd.Series(RLEArray._from_sequence(bool_values))
def rle_series2(values: np.ndarray) -> pd.Series: return pd.Series(RLEArray._from_sequence(values[::-1]), index=np.arange(len(values)) + 1)
def test_different_length_raises(values: np.ndarray) -> None: array1 = RLEArray._from_sequence(values) array2 = RLEArray._from_sequence(values[:-1]) with pytest.raises(ValueError, match="arrays have different lengths"): array1 + array2
def array_rle(array_orig: np.ndarray) -> RLEArray: return RLEArray._from_sequence(array_orig)
def data_missing(): """Length-2 array with [NA, Valid]""" return RLEArray( data=np.asarray([np.nan, 42], dtype=np.float32), positions=np.asarray([1, 2], dtype=POSITIONS_DTYPE), )