def test_cut_small(): nbins = [4, 2, 5, 4, 10, 3, 2, 5] colnames = [ "bool", "int_pos", "int_neg", "int", "float", "inf_max", "inf_min", "inf" ] DT = dt.Frame( [[True, None, False, False, True, None], [3, None, 4, 1, 5, 4], [-5, -1, -1, -1, None, 0], [None, -5, -314, 0, 5, 314], [None, 1.4, 4.1, 1.5, 5.9, 1.4], [math.inf, 1.4, 4.1, 1.5, 5.9, 1.4], [-math.inf, 1.4, 4.1, 1.5, 5.9, 1.4], [-math.inf, 1.4, 4.1, math.inf, 5.9, 1.4]], names=colnames) DT_ref_right = dt.Frame( [[3, None, 0, 0, 3, None], [0, None, 1, 0, 1, 1], [0, 3, 3, 3, None, 4], [None, 1, 0, 1, 2, 3], [None, 0, 5, 0, 9, 0], [None] * DT.nrows, [None] * DT.nrows, [None] * DT.nrows], names=colnames, stypes=[stype.int32] * DT.ncols) DT_ref_left = dt.Frame( [[3, None, 0, 0, 3, None], [1, None, 1, 0, 1, 1], [0, 4, 4, 4, None, 4], [None, 1, 0, 2, 2, 3], [None, 0, 6, 0, 9, 0], [None] * DT.nrows, [None] * DT.nrows, [None] * DT.nrows], names=colnames, stypes=[stype.int32] * DT.ncols) DT_cut_list = DT[:, cut(DT, nbins=nbins)] DT_cut_tuple = DT[:, cut(DT, nbins=tuple(nbins))] DT_cut_list_left = DT[:, cut(DT, nbins=nbins, right_closed=False)] assert_equals(DT_ref_right, DT_cut_list) assert_equals(DT_ref_right, DT_cut_tuple) assert_equals(DT_ref_left, DT_cut_list_left)
def test_cut_one_row(): nbins = [1, 2, 3, 4] DT = dt.Frame([[True], [404], [3.1415926], [None]]) DT_cut_right = DT[:, cut(DT, nbins=nbins)] DT_cut_left = DT[:, cut(DT, nbins=nbins, right_closed=False)] assert DT_cut_right.to_list() == [[0], [0], [1], [None]] assert DT_cut_left.to_list() == [[0], [1], [1], [None]]
def test_cut_error_inconsistent_bins(): msg = ( "When nbins is a list or a tuple, its length must be the same as " "the number of columns in the frame/expression, i.e. 2, instead got: 1" ) DT = dt.Frame([[3, 1, 4], [1, 5, 9]]) with pytest.raises(ValueError, match=msg): cut(DT, nbins=[10])
def test_cut_trivial_bins(): DT_data = dt.Frame({"data": range(10)}) DT_bins = dt.Frame({"bins": range(-1, 10)}) cut_fexpr = cut(f[:], bins=[DT_bins]) for i in range(5): DT_cut = DT_data[:, cut_fexpr] expr_cut = cut(DT_data, bins=[DT_bins]) assert isinstance(expr_cut, FExpr) assert_equals(DT_data, DT_cut)
def test_cut_small_bins(): DT_bins = [dt.Frame([-1, 0, 1, 2]), dt.Frame(range(10)), dt.Frame(range(-10, 0)), dt.Frame([-1000, 0, 314]), dt.Frame(range(10)), dt.Frame([0, 1.4, 2.8, 4.2, 5.6]), dt.Frame([0, 1.4, 2.8, 4.2, 5.6, 7.0]), dt.Frame([-5, 0, 15])] colnames = ["bool", "int_pos", "int_neg", "int", "float", "inf_max", "inf_min", "inf"] DT = dt.Frame( [[True, None, False, False, True, None], [3, None, 4, 1, 5, 4], [-5, -1, -1, -1, None, 0], [None, -5, -314, 0, 5, 314], [None, 1.4, 4.1, 1.5, 5.9, 1.4], [math.inf, 1.4, 4.1, 1.5, 5.9, 1.4], [-math.inf, 1.4, -4.1, 1.5, 5.9, 1.4], [-math.inf, 1.4, 4.1, math.inf, 5.9, 1.4]], names = colnames ) DT_ref_right = dt.Frame( [[1, None, 0, 0, 1, None], [2, None, 3, 0, 4, 3], [4, 8, 8, 8, None, None], [None, 0, 0, 0, 1, 1], [None, 1, 4, 1, 5, 1], [None, 0, 2, 1, None, 0], [None, 0, None, 1, 4, 0], [None, 1, 1, None, 1, 1]], names = colnames, stypes = [stype.int32] * DT.ncols ) DT_ref_left = dt.Frame( [[2, None, 1, 1, 2, None], [3, None, 4, 1, 5, 4], [5, None, None, None, None, None], [None, 0, 0, 1, 1, None], [None, 1, 4, 1, 5, 1], [None, 1, 2, 1, None, 1], [None, 1, None, 1, 4, 1], [None, 1, 1, None, 1, 1]], names = colnames, stypes = [stype.int32] * DT.ncols ) DT_cut_list = DT[:, cut(DT, bins = DT_bins)] DT_cut_tuple = DT[:, cut(DT, bins = tuple(DT_bins))] DT_cut_list_left = DT[:, cut(DT, bins = DT_bins, right_closed = False)] assert_equals(DT_ref_right, DT_cut_list) assert_equals(DT_ref_right, DT_cut_tuple) assert_equals(DT_ref_left, DT_cut_list_left)
def test_cut_one_row_bins(): DT_bins = [dt.Frame([0, 1]), dt.Frame(range(1000)), dt.Frame([-100, 3.1415926, 100]), dt.Frame(range(5))] DT = dt.Frame([[True], [404], [3.1415926], [None]]) DT_cut_right = DT[:, cut(DT, bins = DT_bins)] DT_cut_left = DT[:, cut(DT, bins = DT_bins, right_closed = False)] assert DT_cut_right.to_list() == [[0], [403], [0], [None]] assert DT_cut_left.to_list() == [[None], [404], [1], [None]]
def test_cut_pandas_issue_35126(pandas): nbins = 42 data = [-97, 0, 97] DT = dt.Frame(data) DT_cut_right = DT[:, cut(DT, nbins=nbins)] DT_cut_left = DT[:, cut(DT, nbins=nbins, right_closed=False)] assert DT_cut_right.to_list() == [[0, 20, 41]] assert DT_cut_left.to_list() == [[0, 21, 41]] # Testing that Pandas results are inconsistent PD = pandas.cut(data, nbins, labels=False) assert list(PD) == [0, 21, 41]
def test_cut_vs_pandas_random_bins(pandas, seed): random.seed(seed) max_bins = 20 max_elements = 20 max_value = 100 n_elements = random.randint(1, max_elements) right_closed = bool(random.getrandbits(1)) DT_bins = [] bins = [[] for _ in range(3)] for i in range(3): nbins = random.randint(2, max_bins) bins[i] = random.sample(range(-max_value, max_value), nbins) bins[i].sort() DT_bins.append(dt.Frame(bins[i])) data = [[] for _ in range(3)] for _ in range(n_elements): data[0].append(random.randint(0, 1)) data[1].append(random.randint(-max_value, max_value)) data[2].append(random.random() * 2 * max_value - max_value) DT = dt.Frame(data, stypes = [stype.bool8, stype.int32, stype.float64]) DT_cut = DT[:, cut(DT, bins = DT_bins, right_closed = right_closed)] PD_cut = [pandas.cut(data[i], bins[i], labels=False, right=right_closed) for i in range(3)] PD_l = [list(PD_cut[i]) for i in range(3)] # Replace `nan`s with `None` for pandas for i in range(3): PD_l[i] = [None if math.isnan(PD_l[i][j]) else PD_l[i][j] for j in range(n_elements)] assert PD_l == DT_cut.to_list()
def test_cut_vs_pandas_random(pandas, seed): random.seed(seed) max_size = 20 max_value = 100 n = random.randint(1, max_size) nbins = [random.randint(1, max_size) for _ in range(3)] right_closed = bool(random.getrandbits(1)) data = [[] for _ in range(3)] for _ in range(n): data[0].append(random.randint(0, 1)) data[1].append(random.randint(-max_value, max_value)) data[2].append(random.random() * 2 * max_value - max_value) DT = dt.Frame(data, stypes=[stype.bool8, stype.int32, stype.float64]) DT_cut = DT[:, cut(DT, nbins=nbins, right_closed=right_closed)] PD_cut = [ pandas.cut(data[i], nbins[i], labels=False, right=right_closed) for i in range(3) ] assert [list(PD_cut[i]) for i in range(3)] == DT_cut.to_list()
def test_cut_error_inconsistent_nbins(): msg = ( "When nbins has more than one element, its length must be the same as " "the number of columns in the frame/expression, i.e. 2, instead got: 3" ) DT = dt.Frame([[3, 1, 4], [1, 5, 9]]) with pytest.raises(ValueError, match=msg): DT[:, cut(DT, nbins=[10, 11, 12])]
def test_cut_error_noniterable_bins(): msg = "bins parameter must be a list or a tuple, instead got <class 'float'>" DT = dt.Frame(range(10)) with pytest.raises(TypeError, match=msg): DT[:, cut(DT, bins=1.5)]
def test_cut_expr(): DT = dt.Frame([range(0, 30, 3), range(0, 20, 2)]) DT_cut = DT[:, cut(cut(f[0] - f[1]))] assert_equals(dt.Frame(range(10)), DT_cut)
def test_cut_trivial(): DT = dt.Frame({"trivial": range(10)}) DT_cut = DT[:, cut(f[:])] expr_cut = cut(DT) assert isinstance(expr_cut, FExpr) assert_equals(DT, DT_cut)
def test_cut_empty_frame(): DT = dt.Frame() expr_cut = cut(DT) assert isinstance(expr_cut, FExpr) assert_equals(DT[:, f[:]], DT)
def test_cut_error_wrong_right(): msg = "Expected a boolean, instead got <class 'int'>" DT = dt.Frame(range(10)) with pytest.raises(TypeError, match=msg): cut(DT, right_closed=1492)
def test_cut_error_negative_nbins_list(): msg = r"All elements in nbins must be positive, got nbins\[0\]: 0" DT = dt.Frame([[3, 1, 4], [1, 5, 9]]) with pytest.raises(ValueError, match=msg): DT[:, cut(DT, nbins=[0, -1])]
def test_cut_error_float_nbins(): msg = "Expected an integer, instead got <class 'float'>" DT = dt.Frame(range(10)) with pytest.raises(TypeError, match=msg): DT[:, cut(DT, nbins=1.5)]
def test_cut_error_wrong_column_type(): DT = dt.Frame([[1, 0], ["1", "0"]]) msg = r"cut\(\) can only be applied to numeric columns, instead column 1 " \ "has an stype: str32" with pytest.raises(TypeError, match=msg): DT[:, cut(DT)]
def test_cut_empty_frame(): DT_cut = cut(dt.Frame()) assert_equals(DT_cut, dt.Frame())
def test_cut_error_inconsistent_bins(): msg = ("Number of elements in bins must be equal to the number of columns " "in the frame/expression, i.e. 2, instead got: 1") DT = dt.Frame([[3, 1, 4], [1, 5, 9]]) with pytest.raises(ValueError, match=msg): DT[:, cut(DT, bins=[dt.Frame([1, 2])])]
def test_cut_error_string_bins(): msg = "bins parameter must be a list or a tuple, instead got <class 'str'>" DT = dt.Frame(range(10)) with pytest.raises(TypeError, match=msg): DT[:, cut(DT, bins="bin1")]
def test_cut_error_one_bin_edge(): msg = "To bin data at least two edges are required, instead for the frame 0 got: 1" DT = dt.Frame(range(10)) with pytest.raises(ValueError, match=msg): DT[:, cut(DT, bins=[dt.Frame([1])])]
def test_cut_error_noargs(): msg = r"Function datatable\.cut\(\) requires exactly 1 positional " \ r"argument, but none were given" with pytest.raises(TypeError, match=msg): cut()
def test_cut_error_none_bin_edge(): msg = "Bin edges must be numeric values only, instead for the frame 0 got None at row 2" DT = dt.Frame(range(10)) with pytest.raises(ValueError, match=msg): DT[:, cut(DT, bins=[dt.Frame([1, 2, None, 3])])]
def test_cut_error_wrong_column_type_zero_rows(): DT = dt.Frame(str=[] / dt.str32) msg = r"cut\(\) can only be applied to numeric columns, instead column 0 " \ "has an stype: str32" with pytest.raises(TypeError, match=msg): DT[:, cut(DT)]
def test_cut_error_bin_edges_not_increasing(): msg = "Bin edges must be strictly increasing, instead for the frame 0 at rows 2 and 3 the values are 4 and 3.99" DT = dt.Frame(range(10)) with pytest.raises(ValueError, match=msg): DT[:, cut(DT, bins=[dt.Frame([1, 2, 4.0, 3.99])])]
def test_cut_error_negative_nbins(): msg = "Number of bins must be positive, instead got: -10" DT = dt.Frame(range(10)) with pytest.raises(ValueError, match=msg): DT[:, cut(DT, nbins=-10)]
def test_cut_error_groupby(): msg = r"cut\(\) cannot be used in a groupby context" DT = dt.Frame(range(10)) with pytest.raises(NotImplementedError, match=msg): DT[:, cut(f[0]), f[0]]
def test_cut_error_wrong_right(): msg = r"Argument right_closed in function datatable\.cut\(\) should " \ r"be a boolean, instead got <class 'int'>" DT = dt.Frame(range(10)) with pytest.raises(TypeError, match=msg): DT[:, cut(DT, right_closed=1492)]
def test_cut_error_zero_bins(): msg = "Number of bins must be positive, instead got: 0" DT = dt.Frame(range(10)) with pytest.raises(ValueError, match=msg): cut(DT, nbins=0)