Beispiel #1
0
def test_qcut_small():
    nquantiles = [4, 5, 4, 2, 5, 4, 10, 3, 2, 5]
    colnames = [
        "bool", "one_group_odd", "one_group_even", "int_pos", "int_neg", "int",
        "float", "inf_max", "inf_min", "inf"
    ]

    DT = dt.Frame(
        [[True, None, False, False, True, None], [None, 10, None, 10, 10, 10],
         [None, 10, None, 10, 10, 10], [3, None, 4, 1, 5, 4],
         [-5, -1, -1, -1, None, 0], [None, -5, -314, 0, 5, 314],
         [None, 1.4, 4.1, 1.5, 5.9, 1.4], [math.inf, 1.4, 4.1, 1.5, 5.9, 1.4],
         [-math.inf, 1.4, 4.1, 1.5, 5.9, 1.4],
         [-math.inf, 1.4, 4.1, math.inf, 5.9, 1.4]],
        names=colnames)

    DT_ref = dt.Frame(
        [[3, None, 0, 0, 3, None], [None, 2, None, 2, 2, 2],
         [None, 1, None, 1, 1, 1], [0, None, 1, 0, 1, 1],
         [0, 2, 2, 2, None, 4], [None, 0, 0, 1, 2, 3], [None, 0, 6, 3, 9, 0],
         [2, 0, 1, 0, 2, 0], [0, 0, 1, 0, 1, 0], [0, 1, 2, 4, 3, 1]],
        names=colnames,
        stypes=[stype.int32] * DT.ncols)

    DT_qcut = DT[:, qcut(f[:], nquantiles=nquantiles)]
    DT_qcut_frame = DT[:, qcut(DT, nquantiles=nquantiles)]
    assert_equals(DT_ref, DT_qcut)
    assert_equals(DT_ref, DT_qcut_frame)
Beispiel #2
0
def test_qcut_str():
    assert str(qcut(f.A)) == "FExpr<qcut(f.A)>"
    assert str(qcut(f.A) + 1) == "FExpr<qcut(f.A) + 1>"
    assert str(qcut(f.A + f.B)) == "FExpr<qcut(f.A + f.B)>"
    assert str(qcut(f.B, nquantiles=3)) == "FExpr<qcut(f.B, nquantiles=3)>"
    assert str(qcut(f[:2], nquantiles=[3, 4])) == \
            "FExpr<qcut(f[:2], nquantiles=[3, 4])>"
Beispiel #3
0
def test_qcut_error_inconsistent_nquantiles():
    msg = "When nquantiles is a list or a tuple, its length must be " \
          "the same as the number of input columns, " \
          "i.e. 2, instead got: 1"
    DT = dt.Frame([[3, 1, 4], [1, 5, 9]])
    with pytest.raises(ValueError, match=msg):
        DT[:, qcut(f[:], nquantiles=[10])]
Beispiel #4
0
def test_qcut_random(pandas, seed):
    random.seed(seed)
    max_size = 20
    max_value = 100
    nrows = random.randint(1, max_size)
    ncols = 4
    stypes = (stype.bool8, stype.int32, stype.float64, stype.float64)
    names = ("bool", "int", "float", "nafloat")
    nquantiles = [random.randint(1, max_size) for _ in range(ncols)]
    data = [[] for _ in range(ncols)]

    for _ in range(nrows):
        data[0].append(random.randint(0, 1) if random.random() > 0.1 else None)
        data[1].append(
            random.randint(-max_value, max_value
                           ) if random.random() > 0.05 else None)
        data[2].append(random.random() * 2 * max_value -
                       max_value if random.random() > 0.2 else None)
        data[3].append(random.random() * 2 * max_value -
                       max_value if random.random() < 0.1 else None)

    DT = dt.Frame(data, stypes=stypes, names=names)
    DT_qcut = DT[:, qcut(f[:], nquantiles=nquantiles)]

    DT_nunique = DT.nunique()

    frame_integrity_check(DT_qcut)
    assert DT_qcut.names == names
    assert DT_qcut.stypes == tuple(stype.int32 for _ in range(ncols))

    for j in range(ncols):
        if DT_nunique[0, j] == 1:
            c = int((nquantiles[j] - 1) / 2)
            assert (DT_qcut[j].to_list() == [[
                None if DT[i, j] is None else c for i in range(nrows)
            ]])
        else:
            if DT_qcut[j].countna1() == nrows:
                assert DT_qcut[j].min1() == None
                assert DT_qcut[j].max1() == None
            else:
                assert DT_qcut[j].min1() == 0
                assert DT_qcut[j].max1() == nquantiles[j] - 1
Beispiel #5
0
def test_qcut_vs_pandas_random(pandas, seed):
    random.seed(seed)
    max_size = 20
    max_value = 100

    n = random.randint(1, max_size)

    ncols = 2
    nquantiles = [random.randint(1, max_size) for _ in range(ncols)]
    data = [[] for _ in range(ncols)]

    for _ in range(n):
        data[0].append(random.randint(-max_value, max_value))
        data[1].append(random.random() * 2 * max_value - max_value)

    DT = dt.Frame(data, stypes=[stype.int32, stype.float64])
    DT_qcut = DT[:, qcut(f[:], nquantiles=nquantiles)]
    PD_qcut = [
        pandas.qcut(data[i], nquantiles[i], labels=False) for i in range(ncols)
    ]

    assert [list(PD_qcut[i]) for i in range(ncols)] == DT_qcut.to_list()
Beispiel #6
0
def test_qcut_error_negative_nquantiles_list():
    msg = r"All elements in nquantiles must be positive, got nquantiles\[1\]: -1"
    DT = dt.Frame([[3, 1, 4], [1, 5, 9]])
    with pytest.raises(ValueError, match=msg):
        DT[:, qcut(f[:], nquantiles=[10, -1])]
Beispiel #7
0
def test_qcut_error_negative_nquantiles():
    msg = "Number of quantiles must be positive, instead got: -10"
    DT = dt.Frame(range(10))
    with pytest.raises(ValueError, match=msg):
        DT[:, qcut(f[:], nquantiles=-10)]
Beispiel #8
0
def test_qcut_error_float_nquantiles():
    msg = "Expected an integer, instead got <class 'float'>"
    DT = dt.Frame(range(10))
    with pytest.raises(TypeError, match=msg):
        DT[:, qcut(f[:], nquantiles=1.5)]
Beispiel #9
0
def test_qcut_error_wrong_column_type_zero_rows():
    DT = dt.Frame(obj=[] / dt.obj64)
    msg = r"qcut\(\) cannot be applied to string or object columns, instead " \
           "column 0 has an stype: obj64"
    with pytest.raises(TypeError, match=msg):
        DT[:, qcut(f[:])]
Beispiel #10
0
def test_qcut_error_noargs():
    msg = r"Function datatable\.qcut\(\) requires exactly 1 positional " \
           "argument, but none were given"
    with pytest.raises(TypeError, match=msg):
        qcut()
Beispiel #11
0
def test_qcut_one_row():
    nquantiles = [1, 2, 3, 4]
    DT = dt.Frame([[True], [404], [3.1415926], [None]])
    DT_qcut = DT[:, qcut(f[:], nquantiles=nquantiles)]
    assert DT_qcut.to_list() == [[0], [0], [1], [None]]
Beispiel #12
0
def test_qcut_expr_simple():
    DT = dt.Frame([range(0, 30, 3), range(0, 20, 2)])
    DT_qcut = DT[:, qcut(f[0] - f[1])]
    assert_equals(dt.Frame(range(10)), DT_qcut)
Beispiel #13
0
def test_qcut_trivial():
    DT = dt.Frame({"trivial": range(10)})
    DT_qcut = DT[:, qcut(f[:])]
    expr_qcut = qcut(DT)
    assert isinstance(expr_qcut, FExpr)
    assert_equals(DT, DT_qcut)
Beispiel #14
0
def test_qcut_zerorow_frame():
    DT = dt.Frame([[], []])
    DT_qcut = DT[:, qcut(f[:])]
    expr_qcut = qcut(DT)
    assert isinstance(expr_qcut, FExpr)
    assert_equals(DT_qcut, dt.Frame([[] / dt.int32, [] / dt.int32]))
Beispiel #15
0
def test_qcut_empty_frame():
    DT = dt.Frame()
    expr_qcut = qcut(DT)
    assert isinstance(expr_qcut, FExpr)
    assert_equals(DT[:, f[:]], DT)
Beispiel #16
0
def test_qcut_error_groupby():
    msg = r"qcut\(\) cannot be used in a groupby context"
    DT = dt.Frame(range(10))
    with pytest.raises(NotImplementedError, match=msg):
        DT[:, qcut(f[0]), f[0]]
Beispiel #17
0
def test_qcut_error_wrong_column_types():
    DT = dt.Frame([[0], [dt]])
    msg = r"qcut\(\) cannot be applied to string or object columns, instead " \
           "column 1 has an stype: obj64"
    with pytest.raises(TypeError, match=msg):
        DT[:, qcut(f[:])]