Esempio n. 1
0
def test_rowsd():
    assert str(dt.rowsd(f.A)) == str(f.A.rowsd())
    assert str(dt.rowsd(f[:])) == str(f[:].rowsd())
    DT = dt.Frame({"C": [2, 5, 30, 20, 10],
                   "D": [10, 8, 20, 20, 1]})

    assert_equals(DT[:, f[:].rowsd()], DT[:, dt.rowsd(f[:])])
Esempio n. 2
0
def test_rowsd_floats():
    DT = dt.Frame([(1.5, 6.4, 0.0, None, 7.22),
                   (2.0, -1.1, math.inf, 4.0, 3.2),
                   (1.5, 9.9, None, None, math.nan),
                   (math.inf, -math.inf, None, 0.0, math.nan)])
    RES = DT[:, rowsd(f[:])]
    std1 = 3.5676696409094086
    std3 = 5.939696961966999
    assert_equals(RES, dt.Frame([std1, None, std3, None]))
Esempio n. 3
0
def test_reprs():
    # Check that row-expressions can be repr'd without errors
    assert repr(rowall())
    assert repr(rowany())
    assert repr(rowsum())
    assert repr(rowcount())
    assert repr(rowmin())
    assert repr(rowmax())
    assert repr(rowfirst())
    assert repr(rowlast())
    assert repr(rowmean())
    assert repr(rowsd())
Esempio n. 4
0
def test_rowmean_wrong_types():
    DT = dt.Frame(A=[3, 5, 6], B=["a", "d", "e"])
    with pytest.raises(TypeError,
                       match="Function rowsd expects a sequence "
                       "of numeric columns"):
        assert rowsd(DT)
Esempio n. 5
0
def test_rowsd_same_columns():
    DT = dt.Frame([range(5)] * 10)
    assert_equals(rowsd(DT), dt.Frame([0.0] * 5))
Esempio n. 6
0
def test_rowsd_single_column():
    DT = dt.Frame(A=range(5))
    assert_equals(rowsd(DT), dt.Frame([math.nan] * 5))
Esempio n. 7
0
    # remove black listed columns or column groups that smaller than minimal size
    col_groups = {
        key: val
        for key, val in all_col_groups.items()
        if not key in black_listed_columns or len(val) >= min_col_group_size
    }

    # list of column prefixes
    columns = list(col_groups.keys())
    # list of column ranges
    ranges = [(min(idx), max(idx)) for idx in col_groups.values()]

# produce tuple for column slices
col_slices = [((col + "%d") % (desde), (col + "%d") % (hasta))
              for (col, (desde, hasta)) in zip(columns, ranges)]

for c, r, s in zip(columns, ranges, col_slices):
    update_map = {
        c + "_sum": rowsum(f[s[0]:s[1]]),
        c + "_mean": rowmean(f[s[0]:s[1]]),
        c + "_sd": rowsd(f[s[0]:s[1]]),
        c + "_max": rowmax(f[s[0]:s[1]]),
        c + "_min": rowmin(f[s[0]:s[1]]),
        c + "_range": rowmax(f[s[0]:s[1]]) - rowmin(f[s[0]:s[1]]),
        c + "_first": rowfirst(f[s[0]:s[1]]),
        c + "_last": rowlast(f[s[0]:s[1]]),
        c + "_missing": (r[1] - r[0] + 1) - rowcount(f[s[0]:s[1]])
    }
    X[:, update(**update_map)]

return {"CreditCard-train-aug.csv": X}
Esempio n. 8
0
def test_rowsd_same_columns():
    DT = dt.Frame([range(5)] * 10)
    RES = DT[:, rowsd(f[:])]
    assert_equals(RES, dt.Frame([0.0] * 5))
Esempio n. 9
0
def test_rowsd_single_column():
    DT = dt.Frame(A=range(5))
    RES = DT[:, rowsd(f[:])]
    assert_equals(RES, dt.Frame([None] * 5, type=float))
Esempio n. 10
0
def test_rowsd_single_column():
    DT = dt.Frame(A=range(5))
    RES = DT[:, rowsd(f[:])]
    assert_equals(RES, dt.Frame([math.nan] * 5))
    def create_data(
        X: dt.Frame = None
    ) -> Union[str, List[str], dt.Frame, List[dt.Frame], np.ndarray,
               List[np.ndarray], pd.DataFrame, List[pd.DataFrame], Dict[
                   str, str],  # {data set names : paths}
               Dict[str, dt.Frame],  # {data set names : dt frames}
               Dict[str, np.ndarray],  # {data set names : np arrays}
               Dict[str, pd.DataFrame],  # {data set names : pd frames}
               ]:
        if X is None:
            return []

        columns = None  # columns = ["PAY_AMT", "BILL_AMT", "PAY_"]
        ranges = None  # [(1, 6), (1, 6), (2, 6)]
        black_listed_columns = []
        min_col_group_size = 2

        # parse column names for time series column groups
        if columns is None or columns == [] or \
                ranges is None or ranges == []:
            # match any column names that consist of alpha name (prefix) followed by integer index (suffix)
            p = re.compile(r"^([a-zA-Z_]+)(\d+)$")
            matches = [p.match(c) for c in X.names]
            all_col_groups = defaultdict(list)
            for m in matches:
                if m is not None:
                    key = m.group(1)
                    val = int(m.group(2))
                    all_col_groups[key].append(val)

            # remove black listed columns or column groups that smaller than minimal size
            col_groups = {
                key: val
                for key, val in all_col_groups.items()
                if not key in black_listed_columns
                or len(val) >= min_col_group_size
            }

            # list of column prefixes
            columns = list(col_groups.keys())
            # list of column ranges
            ranges = [(min(idx), max(idx)) for idx in col_groups.values()]

        col_slices = [((col + "%d") % (desde), (col + "%d") % (hasta))
                      for (col, (desde, hasta)) in zip(columns, ranges)]

        for c, r, s in zip(columns, ranges, col_slices):
            update_map = {
                c + "_sum": rowsum(f[s[0]:s[1]]),
                c + "_mean": rowmean(f[s[0]:s[1]]),
                c + "_sd": rowsd(f[s[0]:s[1]]),
                c + "_max": rowmax(f[s[0]:s[1]]),
                c + "_min": rowmin(f[s[0]:s[1]]),
                c + "_range": rowmax(f[s[0]:s[1]]) - rowmin(f[s[0]:s[1]]),
                c + "_first": rowfirst(f[s[0]:s[1]]),
                c + "_last": rowlast(f[s[0]:s[1]]),
                c + "_missing": (r[1] - r[0] + 1) - rowcount(f[s[0]:s[1]])
            }
            X[:, update(**update_map)]

        return X