def test_rowsd(): assert str(dt.rowsd(f.A)) == str(f.A.rowsd()) assert str(dt.rowsd(f[:])) == str(f[:].rowsd()) DT = dt.Frame({"C": [2, 5, 30, 20, 10], "D": [10, 8, 20, 20, 1]}) assert_equals(DT[:, f[:].rowsd()], DT[:, dt.rowsd(f[:])])
def test_rowsd_floats(): DT = dt.Frame([(1.5, 6.4, 0.0, None, 7.22), (2.0, -1.1, math.inf, 4.0, 3.2), (1.5, 9.9, None, None, math.nan), (math.inf, -math.inf, None, 0.0, math.nan)]) RES = DT[:, rowsd(f[:])] std1 = 3.5676696409094086 std3 = 5.939696961966999 assert_equals(RES, dt.Frame([std1, None, std3, None]))
def test_reprs(): # Check that row-expressions can be repr'd without errors assert repr(rowall()) assert repr(rowany()) assert repr(rowsum()) assert repr(rowcount()) assert repr(rowmin()) assert repr(rowmax()) assert repr(rowfirst()) assert repr(rowlast()) assert repr(rowmean()) assert repr(rowsd())
def test_rowmean_wrong_types(): DT = dt.Frame(A=[3, 5, 6], B=["a", "d", "e"]) with pytest.raises(TypeError, match="Function rowsd expects a sequence " "of numeric columns"): assert rowsd(DT)
def test_rowsd_same_columns(): DT = dt.Frame([range(5)] * 10) assert_equals(rowsd(DT), dt.Frame([0.0] * 5))
def test_rowsd_single_column(): DT = dt.Frame(A=range(5)) assert_equals(rowsd(DT), dt.Frame([math.nan] * 5))
# remove black listed columns or column groups that smaller than minimal size col_groups = { key: val for key, val in all_col_groups.items() if not key in black_listed_columns or len(val) >= min_col_group_size } # list of column prefixes columns = list(col_groups.keys()) # list of column ranges ranges = [(min(idx), max(idx)) for idx in col_groups.values()] # produce tuple for column slices col_slices = [((col + "%d") % (desde), (col + "%d") % (hasta)) for (col, (desde, hasta)) in zip(columns, ranges)] for c, r, s in zip(columns, ranges, col_slices): update_map = { c + "_sum": rowsum(f[s[0]:s[1]]), c + "_mean": rowmean(f[s[0]:s[1]]), c + "_sd": rowsd(f[s[0]:s[1]]), c + "_max": rowmax(f[s[0]:s[1]]), c + "_min": rowmin(f[s[0]:s[1]]), c + "_range": rowmax(f[s[0]:s[1]]) - rowmin(f[s[0]:s[1]]), c + "_first": rowfirst(f[s[0]:s[1]]), c + "_last": rowlast(f[s[0]:s[1]]), c + "_missing": (r[1] - r[0] + 1) - rowcount(f[s[0]:s[1]]) } X[:, update(**update_map)] return {"CreditCard-train-aug.csv": X}
def test_rowsd_same_columns(): DT = dt.Frame([range(5)] * 10) RES = DT[:, rowsd(f[:])] assert_equals(RES, dt.Frame([0.0] * 5))
def test_rowsd_single_column(): DT = dt.Frame(A=range(5)) RES = DT[:, rowsd(f[:])] assert_equals(RES, dt.Frame([None] * 5, type=float))
def test_rowsd_single_column(): DT = dt.Frame(A=range(5)) RES = DT[:, rowsd(f[:])] assert_equals(RES, dt.Frame([math.nan] * 5))
def create_data( X: dt.Frame = None ) -> Union[str, List[str], dt.Frame, List[dt.Frame], np.ndarray, List[np.ndarray], pd.DataFrame, List[pd.DataFrame], Dict[ str, str], # {data set names : paths} Dict[str, dt.Frame], # {data set names : dt frames} Dict[str, np.ndarray], # {data set names : np arrays} Dict[str, pd.DataFrame], # {data set names : pd frames} ]: if X is None: return [] columns = None # columns = ["PAY_AMT", "BILL_AMT", "PAY_"] ranges = None # [(1, 6), (1, 6), (2, 6)] black_listed_columns = [] min_col_group_size = 2 # parse column names for time series column groups if columns is None or columns == [] or \ ranges is None or ranges == []: # match any column names that consist of alpha name (prefix) followed by integer index (suffix) p = re.compile(r"^([a-zA-Z_]+)(\d+)$") matches = [p.match(c) for c in X.names] all_col_groups = defaultdict(list) for m in matches: if m is not None: key = m.group(1) val = int(m.group(2)) all_col_groups[key].append(val) # remove black listed columns or column groups that smaller than minimal size col_groups = { key: val for key, val in all_col_groups.items() if not key in black_listed_columns or len(val) >= min_col_group_size } # list of column prefixes columns = list(col_groups.keys()) # list of column ranges ranges = [(min(idx), max(idx)) for idx in col_groups.values()] col_slices = [((col + "%d") % (desde), (col + "%d") % (hasta)) for (col, (desde, hasta)) in zip(columns, ranges)] for c, r, s in zip(columns, ranges, col_slices): update_map = { c + "_sum": rowsum(f[s[0]:s[1]]), c + "_mean": rowmean(f[s[0]:s[1]]), c + "_sd": rowsd(f[s[0]:s[1]]), c + "_max": rowmax(f[s[0]:s[1]]), c + "_min": rowmin(f[s[0]:s[1]]), c + "_range": rowmax(f[s[0]:s[1]]) - rowmin(f[s[0]:s[1]]), c + "_first": rowfirst(f[s[0]:s[1]]), c + "_last": rowlast(f[s[0]:s[1]]), c + "_missing": (r[1] - r[0] + 1) - rowcount(f[s[0]:s[1]]) } X[:, update(**update_map)] return X