def test_rowmean(): assert str(dt.rowmean(f.A)) == str(f.A.rowmean()) assert str(dt.rowmean(f[:])) == str(f[:].rowmean()) DT = dt.Frame({"C": [2, 5, 30, 20, 10], "D": [10, 8, 20, 20, 1]}) assert_equals(DT[:, f[:].rowmean()], DT[:, dt.rowmean(f[:])])
def test_rowmean_floats(): DT = dt.Frame([(1.5, 6.4, 0.0, None, 7.22), (2.0, -1.1, math.inf, 4.0, 3.2), (1.5, 9.9, None, None, math.nan), (math.inf, -math.inf, None, 0.0, math.nan)]) RES = DT[:, rowmean(f[:])] x = (1.5 + 6.4 + 7.22) / 4 assert_equals(RES, dt.Frame([x, math.inf, 5.7, None]))
def test_reprs(): # Check that row-expressions can be repr'd without errors assert repr(rowall()) assert repr(rowany()) assert repr(rowsum()) assert repr(rowcount()) assert repr(rowmin()) assert repr(rowmax()) assert repr(rowfirst()) assert repr(rowlast()) assert repr(rowmean()) assert repr(rowsd())
def test_rowmean_wrong_types(): DT = dt.Frame(A=[3, 5, 6], B=["a", "d", "e"]) with pytest.raises(TypeError, match="Function rowmean expects a sequence " "of numeric columns"): assert rowmean(DT)
def test_rowmean_simple(): DT = dt.Frame(A=range(5)) assert_equals(rowmean(DT), dt.Frame(range(5), stype=dt.float64))
# remove black listed columns or column groups that smaller than minimal size col_groups = { key: val for key, val in all_col_groups.items() if not key in black_listed_columns or len(val) >= min_col_group_size } # list of column prefixes columns = list(col_groups.keys()) # list of column ranges ranges = [(min(idx), max(idx)) for idx in col_groups.values()] # produce tuple for column slices col_slices = [((col + "%d") % (desde), (col + "%d") % (hasta)) for (col, (desde, hasta)) in zip(columns, ranges)] for c, r, s in zip(columns, ranges, col_slices): update_map = { c + "_sum": rowsum(f[s[0]:s[1]]), c + "_mean": rowmean(f[s[0]:s[1]]), c + "_sd": rowsd(f[s[0]:s[1]]), c + "_max": rowmax(f[s[0]:s[1]]), c + "_min": rowmin(f[s[0]:s[1]]), c + "_range": rowmax(f[s[0]:s[1]]) - rowmin(f[s[0]:s[1]]), c + "_first": rowfirst(f[s[0]:s[1]]), c + "_last": rowlast(f[s[0]:s[1]]), c + "_missing": (r[1] - r[0] + 1) - rowcount(f[s[0]:s[1]]) } X[:, update(**update_map)] return {"CreditCard-train-aug.csv": X}
def create_data( X: dt.Frame = None ) -> Union[str, List[str], dt.Frame, List[dt.Frame], np.ndarray, List[np.ndarray], pd.DataFrame, List[pd.DataFrame], Dict[ str, str], # {data set names : paths} Dict[str, dt.Frame], # {data set names : dt frames} Dict[str, np.ndarray], # {data set names : np arrays} Dict[str, pd.DataFrame], # {data set names : pd frames} ]: if X is None: return [] columns = None # columns = ["PAY_AMT", "BILL_AMT", "PAY_"] ranges = None # [(1, 6), (1, 6), (2, 6)] black_listed_columns = [] min_col_group_size = 2 # parse column names for time series column groups if columns is None or columns == [] or \ ranges is None or ranges == []: # match any column names that consist of alpha name (prefix) followed by integer index (suffix) p = re.compile(r"^([a-zA-Z_]+)(\d+)$") matches = [p.match(c) for c in X.names] all_col_groups = defaultdict(list) for m in matches: if m is not None: key = m.group(1) val = int(m.group(2)) all_col_groups[key].append(val) # remove black listed columns or column groups that smaller than minimal size col_groups = { key: val for key, val in all_col_groups.items() if not key in black_listed_columns or len(val) >= min_col_group_size } # list of column prefixes columns = list(col_groups.keys()) # list of column ranges ranges = [(min(idx), max(idx)) for idx in col_groups.values()] col_slices = [((col + "%d") % (desde), (col + "%d") % (hasta)) for (col, (desde, hasta)) in zip(columns, ranges)] for c, r, s in zip(columns, ranges, col_slices): update_map = { c + "_sum": rowsum(f[s[0]:s[1]]), c + "_mean": rowmean(f[s[0]:s[1]]), c + "_sd": rowsd(f[s[0]:s[1]]), c + "_max": rowmax(f[s[0]:s[1]]), c + "_min": rowmin(f[s[0]:s[1]]), c + "_range": rowmax(f[s[0]:s[1]]) - rowmin(f[s[0]:s[1]]), c + "_first": rowfirst(f[s[0]:s[1]]), c + "_last": rowlast(f[s[0]:s[1]]), c + "_missing": (r[1] - r[0] + 1) - rowcount(f[s[0]:s[1]]) } X[:, update(**update_map)] return X