def test_assign_with_groupby2(): DT = dt.Frame(A=range(5), B=[1, 1, 2, 2, 2]) DT[:, "C", by(f.B)] = f.A - dt.mean(f.A) assert_equals( DT, dt.Frame(A=range(5), B=[1, 1, 2, 2, 2], C=[-0.5, 0.5, -1.0, 0, 1.0]))
def test_groupby(): DT = dt.Frame(A=[1, 1, 1, 2, 2, 2], B=[ d(2001, 7, 12, 0, 0, 0), d(2005, 3, 14, 15, 9, 26), None, d(2007, 11, 2, 19, 7, 38), d(1965, 6, 19, 2, 17, 7), d(2004, 4, 18, 12, 3, 31) ]) RES = DT[:, { "count": dt.count(f.B), "min": dt.min(f.B), "max": dt.max(f.B), "mean": dt.mean(f.B), "first": dt.first(f.B), "last": dt.last(f.B) }, dt.by(f.A)] assert_equals( RES, dt.Frame(A=[1, 2], count=[2, 3] / dt.int64, min=[d(2001, 7, 12, 0, 0, 0), d(1965, 6, 19, 2, 17, 7)], max=[d(2005, 3, 14, 15, 9, 26), d(2007, 11, 2, 19, 7, 38)], mean=[ d(2003, 5, 13, 19, 34, 43), d(1992, 7, 13, 19, 9, 25, 333333) ], first=[d(2001, 7, 12, 0, 0, 0), d(2007, 11, 2, 19, 7, 38)], last=[None, d(2004, 4, 18, 12, 3, 31)]))
def test_dt_nunique_with_by_for_ungroupped(): DT = dt.Frame(G=[1, 1, 1, 2, 2, 2], V=[None, None, None, None, 3, 5]) EXP = dt.Frame(G=[1, 2], V1=[0, 2] / dt.int64, V2=[0, 1] / dt.int64) RES = DT[:, { "V1": dt.nunique(f.V), "V2": dt.nunique(dt.mean(f.V)) }, dt.by(f.G)] assert_equals(EXP, RES)
def test_update_with_groupby(): DT = dt.Frame(A=range(5), B=[1, 1, 2, 2, 2]) DT[:, update(C=7, D=dt.mean(f.A), E=f.A + 1), by(f.B)] assert_equals( DT, dt.Frame(A=range(5), B=[1, 1, 2, 2, 2], C=[7] * 5, D=[0.5, 0.5, 3.0, 3.0, 3.0], E=range(1, 6)))
def test_reducers(): DT = dt.Frame(TIME=[ d(2001, 7, 12, 0, 0, 0), d(2005, 3, 14, 15, 9, 26), None, d(2007, 11, 2, 19, 7, 38), d(1965, 6, 19, 2, 17, 7), d(2004, 4, 18, 12, 3, 31) ]) RES = DT[:, { "count": dt.count(f.TIME), "min": dt.min(f.TIME), "max": dt.max(f.TIME), "mean": dt.mean(f.TIME), "first": dt.first(f.TIME), "last": dt.last(f.TIME) }] assert_equals( RES, dt.Frame(count=[5] / dt.int64, min=[d(1965, 6, 19, 2, 17, 7)], max=[d(2007, 11, 2, 19, 7, 38)], mean=[d(1996, 11, 12, 4, 55, 32, 400000)], first=[d(2001, 7, 12, 0, 0, 0)], last=[d(2004, 4, 18, 12, 3, 31)]))
# #### Number of unique observations per column # In[3]: # DT[, lapply(.SD, uniqueN)] --> Rdatatable DT.nunique() # #### Mean of all columns by `species` # In[4]: # DT[, lapply(.SD, mean), by = species] --> Rdatatable DT[:, dt.mean(f[:]), by('species')] # ### __Filtering__ # #### First two observations by species # In[5]: # DT[, .SD[1:2], by = species] DT[:2, :, by('species')] # In [datatable](https://datatable.readthedocs.io/en/latest/index.html), rows are selected in the `i` section after the grouping, unlike in R's [data.table](https://github.com/Rdatatable/data.table), where rows are selected in `i` before grouping, and rows selected in the `.SD` after grouping. # #### Last two observations by `species`
def test_dt_count_na2(): DT = dt.Frame(G=[1, 1, 1, 2, 2, 2], V=[None, None, None, None, 3, 5]) EXP = dt.Frame(G=[1, 2], V1=[3, 1], V2=[3, 0]) RES = DT[:, [dt.countna(f.V), dt.countna(dt.mean(f.V))], dt.by(f.G)] assert EXP.to_list() == RES.to_list()