def _infer_caluclate(DT, stat): if stat == 'mean': return DT[:, {'mean_val': dt.mean(f[1])}, by(f[0])] elif stat == 'median': return DT[:, {'median_val': dt.median(f[1])}, by(f[0])] else: pass
def test_group_reverse_flag(): DT = dt.Frame({"A": [1, 2, 1, 2, 2, 3, 3], "B": [2, 2, 4, 4, 23, 5, 30]}) EXPECTED = DT[:, :, dt.by(dt.f.A), dt.sort(-dt.f.B)] RES1 = DT[:, :, dt.by("A"), dt.sort("B", reverse=True)] RES2 = DT[:, :, dt.by(dt.f.A), dt.sort(dt.f.B, reverse=True)] assert_equals(EXPECTED, RES1) assert_equals(RES1, RES2)
def test_grouped_slice_simple(): DT = dt.Frame(A=[1,2,3,1,2,3], B=[3,4,3,4,3,4]) res1 = DT[1:, :, by("B")] res2 = DT[:2, :, by("B")] res3 = DT[::-1, :, by("B")] assert_equals(res1, dt.Frame(B=[3, 3, 4, 4], A=[3, 2, 1, 3])) assert_equals(res2, dt.Frame(B=[3, 3, 4, 4], A=[1, 3, 2, 1])) assert_equals(res3, dt.Frame(B=[3, 3, 3, 4, 4, 4], A=[2, 3, 1, 3, 1, 2]))
def test_group_slice_all(): DT = dt.Frame([[1, 2, 3, 4, 5, 6], [3, 0, 3, 3, 1, 0], list("abcdef")], names=["A", "B", "C"]) RES = dt.Frame(B=[0, 0, 1, 3, 3, 3], A=[2, 6, 5, 1, 3, 4], C=["b", "f", "e", "a", "c", "d"]) assert_equals(DT[:, :, by(f.B)], RES) assert_equals(DT[:, f[:], by(f.B)], RES)
def py_dt_two_group_proportions_summary(DT,por1,por2): DT_summary = DT[:,dt.count(),by(f[por1],f[por2]) ][:,f[:].extend({'group_tot':dt.sum(f.count)}),by(f[por1]) ][:,f[:].extend({'prop':f.count/f.group_tot}) ][:,f[:].remove(f[1]) ] return DT_summary
def test_groupby_with_sort(): DT = dt.Frame(A=[1,2,3]*4, B=[1,2]*6, C=range(12)) R1 = DT[:, count(), by(f.A, f.B)] R2 = DT[:, count(), by(f.A, f.B), sort(f.C)] R0 = dt.Frame(A=[1, 1, 2, 2, 3, 3], B=[1, 2, 1, 2, 1, 2], count=[2] * 6, stypes={"count": dt.int64}) assert_equals(R1, R0) assert_equals(R2, R0)
def test_ifelse_with_groupby(): DT = dt.Frame(A=[2, 5, 2, 5, 2, 2], B=range(6)) R1 = DT[:, ifelse(f.A == 2, dt.min(f.B), dt.max(f.B)), by(f.A)] R2 = DT[:, ifelse(f.A == 2, f.B, dt.max(f.B)), by(f.A)] R3 = DT[:, ifelse(f.A == 2, dt.min(f.B), f.B), by(f.A)] R4 = DT[:, ifelse(f.B > 2, dt.min(f.B), f.B), by(f.A)] assert_equals(R1, dt.Frame(A=[2, 5], C0=[0, 3])) assert_equals(R2, dt.Frame(A=[2, 2, 2, 2, 5, 5], C0=[0, 2, 4, 5, 3, 3])) assert_equals(R3, dt.Frame(A=[2, 2, 2, 2, 5, 5], C0=[0, 0, 0, 0, 1, 3])) assert_equals(R4, dt.Frame(A=[2, 2, 2, 2, 5, 5], C0=[0, 2, 0, 0, 1, 1]))
def test_group_negate_column(): DT = dt.Frame({"A": [1, 2, 1, 2, 2, 3, 3], "B": [2, 2, 4, 4, 23, 5, 30]}) EXPECTED = dt.Frame({ "A": [3, 3, 2, 2, 2, 1, 1], "B": [30, 5, 23, 4, 2, 4, 2] }) RES1 = DT[:, :, dt.by(-dt.f.A), dt.sort(-dt.f.B)] RES2 = DT[:, :, dt.by(-dt.f.A), dt.sort(dt.f.B, reverse=True)] assert_equals(EXPECTED, RES1) assert_equals(RES1, RES2)
def test_issue2348(): DT = dt.Frame(A=[1, 2, 3, 1, 2, 3], B=list('akdfnv'), C=[0.1, 0.2, 0.3, 0.4, 0.5, 0.6], D=[11]*6, E=[2]*6) # Check that these expressions do not crash DT[:, :, by(f.A), sort(f.A, f.E)] DT[:, :, by(f.A, f.B), sort(f.A, f.B)] assert_equals(DT[:, dt.count(), by(f.D), sort(f.E, f.A)], dt.Frame([[11], [6]], names=["D", "count"], stypes=[dt.int32, dt.int64]))
def fit_transform(self, X: dt.Frame, y: np.array = None): target = '__internal_target__' X[:, target] = dt.Frame(y) target_is_numeric = X[:, target][:, [bool, int, float]].shape[1] > 0 if target_is_numeric: self._group_means = X[:, dt.mean(dt.f[target]), dt.by(*self.input_feature_names)] else: X[:, target] = dt.Frame(LabelEncoder().fit_transform(X[:, target].to_pandas().iloc[:, 0].values).ravel()) self._group_means = X[:, dt.median(dt.f[target]), dt.by(*self.input_feature_names)] del X[:, target] self._group_means.key = self.input_feature_names return self.transform(X)
def test_groups1b(): DT = dt.Frame([[1, 5, 3, 2, 1, 3, 1, 1, None], ["a", "b", "c", "a", None, "f", "b", "h", "d"]], names=["A", "B"]) d1 = DT[:, :, by("A")] assert_equals( d1, dt.Frame(A=[None, 1, 1, 1, 1, 2, 3, 3, 5], B=["d", "a", None, "b", "h", "a", "c", "f", "b"])) d2 = DT[:, :, by("B")] assert_equals( d2, dt.Frame(B=[None, "a", "a", "b", "b", "c", "d", "f", "h"], A=[1, 1, 2, 5, 1, 3, None, 3, 1]))
def timeSeries(fullTable, fromDay, toDay, byCriteria, nameColumn, Altersgruppen, Geschlechter): regions = fullTable[:, [dt.first(nameColumn)], dt.by(byCriteria)] #regions = regions[:5,:] print("Creating time series for regions:") print(regions) dailysByCriteria = {} start = time.perf_counter() for i, lk in enumerate(regions[:, byCriteria].to_list()[0]): print("Processing Region '{}'".format(regions[i, nameColumn][0, 0])) start_region = time.perf_counter() pmu.printMemoryUsage("pre analyzeDailyAltersgruppenGeschlechter") dailysByCriteria[lk] = analyzeDailyAltersgruppenGeschlechter( fullTable, filterByDayAndCriteria(fromDay, toDay, (byCriteria == lk)), Altersgruppen, Geschlechter) finish = time.perf_counter() duration = finish - start print( "Region took {:.2f} seconds, elapsed {:.2f} minutes, time to completion: {:.2f} minutes" .format(finish - start_region, duration / 60, duration / (i + 1) * (regions.nrows - i) / 60)) pmu.printMemoryUsage("post analyzeDailyAltersgruppenGeschlechter") print("Done {} of {}, key = {} name = {}".format( i + 1, regions.nrows, lk, regions[i, nameColumn][0, 0])) #if lk >= 0: # break return regions, dailysByCriteria
def __call__(self, rows=None, select=None, verbose=False, timeit=False, groupby=None, join=None, sort=None, engine=None): """DEPRECATED, use DT[i, j, ...] instead.""" warnings.warn( "`DT(rows, select, ...)` is deprecated and will be removed in " "version 0.9.0. Please use `DT[i, j, ...]` instead", category=FutureWarning) time0 = time.time() if timeit else 0 function = type(lambda: None) if isinstance(rows, function): rows = rows(datatable.f) if isinstance(select, function): select = select(datatable.f) res = self[rows, select, datatable.join(join), datatable.by(groupby), datatable.sort(sort)] if timeit: print("Time taken: %d ms" % (1000 * (time.time() - time0))) return res
def extend_step(self, op, *, data_map, eval_env): if not isinstance(op, data_algebra.data_ops.ExtendNode): raise TypeError( "op was supposed to be a data_algebra.data_ops.ExtendNode") window_situation = (len(op.partition_by) > 0) or (len(op.order_by) > 0) if window_situation: self.check_extend_window_fns(op) if window_situation: raise RuntimeError( "windowed extend not implemented yet") # TODO: implement # datatable doesn't seem to have per-group transform yet (other than the whole dataframe) res = op.sources[0].eval_implementation(data_map=data_map, eval_env=eval_env, data_model=self) if len(op.order_by) > 0: ascending = [ False if ci in set(op.reverse) else True for ci in op.order_by ] if not all(ascending): raise RuntimeError( "reverse isn't implemented for datatable yet" ) # TODO: implement syms = [datatable.f[c] for c in op.order_by] res = res.sort(*syms) if len(op.partition_by) > 0: for (col, expr) in op.ops.items(): dt_expr = expr_to_dt_expr(expr) res[col] = res[:, { col: dt_expr }, datatable.by(*op.partition_by)][col] else: for (col, expr) in op.ops.items(): dt_expr = expr_to_dt_expr(expr) res[col] = res[:, {col: dt_expr}][col] return res
def test_groupby_select_all_columns(): # Check that when selecting all columns, the result has the same number # of columns as the original. DT = dt.Frame(id2=[1, 2] * 3, id4=[1] * 6, v3=[1, 3, 2, 3, 3, 3]) res = DT[:, :, by(f.id2, f.id4)] assert_equals(res, dt.Frame(id2=[1, 1, 1, 2, 2, 2], id4=[1] * 6, v3=[1, 2, 3, 3, 3, 3]))
def test_groups_small1(): DT0 = dt.Frame({"A": [1, 2, 1, 2, 1, 3, 1, 1], "B": [0, 1, 2, 3, 4, 5, 6, 7]}) DT1 = DT0[:, mean(f.B), by(f.A)] assert_equals(DT1, dt.Frame(A=[1, 2, 3], B=[3.8, 2.0, 5.0])) DT2 = DT0[:, mean(f.B), "A"] assert_equals(DT2, DT1)
def fit(self, X, y, sample_weight=None, eval_set=None, sample_weight_eval_set=None, **kwargs): self.tgc = self.params_base['tgc'] self.time_column = self.params_base['time_column'] self.encoder = self.params_base.get('encoder') self.nan_value = y.mean() self.means = {} if not all([x in X.names for x in self.tgc]): raise RuntimeError( "Internal error: need all time group cols (%s) in X, but only got %s" % (self.tgc, X.names)) tgc_wo_time = list(np.setdiff1d(self.tgc, self.time_column)) # Datatable code if len(tgc_wo_time) > 0: self.nan_value = np.mean(y) self.ntrain = X.shape[0] X_dt = X.copy() X_dt.cbind(dt.Frame({"y": y})) self.group_means = X_dt[:, dt.mean(dt.f.y), dt.by(*tgc_wo_time)] # Have meaningful column names self.group_means.names = tgc_wo_time + ["yhat"] else: self.group_means = np.mean(y)
def fit(self, X, y, sample_weight=None, eval_set=None, sample_weight_eval_set=None, **kwargs): self.tgc = self.params_base.get('tgc') self.time_column = self.params_base.get('time_column') self.encoder = self.params_base.get('encoder') self.nan_value = y.mean() self.means = {} if self.tgc is None or not all([x in X.names for x in self.tgc]): return if self.time_column is None: self.time_column = self.tgc[0] tgc_wo_time = list(np.setdiff1d(self.tgc, self.time_column)) # Datatable code if len(tgc_wo_time) > 0: self.nan_value = np.mean(y) self.ntrain = X.shape[0] X_dt = X.copy() X_dt.cbind(dt.Frame({"y": y})) self.group_means = X_dt[:, dt.mean(dt.f.y), dt.by(*tgc_wo_time)] # Have meaningful column names self.group_means.names = tgc_wo_time + ["yhat"] else: self.group_means = np.mean(y)
def project_step(self, op, *, data_map, eval_env): if not isinstance(op, data_algebra.data_ops.ProjectNode): raise TypeError( "op was supposed to be a data_algebra.data_ops.ProjectNode") # check these are forms we are prepared to work with, and build an aggregation dictionary # build an agg list: https://www.shanelynn.ie/summarising-aggregation-and-grouping-data-in-python-pandas/ # https://stackoverflow.com/questions/44635626/rename-result-columns-from-pandas-aggregation-futurewarning-using-a-dict-with for (k, opk) in op.ops.items(): if len(opk.args) != 1: raise ValueError("non-trivial aggregation expression: " + str(k) + ": " + str(opk)) if not isinstance(opk.args[0], data_algebra.expr_rep.ColumnReference): raise ValueError( "windows expression argument must be a column: " + str(k) + ": " + str(opk)) res = op.sources[0].eval_implementation(data_map=data_map, eval_env=eval_env, data_model=self) cols = [] if len(op.group_by) > 0: for (col, expr) in op.ops.items(): dt_expr = expr_to_dt_expr(expr) cols.append(res[:, { col: dt_expr }, datatable.by(*op.group_by)][col]) else: for (col, expr) in op.ops.items(): dt_expr = expr_to_dt_expr(expr) cols.append(res[:, {col: dt_expr}][col]) res = self.columns_to_frame(cols) return res
def fit_transform(self, X: dt.Frame, y: np.array = None): target = '__target__' X[:, target] = dt.Frame(y) target_is_numeric = X[:, target][:, [bool, int, float]].shape[1] > 0 if not target_is_numeric: X[:, target] = dt.Frame(LabelEncoder().fit_transform(X[:, target].to_pandas().iloc[:, 0].values).ravel()) self._group_means = X[:, dt.mean(dt.f[target]), dt.by(*self.input_feature_names)] self._group_means.key = self.input_feature_names self.dataset_mean = X[target].mean().to_numpy().ravel()[0] # Expanding mean transform X_ = X.to_pandas()[self.input_feature_names + [target]] X_["index"] = X_.index X_shuffled = X_.sample(n=len(X_), replace=False) X_shuffled["cnt"] = 1 X_shuffled["cumsum"] = (X_shuffled .groupby(self.input_feature_names, sort=False)['__target__'] .apply(lambda x: x.shift().cumsum())) X_shuffled["cumcnt"] = (X_shuffled .groupby(self.input_feature_names, sort=False)['cnt'] .apply(lambda x: x.shift().cumsum())) X_shuffled["encoded"] = X_shuffled["cumsum"] / X_shuffled["cumcnt"] X_shuffled["encoded"] = X_shuffled["encoded"].fillna(self.dataset_mean) X_transformed = X_shuffled.sort_values("index")["encoded"].values return dt.Frame(X_transformed)
def test_groups_internal3(): f0 = dt.Frame(A=[1, 2, 1, 3, 2, 2, 2, 1, 3, 1], B=range(10)) f1 = f0[:, [f.B, f.A + f.B], by(f.A)] frame_integrity_check(f1) assert f1.to_list() == [[1, 1, 1, 1, 2, 2, 2, 2, 3, 3], [0, 2, 7, 9, 1, 4, 5, 6, 3, 8], [1, 3, 8, 10, 3, 6, 7, 8, 6, 11]]
def test_groups_internal5_strs(seed): random.seed(seed) n = 1000 src = ["%x" % random.getrandbits(8) for _ in range(n)] f0 = dt.Frame({"A": src}) f1 = f0[:, :, by("A")] frame_integrity_check(f1)
def test_groupby_multi_large(seed): random.seed(seed) letters = "abcdefghijklmn" n = 100 + int(random.expovariate(0.0001)) col0 = [random.choice([True, False]) for _ in range(n)] col1 = [random.randint(-10, 10) for _ in range(n)] col2 = [random.choice(letters) for _ in range(n)] col3 = [random.random() for _ in range(n)] rows = [(col0[i], col1[i], col2[i], col3[i]) for i in range(n)] rows.sort() grouped = [] lastkey = rows[0][:3] sumval = 0 for i in range(n): ikey = rows[i][:3] if ikey != lastkey: grouped.append(lastkey + (sumval, )) lastkey = ikey sumval = 0 sumval += rows[i][3] grouped.append(lastkey + (sumval, )) DT0 = dt.Frame([col0, col1, col2, col3], names=["A", "B", "C", "D"]) DT1 = DT0[:, sum(f.D), by(f.A, f.B, f.C)] DT2 = dt.Frame(grouped) assert same_iterables(DT1.to_list(), DT2.to_list())
def test_assign_with_groupby2(): DT = dt.Frame(A=range(5), B=[1, 1, 2, 2, 2]) DT[:, "C", by(f.B)] = f.A - dt.mean(f.A) assert_equals( DT, dt.Frame(A=range(5), B=[1, 1, 2, 2, 2], C=[-0.5, 0.5, -1.0, 0, 1.0]))
def test_groups_internal4(seed): random.seed(seed) n = 100000 src = [random.getrandbits(10) for _ in range(n)] f0 = dt.Frame({"A": src}) f1 = f0[:, :, by("A")] f1.internal.check()
def test_key_after_group(): n = 1000 DT = dt.Frame(A=[random.choice("abcd") for _ in range(n)]) tmp = DT[:, dt.count(), dt.by(0)] frame_integrity_check(tmp) tmp.key = "A" assert tmp.to_list()[0] == ["a", "b", "c", "d"] assert sum(tmp.to_list()[1]) == n
def test_median_grouped(): DT = dt.Frame(A=[0, 0, 0, 0, 1, 1, 1, 1, 1], B=[2, 6, 1, 0, -3, 4, None, None, -1], stypes={"A": dt.int16, "B": dt.int32}) RES = DT[:, median(f.B), by(f.A)] assert RES.shape == (2, 2) assert RES.stypes == (dt.int16, dt.float64) assert RES.to_list() == [[0, 1], [1.5, -1.0]]
def test_groups2a(): DT0 = dt.Frame(A=[1, 2, 1], B=[3, 4, 5]) DT1 = DT0[:, [f.A, f.B, f.A + f.B], by("A")] assert_equals(DT0, dt.Frame(A=[1, 2, 1], B=[3, 4, 5])) assert_equals( DT1, dt.Frame([[1, 1, 2], [1, 1, 2], [3, 5, 4], [4, 6, 6]], names=["A", "A.0", "B", "C0"]))
def test_groups2b(): DT0 = dt.Frame(A=[1, 2, 1, 3, 2, 2, 2, 1, 3, 1], B=range(10)) DT1 = DT0[:, [f.B, f.A + f.B], by(f.A)] assert_equals( DT1, dt.Frame(A=[1, 1, 1, 1, 2, 2, 2, 2, 3, 3], B=[0, 2, 7, 9, 1, 4, 5, 6, 3, 8], C0=[1, 3, 8, 10, 3, 6, 7, 8, 6, 11]))
def test_sort_expr(): df = dt.Frame(A=[1, 2, 1, 2], B=[3.9, 2.7, 0.1, 4.5]) assert_equals(df[:, :, sort("A")], dt.Frame(A=[1, 1, 2, 2], B=[3.9, 0.1, 2.7, 4.5])) assert_equals(df[:, :, sort(f.B)], dt.Frame(A=[1, 2, 1, 2], B=[0.1, 2.7, 3.9, 4.5])) assert_equals(df[:, 'B', by("A"), sort("B")], dt.Frame(A=[1, 1, 2, 2], B=[0.1, 3.9, 2.7, 4.5]))
fun = "[.datatable" cache = "TRUE" data_name = os.environ['SRC_GRP_LOCAL'] src_grp = os.path.join("data", data_name+".csv") print("loading dataset %s" % data_name, flush=True) x = dt.fread(src_grp) print(x.nrows, flush=True) print("grouping...", flush=True) question = "sum v1 by id1" # q1 gc.collect() t_start = timeit.default_timer() ans = x[:, {"v1": sum(f.v1)}, by(f.id1)] print(ans.shape, flush=True) t = timeit.default_timer() - t_start m = memory_usage() t_start = timeit.default_timer() chk = ans[:, sum(f.v1)] chkt = timeit.default_timer() - t_start write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=1, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(flatten(chk.to_list())), chk_time_sec=chkt) del ans gc.collect() t_start = timeit.default_timer() ans = x[:, {"v1": sum(f.v1)}, by(f.id1)] print(ans.shape, flush=True) t = timeit.default_timer() - t_start m = memory_usage() t_start = timeit.default_timer()