Example #1
0
def _infer_caluclate(DT, stat):
    if stat == 'mean':
        return DT[:, {'mean_val': dt.mean(f[1])}, by(f[0])]
    elif stat == 'median':
        return DT[:, {'median_val': dt.median(f[1])}, by(f[0])]
    else:
        pass
Example #2
0
def test_group_reverse_flag():
    DT = dt.Frame({"A": [1, 2, 1, 2, 2, 3, 3], "B": [2, 2, 4, 4, 23, 5, 30]})
    EXPECTED = DT[:, :, dt.by(dt.f.A), dt.sort(-dt.f.B)]
    RES1 = DT[:, :, dt.by("A"), dt.sort("B", reverse=True)]
    RES2 = DT[:, :, dt.by(dt.f.A), dt.sort(dt.f.B, reverse=True)]
    assert_equals(EXPECTED, RES1)
    assert_equals(RES1, RES2)
Example #3
0
def test_grouped_slice_simple():
    DT = dt.Frame(A=[1,2,3,1,2,3], B=[3,4,3,4,3,4])
    res1 = DT[1:, :, by("B")]
    res2 = DT[:2, :, by("B")]
    res3 = DT[::-1, :, by("B")]
    assert_equals(res1, dt.Frame(B=[3, 3, 4, 4], A=[3, 2, 1, 3]))
    assert_equals(res2, dt.Frame(B=[3, 3, 4, 4], A=[1, 3, 2, 1]))
    assert_equals(res3, dt.Frame(B=[3, 3, 3, 4, 4, 4], A=[2, 3, 1, 3, 1, 2]))
Example #4
0
def test_group_slice_all():
    DT = dt.Frame([[1, 2, 3, 4, 5, 6], [3, 0, 3, 3, 1, 0],
                   list("abcdef")],
                  names=["A", "B", "C"])
    RES = dt.Frame(B=[0, 0, 1, 3, 3, 3],
                   A=[2, 6, 5, 1, 3, 4],
                   C=["b", "f", "e", "a", "c", "d"])
    assert_equals(DT[:, :, by(f.B)], RES)
    assert_equals(DT[:, f[:], by(f.B)], RES)
Example #5
0
def py_dt_two_group_proportions_summary(DT,por1,por2):
    
    DT_summary = DT[:,dt.count(),by(f[por1],f[por2])
                   ][:,f[:].extend({'group_tot':dt.sum(f.count)}),by(f[por1])
                    ][:,f[:].extend({'prop':f.count/f.group_tot})
                     ][:,f[:].remove(f[1])
                      ]
    
    return DT_summary
Example #6
0
def test_groupby_with_sort():
    DT = dt.Frame(A=[1,2,3]*4, B=[1,2]*6, C=range(12))
    R1 = DT[:, count(), by(f.A, f.B)]
    R2 = DT[:, count(), by(f.A, f.B), sort(f.C)]
    R0 = dt.Frame(A=[1, 1, 2, 2, 3, 3],
                  B=[1, 2, 1, 2, 1, 2],
                  count=[2] * 6, stypes={"count": dt.int64})
    assert_equals(R1, R0)
    assert_equals(R2, R0)
Example #7
0
def test_ifelse_with_groupby():
    DT = dt.Frame(A=[2, 5, 2, 5, 2, 2], B=range(6))
    R1 = DT[:, ifelse(f.A == 2, dt.min(f.B), dt.max(f.B)), by(f.A)]
    R2 = DT[:, ifelse(f.A == 2, f.B, dt.max(f.B)), by(f.A)]
    R3 = DT[:, ifelse(f.A == 2, dt.min(f.B), f.B), by(f.A)]
    R4 = DT[:, ifelse(f.B > 2, dt.min(f.B), f.B), by(f.A)]
    assert_equals(R1, dt.Frame(A=[2, 5], C0=[0, 3]))
    assert_equals(R2, dt.Frame(A=[2, 2, 2, 2, 5, 5], C0=[0, 2, 4, 5, 3, 3]))
    assert_equals(R3, dt.Frame(A=[2, 2, 2, 2, 5, 5], C0=[0, 0, 0, 0, 1, 3]))
    assert_equals(R4, dt.Frame(A=[2, 2, 2, 2, 5, 5], C0=[0, 2, 0, 0, 1, 1]))
Example #8
0
def test_group_negate_column():
    DT = dt.Frame({"A": [1, 2, 1, 2, 2, 3, 3], "B": [2, 2, 4, 4, 23, 5, 30]})
    EXPECTED = dt.Frame({
        "A": [3, 3, 2, 2, 2, 1, 1],
        "B": [30, 5, 23, 4, 2, 4, 2]
    })
    RES1 = DT[:, :, dt.by(-dt.f.A), dt.sort(-dt.f.B)]
    RES2 = DT[:, :, dt.by(-dt.f.A), dt.sort(dt.f.B, reverse=True)]
    assert_equals(EXPECTED, RES1)
    assert_equals(RES1, RES2)
Example #9
0
def test_issue2348():
    DT = dt.Frame(A=[1, 2, 3, 1, 2, 3], B=list('akdfnv'),
                  C=[0.1, 0.2, 0.3, 0.4, 0.5, 0.6],
                  D=[11]*6, E=[2]*6)
    # Check that these expressions do not crash
    DT[:, :, by(f.A), sort(f.A, f.E)]
    DT[:, :, by(f.A, f.B), sort(f.A, f.B)]
    assert_equals(DT[:, dt.count(), by(f.D), sort(f.E, f.A)],
                  dt.Frame([[11], [6]],
                           names=["D", "count"],
                           stypes=[dt.int32, dt.int64]))
Example #10
0
 def fit_transform(self, X: dt.Frame, y: np.array = None):
     target = '__internal_target__'
     X[:, target] = dt.Frame(y)
     target_is_numeric = X[:, target][:, [bool, int, float]].shape[1] > 0
     if target_is_numeric:
         self._group_means = X[:, dt.mean(dt.f[target]), dt.by(*self.input_feature_names)]
     else:
         X[:, target] = dt.Frame(LabelEncoder().fit_transform(X[:, target].to_pandas().iloc[:, 0].values).ravel())
         self._group_means = X[:, dt.median(dt.f[target]), dt.by(*self.input_feature_names)]
     del X[:, target]
     self._group_means.key = self.input_feature_names
     return self.transform(X)
Example #11
0
def test_groups1b():
    DT = dt.Frame([[1,   5,   3,   2,   1,    3,   1,   1,   None],
                   ["a", "b", "c", "a", None, "f", "b", "h", "d"]],
                  names=["A", "B"])
    d1 = DT[:, :, by("A")]
    assert_equals(
        d1, dt.Frame(A=[None, 1, 1, 1, 1, 2, 3, 3, 5],
                     B=["d", "a", None, "b", "h", "a", "c", "f", "b"]))

    d2 = DT[:, :, by("B")]
    assert_equals(
        d2, dt.Frame(B=[None, "a", "a", "b", "b", "c", "d", "f", "h"],
                     A=[1, 1, 2, 5, 1, 3, None, 3, 1]))
Example #12
0
def timeSeries(fullTable, fromDay, toDay, byCriteria, nameColumn,
               Altersgruppen, Geschlechter):
    regions = fullTable[:, [dt.first(nameColumn)], dt.by(byCriteria)]
    #regions = regions[:5,:]
    print("Creating time series for regions:")
    print(regions)
    dailysByCriteria = {}
    start = time.perf_counter()
    for i, lk in enumerate(regions[:, byCriteria].to_list()[0]):
        print("Processing Region '{}'".format(regions[i, nameColumn][0, 0]))
        start_region = time.perf_counter()

        pmu.printMemoryUsage("pre analyzeDailyAltersgruppenGeschlechter")
        dailysByCriteria[lk] = analyzeDailyAltersgruppenGeschlechter(
            fullTable,
            filterByDayAndCriteria(fromDay, toDay, (byCriteria == lk)),
            Altersgruppen, Geschlechter)
        finish = time.perf_counter()
        duration = finish - start
        print(
            "Region took {:.2f} seconds, elapsed {:.2f} minutes, time to completion: {:.2f} minutes"
            .format(finish - start_region, duration / 60,
                    duration / (i + 1) * (regions.nrows - i) / 60))

        pmu.printMemoryUsage("post analyzeDailyAltersgruppenGeschlechter")
        print("Done {} of {}, key = {} name = {}".format(
            i + 1, regions.nrows, lk, regions[i, nameColumn][0, 0]))
        #if lk >= 0:
        #    break
    return regions, dailysByCriteria
Example #13
0
    def __call__(self,
                 rows=None,
                 select=None,
                 verbose=False,
                 timeit=False,
                 groupby=None,
                 join=None,
                 sort=None,
                 engine=None):
        """DEPRECATED, use DT[i, j, ...] instead."""
        warnings.warn(
            "`DT(rows, select, ...)` is deprecated and will be removed in "
            "version 0.9.0. Please use `DT[i, j, ...]` instead",
            category=FutureWarning)
        time0 = time.time() if timeit else 0
        function = type(lambda: None)
        if isinstance(rows, function):
            rows = rows(datatable.f)
        if isinstance(select, function):
            select = select(datatable.f)

        res = self[rows, select,
                   datatable.join(join),
                   datatable.by(groupby),
                   datatable.sort(sort)]
        if timeit:
            print("Time taken: %d ms" % (1000 * (time.time() - time0)))
        return res
 def extend_step(self, op, *, data_map, eval_env):
     if not isinstance(op, data_algebra.data_ops.ExtendNode):
         raise TypeError(
             "op was supposed to be a data_algebra.data_ops.ExtendNode")
     window_situation = (len(op.partition_by) > 0) or (len(op.order_by) > 0)
     if window_situation:
         self.check_extend_window_fns(op)
     if window_situation:
         raise RuntimeError(
             "windowed extend not implemented yet")  # TODO: implement
     # datatable doesn't seem to have per-group transform yet (other than the whole dataframe)
     res = op.sources[0].eval_implementation(data_map=data_map,
                                             eval_env=eval_env,
                                             data_model=self)
     if len(op.order_by) > 0:
         ascending = [
             False if ci in set(op.reverse) else True for ci in op.order_by
         ]
         if not all(ascending):
             raise RuntimeError(
                 "reverse isn't implemented for datatable yet"
             )  # TODO: implement
         syms = [datatable.f[c] for c in op.order_by]
         res = res.sort(*syms)
     if len(op.partition_by) > 0:
         for (col, expr) in op.ops.items():
             dt_expr = expr_to_dt_expr(expr)
             res[col] = res[:, {
                 col: dt_expr
             }, datatable.by(*op.partition_by)][col]
     else:
         for (col, expr) in op.ops.items():
             dt_expr = expr_to_dt_expr(expr)
             res[col] = res[:, {col: dt_expr}][col]
     return res
Example #15
0
def test_groupby_select_all_columns():
    # Check that when selecting all columns, the result has the same number
    # of columns as the original.
    DT = dt.Frame(id2=[1, 2] * 3, id4=[1] * 6, v3=[1, 3, 2, 3, 3, 3])
    res = DT[:, :, by(f.id2, f.id4)]
    assert_equals(res, dt.Frame(id2=[1, 1, 1, 2, 2, 2], id4=[1] * 6,
                                v3=[1, 2, 3, 3, 3, 3]))
Example #16
0
def test_groups_small1():
    DT0 = dt.Frame({"A": [1, 2, 1, 2, 1, 3, 1, 1],
                    "B": [0, 1, 2, 3, 4, 5, 6, 7]})
    DT1 = DT0[:, mean(f.B), by(f.A)]
    assert_equals(DT1, dt.Frame(A=[1, 2, 3], B=[3.8, 2.0, 5.0]))
    DT2 = DT0[:, mean(f.B), "A"]
    assert_equals(DT2, DT1)
Example #17
0
    def fit(self,
            X,
            y,
            sample_weight=None,
            eval_set=None,
            sample_weight_eval_set=None,
            **kwargs):
        self.tgc = self.params_base['tgc']
        self.time_column = self.params_base['time_column']
        self.encoder = self.params_base.get('encoder')
        self.nan_value = y.mean()
        self.means = {}
        if not all([x in X.names for x in self.tgc]):
            raise RuntimeError(
                "Internal error: need all time group cols (%s) in X, but only got %s"
                % (self.tgc, X.names))

        tgc_wo_time = list(np.setdiff1d(self.tgc, self.time_column))

        # Datatable code

        if len(tgc_wo_time) > 0:
            self.nan_value = np.mean(y)
            self.ntrain = X.shape[0]
            X_dt = X.copy()
            X_dt.cbind(dt.Frame({"y": y}))
            self.group_means = X_dt[:, dt.mean(dt.f.y), dt.by(*tgc_wo_time)]
            # Have meaningful column names
            self.group_means.names = tgc_wo_time + ["yhat"]
        else:
            self.group_means = np.mean(y)
Example #18
0
    def fit(self, X, y, sample_weight=None, eval_set=None, sample_weight_eval_set=None, **kwargs):
        self.tgc = self.params_base.get('tgc')
        self.time_column = self.params_base.get('time_column')
        self.encoder = self.params_base.get('encoder')
        self.nan_value = y.mean()
        self.means = {}
        if self.tgc is None or not all([x in X.names for x in self.tgc]):
            return

        if self.time_column is None:
            self.time_column = self.tgc[0]

        tgc_wo_time = list(np.setdiff1d(self.tgc, self.time_column))

        # Datatable code

        if len(tgc_wo_time) > 0:
            self.nan_value = np.mean(y)
            self.ntrain = X.shape[0]
            X_dt = X.copy()
            X_dt.cbind(dt.Frame({"y": y}))
            self.group_means = X_dt[:, dt.mean(dt.f.y), dt.by(*tgc_wo_time)]
            # Have meaningful column names
            self.group_means.names = tgc_wo_time + ["yhat"]
        else:
            self.group_means = np.mean(y)
 def project_step(self, op, *, data_map, eval_env):
     if not isinstance(op, data_algebra.data_ops.ProjectNode):
         raise TypeError(
             "op was supposed to be a data_algebra.data_ops.ProjectNode")
     # check these are forms we are prepared to work with, and build an aggregation dictionary
     # build an agg list: https://www.shanelynn.ie/summarising-aggregation-and-grouping-data-in-python-pandas/
     # https://stackoverflow.com/questions/44635626/rename-result-columns-from-pandas-aggregation-futurewarning-using-a-dict-with
     for (k, opk) in op.ops.items():
         if len(opk.args) != 1:
             raise ValueError("non-trivial aggregation expression: " +
                              str(k) + ": " + str(opk))
         if not isinstance(opk.args[0],
                           data_algebra.expr_rep.ColumnReference):
             raise ValueError(
                 "windows expression argument must be a column: " + str(k) +
                 ": " + str(opk))
     res = op.sources[0].eval_implementation(data_map=data_map,
                                             eval_env=eval_env,
                                             data_model=self)
     cols = []
     if len(op.group_by) > 0:
         for (col, expr) in op.ops.items():
             dt_expr = expr_to_dt_expr(expr)
             cols.append(res[:, {
                 col: dt_expr
             }, datatable.by(*op.group_by)][col])
     else:
         for (col, expr) in op.ops.items():
             dt_expr = expr_to_dt_expr(expr)
             cols.append(res[:, {col: dt_expr}][col])
     res = self.columns_to_frame(cols)
     return res
    def fit_transform(self, X: dt.Frame, y: np.array = None):
        target = '__target__'
        X[:, target] = dt.Frame(y)
        target_is_numeric = X[:, target][:, [bool, int, float]].shape[1] > 0
        if not target_is_numeric:
            X[:, target] = dt.Frame(LabelEncoder().fit_transform(X[:, target].to_pandas().iloc[:, 0].values).ravel())

        self._group_means = X[:, dt.mean(dt.f[target]), dt.by(*self.input_feature_names)]
        self._group_means.key = self.input_feature_names
        self.dataset_mean = X[target].mean().to_numpy().ravel()[0]

        # Expanding mean transform
        X_ = X.to_pandas()[self.input_feature_names + [target]]
        X_["index"] = X_.index
        X_shuffled = X_.sample(n=len(X_), replace=False)
        X_shuffled["cnt"] = 1
        X_shuffled["cumsum"] = (X_shuffled
                                .groupby(self.input_feature_names, sort=False)['__target__']
                                .apply(lambda x: x.shift().cumsum()))
        X_shuffled["cumcnt"] = (X_shuffled
                                .groupby(self.input_feature_names, sort=False)['cnt']
                                .apply(lambda x: x.shift().cumsum()))
        X_shuffled["encoded"] = X_shuffled["cumsum"] / X_shuffled["cumcnt"]
        X_shuffled["encoded"] = X_shuffled["encoded"].fillna(self.dataset_mean)
        X_transformed = X_shuffled.sort_values("index")["encoded"].values
        return dt.Frame(X_transformed)
Example #21
0
def test_groups_internal3():
    f0 = dt.Frame(A=[1, 2, 1, 3, 2, 2, 2, 1, 3, 1], B=range(10))
    f1 = f0[:, [f.B, f.A + f.B], by(f.A)]
    frame_integrity_check(f1)
    assert f1.to_list() == [[1, 1, 1, 1, 2, 2, 2, 2, 3, 3],
                            [0, 2, 7, 9, 1, 4, 5, 6, 3, 8],
                            [1, 3, 8, 10, 3, 6, 7, 8, 6, 11]]
Example #22
0
def test_groups_internal5_strs(seed):
    random.seed(seed)
    n = 1000
    src = ["%x" % random.getrandbits(8) for _ in range(n)]
    f0 = dt.Frame({"A": src})
    f1 = f0[:, :, by("A")]
    frame_integrity_check(f1)
Example #23
0
def test_groupby_multi_large(seed):
    random.seed(seed)
    letters = "abcdefghijklmn"
    n = 100 + int(random.expovariate(0.0001))
    col0 = [random.choice([True, False]) for _ in range(n)]
    col1 = [random.randint(-10, 10) for _ in range(n)]
    col2 = [random.choice(letters) for _ in range(n)]
    col3 = [random.random() for _ in range(n)]
    rows = [(col0[i], col1[i], col2[i], col3[i]) for i in range(n)]
    rows.sort()
    grouped = []
    lastkey = rows[0][:3]
    sumval = 0
    for i in range(n):
        ikey = rows[i][:3]
        if ikey != lastkey:
            grouped.append(lastkey + (sumval, ))
            lastkey = ikey
            sumval = 0
        sumval += rows[i][3]
    grouped.append(lastkey + (sumval, ))
    DT0 = dt.Frame([col0, col1, col2, col3], names=["A", "B", "C", "D"])
    DT1 = DT0[:, sum(f.D), by(f.A, f.B, f.C)]
    DT2 = dt.Frame(grouped)
    assert same_iterables(DT1.to_list(), DT2.to_list())
Example #24
0
def test_assign_with_groupby2():
    DT = dt.Frame(A=range(5), B=[1, 1, 2, 2, 2])
    DT[:, "C", by(f.B)] = f.A - dt.mean(f.A)
    assert_equals(
        DT, dt.Frame(A=range(5),
                     B=[1, 1, 2, 2, 2],
                     C=[-0.5, 0.5, -1.0, 0, 1.0]))
Example #25
0
def test_groups_internal4(seed):
    random.seed(seed)
    n = 100000
    src = [random.getrandbits(10) for _ in range(n)]
    f0 = dt.Frame({"A": src})
    f1 = f0[:, :, by("A")]
    f1.internal.check()
Example #26
0
def test_key_after_group():
    n = 1000
    DT = dt.Frame(A=[random.choice("abcd") for _ in range(n)])
    tmp = DT[:, dt.count(), dt.by(0)]
    frame_integrity_check(tmp)
    tmp.key = "A"
    assert tmp.to_list()[0] == ["a", "b", "c", "d"]
    assert sum(tmp.to_list()[1]) == n
Example #27
0
def test_median_grouped():
    DT = dt.Frame(A=[0, 0, 0, 0, 1, 1, 1, 1, 1],
                  B=[2, 6, 1, 0, -3, 4, None, None, -1],
                  stypes={"A": dt.int16, "B": dt.int32})
    RES = DT[:, median(f.B), by(f.A)]
    assert RES.shape == (2, 2)
    assert RES.stypes == (dt.int16, dt.float64)
    assert RES.to_list() == [[0, 1], [1.5, -1.0]]
Example #28
0
def test_groups2a():
    DT0 = dt.Frame(A=[1, 2, 1], B=[3, 4, 5])
    DT1 = DT0[:, [f.A, f.B, f.A + f.B], by("A")]
    assert_equals(DT0, dt.Frame(A=[1, 2, 1], B=[3, 4, 5]))
    assert_equals(
        DT1,
        dt.Frame([[1, 1, 2], [1, 1, 2], [3, 5, 4], [4, 6, 6]],
                 names=["A", "A.0", "B", "C0"]))
Example #29
0
def test_groups2b():
    DT0 = dt.Frame(A=[1, 2, 1, 3, 2, 2, 2, 1, 3, 1], B=range(10))
    DT1 = DT0[:, [f.B, f.A + f.B], by(f.A)]
    assert_equals(
        DT1,
        dt.Frame(A=[1, 1, 1, 1, 2, 2, 2, 2, 3, 3],
                 B=[0, 2, 7, 9, 1, 4, 5, 6, 3, 8],
                 C0=[1, 3, 8, 10, 3, 6, 7, 8, 6, 11]))
Example #30
0
def test_sort_expr():
    df = dt.Frame(A=[1, 2, 1, 2], B=[3.9, 2.7, 0.1, 4.5])
    assert_equals(df[:, :, sort("A")],
                  dt.Frame(A=[1, 1, 2, 2], B=[3.9, 0.1, 2.7, 4.5]))
    assert_equals(df[:, :, sort(f.B)],
                  dt.Frame(A=[1, 2, 1, 2], B=[0.1, 2.7, 3.9, 4.5]))
    assert_equals(df[:, 'B', by("A"), sort("B")],
                  dt.Frame(A=[1, 1, 2, 2], B=[0.1, 3.9, 2.7, 4.5]))
Example #31
0
fun = "[.datatable"
cache = "TRUE"

data_name = os.environ['SRC_GRP_LOCAL']
src_grp = os.path.join("data", data_name+".csv")
print("loading dataset %s" % data_name, flush=True)

x = dt.fread(src_grp)
print(x.nrows, flush=True)

print("grouping...", flush=True)

question = "sum v1 by id1" # q1
gc.collect()
t_start = timeit.default_timer()
ans = x[:, {"v1": sum(f.v1)}, by(f.id1)]
print(ans.shape, flush=True)
t = timeit.default_timer() - t_start
m = memory_usage()
t_start = timeit.default_timer()
chk = ans[:, sum(f.v1)]
chkt = timeit.default_timer() - t_start
write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=1, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(flatten(chk.to_list())), chk_time_sec=chkt)
del ans
gc.collect()
t_start = timeit.default_timer()
ans = x[:, {"v1": sum(f.v1)}, by(f.id1)]
print(ans.shape, flush=True)
t = timeit.default_timer() - t_start
m = memory_usage()
t_start = timeit.default_timer()