Example #1
0
def test_median_wrong_stype():
    DT = dt.Frame(A=["foo"], B=["moo"], stypes={"A": dt.str32, "B": dt.str64})
    with pytest.raises(TypeError) as e:
        noop(DT[:, median(f.A)])
    assert ("Unable to apply reduce function median() to a column of "
            "type str32" in str(e.value))
    with pytest.raises(TypeError) as e:
        noop(DT[:, median(f.B)])
    assert ("Unable to apply reduce function median() to a column of "
            "type str64" in str(e.value))
Example #2
0
def test_median_int_odd_nrows(st):
    # data points in the middle: 5 and 7
    DT = dt.Frame(A=[4, -5, 12, 11, 4, 7, 0, 23, 45, 8, 10], stype=st)
    RES = DT[:, median(f.A)]
    assert RES.shape == (1, 1)
    assert RES.stypes == (dt.float64,)
    assert RES[0, 0] == 8.0
Example #3
0
def _infer_caluclate(DT, stat):
    if stat == 'mean':
        return DT[:, {'mean_val': dt.mean(f[1])}, by(f[0])]
    elif stat == 'median':
        return DT[:, {'median_val': dt.median(f[1])}, by(f[0])]
    else:
        pass
Example #4
0
def test_median_int_even_nrows(st):
    # data points in the middle: 5 and 7
    DT = dt.Frame(A=[7, 11, -2, 3, 0, 12, 12, 3, 5, 91], stype=st)
    RES = DT[:, median(f.A)]
    assert RES.shape == (1, 1)
    assert RES.stypes == (dt.float64,)
    assert RES[0, 0] == 6.0
Example #5
0
def test_median_grouped():
    DT = dt.Frame(A=[0, 0, 0, 0, 1, 1, 1, 1, 1],
                  B=[2, 6, 1, 0, -3, 4, None, None, -1],
                  stypes={"A": dt.int16, "B": dt.int32})
    RES = DT[:, median(f.B), by(f.A)]
    assert RES.shape == (2, 2)
    assert RES.stypes == (dt.int16, dt.float64)
    assert RES.to_list() == [[0, 1], [1.5, -1.0]]
Example #6
0
def test_issue1857(numpy):
    nrows = 3620
    numpy.random.seed(364)
    DT = dt.Frame(n1=numpy.random.rand(nrows).astype(numpy.float32),
                  g1=numpy.random.randint(0, 10, nrows),
                  g2=numpy.random.randint(0, 10, nrows))
    agg1 = DT[:, {"M": dt.median(f.n1)}, by(f.g1, f.g2)]
    assert agg1.shape == (100, 3)
    assert agg1.names == ("g1", "g2", "M")
    assert agg1.stypes == (stype.int64, stype.int64, stype.float32)
    assert agg1.sum().to_tuples()[0] == (450, 450, 51.63409462571144)
Example #7
0
 def fit_transform(self, X: dt.Frame, y: np.array = None):
     target = '__internal_target__'
     X[:, target] = dt.Frame(y)
     target_is_numeric = X[:, target][:, [bool, int, float]].shape[1] > 0
     if target_is_numeric:
         self._group_means = X[:, dt.mean(dt.f[target]), dt.by(*self.input_feature_names)]
     else:
         X[:, target] = dt.Frame(LabelEncoder().fit_transform(X[:, target].to_pandas().iloc[:, 0].values).ravel())
         self._group_means = X[:, dt.median(dt.f[target]), dt.by(*self.input_feature_names)]
     del X[:, target]
     self._group_means.key = self.input_feature_names
     return self.transform(X)
Example #8
0
def analyzeDaily(fullTable, filter, prefix, postfix, byDateColName):

    print("analyzeDaily prefix='{}' postfix='{}' byDateColName='{}'".format(prefix, postfix, byDateColName))
    #print("analyzeDaily filter='{}' '".format(filter))
    byDate = dt.f[byDateColName]
    #print("----- analyzeDaily:"+postfix)
    #dayTable = fullTable[(dt.f.DatenstandTag >= fromDay) & (dt.f.DatenstandTag < toDay) & (dt.f.IdLandkreis == forIdLandkreis),:]

    dayTable = fullTable[filter,:]

    cases_to_count = dayTable[(dt.f.NeuerFall == 0) | (dt.f.NeuerFall == 1),:]
    cases = cases_to_count[:, [dt.sum(dt.f.AnzahlFall)],dt.by(byDate)]
    cases.names = [byDateColName, prefix+"AnzahlFall"+postfix]
    cases.key = byDateColName
    print("cases rows = {}, cases_to_count = {}".format(cases.nrows, cases_to_count.nrows))
    #print(cases)
    byDayTable = cases

    if byDateColName == "DatenstandTag":
        new_cases_to_count = dayTable[(dt.f.NeuerFall == -1) | (dt.f.NeuerFall == 1),:]
        new_cases = new_cases_to_count[:, [dt.sum(dt.f.AnzahlFall)],dt.by(byDate)]
        new_cases.names = [byDateColName, prefix+"AnzahlFallNeu"+postfix]
        new_cases.key = byDateColName
        print("new_cases rows = {}, new_cases_to_count = {}".format(new_cases.nrows, new_cases_to_count.nrows))
        #new_cases_to_count.to_csv("new_cases_to_count.csv")
        byDayTable = byDayTable[:,:,dt.join(new_cases)]
    else:
        # add days by MeldeTag
        byDayTable.names = {prefix+"AnzahlFall"+postfix: prefix+"AnzahlFallNeu"+postfix}
        byDayTable = addRunningSumColumn(byDayTable, prefix+"AnzahlFallNeu"+postfix, prefix+"AnzahlFall"+postfix)

    dead_to_count = dayTable[(dt.f.NeuerTodesfall == 0) | (dt.f.NeuerTodesfall == 1),:]
    dead = dead_to_count[:, [dt.sum(dt.f.AnzahlTodesfall)],dt.by(byDate)]
    dead.names = [byDateColName, prefix+"AnzahlTodesfall"+postfix]
    dead.key = byDateColName
    #print("dead rows = {}".format(dead.nrows))
    byDayTable = byDayTable[:,:,dt.join(dead)]

    if byDateColName == "DatenstandTag":
        new_dead_to_count = dayTable[(dt.f.NeuerTodesfall == -1) | (dt.f.NeuerTodesfall == 1),:]
        new_dead = new_dead_to_count[:, [dt.sum(dt.f.AnzahlTodesfall)],dt.by(byDate)]
        new_dead.names = [byDateColName, prefix+"AnzahlTodesfallNeu"+postfix]
        new_dead.key = byDateColName
        #print("new_dead rows = {}".format(new_dead.nrows))
        byDayTable = byDayTable[:,:,dt.join(new_dead)]
    else:
        # add days by MeldeTag
        byDayTable.names = {prefix+"AnzahlTodesfall"+postfix: prefix+"AnzahlTodesfallNeu"+postfix}
        byDayTable = addRunningSumColumn(byDayTable, prefix+"AnzahlTodesfallNeu"+postfix, prefix+"AnzahlTodesfall"+postfix)

    byDayTable.key = byDateColName

    if postfix == "" and prefix == "" and byDateColName == "DatenstandTag":
        new_cases_to_count_delay = new_cases_to_count[(dt.f.AnzahlFall > 0), :]  # measure delay only for positive cases
        new_cases_to_count_delay.materialize()
        new_cases_delay = new_cases_to_count_delay[:, [dt.min(dt.f.MeldeDelay), dt.max(dt.f.MeldeDelay),
                                                       dt.mean(dt.f.MeldeDelay), dt.median(dt.f.MeldeDelay),
                                                       dt.sd(dt.f.MeldeDelay), dt.sum(dt.f.AnzahlFall),
                                                       dt.max(dt.f.DatenstandTag)], dt.by(byDate)]
        new_cases_delay.names = ["DatenstandTag",
                                 "PublikationsdauerFallNeu_Min" + postfix, "PublikationsdauerFallNeu_Max" + postfix,
                                 "PublikationsdauerFallNeu_Schnitt" + postfix, "PublikationsdauerFallNeu_Median" + postfix,
                                 "PublikationsdauerFallNeu_StdAbw" + postfix, "PublikationsdauerFallNeu_Fallbasis" + postfix,
                                 "DatenstandTag_Max" + postfix]
        new_cases_delay.key = "DatenstandTag"
        print("new_cases_delay rows = {}, new_cases_to_count_delay = {}".format(new_cases_delay.nrows,
                                                                                new_cases_to_count_delay.nrows))

        recovered_to_count = dayTable[(dt.f.NeuGenesen == 0) | (dt.f.NeuGenesen == 1),:]
        recovered = recovered_to_count[:, [dt.sum(dt.f.AnzahlGenesen)],dt.by(byDate)]
        recovered.names = ["DatenstandTag", "AnzahlGenesen"+postfix]
        recovered.key = "DatenstandTag"
        #print("recovered rows = {}".format(recovered.nrows))

        new_recovered_to_count = dayTable[(dt.f.NeuGenesen == -1) | (dt.f.NeuGenesen == 1),:]
        new_recovered = new_recovered_to_count[:, [dt.sum(dt.f.AnzahlGenesen)],dt.by(byDate)]
        new_recovered.names = ["DatenstandTag", "AnzahlGenesenNeu"+postfix]
        new_recovered.key = "DatenstandTag"
        #print("new_recovered rows = {}".format(new_recovered.nrows))

        byDayTable = byDayTable[:, :, dt.join(recovered)][:, :, dt.join(new_recovered)][:, :,dt.join(new_cases_delay)]
        #byDayTable = byDayTable[:,:,dt.join(recovered)][:,:,dt.join(new_recovered)]\
        #    [:,:,dt.join(new_cases_strict)][:,:,dt.join(new_cases_strict_14)][:,:,dt.join(new_cases_delay)]

    byDayTable.key = byDateColName
    #print("byDayTable rows = {}".format(byDayTable.nrows))
    #print(byDayTable)
    return byDayTable
Example #9
0
def test_median_issue2802_2():
    I = dt.Frame(list(range(13)), stype=dt.int64)
    DT = dt.Frame(A=range(13))[I, :]
    RES = DT[:, median(f.A)]
    assert_equals(RES, dt.Frame(A=[6.0]))
Example #10
0
def analyzeDaily(fullTable, filter, postfix):

    #print("----- analyzeDaily:"+postfix)
    #dayTable = fullTable[(dt.f.DatenstandTag >= fromDay) & (dt.f.DatenstandTag < toDay) & (dt.f.IdLandkreis == forIdLandkreis),:]
    dayTable = fullTable[filter, :]

    cases_to_count = dayTable[(dt.f.NeuerFall == 0) | (dt.f.NeuerFall == 1), :]
    cases = cases_to_count[:, [dt.sum(dt.f.AnzahlFall)],
                           dt.by(dt.f.DatenstandTag)]
    cases.names = ["DatenstandTag", "AnzahlFall" + postfix]
    cases.key = "DatenstandTag"
    print("cases rows = {}, cases_to_count = {}".format(
        cases.nrows, cases_to_count.nrows))
    #print(cases)

    new_cases_to_count = dayTable[(dt.f.NeuerFall == -1) |
                                  (dt.f.NeuerFall == 1), :]
    new_cases = new_cases_to_count[:, [dt.sum(dt.f.AnzahlFall)],
                                   dt.by(dt.f.DatenstandTag)]
    new_cases.names = ["DatenstandTag", "AnzahlFallNeu" + postfix]
    new_cases.key = "DatenstandTag"
    print("new_cases rows = {}, new_cases_to_count = {}".format(
        new_cases.nrows, new_cases_to_count.nrows))
    #new_cases_to_count.to_csv("new_cases_to_count.csv")

    new_cases_to_count_delay = new_cases_to_count[(
        dt.f.AnzahlFall > 0), :]  # measure delay only for positive cases
    new_cases_to_count_delay.materialize()
    new_cases_delay = new_cases_to_count_delay[:, [
        dt.min(dt.f.MeldeDelay),
        dt.max(dt.f.MeldeDelay),
        dt.mean(dt.f.MeldeDelay),
        dt.median(dt.f.MeldeDelay),
        dt.sd(dt.f.MeldeDelay),
        dt.sum(dt.f.AnzahlFall),
        dt.max(dt.f.DatenstandTag)
    ],
                                               dt.by(dt.f.DatenstandTag)]
    new_cases_delay.names = [
        "DatenstandTag", "MeldeDauerFallNeu-Min" + postfix,
        "MeldeDauerFallNeu-Max" + postfix,
        "MeldeDauerFallNeu-Schnitt" + postfix,
        "MeldeDauerFallNeu-Median" + postfix,
        "MeldeDauerFallNeu-StdAbw" + postfix,
        "MeldeDauerFallNeu-Fallbasis" + postfix, "DatenstandTag-Max" + postfix
    ]
    new_cases_delay.key = "DatenstandTag"
    print("new_cases_delay rows = {}, new_cases_to_count_delay = {}".format(
        new_cases_delay.nrows, new_cases_to_count_delay.nrows))
    #new_cases_delay = new_cases_to_count_delay[:, [dt.mean(dt.f.DatenstandTag-dt.f.MeldeTag)],dt.by(dt.f.DatenstandTag)]

    #     delays = delayRecs[:, [dt.mean(dt.f.MeldeDelay), dt.median(dt.f.MeldeDelay), dt.sd(dt.f.MeldeDelay), dt.sum(dt.f.AnzahlFall)], dt.by(dt.f.Landkreis)]

    # new_cases_stddev = new_cases_to_count_delay[:, [dt.mean(dt.f.DatenstandTag - dt.f.MeldeTag)],
    #                   dt.by(dt.f.DatenstandTag)]
    # new_cases_delay.names = ["DatenstandTag", "AnzahlFallNeu-MeldeDauer" + postfix]
    # new_cases_delay.key = "DatenstandTag"
    # print("new_cases_delay rows = {}, new_cases_to_count_delay = {}".format(new_cases_delay.nrows,
    #                                                                         new_cases_to_count_delay.nrows))

    new_cases_to_count_strict = new_cases_to_count[(
        dt.f.DatenstandTag - dt.f.MeldeTag < 7) | (dt.f.AnzahlFall < 0), :]
    new_cases_strict = new_cases_to_count_strict[:, [dt.sum(dt.f.AnzahlFall)],
                                                 dt.by(dt.f.DatenstandTag)]
    new_cases_strict.names = [
        "DatenstandTag", "AnzahlFallNeu-Meldung-letze-7-Tage" + postfix
    ]
    new_cases_strict.key = "DatenstandTag"
    print("new_cases_strict rows = {}, new_cases_to_count_strict = {}".format(
        new_cases_strict.nrows, new_cases_to_count_strict.nrows))
    #new_cases_to_count_strict.to_csv("new_cases_to_count_strict.csv")

    new_cases_to_count_strict_14 = new_cases_to_count[(
        dt.f.DatenstandTag - dt.f.MeldeTag < 14) | (dt.f.AnzahlFall < 0), :]
    new_cases_strict_14 = new_cases_to_count_strict_14[:, [
        dt.sum(dt.f.AnzahlFall)
    ], dt.by(dt.f.DatenstandTag)]
    new_cases_strict_14.names = [
        "DatenstandTag", "AnzahlFallNeu-Meldung-letze-14-Tage" + postfix
    ]
    new_cases_strict_14.key = "DatenstandTag"
    print("new_cases_strict_14 rows = {}, new_cases_to_count_strict_14 = {}".
          format(new_cases_strict_14.nrows,
                 new_cases_to_count_strict_14.nrows))

    dead_to_count = dayTable[(dt.f.NeuerTodesfall == 0) |
                             (dt.f.NeuerTodesfall == 1), :]
    dead = dead_to_count[:, [dt.sum(dt.f.AnzahlTodesfall)],
                         dt.by(dt.f.DatenstandTag)]
    dead.names = ["DatenstandTag", "AnzahlTodesfall" + postfix]
    dead.key = "DatenstandTag"
    #print("dead rows = {}".format(dead.nrows))

    new_dead_to_count = dayTable[(dt.f.NeuerTodesfall == -1) |
                                 (dt.f.NeuerTodesfall == 1), :]
    new_dead = new_dead_to_count[:, [dt.sum(dt.f.AnzahlTodesfall)],
                                 dt.by(dt.f.DatenstandTag)]
    new_dead.names = ["DatenstandTag", "AnzahlTodesfallNeu" + postfix]
    new_dead.key = "DatenstandTag"
    #print("new_dead rows = {}".format(new_dead.nrows))

    recovered_to_count = dayTable[(dt.f.NeuGenesen == 0) |
                                  (dt.f.NeuGenesen == 1), :]
    recovered = recovered_to_count[:, [dt.sum(dt.f.AnzahlGenesen)],
                                   dt.by(dt.f.DatenstandTag)]
    recovered.names = ["DatenstandTag", "AnzahlGenesen" + postfix]
    recovered.key = "DatenstandTag"
    #print("recovered rows = {}".format(recovered.nrows))

    new_recovered_to_count = dayTable[(dt.f.NeuGenesen == -1) |
                                      (dt.f.NeuGenesen == 1), :]
    new_recovered = new_recovered_to_count[:, [dt.sum(dt.f.AnzahlGenesen)],
                                           dt.by(dt.f.DatenstandTag)]
    new_recovered.names = ["DatenstandTag", "AnzahlGenesenNeu" + postfix]
    new_recovered.key = "DatenstandTag"
    #print("new_recovered rows = {}".format(new_recovered.nrows))

    byDayTable = cases[:,:,dt.join(new_cases)]\
                     [:,:,dt.join(dead)][:,:,dt.join(new_dead)][:,:,dt.join(recovered)][:,:,dt.join(new_recovered)]\
        [:,:,dt.join(new_cases_strict)][:,:,dt.join(new_cases_strict_14)][:,:,dt.join(new_cases_delay)]
    byDayTable.key = "DatenstandTag"
    #print("byDayTable rows = {}".format(byDayTable.nrows))
    print(byDayTable)

    return byDayTable
Example #11
0
          fun=fun,
          run=2,
          time_sec=t,
          mem_gb=m,
          cache=cache,
          chk=make_chk(flatten(chk.to_list())),
          chk_time_sec=chkt,
          on_disk=on_disk)
print(ans.head(3), flush=True)
print(ans.tail(3), flush=True)
del ans

question = 'median v3 sd v3 by id4 id5'  # q6
gc.collect()
t_start = timeit.default_timer()
ans = x[:, {'median_v3': median(f.v3), 'sd_v3': sd(f.v3)}, by(f.id4, f.id5)]
print(ans.shape, flush=True)
t = timeit.default_timer() - t_start
m = memory_usage()
t_start = timeit.default_timer()
chk = ans[:, [sum(f.median_v3), sum(f.sd_v3)]]
chkt = timeit.default_timer() - t_start
write_log(task=task,
          data=data_name,
          in_rows=x.shape[0],
          question=question,
          out_rows=ans.shape[0],
          out_cols=ans.shape[1],
          solution=solution,
          version=ver,
          git=git,
Example #12
0
def test_median_some_nas():
    DT = dt.Frame(S=[None, 5, None, 12, None, -3, None, None, None, 4])
    RES = DT[:, median(f.S)]
    assert RES.shape == (1, 1)
    assert RES.stypes == (dt.float64,)
    assert RES[0, 0] == 4.5
Example #13
0
def test_median_all_nas():
    DT = dt.Frame(N=[math.nan] * 8)
    RES = DT[:, median(f.N)]
    assert RES.shape == (1, 1)
    assert RES.stypes == (dt.float64,)
    assert RES[0, 0] is None
Example #14
0
def test_median_float(st):
    DT = dt.Frame(W=[0.0, 5.5, 7.9, math.inf, -math.inf], stype=st)
    RES = DT[:, median(f.W)]
    assert RES.shape == (1, 1)
    assert RES.stypes == (st,)
    assert RES[0, 0] == 5.5  # 5.5 has same value in float64 and float32
Example #15
0
def test_median_int_no_overflow():
    # If median calculation done inaccurately, 111+112 may overflow int8,
    # giving a negative result
    DT = dt.Frame(A=[111, 112], stype=dt.int8)
    RES = DT[:, median(f.A)]
    assert RES[0, 0] == 111.5
Example #16
0
def test_median():
    assert str(dt.median(f.A)) == str(f.A.median())
    assert str(dt.median(f[:])) == str(f[:].median())
    DT = dt.Frame(A=[2, 3, 5, 5, 9, -1, 2.2])
    assert_equals(DT[:, f.A.median()], DT[:, dt.median(f.A)])
Example #17
0
def test_median_bygroup():
    DT = dt.Frame(A=[0.1, 0.2, 0.5, 0.4, 0.3, 0], B=[1, 2, 1, 1, 2, 2])
    RZ = DT[:, median(f.A), by(f.B)]
    # group 1: 0.1, 0.4, 0.5
    # group 2: 0.0, 0.2, 0.3
    assert RZ.to_list() == [[1, 2], [0.4, 0.2]]
Example #18
0
def test_median_bool_odd_nrows():
    DT2 = dt.Frame(B=[True, False, True])
    RES2 = DT2[:, median(f.B)]
    assert RES2.shape == (1, 1)
    assert RES2.stypes == (dt.float64,)
    assert RES2[0, 0] == 1.0
Example #19
0
def test_median_bool_even_nrows():
    DT = dt.Frame(A=[True, False, True, False])
    RES = DT[:, median(f.A)]
    assert RES.shape == (1, 1)
    assert RES.stypes == (dt.float64,)
    assert RES[0, 0] == 0.5
Example #20
0
def test_median_empty_frame():
    DT = dt.Frame(A=[])
    RES = DT[:, median(f.A)]
    assert RES.shape == (1, 1)
    assert RES.to_list() == [[None]]