コード例 #1
0
def test_rows_min_max():
    from datatable import min, max
    df0 = dt.Frame(range(10), names=["A"])
    # min = 0, max = 9
    df1 = df0(f.A > (min(f.A) + max(f.A)) / 2, engine="eager")
    assert df1.internal.check()
    assert df1.topython() == [[5, 6, 7, 8, 9]]
コード例 #2
0
def test_rows_min_max():
    from datatable import min, max
    df0 = dt.Frame(A=range(10))
    # min = 0, max = 9
    df1 = df0[f.A > (min(f.A) + max(f.A)) / 2, :]
    frame_integrity_check(df1)
    assert df1.to_list() == [[5, 6, 7, 8, 9]]
コード例 #3
0
ファイル: test_groups.py プロジェクト: zzhuuh/datatable
def test_groups_multiple():
    f0 = dt.Frame({
        "color": ["red", "blue", "green", "red", "green"],
        "size": [5, 2, 7, 13, 0]
    })
    f1 = f0[:, [min(f.size), max(f.size)], "color"]
    frame_integrity_check(f1)
    assert f1.to_list() == [["blue", "green", "red"], [2, 0, 5], [2, 7, 13]]
コード例 #4
0
ファイル: test_groups.py プロジェクト: whmnoe4j/datatable
def test_groups_multiple():
    f0 = dt.Frame({
        "color": ["red", "blue", "green", "red", "green"],
        "size": [5, 2, 7, 13, 0]
    })
    f1 = f0[:, [min(f.size), max(f.size)], "color"]
    f1.internal.check()
    assert f1.topython() == [["blue", "green", "red"], [2, 0, 5], [2, 7, 13]]
コード例 #5
0
def test_groupby_on_view():
    # See issue #1542
    DT = dt.Frame(A=[1, 2, 3, 1, 2, 3],
                  B=[3, 6, 2, 4, 3, 1],
                  C=['b', 'd', 'b', 'b', 'd', 'b'])
    V = DT[f.A != 1, :]
    assert isview(V)
    assert_equals(
        V, dt.Frame(A=[2, 3, 2, 3], B=[6, 2, 3, 1], C=['d', 'b', 'd', 'b']))
    RES = V[:, max(f.B), by(f.C)]
    assert_equals(RES, dt.Frame(C=['b', 'd'], B=[2, 6]))
コード例 #6
0
ファイル: test-groups.py プロジェクト: Tom-Deng/datatable
def test_groupby_on_view():
    # See issue #1542
    DT = dt.Frame(A=[1, 2, 3, 1, 2, 3],
                  B=[3, 6, 2, 4, 3, 1],
                  C=['b', 'd', 'b', 'b', 'd', 'b'])
    V = DT[f.A != 1, :]
    assert isview(V)
    assert V.shape == (4, 3)
    assert V.to_dict() == {'A': [2, 3, 2, 3],
                           'B': [6, 2, 3, 1],
                           'C': ['d', 'b', 'd', 'b']}
    RES = V[:, max(f.B), by(f.C)]
    assert RES.shape == (2, 2)
    assert RES.to_dict() == {'C': ['b', 'd'],
                             'C0': [2, 6]}
コード例 #7
0
# parameters
target_col = "Known_Fraud"
times = 5
random_seed = 123

new_dataset_name = "new_dataset_name_with_downsampled_majority"

# counts by target groups
g = X[:, {"count": count()}, by(target_col)]
if not g.shape[1] == 2:
    raise ValueError(
        "Not a binary target - target column must contain exactly 2 values.")

# find sizes and target values for minority and majority class partitions
n_minority = g[:, min(f.count)][0, 0]
n_majority = g[:, max(f.count)][0, 0]
target_minority = g[f.count == n_minority, target_col][0, 0]
target_majority = g[f.count == n_majority, target_col][0, 0]

# validate that times indeed downsamples majority class
if times * n_minority >= n_majority:
    raise ValueError(
        "Downsampling coefficient `times` is too large: downsampled dataset results in inflated majority class."
    )

# downsample with pandas frame
df_majority = X[f[target_col] == target_majority, :].to_pandas()
df_majority_downsampled = resample(df_majority,
                                   replace=False,
                                   n_samples=n_minority * times,
                                   random_state=random_seed)
コード例 #8
0
          fun=fun,
          run=2,
          time_sec=t,
          mem_gb=m,
          cache=cache,
          chk=make_chk(flatten(chk.to_list())),
          chk_time_sec=chkt,
          on_disk=on_disk)
print(ans.head(3), flush=True)
print(ans.tail(3), flush=True)
del ans

question = 'max v1 - min v2 by id3'  # q7
gc.collect()
t_start = timeit.default_timer()
ans = x[:, {'range_v1_v2': max(f.v1) - min(f.v2)}, by(f.id3)]
print(ans.shape, flush=True)
t = timeit.default_timer() - t_start
m = memory_usage()
t_start = timeit.default_timer()
chk = ans[:, sum(f.range_v1_v2)]
chkt = timeit.default_timer() - t_start
write_log(task=task,
          data=data_name,
          in_rows=x.shape[0],
          question=question,
          out_rows=ans.shape[0],
          out_cols=ans.shape[1],
          solution=solution,
          version=ver,
          git=git,
コード例 #9
0
#ans = x[:, {"median_v3": median(f.v3), "sd_v3": sd(f.v3)}, by(f.id2, f.id4)]
#print(ans.shape, flush=True)
#t = timeit.default_timer() - t_start
#m = memory_usage()
#t_start = timeit.default_timer()
#chk = ans[:, [sum(f.median_v3), sum(f.sd_v3)]]
#chkt = timeit.default_timer() - t_start
#write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=2, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(flatten(chk.to_list())), chk_time_sec=chkt)
#print(ans.head(3).to_pandas(), flush=True)
#print(ans.tail(3).to_pandas(), flush=True)
#del ans

question = "max v1 - min v2 by id2 id4"  # q7
gc.collect()
t_start = timeit.default_timer()
ans = x[:, {"range_v1_v2": max(f.v1) - min(f.v2)}, by(f.id2, f.id4)]
print(ans.shape, flush=True)
t = timeit.default_timer() - t_start
m = memory_usage()
t_start = timeit.default_timer()
chk = ans[:, sum(f.range_v1_v2)]
chkt = timeit.default_timer() - t_start
write_log(task=task,
          data=data_name,
          in_rows=x.shape[0],
          question=question,
          out_rows=ans.shape[0],
          out_cols=ans.shape[1],
          solution=solution,
          version=ver,
          git=git,
コード例 #10
0
ファイル: scratch.py プロジェクト: scharrenberg/Corona
def loadAndProcessData(dataFilename):
    print("Loading " + dataFilename)

    fullTable = dt.fread(dataFilename)
    print("Loading done loading table from ‘" + dataFilename + "‘, keys:")
    print(fullTable.keys())
    cases = fullTable[:, 'AnzahlFall'].sum()[0, 0]
    dead = fullTable[:, 'AnzahlTodesfall'].sum()[0, 0]

    lastDay = fullTable[:, 'MeldeDay'].max()[0, 0]
    lastnewCaseOnDay = fullTable[:, 'newCaseOnDay'].max()[0, 0]
    print("File stats: lastDay {} lastnewCaseOnDay {} cases {} dead {}".format(
        lastDay, lastnewCaseOnDay, cases, dead))

    newTable = fullTable[:, dt.f[:].
                         extend({"erkMeldeDelay": dt.f.MeldeDay -
                                 dt.f.RefDay})]
    #print(newTable.keys())

    #dt.by(dt.f.Bundesland)]
    alldays = fullTable[:, [
        dt.sum(dt.f.AnzahlFall),
        dt.sum(dt.f.FaellePro100k),
        dt.sum(dt.f.AnzahlTodesfall),
        dt.sum(dt.f.TodesfaellePro100k),
        dt.mean(dt.f.Bevoelkerung),
        dt.max(dt.f.MeldeDay),
        dt.first(dt.f.LandkreisTyp),
        dt.first(dt.f.Bundesland)
    ],
                        dt.by(dt.f.Landkreis)]

    last7days = fullTable[dt.f.newCaseOnDay > lastDay -
                          7, :][:, [
                              dt.sum(dt.f.AnzahlFall),
                              dt.sum(dt.f.FaellePro100k),
                              dt.sum(dt.f.AnzahlTodesfall),
                              dt.sum(dt.f.TodesfaellePro100k)
                          ],
                                dt.by(dt.f.Landkreis)]
    last7days.names = [
        "Landkreis", "AnzahlFallLetzte7Tage", "FaellePro100kLetzte7Tage",
        "AnzahlTodesfallLetzte7Tage", "TodesfaellePro100kLetzte7Tage"
    ]
    last7days[dt.f.AnzahlFallLetzte7Tage < 0, "AnzahlFallLetzte7Tage"] = 0
    last7days[dt.f.FaellePro100kLetzte7Tage < 0,
              "FaellePro100kLetzte7Tage"] = 0
    last7days[dt.f.AnzahlTodesfallLetzte7Tage < 0,
              "AnzahlTodesfallLetzte7Tage"] = 0
    last7days[dt.f.TodesfaellePro100kLetzte7Tage < 0,
              "TodesfaellePro100kLetzte7Tage"] = 0

    lastWeek7days = fullTable[(dt.f.newCaseOnDay > lastDay - 14) & (
        dt.f.newCaseOnDay <= lastDay - 7), :][:, [
            dt.sum(dt.f.AnzahlFall),
            dt.sum(dt.f.FaellePro100k),
            dt.sum(dt.f.AnzahlTodesfall),
            dt.sum(dt.f.TodesfaellePro100k)
        ],
                                              dt.by(dt.f.Landkreis)]
    #lastWeek7days[dt.f[1:] < 0, dt.f[1:]] = 0
    lastWeek7days.names = [
        "Landkreis", "AnzahlFallLetzte7TageDavor",
        "FaellePro100kLetzte7TageDavor", "AnzahlTodesfallLetzte7TageDavor",
        "TodesfaellePro100kLetzte7TageDavor"
    ]
    lastWeek7days[dt.f.AnzahlFallLetzte7TageDavor < 0,
                  "AnzahlFallLetzte7TageDavor"] = 0
    lastWeek7days[dt.f.FaellePro100kLetzte7TageDavor < 0,
                  "FaellePro100kLetzte7TageDavor"] = 0
    lastWeek7days[dt.f.AnzahlTodesfallLetzte7TageDavor < 0,
                  "AnzahlTodesfallLetzte7TageDavor"] = 0
    lastWeek7days[dt.f.TodesfaellePro100kLetzte7TageDavor < 0,
                  "TodesfaellePro100kLetzte7TageDavor"] = 0

    allDaysExt0 = merge(alldays, last7days, "Landkreis")
    allDaysExt1 = merge(allDaysExt0, lastWeek7days, "Landkreis")

    Rw = dt.f.AnzahlFallLetzte7Tage / dt.f.AnzahlFallLetzte7TageDavor

    allDaysExt2 = allDaysExt1[:, dt.f[:].extend({"AnzahlFallTrend": Rw})]
    allDaysExt3 = allDaysExt2[:, dt.f[:].extend({
        "FaellePro100kTrend":
        dt.f.FaellePro100kLetzte7Tage - dt.f.FaellePro100kLetzte7TageDavor
    })]
    allDaysExt4 = allDaysExt3[:, dt.f[:].extend({
        "TodesfaellePro100kTrend":
        dt.f.TodesfaellePro100kLetzte7Tage -
        dt.f.TodesfaellePro100kLetzte7TageDavor
    })]

    allDaysExt5 = allDaysExt4[:, dt.f[:].extend({
        "Kontaktrisiko":
        dt.f.Bevoelkerung / 6.25 /
        ((dt.f.AnzahlFallLetzte7Tage + dt.f.AnzahlFallLetzte7TageDavor) * Rw)
    })]
    allDaysExt6 = allDaysExt5[:, dt.f[:].extend(
        {"LetzteMeldung": lastDay - dt.f.MeldeDay})]

    allDaysExt6[dt.f.Kontaktrisiko * 2 == dt.f.Kontaktrisiko,
                "Kontaktrisiko"] = 999999

    sortedByRisk = allDaysExt6.sort(
        ["Kontaktrisiko", "LetzteMeldung", "FaellePro100k"])
    #print(sortedByRisk)
    allDaysExt = sortedByRisk[:, dt.f[:].extend({"Rang": 0})]
    allDaysExt[:, "Rang"] = np.arange(1, allDaysExt.nrows + 1)
    #print(allDaysExt)

    print("Column names frame order:", list(enumerate(allDaysExt.names)))

    data = allDaysExt.to_pandas()
    return data
コード例 #11
0
def analyzeDaily(fullTable, filter, postfix):

    #print("----- analyzeDaily:"+postfix)
    #dayTable = fullTable[(dt.f.DatenstandTag >= fromDay) & (dt.f.DatenstandTag < toDay) & (dt.f.IdLandkreis == forIdLandkreis),:]
    dayTable = fullTable[filter, :]

    cases_to_count = dayTable[(dt.f.NeuerFall == 0) | (dt.f.NeuerFall == 1), :]
    cases = cases_to_count[:, [dt.sum(dt.f.AnzahlFall)],
                           dt.by(dt.f.DatenstandTag)]
    cases.names = ["DatenstandTag", "AnzahlFall" + postfix]
    cases.key = "DatenstandTag"
    print("cases rows = {}, cases_to_count = {}".format(
        cases.nrows, cases_to_count.nrows))
    #print(cases)

    new_cases_to_count = dayTable[(dt.f.NeuerFall == -1) |
                                  (dt.f.NeuerFall == 1), :]
    new_cases = new_cases_to_count[:, [dt.sum(dt.f.AnzahlFall)],
                                   dt.by(dt.f.DatenstandTag)]
    new_cases.names = ["DatenstandTag", "AnzahlFallNeu" + postfix]
    new_cases.key = "DatenstandTag"
    print("new_cases rows = {}, new_cases_to_count = {}".format(
        new_cases.nrows, new_cases_to_count.nrows))
    #new_cases_to_count.to_csv("new_cases_to_count.csv")

    new_cases_to_count_delay = new_cases_to_count[(
        dt.f.AnzahlFall > 0), :]  # measure delay only for positive cases
    new_cases_to_count_delay.materialize()
    new_cases_delay = new_cases_to_count_delay[:, [
        dt.min(dt.f.MeldeDelay),
        dt.max(dt.f.MeldeDelay),
        dt.mean(dt.f.MeldeDelay),
        dt.median(dt.f.MeldeDelay),
        dt.sd(dt.f.MeldeDelay),
        dt.sum(dt.f.AnzahlFall),
        dt.max(dt.f.DatenstandTag)
    ],
                                               dt.by(dt.f.DatenstandTag)]
    new_cases_delay.names = [
        "DatenstandTag", "MeldeDauerFallNeu-Min" + postfix,
        "MeldeDauerFallNeu-Max" + postfix,
        "MeldeDauerFallNeu-Schnitt" + postfix,
        "MeldeDauerFallNeu-Median" + postfix,
        "MeldeDauerFallNeu-StdAbw" + postfix,
        "MeldeDauerFallNeu-Fallbasis" + postfix, "DatenstandTag-Max" + postfix
    ]
    new_cases_delay.key = "DatenstandTag"
    print("new_cases_delay rows = {}, new_cases_to_count_delay = {}".format(
        new_cases_delay.nrows, new_cases_to_count_delay.nrows))
    #new_cases_delay = new_cases_to_count_delay[:, [dt.mean(dt.f.DatenstandTag-dt.f.MeldeTag)],dt.by(dt.f.DatenstandTag)]

    #     delays = delayRecs[:, [dt.mean(dt.f.MeldeDelay), dt.median(dt.f.MeldeDelay), dt.sd(dt.f.MeldeDelay), dt.sum(dt.f.AnzahlFall)], dt.by(dt.f.Landkreis)]

    # new_cases_stddev = new_cases_to_count_delay[:, [dt.mean(dt.f.DatenstandTag - dt.f.MeldeTag)],
    #                   dt.by(dt.f.DatenstandTag)]
    # new_cases_delay.names = ["DatenstandTag", "AnzahlFallNeu-MeldeDauer" + postfix]
    # new_cases_delay.key = "DatenstandTag"
    # print("new_cases_delay rows = {}, new_cases_to_count_delay = {}".format(new_cases_delay.nrows,
    #                                                                         new_cases_to_count_delay.nrows))

    new_cases_to_count_strict = new_cases_to_count[(
        dt.f.DatenstandTag - dt.f.MeldeTag < 7) | (dt.f.AnzahlFall < 0), :]
    new_cases_strict = new_cases_to_count_strict[:, [dt.sum(dt.f.AnzahlFall)],
                                                 dt.by(dt.f.DatenstandTag)]
    new_cases_strict.names = [
        "DatenstandTag", "AnzahlFallNeu-Meldung-letze-7-Tage" + postfix
    ]
    new_cases_strict.key = "DatenstandTag"
    print("new_cases_strict rows = {}, new_cases_to_count_strict = {}".format(
        new_cases_strict.nrows, new_cases_to_count_strict.nrows))
    #new_cases_to_count_strict.to_csv("new_cases_to_count_strict.csv")

    new_cases_to_count_strict_14 = new_cases_to_count[(
        dt.f.DatenstandTag - dt.f.MeldeTag < 14) | (dt.f.AnzahlFall < 0), :]
    new_cases_strict_14 = new_cases_to_count_strict_14[:, [
        dt.sum(dt.f.AnzahlFall)
    ], dt.by(dt.f.DatenstandTag)]
    new_cases_strict_14.names = [
        "DatenstandTag", "AnzahlFallNeu-Meldung-letze-14-Tage" + postfix
    ]
    new_cases_strict_14.key = "DatenstandTag"
    print("new_cases_strict_14 rows = {}, new_cases_to_count_strict_14 = {}".
          format(new_cases_strict_14.nrows,
                 new_cases_to_count_strict_14.nrows))

    dead_to_count = dayTable[(dt.f.NeuerTodesfall == 0) |
                             (dt.f.NeuerTodesfall == 1), :]
    dead = dead_to_count[:, [dt.sum(dt.f.AnzahlTodesfall)],
                         dt.by(dt.f.DatenstandTag)]
    dead.names = ["DatenstandTag", "AnzahlTodesfall" + postfix]
    dead.key = "DatenstandTag"
    #print("dead rows = {}".format(dead.nrows))

    new_dead_to_count = dayTable[(dt.f.NeuerTodesfall == -1) |
                                 (dt.f.NeuerTodesfall == 1), :]
    new_dead = new_dead_to_count[:, [dt.sum(dt.f.AnzahlTodesfall)],
                                 dt.by(dt.f.DatenstandTag)]
    new_dead.names = ["DatenstandTag", "AnzahlTodesfallNeu" + postfix]
    new_dead.key = "DatenstandTag"
    #print("new_dead rows = {}".format(new_dead.nrows))

    recovered_to_count = dayTable[(dt.f.NeuGenesen == 0) |
                                  (dt.f.NeuGenesen == 1), :]
    recovered = recovered_to_count[:, [dt.sum(dt.f.AnzahlGenesen)],
                                   dt.by(dt.f.DatenstandTag)]
    recovered.names = ["DatenstandTag", "AnzahlGenesen" + postfix]
    recovered.key = "DatenstandTag"
    #print("recovered rows = {}".format(recovered.nrows))

    new_recovered_to_count = dayTable[(dt.f.NeuGenesen == -1) |
                                      (dt.f.NeuGenesen == 1), :]
    new_recovered = new_recovered_to_count[:, [dt.sum(dt.f.AnzahlGenesen)],
                                           dt.by(dt.f.DatenstandTag)]
    new_recovered.names = ["DatenstandTag", "AnzahlGenesenNeu" + postfix]
    new_recovered.key = "DatenstandTag"
    #print("new_recovered rows = {}".format(new_recovered.nrows))

    byDayTable = cases[:,:,dt.join(new_cases)]\
                     [:,:,dt.join(dead)][:,:,dt.join(new_dead)][:,:,dt.join(recovered)][:,:,dt.join(new_recovered)]\
        [:,:,dt.join(new_cases_strict)][:,:,dt.join(new_cases_strict_14)][:,:,dt.join(new_cases_delay)]
    byDayTable.key = "DatenstandTag"
    #print("byDayTable rows = {}".format(byDayTable.nrows))
    print(byDayTable)

    return byDayTable
コード例 #12
0
ファイル: test-f.py プロジェクト: imvansh25/datatable
def test_max():
    assert str(dt.max(f.A)) == str(f.A.max())
    assert str(dt.max(f[:])) == str(f[:].max())
    DT = dt.Frame(A=range(1, 10))
    assert_equals(DT[:, f.A.max()], DT[:, dt.max(f.A)])
コード例 #13
0
ファイル: database.py プロジェクト: kathsel/Corona
def analyzeDaily(fullTable, filter, prefix, postfix, byDateColName):

    print("analyzeDaily prefix='{}' postfix='{}' byDateColName='{}'".format(prefix, postfix, byDateColName))
    #print("analyzeDaily filter='{}' '".format(filter))
    byDate = dt.f[byDateColName]
    #print("----- analyzeDaily:"+postfix)
    #dayTable = fullTable[(dt.f.DatenstandTag >= fromDay) & (dt.f.DatenstandTag < toDay) & (dt.f.IdLandkreis == forIdLandkreis),:]

    dayTable = fullTable[filter,:]

    cases_to_count = dayTable[(dt.f.NeuerFall == 0) | (dt.f.NeuerFall == 1),:]
    cases = cases_to_count[:, [dt.sum(dt.f.AnzahlFall)],dt.by(byDate)]
    cases.names = [byDateColName, prefix+"AnzahlFall"+postfix]
    cases.key = byDateColName
    print("cases rows = {}, cases_to_count = {}".format(cases.nrows, cases_to_count.nrows))
    #print(cases)
    byDayTable = cases

    if byDateColName == "DatenstandTag":
        new_cases_to_count = dayTable[(dt.f.NeuerFall == -1) | (dt.f.NeuerFall == 1),:]
        new_cases = new_cases_to_count[:, [dt.sum(dt.f.AnzahlFall)],dt.by(byDate)]
        new_cases.names = [byDateColName, prefix+"AnzahlFallNeu"+postfix]
        new_cases.key = byDateColName
        print("new_cases rows = {}, new_cases_to_count = {}".format(new_cases.nrows, new_cases_to_count.nrows))
        #new_cases_to_count.to_csv("new_cases_to_count.csv")
        byDayTable = byDayTable[:,:,dt.join(new_cases)]
    else:
        # add days by MeldeTag
        byDayTable.names = {prefix+"AnzahlFall"+postfix: prefix+"AnzahlFallNeu"+postfix}
        byDayTable = addRunningSumColumn(byDayTable, prefix+"AnzahlFallNeu"+postfix, prefix+"AnzahlFall"+postfix)

    dead_to_count = dayTable[(dt.f.NeuerTodesfall == 0) | (dt.f.NeuerTodesfall == 1),:]
    dead = dead_to_count[:, [dt.sum(dt.f.AnzahlTodesfall)],dt.by(byDate)]
    dead.names = [byDateColName, prefix+"AnzahlTodesfall"+postfix]
    dead.key = byDateColName
    #print("dead rows = {}".format(dead.nrows))
    byDayTable = byDayTable[:,:,dt.join(dead)]

    if byDateColName == "DatenstandTag":
        new_dead_to_count = dayTable[(dt.f.NeuerTodesfall == -1) | (dt.f.NeuerTodesfall == 1),:]
        new_dead = new_dead_to_count[:, [dt.sum(dt.f.AnzahlTodesfall)],dt.by(byDate)]
        new_dead.names = [byDateColName, prefix+"AnzahlTodesfallNeu"+postfix]
        new_dead.key = byDateColName
        #print("new_dead rows = {}".format(new_dead.nrows))
        byDayTable = byDayTable[:,:,dt.join(new_dead)]
    else:
        # add days by MeldeTag
        byDayTable.names = {prefix+"AnzahlTodesfall"+postfix: prefix+"AnzahlTodesfallNeu"+postfix}
        byDayTable = addRunningSumColumn(byDayTable, prefix+"AnzahlTodesfallNeu"+postfix, prefix+"AnzahlTodesfall"+postfix)

    byDayTable.key = byDateColName

    if postfix == "" and prefix == "" and byDateColName == "DatenstandTag":
        new_cases_to_count_delay = new_cases_to_count[(dt.f.AnzahlFall > 0), :]  # measure delay only for positive cases
        new_cases_to_count_delay.materialize()
        new_cases_delay = new_cases_to_count_delay[:, [dt.min(dt.f.MeldeDelay), dt.max(dt.f.MeldeDelay),
                                                       dt.mean(dt.f.MeldeDelay), dt.median(dt.f.MeldeDelay),
                                                       dt.sd(dt.f.MeldeDelay), dt.sum(dt.f.AnzahlFall),
                                                       dt.max(dt.f.DatenstandTag)], dt.by(byDate)]
        new_cases_delay.names = ["DatenstandTag",
                                 "PublikationsdauerFallNeu_Min" + postfix, "PublikationsdauerFallNeu_Max" + postfix,
                                 "PublikationsdauerFallNeu_Schnitt" + postfix, "PublikationsdauerFallNeu_Median" + postfix,
                                 "PublikationsdauerFallNeu_StdAbw" + postfix, "PublikationsdauerFallNeu_Fallbasis" + postfix,
                                 "DatenstandTag_Max" + postfix]
        new_cases_delay.key = "DatenstandTag"
        print("new_cases_delay rows = {}, new_cases_to_count_delay = {}".format(new_cases_delay.nrows,
                                                                                new_cases_to_count_delay.nrows))

        recovered_to_count = dayTable[(dt.f.NeuGenesen == 0) | (dt.f.NeuGenesen == 1),:]
        recovered = recovered_to_count[:, [dt.sum(dt.f.AnzahlGenesen)],dt.by(byDate)]
        recovered.names = ["DatenstandTag", "AnzahlGenesen"+postfix]
        recovered.key = "DatenstandTag"
        #print("recovered rows = {}".format(recovered.nrows))

        new_recovered_to_count = dayTable[(dt.f.NeuGenesen == -1) | (dt.f.NeuGenesen == 1),:]
        new_recovered = new_recovered_to_count[:, [dt.sum(dt.f.AnzahlGenesen)],dt.by(byDate)]
        new_recovered.names = ["DatenstandTag", "AnzahlGenesenNeu"+postfix]
        new_recovered.key = "DatenstandTag"
        #print("new_recovered rows = {}".format(new_recovered.nrows))

        byDayTable = byDayTable[:, :, dt.join(recovered)][:, :, dt.join(new_recovered)][:, :,dt.join(new_cases_delay)]
        #byDayTable = byDayTable[:,:,dt.join(recovered)][:,:,dt.join(new_recovered)]\
        #    [:,:,dt.join(new_cases_strict)][:,:,dt.join(new_cases_strict_14)][:,:,dt.join(new_cases_delay)]

    byDayTable.key = byDateColName
    #print("byDayTable rows = {}".format(byDayTable.nrows))
    #print(byDayTable)
    return byDayTable
コード例 #14
0
# Glance
amigos_info_dt

# Seasons
amigos_info_dt[:,count(),by(f.season)]

# Unique episodes per a season
amigos_info_dt[:,count(),by(f.season,f.episode)
              ][:,{'unique_episodes':count()},by(f.season)
               ]

# average views and ratings per season
amigos_info_dt[:,dt.mean(f[-2:]),by(f.season)]

# Highest rating title
amigos_info_dt[f.imdb_rating==dt.max(f.imdb_rating),:]

# lowest rating title
amigos_info_dt[f.imdb_rating==dt.min(f.imdb_rating),:]

# Top 2 titles having higher rating per season
amigos_info_dt[:2,:,by(f.season),sort(-f.imdb_rating)]

# find a title info
amigos_info_dt[f.title=="The Last One",:]

# select few observations till 235
amigos_info_dt[[slice(None,235)],:]

alt.Chart(amigos_info_dt[:,[f.season,f.episode,f.us_views_millions]].to_pandas()).mark_point().encode(
    alt.X('episode'),
コード例 #15
0
#ans = x[:, {"median_v3": median(f.v3), "sd_v3": sd(f.v3)}, by(f.id2, f.id4)]
#print(ans.shape, flush=True)
#t = timeit.default_timer() - t_start
#m = memory_usage()
#t_start = timeit.default_timer()
#chk = ans[:, [sum(f.median_v3), sum(f.sd_v3)]]
#chkt = timeit.default_timer() - t_start
#write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=2, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(flatten(chk.to_list())), chk_time_sec=chkt)
#print(ans.head(3).to_pandas(), flush=True)
#print(ans.tail(3).to_pandas(), flush=True)
#del ans

question = "max v1 - min v2 by id2 id4" # q7
gc.collect()
t_start = timeit.default_timer()
ans = x[:, {"range_v1_v2": max(f.v1)-min(f.v2)}, by(f.id2, f.id4)]
print(ans.shape, flush=True)
t = timeit.default_timer() - t_start
m = memory_usage()
t_start = timeit.default_timer()
chk = ans[:, sum(f.range_v1_v2)]
chkt = timeit.default_timer() - t_start
write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=1, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(flatten(chk.to_list())), chk_time_sec=chkt)
del ans
gc.collect()
t_start = timeit.default_timer()
ans = x[:, {"range_v1_v2": max(f.v1)-min(f.v2)}, by(f.id2, f.id4)]
print(ans.shape, flush=True)
t = timeit.default_timer() - t_start
m = memory_usage()
t_start = timeit.default_timer()
コード例 #16
0
penguins_dt[-5:, :]

# All observations for last 3 columns
penguins_dt[:, -3:]

# Filter out NA's from sex and body mass g columns
penguins_dt[(dt.isna(f.sex) & ~dt.isna(f.body_mass_g)), :]

# mean of all numerics columns per different penguin sex categories
penguins_dt[~dt.isna(f.sex), :][:,
                                dt.mean((f[dt.int32].remove(f.year),
                                         f[dt.float64])),
                                by(f.sex)]

# step - 1 : finding a max value of body_mass of penguins per sex
penguins_dt[:, update(temp=f.body_mass_g == dt.max(f.body_mass_g)), by(f.sex)]

# step - 2 : finding a max value of body_mass of penguins per sex
penguins_dt[f.temp == 1, f[:].remove(f.temp)]

# step - 1 : finding a min value of body_mass of penguins per sex
penguins_dt[:, update(temp=f.body_mass_g == dt.min(f.body_mass_g)), by(f.sex)]

penguins_dt[f.temp == 1, f[:].remove(f.temp)]

del penguins_dt["temp"]

penguins_tidy_dt = penguins_dt[~dt.isna(f.sex), :]

penguins_year_island = penguins_tidy_dt[:, {
    'total': count()