def test_rows_min_max(): from datatable import min, max df0 = dt.Frame(A=range(10)) # min = 0, max = 9 df1 = df0[f.A > (min(f.A) + max(f.A)) / 2, :] frame_integrity_check(df1) assert df1.to_list() == [[5, 6, 7, 8, 9]]
def test_rows_min_max(): from datatable import min, max df0 = dt.Frame(range(10), names=["A"]) # min = 0, max = 9 df1 = df0(f.A > (min(f.A) + max(f.A)) / 2, engine="eager") assert df1.internal.check() assert df1.topython() == [[5, 6, 7, 8, 9]]
def test_groups_multiple(): f0 = dt.Frame({ "color": ["red", "blue", "green", "red", "green"], "size": [5, 2, 7, 13, 0] }) f1 = f0[:, [min(f.size), max(f.size)], "color"] f1.internal.check() assert f1.topython() == [["blue", "green", "red"], [2, 0, 5], [2, 7, 13]]
def test_groups_multiple(): f0 = dt.Frame({ "color": ["red", "blue", "green", "red", "green"], "size": [5, 2, 7, 13, 0] }) f1 = f0[:, [min(f.size), max(f.size)], "color"] frame_integrity_check(f1) assert f1.to_list() == [["blue", "green", "red"], [2, 0, 5], [2, 7, 13]]
# parameters target_col = "Known_Fraud" times = 5 random_seed = 123 new_dataset_name = "new_dataset_name_with_downsampled_majority" # counts by target groups g = X[:, {"count": count()}, by(target_col)] if not g.shape[1] == 2: raise ValueError( "Not a binary target - target column must contain exactly 2 values.") # find sizes and target values for minority and majority class partitions n_minority = g[:, min(f.count)][0, 0] n_majority = g[:, max(f.count)][0, 0] target_minority = g[f.count == n_minority, target_col][0, 0] target_majority = g[f.count == n_majority, target_col][0, 0] # validate that times indeed downsamples majority class if times * n_minority >= n_majority: raise ValueError( "Downsampling coefficient `times` is too large: downsampled dataset results in inflated majority class." ) # downsample with pandas frame df_majority = X[f[target_col] == target_majority, :].to_pandas() df_majority_downsampled = resample(df_majority, replace=False, n_samples=n_minority * times,
fun=fun, run=2, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(flatten(chk.to_list())), chk_time_sec=chkt, on_disk=on_disk) print(ans.head(3), flush=True) print(ans.tail(3), flush=True) del ans question = 'max v1 - min v2 by id3' # q7 gc.collect() t_start = timeit.default_timer() ans = x[:, {'range_v1_v2': max(f.v1) - min(f.v2)}, by(f.id3)] print(ans.shape, flush=True) t = timeit.default_timer() - t_start m = memory_usage() t_start = timeit.default_timer() chk = ans[:, sum(f.range_v1_v2)] chkt = timeit.default_timer() - t_start write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git,
#ans = x[:, {"median_v3": median(f.v3), "sd_v3": sd(f.v3)}, by(f.id2, f.id4)] #print(ans.shape, flush=True) #t = timeit.default_timer() - t_start #m = memory_usage() #t_start = timeit.default_timer() #chk = ans[:, [sum(f.median_v3), sum(f.sd_v3)]] #chkt = timeit.default_timer() - t_start #write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=2, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(flatten(chk.to_list())), chk_time_sec=chkt) #print(ans.head(3).to_pandas(), flush=True) #print(ans.tail(3).to_pandas(), flush=True) #del ans question = "max v1 - min v2 by id2 id4" # q7 gc.collect() t_start = timeit.default_timer() ans = x[:, {"range_v1_v2": max(f.v1) - min(f.v2)}, by(f.id2, f.id4)] print(ans.shape, flush=True) t = timeit.default_timer() - t_start m = memory_usage() t_start = timeit.default_timer() chk = ans[:, sum(f.range_v1_v2)] chkt = timeit.default_timer() - t_start write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git,
def analyzeDaily(fullTable, filter, postfix): #print("----- analyzeDaily:"+postfix) #dayTable = fullTable[(dt.f.DatenstandTag >= fromDay) & (dt.f.DatenstandTag < toDay) & (dt.f.IdLandkreis == forIdLandkreis),:] dayTable = fullTable[filter, :] cases_to_count = dayTable[(dt.f.NeuerFall == 0) | (dt.f.NeuerFall == 1), :] cases = cases_to_count[:, [dt.sum(dt.f.AnzahlFall)], dt.by(dt.f.DatenstandTag)] cases.names = ["DatenstandTag", "AnzahlFall" + postfix] cases.key = "DatenstandTag" print("cases rows = {}, cases_to_count = {}".format( cases.nrows, cases_to_count.nrows)) #print(cases) new_cases_to_count = dayTable[(dt.f.NeuerFall == -1) | (dt.f.NeuerFall == 1), :] new_cases = new_cases_to_count[:, [dt.sum(dt.f.AnzahlFall)], dt.by(dt.f.DatenstandTag)] new_cases.names = ["DatenstandTag", "AnzahlFallNeu" + postfix] new_cases.key = "DatenstandTag" print("new_cases rows = {}, new_cases_to_count = {}".format( new_cases.nrows, new_cases_to_count.nrows)) #new_cases_to_count.to_csv("new_cases_to_count.csv") new_cases_to_count_delay = new_cases_to_count[( dt.f.AnzahlFall > 0), :] # measure delay only for positive cases new_cases_to_count_delay.materialize() new_cases_delay = new_cases_to_count_delay[:, [ dt.min(dt.f.MeldeDelay), dt.max(dt.f.MeldeDelay), dt.mean(dt.f.MeldeDelay), dt.median(dt.f.MeldeDelay), dt.sd(dt.f.MeldeDelay), dt.sum(dt.f.AnzahlFall), dt.max(dt.f.DatenstandTag) ], dt.by(dt.f.DatenstandTag)] new_cases_delay.names = [ "DatenstandTag", "MeldeDauerFallNeu-Min" + postfix, "MeldeDauerFallNeu-Max" + postfix, "MeldeDauerFallNeu-Schnitt" + postfix, "MeldeDauerFallNeu-Median" + postfix, "MeldeDauerFallNeu-StdAbw" + postfix, "MeldeDauerFallNeu-Fallbasis" + postfix, "DatenstandTag-Max" + postfix ] new_cases_delay.key = "DatenstandTag" print("new_cases_delay rows = {}, new_cases_to_count_delay = {}".format( new_cases_delay.nrows, new_cases_to_count_delay.nrows)) #new_cases_delay = new_cases_to_count_delay[:, [dt.mean(dt.f.DatenstandTag-dt.f.MeldeTag)],dt.by(dt.f.DatenstandTag)] # delays = delayRecs[:, [dt.mean(dt.f.MeldeDelay), dt.median(dt.f.MeldeDelay), dt.sd(dt.f.MeldeDelay), dt.sum(dt.f.AnzahlFall)], dt.by(dt.f.Landkreis)] # new_cases_stddev = new_cases_to_count_delay[:, [dt.mean(dt.f.DatenstandTag - dt.f.MeldeTag)], # dt.by(dt.f.DatenstandTag)] # new_cases_delay.names = ["DatenstandTag", "AnzahlFallNeu-MeldeDauer" + postfix] # new_cases_delay.key = "DatenstandTag" # print("new_cases_delay rows = {}, new_cases_to_count_delay = {}".format(new_cases_delay.nrows, # new_cases_to_count_delay.nrows)) new_cases_to_count_strict = new_cases_to_count[( dt.f.DatenstandTag - dt.f.MeldeTag < 7) | (dt.f.AnzahlFall < 0), :] new_cases_strict = new_cases_to_count_strict[:, [dt.sum(dt.f.AnzahlFall)], dt.by(dt.f.DatenstandTag)] new_cases_strict.names = [ "DatenstandTag", "AnzahlFallNeu-Meldung-letze-7-Tage" + postfix ] new_cases_strict.key = "DatenstandTag" print("new_cases_strict rows = {}, new_cases_to_count_strict = {}".format( new_cases_strict.nrows, new_cases_to_count_strict.nrows)) #new_cases_to_count_strict.to_csv("new_cases_to_count_strict.csv") new_cases_to_count_strict_14 = new_cases_to_count[( dt.f.DatenstandTag - dt.f.MeldeTag < 14) | (dt.f.AnzahlFall < 0), :] new_cases_strict_14 = new_cases_to_count_strict_14[:, [ dt.sum(dt.f.AnzahlFall) ], dt.by(dt.f.DatenstandTag)] new_cases_strict_14.names = [ "DatenstandTag", "AnzahlFallNeu-Meldung-letze-14-Tage" + postfix ] new_cases_strict_14.key = "DatenstandTag" print("new_cases_strict_14 rows = {}, new_cases_to_count_strict_14 = {}". format(new_cases_strict_14.nrows, new_cases_to_count_strict_14.nrows)) dead_to_count = dayTable[(dt.f.NeuerTodesfall == 0) | (dt.f.NeuerTodesfall == 1), :] dead = dead_to_count[:, [dt.sum(dt.f.AnzahlTodesfall)], dt.by(dt.f.DatenstandTag)] dead.names = ["DatenstandTag", "AnzahlTodesfall" + postfix] dead.key = "DatenstandTag" #print("dead rows = {}".format(dead.nrows)) new_dead_to_count = dayTable[(dt.f.NeuerTodesfall == -1) | (dt.f.NeuerTodesfall == 1), :] new_dead = new_dead_to_count[:, [dt.sum(dt.f.AnzahlTodesfall)], dt.by(dt.f.DatenstandTag)] new_dead.names = ["DatenstandTag", "AnzahlTodesfallNeu" + postfix] new_dead.key = "DatenstandTag" #print("new_dead rows = {}".format(new_dead.nrows)) recovered_to_count = dayTable[(dt.f.NeuGenesen == 0) | (dt.f.NeuGenesen == 1), :] recovered = recovered_to_count[:, [dt.sum(dt.f.AnzahlGenesen)], dt.by(dt.f.DatenstandTag)] recovered.names = ["DatenstandTag", "AnzahlGenesen" + postfix] recovered.key = "DatenstandTag" #print("recovered rows = {}".format(recovered.nrows)) new_recovered_to_count = dayTable[(dt.f.NeuGenesen == -1) | (dt.f.NeuGenesen == 1), :] new_recovered = new_recovered_to_count[:, [dt.sum(dt.f.AnzahlGenesen)], dt.by(dt.f.DatenstandTag)] new_recovered.names = ["DatenstandTag", "AnzahlGenesenNeu" + postfix] new_recovered.key = "DatenstandTag" #print("new_recovered rows = {}".format(new_recovered.nrows)) byDayTable = cases[:,:,dt.join(new_cases)]\ [:,:,dt.join(dead)][:,:,dt.join(new_dead)][:,:,dt.join(recovered)][:,:,dt.join(new_recovered)]\ [:,:,dt.join(new_cases_strict)][:,:,dt.join(new_cases_strict_14)][:,:,dt.join(new_cases_delay)] byDayTable.key = "DatenstandTag" #print("byDayTable rows = {}".format(byDayTable.nrows)) print(byDayTable) return byDayTable
def test_min(): assert str(dt.min(f.A)) == str(f.A.min()) assert str(dt.min(f[:])) == str(f[:].min()) DT = dt.Frame(A=[2, 3, 5, 5, 9, -1, 2.2]) assert_equals(DT[:, f.A.min()], DT[:, dt.min(f.A)])
def analyzeDaily(fullTable, filter, prefix, postfix, byDateColName): print("analyzeDaily prefix='{}' postfix='{}' byDateColName='{}'".format(prefix, postfix, byDateColName)) #print("analyzeDaily filter='{}' '".format(filter)) byDate = dt.f[byDateColName] #print("----- analyzeDaily:"+postfix) #dayTable = fullTable[(dt.f.DatenstandTag >= fromDay) & (dt.f.DatenstandTag < toDay) & (dt.f.IdLandkreis == forIdLandkreis),:] dayTable = fullTable[filter,:] cases_to_count = dayTable[(dt.f.NeuerFall == 0) | (dt.f.NeuerFall == 1),:] cases = cases_to_count[:, [dt.sum(dt.f.AnzahlFall)],dt.by(byDate)] cases.names = [byDateColName, prefix+"AnzahlFall"+postfix] cases.key = byDateColName print("cases rows = {}, cases_to_count = {}".format(cases.nrows, cases_to_count.nrows)) #print(cases) byDayTable = cases if byDateColName == "DatenstandTag": new_cases_to_count = dayTable[(dt.f.NeuerFall == -1) | (dt.f.NeuerFall == 1),:] new_cases = new_cases_to_count[:, [dt.sum(dt.f.AnzahlFall)],dt.by(byDate)] new_cases.names = [byDateColName, prefix+"AnzahlFallNeu"+postfix] new_cases.key = byDateColName print("new_cases rows = {}, new_cases_to_count = {}".format(new_cases.nrows, new_cases_to_count.nrows)) #new_cases_to_count.to_csv("new_cases_to_count.csv") byDayTable = byDayTable[:,:,dt.join(new_cases)] else: # add days by MeldeTag byDayTable.names = {prefix+"AnzahlFall"+postfix: prefix+"AnzahlFallNeu"+postfix} byDayTable = addRunningSumColumn(byDayTable, prefix+"AnzahlFallNeu"+postfix, prefix+"AnzahlFall"+postfix) dead_to_count = dayTable[(dt.f.NeuerTodesfall == 0) | (dt.f.NeuerTodesfall == 1),:] dead = dead_to_count[:, [dt.sum(dt.f.AnzahlTodesfall)],dt.by(byDate)] dead.names = [byDateColName, prefix+"AnzahlTodesfall"+postfix] dead.key = byDateColName #print("dead rows = {}".format(dead.nrows)) byDayTable = byDayTable[:,:,dt.join(dead)] if byDateColName == "DatenstandTag": new_dead_to_count = dayTable[(dt.f.NeuerTodesfall == -1) | (dt.f.NeuerTodesfall == 1),:] new_dead = new_dead_to_count[:, [dt.sum(dt.f.AnzahlTodesfall)],dt.by(byDate)] new_dead.names = [byDateColName, prefix+"AnzahlTodesfallNeu"+postfix] new_dead.key = byDateColName #print("new_dead rows = {}".format(new_dead.nrows)) byDayTable = byDayTable[:,:,dt.join(new_dead)] else: # add days by MeldeTag byDayTable.names = {prefix+"AnzahlTodesfall"+postfix: prefix+"AnzahlTodesfallNeu"+postfix} byDayTable = addRunningSumColumn(byDayTable, prefix+"AnzahlTodesfallNeu"+postfix, prefix+"AnzahlTodesfall"+postfix) byDayTable.key = byDateColName if postfix == "" and prefix == "" and byDateColName == "DatenstandTag": new_cases_to_count_delay = new_cases_to_count[(dt.f.AnzahlFall > 0), :] # measure delay only for positive cases new_cases_to_count_delay.materialize() new_cases_delay = new_cases_to_count_delay[:, [dt.min(dt.f.MeldeDelay), dt.max(dt.f.MeldeDelay), dt.mean(dt.f.MeldeDelay), dt.median(dt.f.MeldeDelay), dt.sd(dt.f.MeldeDelay), dt.sum(dt.f.AnzahlFall), dt.max(dt.f.DatenstandTag)], dt.by(byDate)] new_cases_delay.names = ["DatenstandTag", "PublikationsdauerFallNeu_Min" + postfix, "PublikationsdauerFallNeu_Max" + postfix, "PublikationsdauerFallNeu_Schnitt" + postfix, "PublikationsdauerFallNeu_Median" + postfix, "PublikationsdauerFallNeu_StdAbw" + postfix, "PublikationsdauerFallNeu_Fallbasis" + postfix, "DatenstandTag_Max" + postfix] new_cases_delay.key = "DatenstandTag" print("new_cases_delay rows = {}, new_cases_to_count_delay = {}".format(new_cases_delay.nrows, new_cases_to_count_delay.nrows)) recovered_to_count = dayTable[(dt.f.NeuGenesen == 0) | (dt.f.NeuGenesen == 1),:] recovered = recovered_to_count[:, [dt.sum(dt.f.AnzahlGenesen)],dt.by(byDate)] recovered.names = ["DatenstandTag", "AnzahlGenesen"+postfix] recovered.key = "DatenstandTag" #print("recovered rows = {}".format(recovered.nrows)) new_recovered_to_count = dayTable[(dt.f.NeuGenesen == -1) | (dt.f.NeuGenesen == 1),:] new_recovered = new_recovered_to_count[:, [dt.sum(dt.f.AnzahlGenesen)],dt.by(byDate)] new_recovered.names = ["DatenstandTag", "AnzahlGenesenNeu"+postfix] new_recovered.key = "DatenstandTag" #print("new_recovered rows = {}".format(new_recovered.nrows)) byDayTable = byDayTable[:, :, dt.join(recovered)][:, :, dt.join(new_recovered)][:, :,dt.join(new_cases_delay)] #byDayTable = byDayTable[:,:,dt.join(recovered)][:,:,dt.join(new_recovered)]\ # [:,:,dt.join(new_cases_strict)][:,:,dt.join(new_cases_strict_14)][:,:,dt.join(new_cases_delay)] byDayTable.key = byDateColName #print("byDayTable rows = {}".format(byDayTable.nrows)) #print(byDayTable) return byDayTable
# Seasons amigos_info_dt[:,count(),by(f.season)] # Unique episodes per a season amigos_info_dt[:,count(),by(f.season,f.episode) ][:,{'unique_episodes':count()},by(f.season) ] # average views and ratings per season amigos_info_dt[:,dt.mean(f[-2:]),by(f.season)] # Highest rating title amigos_info_dt[f.imdb_rating==dt.max(f.imdb_rating),:] # lowest rating title amigos_info_dt[f.imdb_rating==dt.min(f.imdb_rating),:] # Top 2 titles having higher rating per season amigos_info_dt[:2,:,by(f.season),sort(-f.imdb_rating)] # find a title info amigos_info_dt[f.title=="The Last One",:] # select few observations till 235 amigos_info_dt[[slice(None,235)],:] alt.Chart(amigos_info_dt[:,[f.season,f.episode,f.us_views_millions]].to_pandas()).mark_point().encode( alt.X('episode'), alt.Y('us_views_millions') ).properties( title=' Episode and views'
#ans = x[:, {"median_v3": median(f.v3), "sd_v3": sd(f.v3)}, by(f.id2, f.id4)] #print(ans.shape, flush=True) #t = timeit.default_timer() - t_start #m = memory_usage() #t_start = timeit.default_timer() #chk = ans[:, [sum(f.median_v3), sum(f.sd_v3)]] #chkt = timeit.default_timer() - t_start #write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=2, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(flatten(chk.to_list())), chk_time_sec=chkt) #print(ans.head(3).to_pandas(), flush=True) #print(ans.tail(3).to_pandas(), flush=True) #del ans question = "max v1 - min v2 by id2 id4" # q7 gc.collect() t_start = timeit.default_timer() ans = x[:, {"range_v1_v2": max(f.v1)-min(f.v2)}, by(f.id2, f.id4)] print(ans.shape, flush=True) t = timeit.default_timer() - t_start m = memory_usage() t_start = timeit.default_timer() chk = ans[:, sum(f.range_v1_v2)] chkt = timeit.default_timer() - t_start write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=1, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(flatten(chk.to_list())), chk_time_sec=chkt) del ans gc.collect() t_start = timeit.default_timer() ans = x[:, {"range_v1_v2": max(f.v1)-min(f.v2)}, by(f.id2, f.id4)] print(ans.shape, flush=True) t = timeit.default_timer() - t_start m = memory_usage() t_start = timeit.default_timer()
penguins_dt[(dt.isna(f.sex) & ~dt.isna(f.body_mass_g)), :] # mean of all numerics columns per different penguin sex categories penguins_dt[~dt.isna(f.sex), :][:, dt.mean((f[dt.int32].remove(f.year), f[dt.float64])), by(f.sex)] # step - 1 : finding a max value of body_mass of penguins per sex penguins_dt[:, update(temp=f.body_mass_g == dt.max(f.body_mass_g)), by(f.sex)] # step - 2 : finding a max value of body_mass of penguins per sex penguins_dt[f.temp == 1, f[:].remove(f.temp)] # step - 1 : finding a min value of body_mass of penguins per sex penguins_dt[:, update(temp=f.body_mass_g == dt.min(f.body_mass_g)), by(f.sex)] penguins_dt[f.temp == 1, f[:].remove(f.temp)] del penguins_dt["temp"] penguins_tidy_dt = penguins_dt[~dt.isna(f.sex), :] penguins_year_island = penguins_tidy_dt[:, { 'total': count() }, by(f.year, f.island)] penguins_year = penguins_year_island[:, { 'gr_total': dt.sum(f.total) }, by(f.year)]