fun=fun, run=2, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(flatten(chk.to_list())), chk_time_sec=chkt, on_disk=on_disk) print(ans.head(3), flush=True) print(ans.tail(3), flush=True) del ans question = 'sum v1 mean v3 by id3' # q3 gc.collect() t_start = timeit.default_timer() ans = x[:, {'v1': sum(f.v1), 'v3': mean(f.v3)}, by(f.id3)] print(ans.shape, flush=True) t = timeit.default_timer() - t_start m = memory_usage() t_start = timeit.default_timer() chk = ans[:, [sum(f.v1), sum(f.v3)]] chkt = timeit.default_timer() - t_start write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git,
def test_mean_simple(): DT = dt.Frame(A=range(5)) RZ = DT[:, mean(f.A)] frame_integrity_check(RZ) assert RZ.stypes == (dt.float64,) assert RZ.to_list() == [[2.0]]
def loadAndProcessData(dataFilename): print("Loading " + dataFilename) fullTable = dt.fread(dataFilename) print("Loading done loading table from ‘" + dataFilename + "‘, keys:") print(fullTable.keys()) cases = fullTable[:, 'AnzahlFall'].sum()[0, 0] dead = fullTable[:, 'AnzahlTodesfall'].sum()[0, 0] lastDay = fullTable[:, 'MeldeDay'].max()[0, 0] lastnewCaseOnDay = fullTable[:, 'newCaseOnDay'].max()[0, 0] print("File stats: lastDay {} lastnewCaseOnDay {} cases {} dead {}".format( lastDay, lastnewCaseOnDay, cases, dead)) newTable = fullTable[:, dt.f[:]. extend({"erkMeldeDelay": dt.f.MeldeDay - dt.f.RefDay})] #print(newTable.keys()) #dt.by(dt.f.Bundesland)] alldays = fullTable[:, [ dt.sum(dt.f.AnzahlFall), dt.sum(dt.f.FaellePro100k), dt.sum(dt.f.AnzahlTodesfall), dt.sum(dt.f.TodesfaellePro100k), dt.mean(dt.f.Bevoelkerung), dt.max(dt.f.MeldeDay), dt.first(dt.f.LandkreisTyp), dt.first(dt.f.Bundesland) ], dt.by(dt.f.Landkreis)] last7days = fullTable[dt.f.newCaseOnDay > lastDay - 7, :][:, [ dt.sum(dt.f.AnzahlFall), dt.sum(dt.f.FaellePro100k), dt.sum(dt.f.AnzahlTodesfall), dt.sum(dt.f.TodesfaellePro100k) ], dt.by(dt.f.Landkreis)] last7days.names = [ "Landkreis", "AnzahlFallLetzte7Tage", "FaellePro100kLetzte7Tage", "AnzahlTodesfallLetzte7Tage", "TodesfaellePro100kLetzte7Tage" ] last7days[dt.f.AnzahlFallLetzte7Tage < 0, "AnzahlFallLetzte7Tage"] = 0 last7days[dt.f.FaellePro100kLetzte7Tage < 0, "FaellePro100kLetzte7Tage"] = 0 last7days[dt.f.AnzahlTodesfallLetzte7Tage < 0, "AnzahlTodesfallLetzte7Tage"] = 0 last7days[dt.f.TodesfaellePro100kLetzte7Tage < 0, "TodesfaellePro100kLetzte7Tage"] = 0 lastWeek7days = fullTable[(dt.f.newCaseOnDay > lastDay - 14) & ( dt.f.newCaseOnDay <= lastDay - 7), :][:, [ dt.sum(dt.f.AnzahlFall), dt.sum(dt.f.FaellePro100k), dt.sum(dt.f.AnzahlTodesfall), dt.sum(dt.f.TodesfaellePro100k) ], dt.by(dt.f.Landkreis)] #lastWeek7days[dt.f[1:] < 0, dt.f[1:]] = 0 lastWeek7days.names = [ "Landkreis", "AnzahlFallLetzte7TageDavor", "FaellePro100kLetzte7TageDavor", "AnzahlTodesfallLetzte7TageDavor", "TodesfaellePro100kLetzte7TageDavor" ] lastWeek7days[dt.f.AnzahlFallLetzte7TageDavor < 0, "AnzahlFallLetzte7TageDavor"] = 0 lastWeek7days[dt.f.FaellePro100kLetzte7TageDavor < 0, "FaellePro100kLetzte7TageDavor"] = 0 lastWeek7days[dt.f.AnzahlTodesfallLetzte7TageDavor < 0, "AnzahlTodesfallLetzte7TageDavor"] = 0 lastWeek7days[dt.f.TodesfaellePro100kLetzte7TageDavor < 0, "TodesfaellePro100kLetzte7TageDavor"] = 0 allDaysExt0 = merge(alldays, last7days, "Landkreis") allDaysExt1 = merge(allDaysExt0, lastWeek7days, "Landkreis") Rw = dt.f.AnzahlFallLetzte7Tage / dt.f.AnzahlFallLetzte7TageDavor allDaysExt2 = allDaysExt1[:, dt.f[:].extend({"AnzahlFallTrend": Rw})] allDaysExt3 = allDaysExt2[:, dt.f[:].extend({ "FaellePro100kTrend": dt.f.FaellePro100kLetzte7Tage - dt.f.FaellePro100kLetzte7TageDavor })] allDaysExt4 = allDaysExt3[:, dt.f[:].extend({ "TodesfaellePro100kTrend": dt.f.TodesfaellePro100kLetzte7Tage - dt.f.TodesfaellePro100kLetzte7TageDavor })] allDaysExt5 = allDaysExt4[:, dt.f[:].extend({ "Kontaktrisiko": dt.f.Bevoelkerung / 6.25 / ((dt.f.AnzahlFallLetzte7Tage + dt.f.AnzahlFallLetzte7TageDavor) * Rw) })] allDaysExt6 = allDaysExt5[:, dt.f[:].extend( {"LetzteMeldung": lastDay - dt.f.MeldeDay})] allDaysExt6[dt.f.Kontaktrisiko * 2 == dt.f.Kontaktrisiko, "Kontaktrisiko"] = 999999 sortedByRisk = allDaysExt6.sort( ["Kontaktrisiko", "LetzteMeldung", "FaellePro100k"]) #print(sortedByRisk) allDaysExt = sortedByRisk[:, dt.f[:].extend({"Rang": 0})] allDaysExt[:, "Rang"] = np.arange(1, allDaysExt.nrows + 1) #print(allDaysExt) print("Column names frame order:", list(enumerate(allDaysExt.names))) data = allDaysExt.to_pandas() return data
def test_rows_mean(): from datatable import mean df0 = dt.Frame(A=range(10)) df1 = df0[f.A > mean(f.A), :] df1.internal.check() assert df1.to_list() == [[5, 6, 7, 8, 9]]
# First five observations from 2 to 5 columns in DT penguins_dt[:5, 2:6] # Last five observations from DT penguins_dt[-5:, :] # All observations for last 3 columns penguins_dt[:, -3:] # Filter out NA's from sex and body mass g columns penguins_dt[(dt.isna(f.sex) & ~dt.isna(f.body_mass_g)), :] # mean of all numerics columns per different penguin sex categories penguins_dt[~dt.isna(f.sex), :][:, dt.mean((f[dt.int32].remove(f.year), f[dt.float64])), by(f.sex)] # step - 1 : finding a max value of body_mass of penguins per sex penguins_dt[:, update(temp=f.body_mass_g == dt.max(f.body_mass_g)), by(f.sex)] # step - 2 : finding a max value of body_mass of penguins per sex penguins_dt[f.temp == 1, f[:].remove(f.temp)] # step - 1 : finding a min value of body_mass of penguins per sex penguins_dt[:, update(temp=f.body_mass_g == dt.min(f.body_mass_g)), by(f.sex)] penguins_dt[f.temp == 1, f[:].remove(f.temp)] del penguins_dt["temp"]
def analyzeDaily(fullTable, filter, postfix): #print("----- analyzeDaily:"+postfix) #dayTable = fullTable[(dt.f.DatenstandTag >= fromDay) & (dt.f.DatenstandTag < toDay) & (dt.f.IdLandkreis == forIdLandkreis),:] dayTable = fullTable[filter, :] cases_to_count = dayTable[(dt.f.NeuerFall == 0) | (dt.f.NeuerFall == 1), :] cases = cases_to_count[:, [dt.sum(dt.f.AnzahlFall)], dt.by(dt.f.DatenstandTag)] cases.names = ["DatenstandTag", "AnzahlFall" + postfix] cases.key = "DatenstandTag" print("cases rows = {}, cases_to_count = {}".format( cases.nrows, cases_to_count.nrows)) #print(cases) new_cases_to_count = dayTable[(dt.f.NeuerFall == -1) | (dt.f.NeuerFall == 1), :] new_cases = new_cases_to_count[:, [dt.sum(dt.f.AnzahlFall)], dt.by(dt.f.DatenstandTag)] new_cases.names = ["DatenstandTag", "AnzahlFallNeu" + postfix] new_cases.key = "DatenstandTag" print("new_cases rows = {}, new_cases_to_count = {}".format( new_cases.nrows, new_cases_to_count.nrows)) #new_cases_to_count.to_csv("new_cases_to_count.csv") new_cases_to_count_delay = new_cases_to_count[( dt.f.AnzahlFall > 0), :] # measure delay only for positive cases new_cases_to_count_delay.materialize() new_cases_delay = new_cases_to_count_delay[:, [ dt.min(dt.f.MeldeDelay), dt.max(dt.f.MeldeDelay), dt.mean(dt.f.MeldeDelay), dt.median(dt.f.MeldeDelay), dt.sd(dt.f.MeldeDelay), dt.sum(dt.f.AnzahlFall), dt.max(dt.f.DatenstandTag) ], dt.by(dt.f.DatenstandTag)] new_cases_delay.names = [ "DatenstandTag", "MeldeDauerFallNeu-Min" + postfix, "MeldeDauerFallNeu-Max" + postfix, "MeldeDauerFallNeu-Schnitt" + postfix, "MeldeDauerFallNeu-Median" + postfix, "MeldeDauerFallNeu-StdAbw" + postfix, "MeldeDauerFallNeu-Fallbasis" + postfix, "DatenstandTag-Max" + postfix ] new_cases_delay.key = "DatenstandTag" print("new_cases_delay rows = {}, new_cases_to_count_delay = {}".format( new_cases_delay.nrows, new_cases_to_count_delay.nrows)) #new_cases_delay = new_cases_to_count_delay[:, [dt.mean(dt.f.DatenstandTag-dt.f.MeldeTag)],dt.by(dt.f.DatenstandTag)] # delays = delayRecs[:, [dt.mean(dt.f.MeldeDelay), dt.median(dt.f.MeldeDelay), dt.sd(dt.f.MeldeDelay), dt.sum(dt.f.AnzahlFall)], dt.by(dt.f.Landkreis)] # new_cases_stddev = new_cases_to_count_delay[:, [dt.mean(dt.f.DatenstandTag - dt.f.MeldeTag)], # dt.by(dt.f.DatenstandTag)] # new_cases_delay.names = ["DatenstandTag", "AnzahlFallNeu-MeldeDauer" + postfix] # new_cases_delay.key = "DatenstandTag" # print("new_cases_delay rows = {}, new_cases_to_count_delay = {}".format(new_cases_delay.nrows, # new_cases_to_count_delay.nrows)) new_cases_to_count_strict = new_cases_to_count[( dt.f.DatenstandTag - dt.f.MeldeTag < 7) | (dt.f.AnzahlFall < 0), :] new_cases_strict = new_cases_to_count_strict[:, [dt.sum(dt.f.AnzahlFall)], dt.by(dt.f.DatenstandTag)] new_cases_strict.names = [ "DatenstandTag", "AnzahlFallNeu-Meldung-letze-7-Tage" + postfix ] new_cases_strict.key = "DatenstandTag" print("new_cases_strict rows = {}, new_cases_to_count_strict = {}".format( new_cases_strict.nrows, new_cases_to_count_strict.nrows)) #new_cases_to_count_strict.to_csv("new_cases_to_count_strict.csv") new_cases_to_count_strict_14 = new_cases_to_count[( dt.f.DatenstandTag - dt.f.MeldeTag < 14) | (dt.f.AnzahlFall < 0), :] new_cases_strict_14 = new_cases_to_count_strict_14[:, [ dt.sum(dt.f.AnzahlFall) ], dt.by(dt.f.DatenstandTag)] new_cases_strict_14.names = [ "DatenstandTag", "AnzahlFallNeu-Meldung-letze-14-Tage" + postfix ] new_cases_strict_14.key = "DatenstandTag" print("new_cases_strict_14 rows = {}, new_cases_to_count_strict_14 = {}". format(new_cases_strict_14.nrows, new_cases_to_count_strict_14.nrows)) dead_to_count = dayTable[(dt.f.NeuerTodesfall == 0) | (dt.f.NeuerTodesfall == 1), :] dead = dead_to_count[:, [dt.sum(dt.f.AnzahlTodesfall)], dt.by(dt.f.DatenstandTag)] dead.names = ["DatenstandTag", "AnzahlTodesfall" + postfix] dead.key = "DatenstandTag" #print("dead rows = {}".format(dead.nrows)) new_dead_to_count = dayTable[(dt.f.NeuerTodesfall == -1) | (dt.f.NeuerTodesfall == 1), :] new_dead = new_dead_to_count[:, [dt.sum(dt.f.AnzahlTodesfall)], dt.by(dt.f.DatenstandTag)] new_dead.names = ["DatenstandTag", "AnzahlTodesfallNeu" + postfix] new_dead.key = "DatenstandTag" #print("new_dead rows = {}".format(new_dead.nrows)) recovered_to_count = dayTable[(dt.f.NeuGenesen == 0) | (dt.f.NeuGenesen == 1), :] recovered = recovered_to_count[:, [dt.sum(dt.f.AnzahlGenesen)], dt.by(dt.f.DatenstandTag)] recovered.names = ["DatenstandTag", "AnzahlGenesen" + postfix] recovered.key = "DatenstandTag" #print("recovered rows = {}".format(recovered.nrows)) new_recovered_to_count = dayTable[(dt.f.NeuGenesen == -1) | (dt.f.NeuGenesen == 1), :] new_recovered = new_recovered_to_count[:, [dt.sum(dt.f.AnzahlGenesen)], dt.by(dt.f.DatenstandTag)] new_recovered.names = ["DatenstandTag", "AnzahlGenesenNeu" + postfix] new_recovered.key = "DatenstandTag" #print("new_recovered rows = {}".format(new_recovered.nrows)) byDayTable = cases[:,:,dt.join(new_cases)]\ [:,:,dt.join(dead)][:,:,dt.join(new_dead)][:,:,dt.join(recovered)][:,:,dt.join(new_recovered)]\ [:,:,dt.join(new_cases_strict)][:,:,dt.join(new_cases_strict_14)][:,:,dt.join(new_cases_delay)] byDayTable.key = "DatenstandTag" #print("byDayTable rows = {}".format(byDayTable.nrows)) print(byDayTable) return byDayTable
"""Compute new features based on aggregates, e.g. distance from mean""" # Compute per-column expressions (signed distance from the mean in this example) # for all numeric (int, float) columns with stats computed by groups and # new column added for each original numeric feature. # see: https://stackoverflow.com/questions/62974899/updating-or-adding-multiple-columns-with-pydatatable-in-style-of-r-datables-sd # # Specification: # Inputs: # X: datatable - primary data set # Parameters: # group_by_cols: list of column names - group columns to compute stats by # Output: # dataset augmented with computed statistics from datatable import f, by, sort, update, shift, isna, mean group_by_cols = ["user_id"] new_dataset_name = "new_dataset_name_with_stats" aggs = {f"{col}_dist_from_mean": mean(dt.f[col]) - f[col] for col in X[:, f[int].extend(f[float])].names} X[:, update(**aggs), by(*group_by_cols)] return {new_dataset_name: X}
solution=solution, version=ver, git=git, fun=fun, run=2, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(flatten(chk.topython())), chk_time_sec=chkt) del ans question = "sum v1 mean v3 by id3" #3 gc.collect() t_start = timeit.default_timer() ans = x[:, {"v1": sum(f.v1), "v3": mean(f.v3)}, by(f.id3)] print(ans.shape) t = timeit.default_timer() - t_start m = memory_usage() t_start = timeit.default_timer() chk = ans[:, [sum(f.v1), sum(f.v3)]] chkt = timeit.default_timer() - t_start write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git,
# Hour wise arrests hour_wise_arrests_dt = py_dt_two_group_proportions_summary(policia_tidy_dt_v1,'stop_hour','is_arrested') # Visualization alt.Chart(hour_wise_arrests_dt.to_pandas()).mark_bar().encode( alt.X('stop_hour:N'), alt.Y('count'), alt.Color('is_arrested') ).properties( title= 'Hour wise arrest trends' ) # Hour wise arrest rates hour_wise_arrests_rates_dt= hour_wise_arrests_dt[f.is_arrested==True,: ][:,dt.mean(f.count),by(f.stop_hour) ] # Visualization alt.Chart(hour_wise_arrests_rates_dt.to_pandas()).mark_line().encode( alt.X('stop_hour'), alt.Y('count') ).properties( title = 'Hourly wise - average arrest rates' ) py_dt_one_group_proportions_summary(policia_tidy_dt_v1,'drugs_related_stop') # stop date and converting to pandas frame stop_date_df = policia_tidy_dt[:,(f.stop_date)].to_pandas()
amigos_info_dt = dt.fread('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-09-08/friends_info.csv') amigos_dt = dt.fread('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-09-08/friends.csv') # Glance amigos_info_dt # Seasons amigos_info_dt[:,count(),by(f.season)] # Unique episodes per a season amigos_info_dt[:,count(),by(f.season,f.episode) ][:,{'unique_episodes':count()},by(f.season) ] # average views and ratings per season amigos_info_dt[:,dt.mean(f[-2:]),by(f.season)] # Highest rating title amigos_info_dt[f.imdb_rating==dt.max(f.imdb_rating),:] # lowest rating title amigos_info_dt[f.imdb_rating==dt.min(f.imdb_rating),:] # Top 2 titles having higher rating per season amigos_info_dt[:2,:,by(f.season),sort(-f.imdb_rating)] # find a title info amigos_info_dt[f.title=="The Last One",:] # select few observations till 235 amigos_info_dt[[slice(None,235)],:]
def agregar_variables_nuevas(dataset: Frame) -> Frame: dataset['tarjetas_status01'] = dataset[:, dt.rowmax([ f.Master_status, f.Visa_status ])] # 3 dataset['tarjetas_status02'] = dataset[:, dt.rowmin([ f.Master_status, f.Visa_status ])] # 2 dataset['tarjetas_fultimo_cierre01'] = dataset[:, dt.rowmax([ f.Master_fultimo_cierre, f.Visa_fultimo_cierre ])] # 479 dataset['tarjetas_fultimo_cierre02'] = dataset[:, dt.rowmin([ f.Master_fultimo_cierre, f.Visa_fultimo_cierre ])] # 421 dataset['tarjetas_Finiciomora'] = dataset[:, dt.rowmin([ f.Master_Finiciomora, f.Visa_Finiciomora ])] # 12 dataset['tarjetas_Fvencimiento'] = dataset[:, dt.rowmin([ f.Master_Fvencimiento, f.Visa_Fvencimiento ])] # 359 dataset['tarjetas_delinquency'] = dataset[:, dt.rowmax([ f.Master_delinquency, f.Visa_delinquency ])] # 18 dataset[ 'tarjetas_mfinanciacion_limite'] = dataset[:, dt.rowsum([ f. Master_mfinanciacion_limite, f. Visa_mfinanciacion_limite ])] # 230 dataset['tarjetas_msaldototal'] = dataset[:, f.Master_msaldototal + f.Visa_msaldototal] # 57 dataset['tarjetas_msaldopesos'] = dataset[:, f.Master_msaldopesos + f.Visa_msaldopesos] # 46 dataset[ 'tarjetas_msaldodolares'] = dataset[:, f.Master_msaldodolares + f. Visa_msaldodolares] # 1142 pero una derivada 104 dataset['tarjetas_mconsumospesos'] = dataset[:, f.Master_mconsumospesos + f.Visa_mconsumospesos] # 400 dataset[ 'tarjetas_mconsumosdolares'] = dataset[:, f.Master_mconsumosdolares + f. Visa_mconsumosdolares] # 891 pero con derivadas 352 dataset[ 'tarjetas_mlimitecompra'] = dataset[:, f.Master_mlimitecompra + f. Visa_mlimitecompra] # 186 pero con derivadas 26 dataset[ 'tarjetas_madelantopesos'] = dataset[:, f.Master_madelantopesos + f. Visa_madelantopesos] # 666 pero derivadas 26 dataset[ 'tarjetas_madelantodolares'] = dataset[:, f.Master_madelantodolares + f. Visa_madelantodolares] # 294 y derivadas 33 dataset['tarjetas_fultimo_cierre'] = dataset[:, dt.rowmax([ f.Master_fultimo_cierre, f.Visa_fultimo_cierre ])] # 448 dataset['tarjetas_mpagado'] = dataset[:, f.Master_mpagado + f.Visa_mpagado] # 384 y derivadas 29 dataset['tarjetas_mpagospesos'] = dataset[:, f.Master_mpagospesos + f.Visa_mpagospesos] # 28 dataset[ 'tarjetas_mpagosdolares'] = dataset[:, f.Master_mpagosdolares + f. Visa_mpagosdolares] # 1017 y derivadas 255 dataset['tarjetas_fechaalta'] = dataset[:, dt.rowmax([ f.Master_fechaalta, f.Visa_fechaalta ])] # 159 dataset[ 'tarjetas_mconsumototal'] = dataset[:, f.Master_mconsumototal + f. Visa_mconsumototal] # 512 y derivadas 365 dataset['tarjetas_cconsumos'] = dataset[:, f.Master_cconsumos + f.Visa_cconsumos] # 424 dataset[ 'tarjetas_cadelantosefectivo'] = dataset[:, f.Master_cadelantosefectivo + f. Visa_cadelantosefectivo] # 750 dataset['tarjetas_mpagominimo'] = dataset[:, f.Master_mpagominimo + f.Visa_mpagominimo] # 98 dataset[ 'ratio_tarjetas_msaldodolares__tarjetas_mlimitecompra'] = dataset[:, f. tarjetas_msaldodolares / f. tarjetas_mlimitecompra] # 104 dataset[ 'ratio_tarjetas_msaldodolares__tarjetas_msaldototal'] = dataset[:, f. tarjetas_msaldodolares / f. tarjetas_msaldototal] # 611 dataset[ 'ratio_tarjetas_mconsumospesos__tarjetas_mlimitecompra'] = dataset[:, f. tarjetas_mconsumospesos / f. tarjetas_mlimitecompra] # 244 dataset[ 'ratio_tarjetas_madelantopesos__tarjetas_mlimitecompra'] = dataset[:, f. tarjetas_madelantopesos / f. tarjetas_mlimitecompra] # 26 dataset[ 'ratio_tarjetas_madelantodolares__tarjetas_mlimitecompra'] = dataset[:, f. tarjetas_madelantodolares / f. tarjetas_mlimitecompra] # 33 dataset[ 'ratio_tarjetas_mpagospesos__tarjetas_mlimitecompra'] = dataset[:, f. tarjetas_mpagospesos / f. tarjetas_mlimitecompra] # 38 dataset[ 'ratio_tarjetas_mpagominimo__tarjetas_mlimitecompra'] = dataset[:, f. tarjetas_mpagominimo / f. tarjetas_mlimitecompra] # 100 dataset[ 'ratio_tarjetas_mpagado__tarjetas_mlimitecompra'] = dataset[:, f. tarjetas_mpagado / f. tarjetas_mlimitecompra] # 29 dataset[ 'ratio_tarjetas_mpagosdolares__tarjetas_mlimitecompra'] = dataset[:, f. tarjetas_mpagosdolares / f. tarjetas_mlimitecompra] # 255 dataset[ 'ratio_tarjetas_mconsumototal__tarjetas_mlimitecompra'] = dataset[:, f. tarjetas_mconsumototal / f. tarjetas_mlimitecompra] # 365 dataset[ 'ratio_tarjetas_mconsumosdolares__tarjetas_mlimitecompra'] = dataset[:, f. tarjetas_mconsumosdolares / f. tarjetas_mlimitecompra] # 352 dataset[ 'ratio_tarjetas_msaldopesos__tarjetas_mlimitecompra'] = dataset[:, f. tarjetas_msaldopesos / f. tarjetas_mlimitecompra] # 270 dataset[ 'ratio_tarjetas_msaldopesos__tarjetas_msaldototal'] = dataset[:, f. tarjetas_msaldopesos / f. tarjetas_msaldototal] # 414 dataset[ 'ratio_Master_mlimitecompra__tarjetas_mlimitecompra'] = dataset[:, f. Master_mlimitecompra / f. tarjetas_mlimitecompra] # 367 dataset[ 'ratio_Visa_mlimitecompra__tarjetas_mlimitecompra'] = dataset[:, f. Visa_mlimitecompra / f. tarjetas_mlimitecompra] # 192 # v2 dataset['ctarjetas_credito'] = dataset[:, f.ctarjeta_master + f.ctarjeta_visa] # 27 dataset['ctarjetas'] = dataset[:, f.ctarjetas_credito + f.ctarjeta_debito] # 623 dataset[ 'ratio_mprestamos_personales__cprestamos_personales'] = dataset[:, f. mprestamos_personales / f. cprestamos_personales] # 127 dataset['cextracciones'] = dataset[:, f.cextraccion_autoservicio + f.ccajas_extracciones] # 157 dataset[ 'ratio_mextraccion_autoservicio__mcuentas_saldo'] = dataset[:, f. mextraccion_autoservicio / f. mcuentas_saldo] # 565 dataset['ccomisiones'] = dataset[:, f.ccomisiones_mantenimiento + f.ccomisiones_otras] # 578 dataset['ratio_mcomisiones__ccomisiones'] = dataset[:, f.mcomisiones / f.ccomisiones] # 508 dataset['ctransacciones'] = dataset[:, f.ccallcenter_transacciones + f.chomebanking_transacciones + f.ccajas_transacciones] # 485 dataset['ratio_ctransacciones__cproductos'] = dataset[:, f.ctransacciones / f.cproductos] # 472 # v3 dataset['mpayroll_total'] = dataset[:, f.mpayroll + f.mpayroll2] # 68 dataset['ratio_mpayroll_total__cliente_edad'] = dataset[:, f.mpayroll_total / f. cliente_edad] # 87 dataset['ratio_mcaja_ahorro__cliente_edad'] = dataset[:, f.mcaja_ahorro / f.cliente_edad] # 23 dataset[ 'ratio_mcuentas_saldo__cliente_edad'] = dataset[:, f.mcuentas_saldo / f.cliente_edad] # 102 dataset['cseguros_total'] = dataset[:, f.cseguro_vida + f.cseguro_auto + f.cseguro_vivienda + f.cseguro_accidentes_personales] # 454 dataset[ 'ratio_cseguros_total__cliente_antiguedad'] = dataset[:, f.cseguros_total / f. cliente_antiguedad] # 628 # v7 dataset['tarjetas_mconsumo_mes'] = dataset[:, f.mtarjeta_visa_consumo + f.mtarjeta_master_consumo] # 45 dataset['tarjetas_mconsumototal'] = dataset[:, f.Master_mconsumototal + f.Visa_mconsumototal] # 419 dataset[ 'ratio_tarjetas_consumo_mes__cliente_edad'] = dataset[:, f. tarjetas_mconsumo_mes / f. cliente_edad] # 51 dataset['score_04'] = dataset[:, (f.ctarjetas_credito * f.tarjetas_delinquency) / f.cliente_edad] # 695 dataset['score_04_relativo'] = dataset[:, f.score_04 / mean(f.score_04)] # 267 # Resultaron no ser importantes # v1 # dataset['ratio_tarjetas_msaldototal__tarjetas_mlimitecompra'] = dataset[:, f.tarjetas_mlimitecompra / f.tarjetas_mlimitecompra] # 2544 # v2 # dataset['ratio_mrentabilidad__cproductos'] = dataset[:, f.mrentabilidad / f.cproductos] # 911 # dataset['dif_tarjetas_mconsumototal__tarjetas_mpagado'] = dataset[:, f.tarjetas_mconsumototal - f.tarjetas_mpagado] # 1277 # dataset['ratio_mrentabilidad__mcomisiones'] = dataset[:, f.mrentabilidad / f.mcomisiones] # 1100 # v3 # dataset['ratio_mrentabilidad__mcuentas_saldo'] = dataset[:, f.mrentabilidad / f.mcuentas_saldo] # 2042 # dataset['ratio_mrentabilidad__cliente_antiguedad'] = dataset[:, f.mrentabilidad / f.cliente_antiguedad] # 1854 # dataset['ratio_mrentabilidad__cliente_edad'] = dataset[:, f.mrentabilidad / f.cliente_edad] # 1811 # dataset['ratio_cliente_antiguedad__cliente_edad'] = dataset[:, f.cliente_antiguedad / f.cliente_edad] # 1719 # v7 # dataset['score_01_relativo'] = dataset[:, f.score_01 / mean(f.score_01)] # no aparece # dataset['score_02_relativo'] = dataset[:, f.score_02 / mean(f.score_02)] # 2507 # dataset['score_03_relativo'] = dataset[:, f.score_03 / mean(f.score_03)] # 2454 # dataset['ratio_tarjetas_mconsumototal__cliente_edad'] = dataset[:, f.tarjetas_mconsumototal / f.cliente_edad] # 2459 # dataset['ratio_Visa_mconsumospesos__cliente_edad'] = dataset[:, f.Visa_mconsumospesos / f.cliente_edad] # 2485 # dataset['ratio_Visa_mconsumosdolares__cliente_edad'] = dataset[:, f.Visa_mconsumosdolares / f.cliente_edad] # 2486 # dataset['ratio_Visa_mconsumototal__cliente_edad'] = dataset[:, f.Visa_mconsumototal / f.cliente_edad] # 2429 # dataset['ratio_Master_mconsumospesos__cliente_edad'] = dataset[:, f.Master_mconsumospesos / f.cliente_edad] # 2501 # dataset['ratio_Master_mconsumosdolares__cliente_edad'] = dataset[:, f.Master_mconsumosdolares / f.cliente_edad] # 2345 # dataset['ratio_Master_mconsumototal__cliente_edad'] = dataset[:, f.Master_mconsumototal / f.cliente_edad] # 2493 # dataset['ratio_ctransacciones__cliente_edad'] = dataset[:, f.ctransacciones / f.cliente_edad] # 2508 # dataset['score_01'] = dataset[:, (f.ctarjetas * f.mrentabilidad) / f.ctrx_quarter] # 2575 # dataset['score_02'] = dataset[:, (f.ctarjetas * f.ctransacciones) / f.ctrx_quarter] # 2507 # dataset['score_03'] = dataset[:, (f.ctarjetas * f.ctransacciones) / f.cliente_edad] # 2498 return dataset
def score(self, actual: np.array, predicted: np.array, sample_weight: typing.Optional[np.array] = None, labels: typing.Optional[List[any]] = None, X: typing.Optional[dt.Frame] = None, **kwargs) -> float: # Get the logger if it exists logger = self.get_experiment_logger() # hard-coded as access to experiment parameters (such as self.tgc) not yet available tgc = ["Store", "Dept"] # tgc = ["state"] # tgc = None # enable weighted average over TS R2 scores: weighted based on TS share of rows isR2AverageWeighted = False # obtain a scorer for metric to use scorer = self.get_scorer() if tgc is None or not all(col in X.names for col in tgc): loggerinfo( logger, f"TS R2 computes single R2 on {X.nrows} rows as either tgc {tgc} is not defined or incorrect." ) return scorer.score(actual, predicted, sample_weight, labels, **kwargs) else: tgc_values = X[:, { "weight": count() / X.nrows, "r2": 0.0 }, by(tgc)] loggerinfo( logger, f"TS R2 computes multiple R2 on {X.nrows} rows, tgc {tgc} with weighting is {isR2AverageWeighted}." ) none_values = [None] * X.nrows X = cbind( X[:, tgc], Frame(actual=actual, predicted=predicted, sample_weight=sample_weight if sample_weight is not None else none_values)) for i in range(0, tgc_values.nrows): current_tgc = tgc_values[i, :] current_tgc.key = tgc ts_frame = X[:, :, join(current_tgc)][~isna(f.r2), :] r2_score = scorer.score( ts_frame['actual'].to_numpy(), ts_frame['predicted'].to_numpy(), ts_frame['sample_weight'].to_numpy() if sample_weight is not None else None, labels, **kwargs) tgc_values[i, f.r2] = r2_score loggerinfo( logger, f"TS R2 = {r2_score} on {ts_frame.nrows} rows, tgc = {current_tgc[0, tgc].to_tuples()}" ) if isR2AverageWeighted: # return np.average(tgc_values["r2"].to_numpy(), weights=tgc_values["weight"].to_numpy()) return tgc_values[:, mean(f.r2 * f.weight)][0, 0] else: return tgc_values[:, mean(f.r2)][0, 0]
ans = x[:, {"v1": sum(f.v1)}, by(f.id1, f.id2)] print(ans.shape, flush=True) t = timeit.default_timer() - t_start m = memory_usage() t_start = timeit.default_timer() chk = ans[:, sum(f.v1)] chkt = timeit.default_timer() - t_start write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=2, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(flatten(chk.to_list())), chk_time_sec=chkt) print(ans.head(3).to_pandas(), flush=True) print(ans.tail(3).to_pandas(), flush=True) del ans question = "sum v1 mean v3 by id3" # q3 gc.collect() t_start = timeit.default_timer() ans = x[:, {"v1": sum(f.v1), "v3": mean(f.v3)}, by(f.id3)] print(ans.shape, flush=True) t = timeit.default_timer() - t_start m = memory_usage() t_start = timeit.default_timer() chk = ans[:, [sum(f.v1), sum(f.v3)]] chkt = timeit.default_timer() - t_start write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=1, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(flatten(chk.to_list())), chk_time_sec=chkt) del ans gc.collect() t_start = timeit.default_timer() ans = x[:, {"v1": sum(f.v1), "v3": mean(f.v3)}, by(f.id3)] print(ans.shape, flush=True) t = timeit.default_timer() - t_start m = memory_usage() t_start = timeit.default_timer()
dead = fullTable[:,'AnzahlTodesfall'].sum()[0,0] lastDay=fullTable[:,'MeldeDay'].max()[0,0] print("lastDay {} cases {} dead{}".format(lastDay, cases, dead)) newTable=fullTable[:,dt.f[:].extend({"erkMeldeDelay": dt.f.MeldeDay-dt.f.RefDay})] #print(newTable.keys()) #dt.by(dt.f.Bundesland)] alldays=fullTable[:, [dt.sum(dt.f.AnzahlFall), dt.sum(dt.f.FaellePro100k), dt.sum(dt.f.AnzahlTodesfall), dt.sum(dt.f.TodesfaellePro100k), dt.mean(dt.f.Bevoelkerung)], dt.by(dt.f.Landkreis)] last7days=fullTable[dt.f.newCaseOnDay>lastDay-7,:][:, [dt.sum(dt.f.AnzahlFall), dt.sum(dt.f.FaellePro100k), dt.sum(dt.f.AnzahlTodesfall), dt.sum(dt.f.TodesfaellePro100k)], dt.by(dt.f.Landkreis)] last7days.names=["Landkreis","AnzahlFallLetzte7Tage","FaellePro100kLetzte7Tage","AnzahlTodesfallLetzte7Tage","TodesfaellePro100kLetzte7Tage"] def merge(largerTable, smallerTable, keyFieldName): keys = smallerTable[:, keyFieldName].to_list()[0] extTable = largerTable.copy() for colName in smallerTable.names: if colName != keyFieldName:
def analyzeDaily(fullTable, filter, prefix, postfix, byDateColName): print("analyzeDaily prefix='{}' postfix='{}' byDateColName='{}'".format(prefix, postfix, byDateColName)) #print("analyzeDaily filter='{}' '".format(filter)) byDate = dt.f[byDateColName] #print("----- analyzeDaily:"+postfix) #dayTable = fullTable[(dt.f.DatenstandTag >= fromDay) & (dt.f.DatenstandTag < toDay) & (dt.f.IdLandkreis == forIdLandkreis),:] dayTable = fullTable[filter,:] cases_to_count = dayTable[(dt.f.NeuerFall == 0) | (dt.f.NeuerFall == 1),:] cases = cases_to_count[:, [dt.sum(dt.f.AnzahlFall)],dt.by(byDate)] cases.names = [byDateColName, prefix+"AnzahlFall"+postfix] cases.key = byDateColName print("cases rows = {}, cases_to_count = {}".format(cases.nrows, cases_to_count.nrows)) #print(cases) byDayTable = cases if byDateColName == "DatenstandTag": new_cases_to_count = dayTable[(dt.f.NeuerFall == -1) | (dt.f.NeuerFall == 1),:] new_cases = new_cases_to_count[:, [dt.sum(dt.f.AnzahlFall)],dt.by(byDate)] new_cases.names = [byDateColName, prefix+"AnzahlFallNeu"+postfix] new_cases.key = byDateColName print("new_cases rows = {}, new_cases_to_count = {}".format(new_cases.nrows, new_cases_to_count.nrows)) #new_cases_to_count.to_csv("new_cases_to_count.csv") byDayTable = byDayTable[:,:,dt.join(new_cases)] else: # add days by MeldeTag byDayTable.names = {prefix+"AnzahlFall"+postfix: prefix+"AnzahlFallNeu"+postfix} byDayTable = addRunningSumColumn(byDayTable, prefix+"AnzahlFallNeu"+postfix, prefix+"AnzahlFall"+postfix) dead_to_count = dayTable[(dt.f.NeuerTodesfall == 0) | (dt.f.NeuerTodesfall == 1),:] dead = dead_to_count[:, [dt.sum(dt.f.AnzahlTodesfall)],dt.by(byDate)] dead.names = [byDateColName, prefix+"AnzahlTodesfall"+postfix] dead.key = byDateColName #print("dead rows = {}".format(dead.nrows)) byDayTable = byDayTable[:,:,dt.join(dead)] if byDateColName == "DatenstandTag": new_dead_to_count = dayTable[(dt.f.NeuerTodesfall == -1) | (dt.f.NeuerTodesfall == 1),:] new_dead = new_dead_to_count[:, [dt.sum(dt.f.AnzahlTodesfall)],dt.by(byDate)] new_dead.names = [byDateColName, prefix+"AnzahlTodesfallNeu"+postfix] new_dead.key = byDateColName #print("new_dead rows = {}".format(new_dead.nrows)) byDayTable = byDayTable[:,:,dt.join(new_dead)] else: # add days by MeldeTag byDayTable.names = {prefix+"AnzahlTodesfall"+postfix: prefix+"AnzahlTodesfallNeu"+postfix} byDayTable = addRunningSumColumn(byDayTable, prefix+"AnzahlTodesfallNeu"+postfix, prefix+"AnzahlTodesfall"+postfix) byDayTable.key = byDateColName if postfix == "" and prefix == "" and byDateColName == "DatenstandTag": new_cases_to_count_delay = new_cases_to_count[(dt.f.AnzahlFall > 0), :] # measure delay only for positive cases new_cases_to_count_delay.materialize() new_cases_delay = new_cases_to_count_delay[:, [dt.min(dt.f.MeldeDelay), dt.max(dt.f.MeldeDelay), dt.mean(dt.f.MeldeDelay), dt.median(dt.f.MeldeDelay), dt.sd(dt.f.MeldeDelay), dt.sum(dt.f.AnzahlFall), dt.max(dt.f.DatenstandTag)], dt.by(byDate)] new_cases_delay.names = ["DatenstandTag", "PublikationsdauerFallNeu_Min" + postfix, "PublikationsdauerFallNeu_Max" + postfix, "PublikationsdauerFallNeu_Schnitt" + postfix, "PublikationsdauerFallNeu_Median" + postfix, "PublikationsdauerFallNeu_StdAbw" + postfix, "PublikationsdauerFallNeu_Fallbasis" + postfix, "DatenstandTag_Max" + postfix] new_cases_delay.key = "DatenstandTag" print("new_cases_delay rows = {}, new_cases_to_count_delay = {}".format(new_cases_delay.nrows, new_cases_to_count_delay.nrows)) recovered_to_count = dayTable[(dt.f.NeuGenesen == 0) | (dt.f.NeuGenesen == 1),:] recovered = recovered_to_count[:, [dt.sum(dt.f.AnzahlGenesen)],dt.by(byDate)] recovered.names = ["DatenstandTag", "AnzahlGenesen"+postfix] recovered.key = "DatenstandTag" #print("recovered rows = {}".format(recovered.nrows)) new_recovered_to_count = dayTable[(dt.f.NeuGenesen == -1) | (dt.f.NeuGenesen == 1),:] new_recovered = new_recovered_to_count[:, [dt.sum(dt.f.AnzahlGenesen)],dt.by(byDate)] new_recovered.names = ["DatenstandTag", "AnzahlGenesenNeu"+postfix] new_recovered.key = "DatenstandTag" #print("new_recovered rows = {}".format(new_recovered.nrows)) byDayTable = byDayTable[:, :, dt.join(recovered)][:, :, dt.join(new_recovered)][:, :,dt.join(new_cases_delay)] #byDayTable = byDayTable[:,:,dt.join(recovered)][:,:,dt.join(new_recovered)]\ # [:,:,dt.join(new_cases_strict)][:,:,dt.join(new_cases_strict_14)][:,:,dt.join(new_cases_delay)] byDayTable.key = byDateColName #print("byDayTable rows = {}".format(byDayTable.nrows)) #print(byDayTable) return byDayTable
def test_rows_mean(): from datatable import mean df0 = dt.Frame(range(10), names=["A"]) df1 = df0(f.A > mean(f.A), engine="eager") df1.internal.check() assert df1.topython() == [[5, 6, 7, 8, 9]]
def test_mean(): assert str(dt.mean(f.A)) == str(f.A.mean()) assert str(dt.mean(f[:])) == str(f[:].mean()) DT = dt.Frame(A=range(1, 10)) assert_equals(DT[:, f.A.mean()], DT[:, dt.mean(f.A)])
def test_rows_mean(): from datatable import mean df0 = dt.Frame(A=range(10)) df1 = df0[f.A > mean(f.A), :] frame_integrity_check(df1) assert df1.to_list() == [[5, 6, 7, 8, 9]]
def test_groups1(): f0 = dt.Frame({"A": [1, 2, 1, 2, 1, 3, 1, 1], "B": [0, 1, 2, 3, 4, 5, 6, 7]}) f1 = f0(select=mean(f.B), groupby=f.A) assert f1.stypes == (dt.float64,) assert f1.topython() == [[3.8, 2.0, 5.0]]