def etl_pandas(filename, columns_names, columns_types, etl_keys): etl_times = {key: 0.0 for key in etl_keys} t0 = timer() train_pd = load_data_pandas( filename=filename, columns_names=columns_names, columns_types=columns_types, header=0, nrows=None, use_gzip=filename.endswith(".gz"), pd=run_benchmark.__globals__["pd"], ) etl_times["t_readcsv"] = round((timer() - t0) * 1000) t_etl_begin = timer() for i in range(200): col = "var_%d" % i var_count = train_pd.groupby(col).agg({col: "count"}) var_count.columns = ["%s_count" % col] var_count = var_count.reset_index() train_pd = train_pd.merge(var_count, on=col, how="left") for i in range(200): col = "var_%d" % i mask = train_pd["%s_count" % col] > 1 train_pd.loc[mask, "%s_gt1" % col] = train_pd.loc[mask, col] train_pd = train_pd.drop(["ID_code"], axis=1) etl_times["t_etl"] = round((timer() - t_etl_begin) * 1000) return train_pd, etl_times
def etl_pandas( filename, files_limit, columns_names, columns_types, ): queries = { "Query1": q1_pandas, "Query2": q2_pandas, "Query3": q3_pandas, "Query4": q4_pandas, } etl_times = {x: 0.0 for x in queries.keys()} t0 = time.time() df_from_each_file = [ load_data_pandas( filename=f, columns_names=columns_names, header=0, nrows=None, use_gzip=f.endswith(".gz"), parse_dates=[ "pickup_datetime", "dropoff_datetime", ], pd=run_benchmark.__globals__["pd"], ) for f in filename ] concatenated_df = pd.concat(df_from_each_file, ignore_index=True) etl_times["t_readcsv"] = time.time() - t0 queries_parameters = {"df": concatenated_df} return run_queries(queries=queries, parameters=queries_parameters, etl_times=etl_times)
def etl_pandas(filename, columns_names, columns_types, etl_keys, pandas_mode): etl_times = {key: 0.0 for key in etl_keys} t0 = timer() if pandas_mode == "Modin_on_omnisci": df = load_data_modin_on_omnisci( filename=filename, columns_names=columns_names, columns_types=columns_types, skiprows=1, pd=run_benchmark.__globals__["pd"], ) else: df = load_data_pandas( filename=filename, columns_names=columns_names, columns_types=columns_types, header=0, nrows=None, use_gzip=filename.endswith(".gz"), pd=run_benchmark.__globals__["pd"], ) etl_times["t_readcsv"] = timer() - t0 t_etl_start = timer() keep_cols = [ "YEAR0", "DATANUM", "SERIAL", "CBSERIAL", "HHWT", "CPI99", "GQ", "PERNUM", "SEX", "AGE", "INCTOT", "EDUC", "EDUCD", "EDUC_HEAD", "EDUC_POP", "EDUC_MOM", "EDUCD_MOM2", "EDUCD_POP2", "INCTOT_MOM", "INCTOT_POP", "INCTOT_MOM2", "INCTOT_POP2", "INCTOT_HEAD", "SEX_HEAD", ] df = df[keep_cols] df = df[df["INCTOT"] != 9999999] df = df[df["EDUC"] != -1] df = df[df["EDUCD"] != -1] df["INCTOT"] = df["INCTOT"] * df["CPI99"] for column in keep_cols: df[column] = df[column].fillna(-1) df[column] = df[column].astype("float64") y = df["EDUC"] X = df.drop(columns=["EDUC", "CPI99"]) # trigger computation df.shape y.shape X.shape etl_times["t_etl"] = timer() - t_etl_start print("DataFrame shape:", X.shape) return df, X, y, etl_times
def etl_pandas(filename, columns_names, columns_types, etl_keys): etl_times = {key: 0.0 for key in etl_keys} t0 = timer() df = load_data_pandas( filename=filename, columns_names=columns_names, columns_types=columns_types, header=0, nrows=None, use_gzip=filename.endswith(".gz"), pd=run_benchmark.__globals__["pd"], ) etl_times["t_readcsv"] = round((timer() - t0) * 1000) t_etl_start = timer() keep_cols = [ "YEAR0", "DATANUM", "SERIAL", "CBSERIAL", "HHWT", "CPI99", "GQ", "PERNUM", "SEX", "AGE", "INCTOT", "EDUC", "EDUCD", "EDUC_HEAD", "EDUC_POP", "EDUC_MOM", "EDUCD_MOM2", "EDUCD_POP2", "INCTOT_MOM", "INCTOT_POP", "INCTOT_MOM2", "INCTOT_POP2", "INCTOT_HEAD", "SEX_HEAD", ] df = df[keep_cols] df = df.query("INCTOT != 9999999") df = df.query("EDUC != -1") df = df.query("EDUCD != -1") df["INCTOT"] = df["INCTOT"] * df["CPI99"] for column in keep_cols: df[column] = df[column].fillna(-1) df[column] = df[column].astype("float64") y = df["EDUC"] X = df.drop(columns=["EDUC", "CPI99"]) etl_times["t_etl"] = round((timer() - t_etl_start) * 1000) print("DataFrame shape:", X.shape) return df, X, y, etl_times
def etl_pandas( filename, files_limit, columns_names, columns_types, output_for_validation, pandas_mode ): if pandas_mode == "Modin_on_omnisci" and any(f.endswith(".gz") for f in filename): raise NotImplementedError( "Modin_on_omnisci mode doesn't support import of compressed files yet" ) queries = {"Query1": q1_pandas, "Query2": q2_pandas, "Query3": q3_pandas, "Query4": q4_pandas} etl_results = {x: 0.0 for x in queries.keys()} t0 = timer() if pandas_mode == "Modin_on_omnisci": df_from_each_file = [ load_data_modin_on_omnisci( filename=f, columns_names=columns_names, columns_types=columns_types, parse_dates=["timestamp"], pd=run_benchmark.__globals__["pd"], ) for f in filename ] else: df_from_each_file = [ load_data_pandas( filename=f, columns_names=columns_names, header=None, nrows=None, use_gzip=f.endswith(".gz"), parse_dates=["pickup_datetime", "dropoff_datetime"], pd=run_benchmark.__globals__["pd"], pandas_mode=pandas_mode, ) for f in filename ] concatenated_df = pd.concat(df_from_each_file, ignore_index=True) # this is to trigger data import in `MOdin_on_omnisci` mode if pandas_mode == "Modin_on_omnisci": from modin.experimental.engines.omnisci_on_ray.frame.omnisci_worker import OmnisciServer concatenated_df.shape concatenated_df._query_compiler._modin_frame._partitions[0][ 0 ].frame_id = OmnisciServer().put_arrow_to_omnisci( concatenated_df._query_compiler._modin_frame._partitions[0][0].get() ) etl_results["t_readcsv"] = timer() - t0 queries_parameters = { query_name: { "df": concatenated_df.copy() if pandas_mode == "Modin_on_omnisci" else concatenated_df, "pandas_mode": pandas_mode, } for query_name in list(queries.keys()) } return run_queries( queries=queries, parameters=queries_parameters, etl_results=etl_results, output_for_validation=output_for_validation, )