def run_benchmark(parameters): ignored_parameters = { "optimizer": parameters["optimizer"], "no_ml": parameters["no_ml"], "gpu_memory": parameters["gpu_memory"], } warnings.warn(f"Parameters {ignored_parameters} are ignored", RuntimeWarning) parameters["data_file"] = parameters["data_file"].replace("'", "") columns_names = [ "trip_id", "vendor_id", "pickup_datetime", "dropoff_datetime", "store_and_fwd_flag", "rate_code_id", "pickup_longitude", "pickup_latitude", "dropoff_longitude", "dropoff_latitude", "passenger_count", "trip_distance", "fare_amount", "extra", "mta_tax", "tip_amount", "tolls_amount", "ehail_fee", "improvement_surcharge", "total_amount", "payment_type", "trip_type", "pickup", "dropoff", "cab_type", "precipitation", "snow_depth", "snowfall", "max_temperature", "min_temperature", "average_wind_speed", "pickup_nyct2010_gid", "pickup_ctlabel", "pickup_borocode", "pickup_boroname", "pickup_ct2010", "pickup_boroct2010", "pickup_cdeligibil", "pickup_ntacode", "pickup_ntaname", "pickup_puma", "dropoff_nyct2010_gid", "dropoff_ctlabel", "dropoff_borocode", "dropoff_boroname", "dropoff_ct2010", "dropoff_boroct2010", "dropoff_cdeligibil", "dropoff_ntacode", "dropoff_ntaname", "dropoff_puma", ] columns_types = [ "int64", "int64", "timestamp", "timestamp", "string", "int64", "float64", "float64", "float64", "float64", "int64", "float64", "float64", "float64", "float64", "float64", "float64", "float64", "float64", "float64", "int64", "float64", "string", "string", "string", "float64", "int64", "float64", "int64", "int64", "float64", "float64", "float64", "float64", "string", "float64", "float64", "string", "string", "string", "float64", "float64", "float64", "float64", "string", "float64", "float64", "string", "string", "string", "float64", ] if parameters["dfiles_num"] <= 0: print("Bad number of data files specified: ", parameters["dfiles_num"]) sys.exit(1) try: import_pandas_into_module_namespace( namespace=run_benchmark.__globals__, mode=parameters["pandas_mode"], ray_tmpdir=parameters["ray_tmpdir"], ray_memory=parameters["ray_memory"], ) etl_times_ibis = None if not parameters["no_ibis"]: etl_times_ibis = etl_ibis( filename=parameters["data_file"], files_limit=parameters["dfiles_num"], columns_names=columns_names, columns_types=columns_types, database_name=parameters["database_name"], table_name=parameters["table"], omnisci_server_worker=parameters["omnisci_server_worker"], delete_old_database=not parameters["dnd"], ipc_connection=parameters["ipc_connection"], create_new_table=not parameters["dni"], validation=parameters["validation"], ) print_results(results=etl_times_ibis, backend="Ibis", unit="ms") etl_times_ibis["Backend"] = "Ibis" pandas_files_limit = parameters["dfiles_num"] filename = files_names_from_pattern( parameters["data_file"])[:pandas_files_limit] etl_times = etl_pandas( filename=filename, files_limit=pandas_files_limit, columns_names=columns_names, columns_types=columns_types, ) print_results(results=etl_times, backend=parameters["pandas_mode"], unit="ms") etl_times["Backend"] = parameters["pandas_mode"] return {"ETL": [etl_times_ibis, etl_times], "ML": []} except Exception: traceback.print_exc(file=sys.stdout) sys.exit(1)
def etl_ibis( filename, files_limit, columns_names, columns_types, database_name, table_name, omnisci_server_worker, delete_old_database, create_new_table, ipc_connection, validation, ): queries = { "Query1": q1_ibis, "Query2": q2_ibis, "Query3": q3_ibis, "Query4": q4_ibis, } etl_times = {x: 0.0 for x in queries.keys()} queries_validation_results = {"q%s" % i: False for i in range(1, 5)} queries_validation_flags = {"q%s" % i: False for i in range(1, 5)} omnisci_server_worker.connect_to_server() data_files_names = files_names_from_pattern(filename) if len(data_files_names) == 0: print("Could not find any data files matching ", filename) sys.exit(2) omnisci_server_worker.create_database(database_name, delete_if_exists=delete_old_database) if create_new_table: # TODO t_import_pandas, t_import_ibis = omnisci_server_worker.import_data_by_ibis t0 = time.time() omnisci_server_worker.import_data( table_name=table_name, data_files_names=data_files_names, files_limit=files_limit, columns_names=columns_names, columns_types=columns_types, header=False, ) etl_times["t_readcsv"] = time.time() - t0 # etl_times["t_readcsv"] = t_import_pandas + t_import_ibis omnisci_server_worker.connect_to_server(database=database_name, ipc=ipc_connection) table = omnisci_server_worker.database(database_name).table(table_name) df_pandas = None if validation: df_pandas = validation_prereqs(omnisci_server_worker, data_files_names, files_limit, columns_names) queries_parameters = { "table": table, "df_pandas": df_pandas, "queries_validation_results": queries_validation_results, "queries_validation_flags": queries_validation_flags, "validation": validation, } return run_queries(queries=queries, parameters=queries_parameters, etl_times=etl_times)
def etl_ibis( filename, files_limit, columns_names, columns_types, database_name, table_name, omnisci_server_worker, delete_old_database, create_new_table, ipc_connection, input_for_validation, import_mode, fragments_size, debug_mode, ): import ibis fragments_size = check_fragments_size(fragments_size, count_table=1, import_mode=import_mode) queries = {"Query1": q1_ibis, "Query2": q2_ibis, "Query3": q3_ibis, "Query4": q4_ibis} etl_results = {x: 0.0 for x in queries.keys()} etl_results["t_readcsv"] = 0.0 etl_results["t_connect"] = 0.0 omnisci_server_worker.connect_to_server() data_files_names = files_names_from_pattern(filename) if len(data_files_names) == 0: raise FileNotFoundError(f"Could not find any data files matching: [{filename}]") data_files_extension = data_files_names[0].split(".")[-1] if not all([name.endswith(data_files_extension) for name in data_files_names]): raise NotImplementedError( "Import of data files with different extensions is not supported" ) omnisci_server_worker.create_database(database_name, delete_if_exists=delete_old_database) # Create table and import data for ETL queries if create_new_table: schema_table = ibis.Schema(names=columns_names, types=columns_types) if import_mode == "copy-from": t0 = timer() omnisci_server_worker.create_table( table_name=table_name, schema=schema_table, database=database_name, fragment_size=fragments_size[0], ) etl_results["t_connect"] += timer() - t0 table_import = omnisci_server_worker.database(database_name).table(table_name) etl_results["t_connect"] += omnisci_server_worker.get_conn_creation_time() for file_to_import in data_files_names[:files_limit]: t0 = timer() table_import.read_csv(file_to_import, header=False, quotechar='"', delimiter=",") etl_results["t_readcsv"] += timer() - t0 elif import_mode == "pandas": # pymapd load_table (that is called recursively by import_data_by_ibis) # needs homogeneus data, and since vendor_id and payment_type fields # from trips_xad file contain text data, next workaround and check are used columns_types[1] = "int64" columns_types[20] = "int64" files_names = [ file_path.split("/")[-1].split(".")[0] for file_path in data_files_names[:files_limit] ] if not all( [ file_name in accepted_data_files_for_pandas_import_mode for file_name in files_names ] ): raise AttributeError( f"pandas import_mode is supported only for {accepted_data_files_for_pandas_import_mode} data files, actually passed {files_names}" ) t_import_pandas, t_import_ibis = omnisci_server_worker.import_data_by_ibis( table_name=table_name, data_files_names=data_files_names, files_limit=files_limit, columns_names=columns_names, columns_types=columns_types, header=None, nrows=None, compression_type="gzip" if data_files_extension == "gz" else None, use_columns_types_for_pd=False, ) etl_results["t_readcsv"] = t_import_pandas + t_import_ibis etl_results["t_connect"] = omnisci_server_worker.get_conn_creation_time() elif import_mode == "fsi": with FilesCombiner( data_files_names=data_files_names, combined_filename=f"taxibench-{files_limit}--files-fsi.csv", files_limit=files_limit, ) as data_file_path: t0 = timer() omnisci_server_worker.get_conn().create_table_from_csv( table_name, data_file_path, schema_table, header=False, fragment_size=fragments_size[0], ) etl_results["t_readcsv"] += timer() - t0 etl_results["t_connect"] = omnisci_server_worker.get_conn_creation_time() # Second connection - this is ibis's ipc connection for DML omnisci_server_worker.connect_to_server(database_name, ipc=ipc_connection) etl_results["t_connect"] += omnisci_server_worker.get_conn_creation_time() t0 = timer() table = omnisci_server_worker.database(database_name).table(table_name) etl_results["t_connect"] += timer() - t0 queries_parameters = { query_name: { "table": table, "input_for_validation": input_for_validation, "debug_mode": debug_mode, } for query_name in queries.keys() } return run_queries(queries=queries, parameters=queries_parameters, etl_results=etl_results)
def run_benchmark(parameters): check_support(parameters, unsupported_params=["optimizer", "no_ml", "gpu_memory"]) parameters["data_file"] = parameters["data_file"].replace("'", "") columns_names = [ "trip_id", "vendor_id", "pickup_datetime", "dropoff_datetime", "store_and_fwd_flag", "rate_code_id", "pickup_longitude", "pickup_latitude", "dropoff_longitude", "dropoff_latitude", "passenger_count", "trip_distance", "fare_amount", "extra", "mta_tax", "tip_amount", "tolls_amount", "ehail_fee", "improvement_surcharge", "total_amount", "payment_type", "trip_type", "pickup", "dropoff", "cab_type", "precipitation", "snow_depth", "snowfall", "max_temperature", "min_temperature", "average_wind_speed", "pickup_nyct2010_gid", "pickup_ctlabel", "pickup_borocode", "pickup_boroname", "pickup_ct2010", "pickup_boroct2010", "pickup_cdeligibil", "pickup_ntacode", "pickup_ntaname", "pickup_puma", "dropoff_nyct2010_gid", "dropoff_ctlabel", "dropoff_borocode", "dropoff_boroname", "dropoff_ct2010", "dropoff_boroct2010", "dropoff_cdeligibil", "dropoff_ntacode", "dropoff_ntaname", "dropoff_puma", ] columns_types = [ "int64", "category", "timestamp", "timestamp", "category", "int64", "float64", "float64", "float64", "float64", "int64", "float64", "float64", "float64", "float64", "float64", "float64", "float64", "float64", "float64", "category", "float64", "category", "category", "category", "float64", "int64", "float64", "int64", "int64", "float64", "float64", "float64", "float64", "category", "float64", "float64", "category", "category", "category", "float64", "float64", "float64", "float64", "category", "float64", "float64", "category", "category", "category", "float64", ] if parameters["dfiles_num"] <= 0: raise ValueError(f"Bad number of data files specified: {parameters['dfiles_num']}") if not parameters["no_pandas"]: import_pandas_into_module_namespace( namespace=run_benchmark.__globals__, mode=parameters["pandas_mode"], ray_tmpdir=parameters["ray_tmpdir"], ray_memory=parameters["ray_memory"], ) etl_results_ibis = None etl_results = None pd_queries_outputs = {} if parameters["validation"] else None if not parameters["no_pandas"]: pandas_files_limit = parameters["dfiles_num"] filename = files_names_from_pattern(parameters["data_file"])[:pandas_files_limit] etl_results = etl_pandas( filename=filename, files_limit=pandas_files_limit, columns_names=columns_names, columns_types=columns_types, output_for_validation=pd_queries_outputs, pandas_mode=parameters["pandas_mode"], ) print_results(results=etl_results, backend=parameters["pandas_mode"], unit="ms") etl_results["Backend"] = parameters["pandas_mode"] etl_results["dfiles_num"] = parameters["dfiles_num"] etl_results["dataset_size"] = get_ny_taxi_dataset_size(parameters["dfiles_num"]) if not parameters["no_ibis"]: etl_results_ibis = etl_ibis( filename=parameters["data_file"], files_limit=parameters["dfiles_num"], columns_names=columns_names, columns_types=columns_types, database_name=parameters["database_name"], table_name=parameters["table"], omnisci_server_worker=parameters["omnisci_server_worker"], delete_old_database=not parameters["dnd"], ipc_connection=parameters["ipc_connection"], create_new_table=not parameters["dni"], input_for_validation=pd_queries_outputs, import_mode=parameters["import_mode"], fragments_size=parameters["fragments_size"], debug_mode=parameters["debug_mode"], ) print_results(results=etl_results_ibis, backend="Ibis", unit="ms") etl_results_ibis["Backend"] = "Ibis" etl_results_ibis["dfiles_num"] = parameters["dfiles_num"] etl_results_ibis["dataset_size"] = get_ny_taxi_dataset_size(parameters["dfiles_num"]) return {"ETL": [etl_results_ibis, etl_results], "ML": []}
def queries_modin(filename, pandas_mode, extended_functionality): data_files_names = files_names_from_pattern(filename) data_for_groupby_queries = [] data_for_join_queries = [] for f in data_files_names: if f.split("/")[-1].startswith("G1"): data_for_groupby_queries.append(f) elif f.split("/")[-1].startswith("J1"): data_for_join_queries.append(f) else: raise AttributeError(f"Unrecognized file is passed as -data_file flag argument: {f}") groupby_queries_files_number = len(data_for_groupby_queries) join_queries_files_number = len(data_for_join_queries) accepted_number_of_files_for_join_queries = [0, 1, 4] if all([groupby_queries_files_number, join_queries_files_number]): raise AttributeError( "Only one type of queries (groupby or join) can be executed during one run, but files for both queries are passed with -data_file flag" ) elif groupby_queries_files_number > 1: raise AttributeError( f"Only one file for one run is accepted for groupby queries, actually passed {groupby_queries_files_number}: {data_for_groupby_queries}" ) elif join_queries_files_number not in accepted_number_of_files_for_join_queries: raise AttributeError( f"Accepted numbers of files for join queries are {accepted_number_of_files_for_join_queries}, actually passed {join_queries_files_number}: {data_for_join_queries}" ) elif join_queries_files_number and sum("NA" in f for f in data_for_join_queries) != 1: raise FileNotFoundError( "Data files for join queries should contain file (only one) with NA component in the file name" ) queries_results_fields = ["t_run1", "chk_t_run1", "t_run2", "chk_t_run2"] if groupby_queries_files_number: print(f"loading dataset {data_for_groupby_queries[0]}") t0 = timer() x = pd.read_csv(data_for_groupby_queries[0]) x_data_file_import_time = timer() - t0 queries = { "groupby_query1": groupby_query1_modin, "groupby_query2": groupby_query2_modin, "groupby_query3": groupby_query3_modin, "groupby_query4": groupby_query4_modin, "groupby_query5": groupby_query5_modin, "groupby_query6": groupby_query6_modin, "groupby_query7": groupby_query7_modin, "groupby_query8": groupby_query8_modin, "groupby_query9": groupby_query9_modin, "groupby_query10": groupby_query10_modin, } if pandas_mode == "Modin_on_omnisci": del queries["groupby_query6"] # NotImplementedError: unsupported aggregate median del queries["groupby_query8"] # Query execution in `Modin_on_omnisci` mode # is under development del queries["groupby_query9"] # core dumped issue del queries["groupby_query10"] # core dumped issue queries_results = {x: {y: 0.0 for y in queries_results_fields} for x in queries.keys()} x_data_file_size = getsize(data_for_groupby_queries[0]) query_data_file_sizes = {x: x_data_file_size for x in queries.keys()} query_data_file_import_times = {x: x_data_file_import_time for x in queries.keys()} queries_parameters = { "x": x, "queries_results": queries_results, "extended_functionality": extended_functionality, } if join_queries_files_number: data_name = next( (f for f in data_for_join_queries if "NA" in f), None ) # gets the file name with "NA" component data_files_paths, data_files_sizes = join_to_tbls(data_name) data_files_import_times = {} data_df = {} print(f"loading dataset {[path for path in data_files_paths.values()]}") for data_id, data_path in data_files_paths.items(): t0 = timer() data_df[data_id] = pd.read_csv(data_path) data_files_import_times[data_id] = timer() - t0 print(len(data_df["x"].index), flush=True) print(len(data_df["small"].index), flush=True) print(len(data_df["medium"].index), flush=True) print(len(data_df["big"].index), flush=True) queries = { "join_query1": join_query1_modin, "join_query2": join_query2_modin, "join_query3": join_query3_modin, "join_query4": join_query4_modin, "join_query5": join_query5_modin, } queries_results = {x: {y: 0.0 for y in queries_results_fields} for x in queries.keys()} queries_parameters = { "x": data_df["x"], "ys": [data_df["small"], data_df["medium"], data_df["big"]], "queries_results": queries_results, "extended_functionality": extended_functionality, } query_data_file_sizes = { "join_query1": data_files_sizes["x"] + data_files_sizes["small"], "join_query2": data_files_sizes["x"] + data_files_sizes["medium"], "join_query3": data_files_sizes["x"] + data_files_sizes["medium"], "join_query4": data_files_sizes["x"] + data_files_sizes["medium"], "join_query5": data_files_sizes["x"] + data_files_sizes["big"], } query_data_file_import_times = { "join_query1": data_files_import_times["x"] + data_files_import_times["small"], "join_query2": data_files_import_times["x"] + data_files_import_times["medium"], "join_query3": data_files_import_times["x"] + data_files_import_times["medium"], "join_query4": data_files_import_times["x"] + data_files_import_times["medium"], "join_query5": data_files_import_times["x"] + data_files_import_times["big"], } for query_name, query_func in queries.items(): query_func(**queries_parameters) print(f"{pandas_mode} {query_name} results:") print_results(results=queries_results[query_name], unit="s") queries_results[query_name]["Backend"] = pandas_mode queries_results[query_name]["t_readcsv"] = query_data_file_import_times[query_name] queries_results[query_name]["dataset_size"] = query_data_file_sizes[query_name] return queries_results