def main(): omniscript_path = os.path.dirname(__file__) args = None omnisci_server = None parser = argparse.ArgumentParser( description="Run internal tests from ibis project") optional = parser._action_groups.pop() required = parser.add_argument_group("required arguments") parser._action_groups.append(optional) required.add_argument( "-f", "--file", dest="file", required=True, help="A datafile that should be loaded", ) optional.add_argument("-dnd", action="store_true", help="Do not delete old table.") optional.add_argument( "-dni", action="store_true", help="Do not create new table and import any data from CSV files.", ) optional.add_argument( "-val", action="store_true", help= "validate queries results (by comparison with Pandas queries results).", ) optional.add_argument( "-o", "--optimizer", choices=["intel", "stock"], dest="optimizer", default="intel", help="Which optimizer is used", ) # MySQL database parameters optional.add_argument( "-db-server", dest="db_server", default="localhost", help="Host name of MySQL server.", ) optional.add_argument( "-db-port", dest="db_port", default=3306, type=int, help="Port number of MySQL server.", ) optional.add_argument( "-db-user", dest="db_user", default="", help="Username to use to connect to MySQL database. " "If user name is specified, script attempts to store results in MySQL " "database using other -db-* parameters.", ) optional.add_argument( "-db-pass", dest="db_password", default="omniscidb", help="Password to use to connect to MySQL database.", ) optional.add_argument( "-db-name", dest="db_name", default="omniscidb", help="MySQL database to use to store benchmark results.", ) optional.add_argument( "-db-table", dest="db_table", help="Table to use to store results for this benchmark.", ) # Omnisci server parameters optional.add_argument( "-e", "--executable", dest="omnisci_executable", required=False, help="Path to omnisci_server executable.", ) optional.add_argument( "-w", "--workdir", dest="omnisci_cwd", help="Path to omnisci working directory. " "By default parent directory of executable location is used. " "Data directory is used in this location.", ) optional.add_argument( "-port", "--omnisci_port", dest="omnisci_port", default=6274, type=int, help="TCP port number to run omnisci_server on.", ) optional.add_argument( "-u", "--user", dest="user", default="admin", help="User name to use on omniscidb server.", ) optional.add_argument( "-p", "--password", dest="password", default="HyperInteractive", help="User password to use on omniscidb server.", ) optional.add_argument( "-n", "--name", dest="name", default="census_database", help="Database name to use in omniscidb server.", ) optional.add_argument( "-t", "--table", dest="table", default="census_table", help="Table name name to use in omniscidb server.", ) optional.add_argument( "-commit_omnisci", dest="commit_omnisci", default="1234567890123456789012345678901234567890", help="Omnisci commit hash to use for benchmark.", ) optional.add_argument( "-commit_ibis", dest="commit_ibis", default="1234567890123456789012345678901234567890", help="Ibis commit hash to use for benchmark.", ) optional.add_argument( "-no_ibis", action="store_true", help="Do not run Ibis benchmark, run only Pandas (or Modin) version") optional.add_argument( "-pandas_mode", choices=["pandas", "modin_on_ray", "modin_on_dask"], default="pandas", help= "Specifies which version of Pandas to use: plain Pandas, Modin runing on Ray or on Dask" ) optional.add_argument( "-ray_tmpdir", default="/tmp", help= "Location where to keep Ray plasma store. It should have enough space to keep -ray_memory" ) optional.add_argument( "-ray_memory", default=200 * 1024 * 1024 * 1024, help="Size of memory to allocate for Ray plasma store") optional.add_argument( "-no_ml", action="store_true", help="Do not run machine learning benchmark, only ETL part") args = parser.parse_args() args.file = args.file.replace("'", "") # ML specific N_RUNS = 50 TRAIN_SIZE = 0.9 RANDOM_STATE = 777 columns_names = [ "YEAR0", "DATANUM", "SERIAL", "CBSERIAL", "HHWT", "CPI99", "GQ", "QGQ", "PERNUM", "PERWT", "SEX", "AGE", "EDUC", "EDUCD", "INCTOT", "SEX_HEAD", "SEX_MOM", "SEX_POP", "SEX_SP", "SEX_MOM2", "SEX_POP2", "AGE_HEAD", "AGE_MOM", "AGE_POP", "AGE_SP", "AGE_MOM2", "AGE_POP2", "EDUC_HEAD", "EDUC_MOM", "EDUC_POP", "EDUC_SP", "EDUC_MOM2", "EDUC_POP2", "EDUCD_HEAD", "EDUCD_MOM", "EDUCD_POP", "EDUCD_SP", "EDUCD_MOM2", "EDUCD_POP2", "INCTOT_HEAD", "INCTOT_MOM", "INCTOT_POP", "INCTOT_SP", "INCTOT_MOM2", "INCTOT_POP2", ] columns_types = [ "int64", "int64", "int64", "float64", "int64", "float64", "int64", "float64", "int64", "int64", "int64", "int64", "int64", "int64", "int64", "float64", "float64", "float64", "float64", "float64", "float64", "float64", "float64", "float64", "float64", "float64", "float64", "float64", "float64", "float64", "float64", "float64", "float64", "float64", "float64", "float64", "float64", "float64", "float64", "float64", "float64", "float64", "float64", "float64", "float64", ] try: if not args.no_ibis: if args.omnisci_executable is None: parser.error( "Omnisci executable should be specified with -e/--executable" ) omnisci_server = OmnisciServer( omnisci_executable=args.omnisci_executable, omnisci_port=args.omnisci_port, database_name=args.name, user=args.user, password=args.password, ) omnisci_server.launch() from server_worker import OmnisciServerWorker omnisci_server_worker = OmnisciServerWorker(omnisci_server) X_ibis, y_ibis, etl_times_ibis = etl_ibis( filename=args.file, columns_names=columns_names, columns_types=columns_types, database_name=args.name, table_name=args.table, omnisci_server_worker=omnisci_server_worker, delete_old_database=not args.dnd, create_new_table=not args.dni, ) omnisci_server.terminate() omnisci_server = None print_times(etl_times_ibis, name='Ibis') if not args.no_ml: mse_mean, cod_mean, mse_dev, cod_dev, ml_times = ml( X_ibis, y_ibis, RANDOM_STATE, N_RUNS, TRAIN_SIZE, args.optimizer) print_times(ml_times) print("mean MSE ± deviation: {:.9f} ± {:.9f}".format( mse_mean, mse_dev)) print("mean COD ± deviation: {:.9f} ± {:.9f}".format( cod_mean, cod_dev)) import_pandas_into_module_namespace(main.__globals__, args.pandas_mode, args.ray_tmpdir, args.ray_memory) X, y, etl_times = etl_pandas(args.file, columns_names=columns_names, columns_types=columns_types) print_times(etl_times, name=args.pandas_mode) if not args.no_ml: mse_mean, cod_mean, mse_dev, cod_dev, ml_times = ml( X, y, RANDOM_STATE, N_RUNS, TRAIN_SIZE, args.optimizer) print_times(ml_times) print("mean MSE ± deviation: {:.9f} ± {:.9f}".format( mse_mean, mse_dev)) print("mean COD ± deviation: {:.9f} ± {:.9f}".format( cod_mean, cod_dev)) if args.val: compare_dataframes(ibis_df=(X_ibis, y_ibis), pandas_df=(X, y)) except Exception as err: print("Failed: ", err) sys.exit(1) finally: if omnisci_server: omnisci_server.terminate()
def main(): args = None omnisci_server = None port_default_value = -1 benchmarks = { "ny_taxi": "taxi", "santander": "santander", "census": "census", "plasticc": "plasticc", "mortgage": "mortgage", "h2o": "h2o", } benchmarks_with_ibis_queries = [ "ny_taxi", "santander", "census", "plasticc", "mortgage" ] ignore_fields_for_bd_report_etl = ["t_connect"] ignore_fields_for_bd_report_ml = [] ignore_fields_for_results_unit_conversion = [ "Backend", "dfiles_num", "dataset_size", "query_name", ] parser = argparse.ArgumentParser( description="Run internal tests from ibis project") optional = parser._action_groups.pop() required = parser.add_argument_group("required arguments") parser._action_groups.append(optional) required.add_argument( "-bench_name", dest="bench_name", choices=sorted(benchmarks.keys()), help="Benchmark name.", required=True, ) required.add_argument( "-data_file", dest="data_file", help="A datafile that should be loaded.", required=True, ) optional.add_argument( "-dfiles_num", dest="dfiles_num", default=None, type=int, help="Number of datafiles to input into database for processing.", ) optional.add_argument( "-iterations", dest="iterations", default=1, type=int, help= "Number of iterations to run every query. Best result is selected.", ) optional.add_argument("-dnd", default=False, type=str_arg_to_bool, help="Do not delete old table.") optional.add_argument( "-dni", default=False, type=str_arg_to_bool, help="Do not create new table and import any data from CSV files.", ) optional.add_argument( "-validation", dest="validation", default=False, type=str_arg_to_bool, help= "validate queries results (by comparison with Pandas queries results).", ) optional.add_argument( "-import_mode", dest="import_mode", default="fsi", help="measure 'COPY FROM' import, FSI import, import through pandas", ) optional.add_argument( "-optimizer", choices=["intel", "stock"], dest="optimizer", default=None, help="Which optimizer is used", ) optional.add_argument( "-no_ibis", default=False, type=str_arg_to_bool, help="Do not run Ibis benchmark, run only Pandas (or Modin) version", ) optional.add_argument( "-no_pandas", default=False, type=str_arg_to_bool, help="Do not run Pandas version of benchmark", ) optional.add_argument( "-pandas_mode", choices=[ "Pandas", "Modin_on_ray", "Modin_on_dask", "Modin_on_python", "Modin_on_omnisci" ], default="Pandas", help= "Specifies which version of Pandas to use: plain Pandas, Modin runing on Ray or on Dask", ) optional.add_argument( "-ray_tmpdir", default="/tmp", help= "Location where to keep Ray plasma store. It should have enough space to keep -ray_memory", ) optional.add_argument( "-ray_memory", default=200 * 1024 * 1024 * 1024, type=int, help="Size of memory to allocate for Ray plasma store", ) optional.add_argument( "-no_ml", default=None, type=str_arg_to_bool, help="Do not run machine learning benchmark, only ETL part", ) optional.add_argument( "-gpu_memory", dest="gpu_memory", type=int, help="specify the memory of your gpu" "(This controls the lines to be used. Also work for CPU version. )", default=None, ) optional.add_argument( "-extended_functionality", dest="extended_functionality", default=False, type=str_arg_to_bool, help= "Extends functionality of H2O benchmark by adding 'chk' functions and verbose local reporting of results", ) # MySQL database parameters optional.add_argument( "-db_server", dest="db_server", default="localhost", help="Host name of MySQL server.", ) optional.add_argument( "-db_port", dest="db_port", default=3306, type=int, help="Port number of MySQL server.", ) optional.add_argument( "-db_user", dest="db_user", help="Username to use to connect to MySQL database. " "If user name is specified, script attempts to store results in MySQL " "database using other -db-* parameters.", ) optional.add_argument( "-db_pass", dest="db_pass", default="omniscidb", help="Password to use to connect to MySQL database.", ) optional.add_argument( "-db_name", dest="db_name", default="omniscidb", help="MySQL database to use to store benchmark results.", ) optional.add_argument( "-db_table_etl", dest="db_table_etl", help="Table to use to store ETL results for this benchmark.", ) optional.add_argument( "-db_table_ml", dest="db_table_ml", help="Table to use to store ML results for this benchmark.", ) # Omnisci server parameters optional.add_argument( "-executable", dest="executable", help="Path to omnisci_server executable.", ) optional.add_argument( "-omnisci_cwd", dest="omnisci_cwd", help="Path to omnisci working directory. " "By default parent directory of executable location is used. " "Data directory is used in this location.", ) optional.add_argument( "-port", dest="port", default=port_default_value, type=int, help="TCP port number to run omnisci_server on.", ) optional.add_argument( "-http_port", dest="http_port", default=port_default_value, type=int, help="HTTP port number to run omnisci_server on.", ) optional.add_argument( "-calcite_port", dest="calcite_port", default=port_default_value, type=int, help="Calcite port number to run omnisci_server on.", ) optional.add_argument( "-user", dest="user", default="admin", help="User name to use on omniscidb server.", ) optional.add_argument( "-password", dest="password", default="HyperInteractive", help="User password to use on omniscidb server.", ) optional.add_argument( "-database_name", dest="database_name", default="omnisci", help="Database name to use in omniscidb server.", ) optional.add_argument( "-table", dest="table", default="benchmark_table", help="Table name name to use in omniscidb server.", ) optional.add_argument( "-ipc_conn", dest="ipc_conn", default=True, type=str_arg_to_bool, help="Table name name to use in omniscidb server.", ) optional.add_argument( "-debug_timer", dest="debug_timer", default=False, type=str_arg_to_bool, help="Enable fine-grained query execution timers for debug.", ) optional.add_argument( "-columnar_output", dest="columnar_output", default=True, type=str_arg_to_bool, help= "Allows OmniSci Core to directly materialize intermediate projections \ and the final ResultSet in Columnar format where appropriate.", ) optional.add_argument( "-lazy_fetch", dest="lazy_fetch", default=None, type=str_arg_to_bool, help="[lazy_fetch help message]", ) optional.add_argument( "-multifrag_rs", dest="multifrag_rs", default=None, type=str_arg_to_bool, help="[multifrag_rs help message]", ) optional.add_argument( "-fragments_size", dest="fragments_size", default=None, nargs="*", type=int, help= "Number of rows per fragment that is a unit of the table for query processing. \ Should be specified for each table in workload", ) optional.add_argument( "-omnisci_run_kwargs", dest="omnisci_run_kwargs", default={}, metavar="KEY1=VAL1,KEY2=VAL2...", action=KeyValueListParser, help="options to start omnisci server", ) # Additional information optional.add_argument( "-commit_omnisci", dest="commit_omnisci", default="1234567890123456789012345678901234567890", help="Omnisci commit hash used for benchmark.", ) optional.add_argument( "-commit_ibis", dest="commit_ibis", default="1234567890123456789012345678901234567890", help="Ibis commit hash used for benchmark.", ) optional.add_argument( "-commit_omniscripts", dest="commit_omniscripts", default="1234567890123456789012345678901234567890", help="Omniscripts commit hash used for benchmark.", ) optional.add_argument( "-commit_modin", dest="commit_modin", default="1234567890123456789012345678901234567890", help="Modin commit hash used for benchmark.", ) optional.add_argument( "-debug_mode", dest="debug_mode", default=False, type=str_arg_to_bool, help="Enable debug mode.", ) try: os.environ["PYTHONIOENCODING"] = "UTF-8" os.environ["PYTHONUNBUFFERED"] = "1" omnisci_server_worker = None omnisci_server = None args = parser.parse_args() launch_omnisci_server = (not args.no_ibis and args.bench_name in benchmarks_with_ibis_queries) if args.port == port_default_value: args.port = find_free_port() if args.http_port == port_default_value: args.http_port = find_free_port() if args.calcite_port == port_default_value: args.calcite_port = find_free_port() run_benchmark = __import__(benchmarks[args.bench_name]).run_benchmark parameters = { "data_file": args.data_file, "dfiles_num": args.dfiles_num, "no_ml": args.no_ml, "no_ibis": args.no_ibis, "optimizer": args.optimizer, "pandas_mode": args.pandas_mode, "ray_tmpdir": args.ray_tmpdir, "ray_memory": args.ray_memory, "gpu_memory": args.gpu_memory, "validation": args.validation, "no_pandas": args.no_pandas, "debug_mode": args.debug_mode, "extended_functionality": args.extended_functionality, } if launch_omnisci_server: if args.executable is None: parser.error( "Omnisci executable should be specified with -e/--executable for Ibis part" ) from server import OmnisciServer omnisci_server = OmnisciServer( omnisci_executable=args.executable, omnisci_port=args.port, http_port=args.http_port, calcite_port=args.calcite_port, database_name=args.database_name, omnisci_cwd=args.omnisci_cwd, user=args.user, password=args.password, debug_timer=args.debug_timer, columnar_output=args.columnar_output, lazy_fetch=args.lazy_fetch, multifrag_rs=args.multifrag_rs, omnisci_run_kwargs=args.omnisci_run_kwargs, ) parameters["database_name"] = args.database_name parameters["table"] = args.table parameters["dnd"] = args.dnd parameters["dni"] = args.dni parameters["import_mode"] = args.import_mode parameters["fragments_size"] = args.fragments_size if parameters["validation"] and (parameters["no_pandas"] or parameters["no_ibis"]): parameters["validation"] = False print( "WARNING: validation was turned off as it requires both sides to compare." ) etl_results = [] ml_results = [] print(parameters) run_id = int(round(time.time())) for iter_num in range(1, args.iterations + 1): print(f"Iteration #{iter_num}") if launch_omnisci_server: from server_worker import OmnisciServerWorker omnisci_server_worker = OmnisciServerWorker(omnisci_server) parameters["omnisci_server_worker"] = omnisci_server_worker parameters["ipc_connection"] = args.ipc_conn omnisci_server.launch() parameters = { key: os.path.expandvars(value) if isinstance(value, str) else value for key, value in parameters.items() } benchmark_results = run_benchmark(parameters) if launch_omnisci_server: omnisci_server_worker.terminate() omnisci_server.terminate() additional_fields_for_reporting = { "ETL": { "Iteration": iter_num, "run_id": run_id }, "ML": { "Iteration": iter_num, "run_id": run_id }, } etl_ml_results = refactor_results_for_reporting( benchmark_results=benchmark_results, ignore_fields_for_results_unit_conversion= ignore_fields_for_results_unit_conversion, additional_fields=additional_fields_for_reporting, reporting_unit="ms", ) etl_results = list(etl_ml_results["ETL"]) ml_results = list(etl_ml_results["ML"]) # Reporting to MySQL database if args.db_user is not None: if iter_num == 1: db = mysql.connector.connect( host=args.db_server, port=args.db_port, user=args.db_user, passwd=args.db_pass, db=args.db_name, ) reporting_init_fields = { "OmnisciCommitHash": args.commit_omnisci, "IbisCommitHash": args.commit_ibis, "OmniscriptsCommitHash": args.commit_omniscripts, "ModinCommitHash": args.commit_modin, } reporting_fields_benchmark_etl = { x: "VARCHAR(500) NOT NULL" for x in etl_results[0] } if len(etl_results) != 1: reporting_fields_benchmark_etl.update({ x: "VARCHAR(500) NOT NULL" for x in etl_results[1] }) db_reporter_etl = DbReport( db, args.db_table_etl, reporting_fields_benchmark_etl, reporting_init_fields, ) if len(ml_results) != 0: reporting_fields_benchmark_ml = { x: "VARCHAR(500) NOT NULL" for x in ml_results[0] } if len(ml_results) != 1: reporting_fields_benchmark_ml.update({ x: "VARCHAR(500) NOT NULL" for x in ml_results[1] }) db_reporter_ml = DbReport( db, args.db_table_ml, reporting_fields_benchmark_ml, reporting_init_fields, ) if iter_num == args.iterations: for result_etl in etl_results: remove_fields_from_dict( result_etl, ignore_fields_for_bd_report_etl) db_reporter_etl.submit(result_etl) if len(ml_results) != 0: for result_ml in ml_results: remove_fields_from_dict( result_ml, ignore_fields_for_bd_report_ml) db_reporter_ml.submit(result_ml) finally: if omnisci_server_worker: omnisci_server_worker.terminate() if omnisci_server: omnisci_server.terminate()
def main(): args = None omnisci_server_worker = None train_final, test_final = None, None parser, args, skip_rows = get_args() try: if not args.no_ibis: sys.path.append(os.path.join(os.path.dirname(__file__), "..")) from server import OmnisciServer if args.omnisci_executable is None: parser.error( "Omnisci executable should be specified with -e/--executable" ) omnisci_server = OmnisciServer( omnisci_executable=args.omnisci_executable, omnisci_port=args.omnisci_port, database_name=args.name, omnisci_cwd=args.omnisci_cwd, user=args.user, password=args.password, ) omnisci_server.launch() from server_worker import OmnisciServerWorker omnisci_server_worker = OmnisciServerWorker(omnisci_server) train_final, test_final, etl_times = etl_all_ibis( filename=args.dataset_path, database_name=args.name, omnisci_server_worker=omnisci_server_worker, delete_old_database=not args.dnd, create_new_table=not args.dni, skip_rows=skip_rows, validation=args.val, ) ml_data, etl_times = split_step(train_final, test_final, etl_times) print_times(etl_times) omnisci_server_worker.terminate() omnisci_server_worker = None if not args.no_ml: print("using ml with dataframes from ibis") ml_times = ml(ml_data) print_times(ml_times) ptrain_final, ptest_final, petl_times = etl_all_pandas( args.dataset_path, skip_rows ) ml_data, petl_times = split_step(ptrain_final, ptest_final, petl_times) print_times(petl_times) if not args.no_ml: print("using ml with dataframes from pandas") ml_times = ml(ml_data) print_times(ml_times) if args.val and (not train_final is None) and (not test_final is None): print("validating result ...") compare_dataframes((train_final, test_final), (ptrain_final, ptest_final)) finally: if omnisci_server_worker: omnisci_server_worker.terminate()
def main(): omniscript_path = os.path.dirname(__file__) args = None omnisci_server = None port_default_value = -1 benchmarks = ["ny_taxi", "santander", "census", "plasticc"] parser = argparse.ArgumentParser( description="Run internal tests from ibis project") optional = parser._action_groups.pop() required = parser.add_argument_group("required arguments") parser._action_groups.append(optional) required.add_argument( "-bench_name", dest="bench_name", choices=benchmarks, help="Benchmark name.", required=True, ) required.add_argument( "-data_file", dest="data_file", help="A datafile that should be loaded.", required=True, ) optional.add_argument( "-dfiles_num", dest="dfiles_num", default=1, type=int, help="Number of datafiles to input into database for processing.", ) optional.add_argument( "-iterations", dest="iterations", default=1, type=int, help= "Number of iterations to run every query. Best result is selected.", ) optional.add_argument("-dnd", default=False, type=str_arg_to_bool, help="Do not delete old table.") optional.add_argument( "-dni", default=False, type=str_arg_to_bool, help="Do not create new table and import any data from CSV files.", ) optional.add_argument( "-validation", dest="validation", default=False, type=str_arg_to_bool, help= "validate queries results (by comparison with Pandas queries results).", ) optional.add_argument( "-optimizer", choices=["intel", "stock"], dest="optimizer", default="intel", help="Which optimizer is used", ) optional.add_argument( "-no_ibis", default=False, type=str_arg_to_bool, help="Do not run Ibis benchmark, run only Pandas (or Modin) version", ) optional.add_argument( "-pandas_mode", choices=["Pandas", "Modin_on_ray", "Modin_on_dask", "Modin_on_python"], default="Pandas", help= "Specifies which version of Pandas to use: plain Pandas, Modin runing on Ray or on Dask", ) optional.add_argument( "-ray_tmpdir", default="/tmp", help= "Location where to keep Ray plasma store. It should have enough space to keep -ray_memory", ) optional.add_argument( "-ray_memory", default=200 * 1024 * 1024 * 1024, help="Size of memory to allocate for Ray plasma store", ) optional.add_argument( "-no_ml", default=False, type=str_arg_to_bool, help="Do not run machine learning benchmark, only ETL part", ) optional.add_argument( "-gpu_memory", dest="gpu_memory", type=int, help= "specify the memory of your gpu, default 16. (This controls the lines to be used. Also work for CPU version. )", default=16, ) # MySQL database parameters optional.add_argument( "-db_server", dest="db_server", default="localhost", help="Host name of MySQL server.", ) optional.add_argument( "-db_port", dest="db_port", default=3306, type=int, help="Port number of MySQL server.", ) optional.add_argument( "-db_user", dest="db_user", help="Username to use to connect to MySQL database. " "If user name is specified, script attempts to store results in MySQL " "database using other -db-* parameters.", ) optional.add_argument( "-db_pass", dest="db_pass", default="omniscidb", help="Password to use to connect to MySQL database.", ) optional.add_argument( "-db_name", dest="db_name", default="omniscidb", help="MySQL database to use to store benchmark results.", ) optional.add_argument( "-db_table_etl", dest="db_table_etl", help="Table to use to store ETL results for this benchmark.", ) optional.add_argument( "-db_table_ml", dest="db_table_ml", help="Table to use to store ML results for this benchmark.", ) # Omnisci server parameters optional.add_argument( "-executable", dest="executable", help="Path to omnisci_server executable.", ) optional.add_argument( "-omnisci_cwd", dest="omnisci_cwd", help="Path to omnisci working directory. " "By default parent directory of executable location is used. " "Data directory is used in this location.", ) optional.add_argument( "-port", dest="port", default=port_default_value, type=int, help="TCP port number to run omnisci_server on.", ) optional.add_argument( "-http_port", dest="http_port", default=port_default_value, type=int, help="HTTP port number to run omnisci_server on.", ) optional.add_argument( "-calcite_port", dest="calcite_port", default=port_default_value, type=int, help="Calcite port number to run omnisci_server on.", ) optional.add_argument( "-user", dest="user", default="admin", help="User name to use on omniscidb server.", ) optional.add_argument( "-password", dest="password", default="HyperInteractive", help="User password to use on omniscidb server.", ) optional.add_argument( "-database_name", dest="database_name", default="omnisci", help="Database name to use in omniscidb server.", ) optional.add_argument( "-table", dest="table", default="benchmark_table", help="Table name name to use in omniscidb server.", ) optional.add_argument( "-ipc_conn", dest="ipc_connection", default=True, type=str_arg_to_bool, help="Table name name to use in omniscidb server.", ) # Additional information optional.add_argument( "-commit_omnisci", dest="commit_omnisci", default="1234567890123456789012345678901234567890", help="Omnisci commit hash to use for benchmark.", ) optional.add_argument( "-commit_ibis", dest="commit_ibis", default="1234567890123456789012345678901234567890", help="Ibis commit hash to use for benchmark.", ) try: os.environ["PYTHONIOENCODING"] = "UTF-8" os.environ["PYTHONUNBUFFERED"] = "1" omnisci_server_worker = None args = parser.parse_args() if args.port == port_default_value: args.port = find_free_port() if args.http_port == port_default_value: args.http_port = find_free_port() if args.calcite_port == port_default_value: args.calcite_port = find_free_port() if args.bench_name == "ny_taxi": from taxi import run_benchmark elif args.bench_name == "santander": from santander import run_benchmark elif args.bench_name == "census": from census import run_benchmark elif args.bench_name == "plasticc": from plasticc import run_benchmark parameters = { "data_file": args.data_file, "dfiles_num": args.dfiles_num, "no_ml": args.no_ml, "no_ibis": args.no_ibis, "optimizer": args.optimizer, "pandas_mode": args.pandas_mode, "ray_tmpdir": args.ray_tmpdir, "ray_memory": args.ray_memory, "gpu_memory": args.gpu_memory, } if not args.no_ibis: if args.executable is None: parser.error( "Omnisci executable should be specified with -e/--executable for Ibis part" ) omnisci_server = OmnisciServer( omnisci_executable=args.executable, omnisci_port=args.port, http_port=args.http_port, calcite_port=args.calcite_port, database_name=args.database_name, user=args.user, password=args.password, ) parameters["database_name"] = args.database_name parameters["table"] = args.table parameters["dnd"] = args.dnd parameters["dni"] = args.dni parameters["validation"] = args.validation etl_results = [] ml_results = [] print(parameters) run_id = int(round(time.time())) for iter_num in range(1, args.iterations + 1): print(f"Iteration #{iter_num}") if not args.no_ibis: omnisci_server_worker = OmnisciServerWorker(omnisci_server) parameters["omnisci_server_worker"] = omnisci_server_worker parameters["ipc_connection"] = args.ipc_connection omnisci_server.launch() result = run_benchmark(parameters) if not args.no_ibis: omnisci_server.terminate() for backend_res in result["ETL"]: if backend_res: backend_res["Iteration"] = iter_num backend_res["run_id"] = run_id etl_results.append(backend_res) for backend_res in result["ML"]: if backend_res: backend_res["Iteration"] = iter_num backend_res["run_id"] = run_id ml_results.append(backend_res) # Reporting to MySQL database if args.db_user is not None: if iter_num == 1: db = mysql.connector.connect( host=args.db_server, port=args.db_port, user=args.db_user, passwd=args.db_pass, db=args.db_name, ) reporting_init_fields = { "OmnisciCommitHash": args.commit_omnisci, "IbisCommitHash": args.commit_ibis } reporting_fields_benchmark_etl = { x: "VARCHAR(500) NOT NULL" for x in etl_results[0] } if len(etl_results) is not 1: reporting_fields_benchmark_etl.update({ x: "VARCHAR(500) NOT NULL" for x in etl_results[1] }) db_reporter_etl = DbReport(db, args.db_table_etl, reporting_fields_benchmark_etl, reporting_init_fields) if len(ml_results) is not 0: reporting_fields_benchmark_ml = { x: "VARCHAR(500) NOT NULL" for x in ml_results[0] } if len(ml_results) is not 1: reporting_fields_benchmark_ml.update({ x: "VARCHAR(500) NOT NULL" for x in ml_results[1] }) db_reporter_ml = DbReport( db, args.db_table_ml, reporting_fields_benchmark_ml, reporting_init_fields) for result_etl in etl_results: db_reporter_etl.submit(result_etl) if len(ml_results) is not 0: for result_ml in ml_results: db_reporter_ml.submit(result_ml) except Exception: traceback.print_exc(file=sys.stdout) sys.exit(1) finally: if omnisci_server_worker: omnisci_server_worker.terminate()
def etl_ibis(args, run_import_queries, columns_names, columns_types, validation=False): filename = args.file database_name = args.name table_name = args.table delete_old_database = not args.dnd create_new_table = not args.dni run_import_queries = str_arg_to_bool(run_import_queries) validation = str_arg_to_bool(validation) tmp_table_name = "tmp_table" etl_times = {"t_groupby_merge_where": 0.0, "t_train_test_split": 0.0, "t_etl": 0.0} if run_import_queries: etl_times_import = { "t_readcsv_by_ibis": 0.0, "t_readcsv_by_COPY": 0.0, "t_readcsv_by_FSI": 0.0, } etl_times.update(etl_times_import) omnisci_server = OmnisciServer( omnisci_executable=args.omnisci_executable, omnisci_port=args.omnisci_port, database_name=args.name, user=args.user, password=args.password, debug_timer=True, columnar_output=args.server_columnar_output, lazy_fetch=args.server_lazy_fetch, ) omnisci_server.launch() import ibis from server_worker import OmnisciServerWorker omnisci_server_worker = OmnisciServerWorker(omnisci_server) omnisci_server_worker.create_database( database_name, delete_if_exists=delete_old_database ) time.sleep(2) omnisci_server_worker.connect_to_server() if run_import_queries: # SQL statemnts preparation for data file import queries connect_to_db_sql_template = "\c {0} admin HyperInteractive" create_table_sql_template = """ CREATE TABLE {0} ({1}); """ import_by_COPY_sql_template = """ COPY {0} FROM '{1}' WITH (header='{2}'); """ import_by_FSI_sql_template = """ CREATE TEMPORARY TABLE {0} ({1}) WITH (storage_type='CSV:{2}'); """ drop_table_sql_template = """ DROP TABLE IF EXISTS {0}; """ import_query_cols_list = ( ["ID_code TEXT ENCODING NONE, \n", "target SMALLINT, \n"] + ["var_%s DOUBLE, \n" % i for i in range(199)] + ["var_199 DOUBLE"] ) import_query_cols_str = "".join(import_query_cols_list) connect_to_db_sql = connect_to_db_sql_template.format(database_name) create_table_sql = create_table_sql_template.format( tmp_table_name, import_query_cols_str ) import_by_COPY_sql = import_by_COPY_sql_template.format( tmp_table_name, filename, "true" ) import_by_FSI_sql = import_by_FSI_sql_template.format( tmp_table_name, import_query_cols_str, filename ) # data file import by ibis columns_types_import_query = ["string", "int64"] + [ "float64" for _ in range(200) ] schema_table_import = ibis.Schema( names=columns_names, types=columns_types_import_query ) omnisci_server_worker.get_conn().create_table( table_name=tmp_table_name, schema=schema_table_import, database=database_name, fragment_size=args.fragment_size, ) table_import_query = omnisci_server_worker.database(database_name).table(tmp_table_name) t0 = timer() table_import_query.read_csv(filename, delimiter=",") etl_times["t_readcsv_by_ibis"] = timer() - t0 # data file import by FSI omnisci_server_worker.drop_table(tmp_table_name) t0 = timer() omnisci_server_worker.execute_sql_query(import_by_FSI_sql) etl_times["t_readcsv_by_FSI"] = timer() - t0 omnisci_server_worker.drop_table(tmp_table_name) # data file import by SQL COPY statement omnisci_server_worker.execute_sql_query(create_table_sql) t0 = timer() omnisci_server_worker.execute_sql_query(import_by_COPY_sql) etl_times["t_readcsv_by_COPY"] = timer() - t0 omnisci_server_worker.drop_table(tmp_table_name) if create_new_table: # Create table and import data for ETL queries schema_table = ibis.Schema(names=columns_names, types=columns_types) omnisci_server_worker.get_conn().create_table( table_name=table_name, schema=schema_table, database=database_name, fragment_size=args.fragment_size, ) table_import = omnisci_server_worker.database(database_name).table(table_name) table_import.read_csv(filename, delimiter=",") if args.server_conn_type == "regular": omnisci_server_worker.connect_to_server() elif args.server_conn_type == "ipc": omnisci_server_worker.ipc_connect_to_server() else: print("Wrong connection type is specified!") sys.exit(0) db = omnisci_server_worker.database(database_name) table = db.table(table_name) # group_by/count, merge (join) and filtration queries # We are making 400 columns and then insert them into original table thus avoiding # nested sql requests t0 = timer() count_cols = [] orig_cols = ["ID_code", "target"] + ['var_%s'%i for i in range(200)] cast_cols = [] cast_cols.append(table["target"].cast("int64").name("target0")) gt1_cols = [] for i in range(200): col = "var_%d" % i col_count = "var_%d_count" % i col_gt1 = "var_%d_gt1" % i w = ibis.window(group_by=col) count_cols.append(table[col].count().over(w).name(col_count)) gt1_cols.append( ibis.case() .when( table[col].count().over(w).name(col_count) > 1, table[col].cast("float32"), ) .else_(ibis.null()) .end() .name("var_%d_gt1" % i) ) cast_cols.append(table[col].cast("float32").name(col)) table = table.mutate(count_cols) table = table.drop(orig_cols) table = table.mutate(gt1_cols) table = table.mutate(cast_cols) table_df = table.execute() etl_times["t_groupby_merge_where"] = timer() - t0 # rows split query t0 = timer() training_part, validation_part = table_df[:-10000], table_df[-10000:] etl_times["t_train_test_split"] = timer() - t0 etl_times["t_etl"] = etl_times["t_groupby_merge_where"] + etl_times["t_train_test_split"] x_train = training_part.drop(['target0'],axis=1) y_train = training_part['target0'] x_valid = validation_part.drop(['target0'],axis=1) y_valid = validation_part['target0'] omnisci_server.terminate() omnisci_server = None return x_train, y_train, x_valid, y_valid, etl_times
def main(): args = None omnisci_server = None parser, args, skip_rows = get_args() try: if not args.no_ibis: sys.path.append(os.path.join(os.path.dirname(__file__), "..")) from server import OmnisciServer if args.omnisci_executable is None: parser.error( "Omnisci executable should be specified with -e/--executable" ) omnisci_server = OmnisciServer( omnisci_executable=args.omnisci_executable, omnisci_port=args.omnisci_port, database_name=args.name, omnisci_cwd=args.omnisci_cwd, user=args.user, password=args.password, ) omnisci_server.launch() from server_worker import OmnisciServerWorker omnisci_server_worker = OmnisciServerWorker(omnisci_server) ( X_train, y_train, X_test, y_test, Xt, classes, class_weights, etl_times, ) = etl_all_ibis( filename=args.dataset_path, database_name=args.name, omnisci_server_worker=omnisci_server_worker, delete_old_database=not args.dnd, create_new_table=not args.dni, skip_rows=skip_rows, ) print_times(etl_times) omnisci_server.terminate() omnisci_server = None if not args.no_ml: print("using ml with dataframes from ibis") ml_times = ml(X_train, y_train, X_test, y_test, Xt, classes, class_weights) print_times(ml_times) ( X_train, y_train, X_test, y_test, Xt, classes, class_weights, etl_times, ) = etl_all_pandas(args.dataset_path, skip_rows) print_times(etl_times) if not args.no_ml: print("using ml with dataframes from pandas") ml_times = ml(X_train, y_train, X_test, y_test, Xt, classes, class_weights) print_times(ml_times) if args.val: # this isn't work so easy # compare_dataframes(ibis_df=(X_train_ibis, y_train_ibis), pandas_df=(X, y)) print("validate by ml results") except Exception as err: print("Failed: ", err) sys.exit(1) finally: if omnisci_server: omnisci_server.terminate()
print("Bad number of data files specified", args.df) sys.exit(1) if args.i < 1: print("Bad number of iterations specified", args.i) database_name = args.n omnisci_server = OmnisciServer( omnisci_executable=args.e, omnisci_port=args.port, database_name=database_name, user=args.u, password=args.p, ) omnisci_server.launch() omnisci_server_worker = OmnisciServerWorker(omnisci_server) time.sleep(2) omnisci_server_worker.connect_to_server() taxibench_columns_names = [ "trip_id", "vendor_id", "pickup_datetime", "dropoff_datetime", "store_and_fwd_flag", "rate_code_id", "pickup_longitude", "pickup_latitude", "dropoff_longitude", "dropoff_latitude",
args = parser.parse_args() if args.df <= 0: print("Bad number of data files specified", args.df) sys.exit(1) if args.i < 1: print("Bad number of iterations specified", args.i) database_name = args.n omnisci_server = OmnisciServer(omnisci_executable=args.e, omnisci_port=args.port, database_name=database_name, user=args.u, password=args.p) omnisci_server.launch() omnisci_server_worker = OmnisciServerWorker(omnisci_server) time.sleep(2) conn = omnisci_server_worker.connect_to_server() taxibench_columns_names = [ "trip_id", "vendor_id", "pickup_datetime", "dropoff_datetime", "store_and_fwd_flag", "rate_code_id", "pickup_longitude", "pickup_latitude", "dropoff_longitude", "dropoff_latitude", "passenger_count", "trip_distance", "fare_amount", "extra", "mta_tax", "tip_amount", "tolls_amount", "ehail_fee", "improvement_surcharge", "total_amount", "payment_type", "trip_type", "pickup", "dropoff", "cab_type", "precipitation", "snow_depth", "snowfall", "max_temperature", "min_temperature", "average_wind_speed", "pickup_nyct2010_gid", "pickup_ctlabel", "pickup_borocode", "pickup_boroname", "pickup_ct2010", "pickup_boroct2010",
"var_" + str(index) for index in range(200) ] datafile_columns_types = ["string", "int64" ] + ["float64" for _ in range(200)] schema_train = ibis.Schema(names=datafile_columns_names, types=datafile_columns_types) database_name = args.n omnisci_server = OmnisciServer(omnisci_executable=args.e, omnisci_port=args.port, database_name=database_name, user=args.u, password=args.p) omnisci_server.launch() omnisci_server_worker = OmnisciServerWorker(omnisci_server) time.sleep(2) conn = omnisci_server_worker.connect_to_server() db_reporter = None if args.db_user is not "": print("Connecting to database") db = mysql.connector.connect(host=args.db_server, port=args.db_port, user=args.db_user, passwd=args.db_pass, db=args.db_name) db_reporter = DbReport( db, args.db_table, { 'QueryName': 'VARCHAR(500) NOT NULL',