def import_data_from_pd_df(self, table_name, pd_obj, columns_names, columns_types): "Import table data using Ibis load_data to the OmniSciDB from the Pandas.DataFrame" schema_table = ibis.Schema(names=columns_names, types=columns_types) if not self._conn.exists_table( name=table_name, database=self.omnisci_server.database_name): try: self._conn.create_table( table_name=table_name, schema=schema_table, database=self.omnisci_server.database_name, ) except Exception as err: print("Failed to create table:", err) self._conn.load_data( table_name=table_name, obj=pd_obj, database=self.omnisci_server.database_name, method="columnar", ) return self._conn.database( self.omnisci_server.database_name).table(table_name)
def import_data( self, table_name, data_files_names, files_limit, columns_names, columns_types, header=False, ): "Import CSV files to the OmniSciDB using COPY SQL statement" if header: header_value = "true" elif not header: header_value = "false" else: print("Wrong value of header argument!") sys.exit(2) schema_table = ibis.Schema(names=columns_names, types=columns_types) if not self._conn.exists_table( name=table_name, database=self.omnisci_server.database_name): try: self._conn.create_table( table_name=table_name, schema=schema_table, database=self.omnisci_server.database_name, ) except Exception as err: print("Failed to create table:", err) for f in data_files_names[:files_limit]: print("Importing datafile", f) copy_str = self._command_2_import_CSV % (table_name, f, header_value) omnisci_cmd_line = self._get_omnisci_cmd_line() try: import_process = subprocess.Popen( omnisci_cmd_line, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, stdin=subprocess.PIPE, ) output = import_process.communicate(copy_str.encode()) except OSError as err: print(f"Failed to start '{omnisci_cmd_line}'", err) print(str(output[0].strip().decode())) print("Command returned", import_process.returncode)
def import_data_by_ibis(self, table_name, data_files_names, files_limit, columns_names, columns_types, cast_dict, header=None): "Import CSV files using Ibis load_data from the Pandas.DataFrame" schema_table = ibis.Schema(names=columns_names, types=columns_types) if not self._conn.exists_table(name=table_name, database=self._database_name): try: self._conn.create_table(table_name=table_name, schema=schema_table, database=self._database_name) except Exception as err: print("Failed to create table:", err) t0 = time.time() if files_limit > 1: pandas_df_from_each_file = (self._read_csv_datafile( file_name, columns_names, header) for file_name in data_files_names[:files_limit]) self._imported_pd_df[table_name] = pd.concat( pandas_df_from_each_file, ignore_index=True) else: self._imported_pd_df[table_name] = self._read_csv_datafile( data_files_names, columns_names, header) t_import_pandas = time.time() - t0 pandas_concatenated_df_casted = self._imported_pd_df[ table_name].astype(dtype=cast_dict, copy=True) t0 = time.time() self._conn.load_data(table_name=table_name, obj=pandas_concatenated_df_casted, database=self._database_name) t_import_ibis = time.time() - t0 return t_import_pandas, t_import_ibis
def etl_ibis( filename, columns_names, columns_types, database_name, table_name, omnisci_server_worker, delete_old_database, create_new_table, ipc_connection, validation, etl_keys, import_mode, ): import ibis etl_times = {key: 0.0 for key in etl_keys} omnisci_server_worker.create_database(database_name, delete_if_exists=delete_old_database) # Create table and import data if create_new_table: schema_table = ibis.Schema(names=columns_names, types=columns_types) if import_mode == "copy-from": # Create table and import data for ETL queries omnisci_server_worker.create_table( table_name=table_name, schema=schema_table, database=database_name, ) table_import = omnisci_server_worker.database(database_name).table( table_name) t0 = timer() table_import.read_csv(filename, header=True, quotechar="", delimiter=",") etl_times["t_readcsv"] = round((timer() - t0) * 1000) elif import_mode == "pandas": # Datafiles import t_import_pandas, t_import_ibis = omnisci_server_worker.import_data_by_ibis( table_name=table_name, data_files_names=filename, files_limit=1, columns_names=columns_names, columns_types=columns_types, header=0, nrows=None, compression_type="gzip" if filename.endswith("gz") else None, validation=validation, ) etl_times["t_readcsv"] = round( (t_import_pandas + t_import_ibis) * 1000) elif import_mode == "fsi": try: unzip_name = None if filename.endswith("gz"): import gzip unzip_name = "/tmp/census-fsi.csv" with gzip.open(filename, "rb") as gz_input: with open(unzip_name, "wb") as output: output.write(gz_input.read()) t0 = timer() omnisci_server_worker._conn.create_table_from_csv( table_name, unzip_name or filename, schema_table) etl_times["t_readcsv"] = round((timer() - t0) * 1000) finally: if filename.endswith("gz"): import os os.remove(unzip_name) # Second connection - this is ibis's ipc connection for DML omnisci_server_worker.connect_to_server(database_name, ipc=ipc_connection) table = omnisci_server_worker.database(database_name).table(table_name) t_etl_start = timer() keep_cols = [ "YEAR0", "DATANUM", "SERIAL", "CBSERIAL", "HHWT", "CPI99", "GQ", "PERNUM", "SEX", "AGE", "INCTOT", "EDUC", "EDUCD", "EDUC_HEAD", "EDUC_POP", "EDUC_MOM", "EDUCD_MOM2", "EDUCD_POP2", "INCTOT_MOM", "INCTOT_POP", "INCTOT_MOM2", "INCTOT_POP2", "INCTOT_HEAD", "SEX_HEAD", ] if import_mode == "pandas" and validation: keep_cols.append("id") table = table[keep_cols] # first, we do all filters and eliminate redundant fillna operations for EDUC and EDUCD table = table[table.INCTOT != 9999999] table = table[table["EDUC"].notnull()] table = table[table["EDUCD"].notnull()] table = table.set_column("INCTOT", table["INCTOT"] * table["CPI99"]) cols = [] # final fillna and casting for necessary columns for column in keep_cols: cols.append(ibis.case().when( table[column].notnull(), table[column]).else_(-1).end().cast("float64").name(column)) table = table.mutate(cols) df = table.execute() if import_mode == "pandas" and validation: df.index = df["id"].values # here we use pandas to split table y = df["EDUC"] X = df.drop(["EDUC", "CPI99"], axis=1) etl_times["t_etl"] = round((timer() - t_etl_start) * 1000) print("DataFrame shape:", X.shape) return df, X, y, etl_times
def load_data_ibis( dataset_path, database_name, omnisci_server_worker, delete_old_database, create_new_table, ipc_connection, skip_rows, validation, dtypes, meta_dtypes, import_mode, fragments_size, ): fragments_size = check_fragments_size( fragments_size, count_table=4, import_mode=import_mode, default_fragments_size=[32000000, 32000000, 32000000, 32000000], ) omnisci_server_worker.create_database(database_name, delete_if_exists=delete_old_database) t_readcsv = 0.0 t_connect = 0.0 # Create tables and import data if create_new_table: import ibis training_file = "%s/training_set.csv" % dataset_path # COPY FROM doesn't have skip_rows option test_file = "%s/test_set_skiprows.csv" % dataset_path training_meta_file = "%s/training_set_metadata.csv" % dataset_path test_meta_file = "%s/test_set_metadata.csv" % dataset_path schema = ibis.Schema(names=dtypes.keys(), types=dtypes.values()) meta_schema = ibis.Schema(names=meta_dtypes.keys(), types=meta_dtypes.values()) target = meta_dtypes.pop("target") meta_schema_without_target = ibis.Schema( names=meta_dtypes.keys(), types=meta_dtypes.values() ) meta_dtypes["target"] = target if import_mode == "copy-from": # create tables t0 = timer() omnisci_server_worker.create_table( table_name="training", schema=schema, database=database_name, fragment_size=fragments_size[0], ) omnisci_server_worker.create_table( table_name="test", schema=schema, database=database_name, fragment_size=fragments_size[1], ) omnisci_server_worker.create_table( table_name="training_meta", schema=meta_schema, database=database_name, fragment_size=fragments_size[2], ) omnisci_server_worker.create_table( table_name="test_meta", schema=meta_schema_without_target, database=database_name, fragment_size=fragments_size[3], ) # get tables db = omnisci_server_worker.database(database_name) training_table = db.table("training") test_table = db.table("test") training_meta_table = db.table("training_meta") test_meta_table = db.table("test_meta") t_connect = timer() - t0 # measuring time of reading t0 = timer() training_table.read_csv(training_file, header=True, quotechar="", delimiter=",") test_table.read_csv(test_file, header=True, quotechar="", delimiter=",") training_meta_table.read_csv( training_meta_file, header=True, quotechar="", delimiter="," ) test_meta_table.read_csv(test_meta_file, header=True, quotechar="", delimiter=",") t_readcsv = timer() - t0 elif import_mode == "pandas": general_options = { "files_limit": 1, "columns_names": list(dtypes.keys()), "columns_types": list(dtypes.values()), "header": 0, "nrows": None, "compression_type": None, "validation": validation, } # create table #1 t_import_pandas_1, t_import_ibis_1 = omnisci_server_worker.import_data_by_ibis( table_name="training", data_files_names="%s/training_set.csv" % dataset_path, **general_options, ) # create table #2 t_import_pandas_2, t_import_ibis_2 = omnisci_server_worker.import_data_by_ibis( table_name="test", data_files_names="%s/test_set.csv" % dataset_path, skiprows=skip_rows, **general_options, ) # before reading meta test files, we should update columns_names # and columns_types in general options with its proper values general_options["columns_names"] = list(meta_dtypes.keys()) general_options["columns_types"] = list(meta_dtypes.values()) # create table #3 t_import_pandas_3, t_import_ibis_3 = omnisci_server_worker.import_data_by_ibis( table_name="training_meta", data_files_names="%s/training_set_metadata.csv" % dataset_path, **general_options, ) target = meta_dtypes.pop("target") general_options["columns_names"] = list(meta_dtypes.keys()) general_options["columns_types"] = list(meta_dtypes.values()) # create table #4 t_import_pandas_4, t_import_ibis_4 = omnisci_server_worker.import_data_by_ibis( table_name="test_meta", data_files_names="%s/test_set_metadata.csv" % dataset_path, **general_options, ) meta_dtypes["target"] = target t_import_pandas = ( t_import_pandas_1 + t_import_pandas_2 + t_import_pandas_3 + t_import_pandas_4 ) t_import_ibis = t_import_ibis_1 + t_import_ibis_2 + t_import_ibis_3 + t_import_ibis_4 print(f"import times: pandas - {t_import_pandas}s, ibis - {t_import_ibis}s") t_readcsv = t_import_pandas + t_import_ibis t_connect += omnisci_server_worker.get_conn_creation_time() elif import_mode == "fsi": t0 = timer() omnisci_server_worker._conn.create_table_from_csv( "training", training_file, schema, fragment_size=fragments_size[0], ) omnisci_server_worker._conn.create_table_from_csv( "test", test_file, schema, fragment_size=fragments_size[1], ) omnisci_server_worker._conn.create_table_from_csv( "training_meta", training_meta_file, meta_schema, fragment_size=fragments_size[2], ) omnisci_server_worker._conn.create_table_from_csv( "test_meta", test_meta_file, meta_schema_without_target, fragment_size=fragments_size[3], ) t_readcsv = timer() - t0 t_connect += omnisci_server_worker.get_conn_creation_time() # Second connection - this is ibis's ipc connection for DML t0 = timer() omnisci_server_worker.connect_to_server(database_name, ipc=ipc_connection) db = omnisci_server_worker.database(database_name) t_connect += timer() - t0 training_table = db.table("training") test_table = db.table("test") training_meta_table = db.table("training_meta") test_meta_table = db.table("test_meta") return ( training_table, training_meta_table, test_table, test_meta_table, t_readcsv, t_connect, )
def etl_ibis( filename, columns_names, columns_types, database_name, table_name, omnisci_server_worker, delete_old_database, create_new_table, ipc_connection, validation, etl_keys, import_mode, fragments_size, ): etl_times = {key: 0.0 for key in etl_keys} fragments_size = check_fragments_size(fragments_size, count_table=1, import_mode=import_mode) omnisci_server_worker.create_database(database_name, delete_if_exists=delete_old_database) if create_new_table: # Create table and import data for ETL queries schema_table = ibis.Schema(names=columns_names, types=columns_types) if import_mode == "copy-from": t0 = timer() omnisci_server_worker.create_table( table_name=table_name, schema=schema_table, database=database_name, fragment_size=fragments_size[0], ) table_import = omnisci_server_worker.database(database_name).table( table_name) etl_times["t_connect"] += timer() - t0 t0 = timer() table_import.read_csv(filename, header=True, quotechar="", delimiter=",") etl_times["t_readcsv"] = timer() - t0 elif import_mode == "pandas": # decimal(8, 4) is converted to decimal(9, 6) in order to provide better data conversion # accuracy during import from Pandas into OmniSciDB for proper results validation columns_types = [ "decimal(9, 6)" if (x == "decimal(8, 4)") else x for x in columns_types ] t_import_pandas, t_import_ibis = omnisci_server_worker.import_data_by_ibis( table_name=table_name, data_files_names=filename, files_limit=1, columns_names=columns_names, columns_types=columns_types, header=0, nrows=None, compression_type="gzip" if filename.endswith(".gz") else None, use_columns_types_for_pd=False, ) etl_times["t_readcsv"] = t_import_pandas + t_import_ibis etl_times[ "t_connect"] += omnisci_server_worker.get_conn_creation_time() elif import_mode == "fsi": try: unzip_name = None if filename.endswith(".gz"): import gzip unzip_name = get_tmp_filepath("santander-fsi.csv") with gzip.open(filename, "rb") as gz_input: with open(unzip_name, "wb") as output: output.write(gz_input.read()) t0 = timer() omnisci_server_worker._conn.create_table_from_csv( table_name, unzip_name or filename, schema_table, fragment_size=fragments_size[0], ) etl_times["t_readcsv"] = timer() - t0 etl_times[ "t_connect"] += omnisci_server_worker.get_conn_creation_time( ) finally: if filename.endswith("gz"): import os os.remove(unzip_name) # Second connection - this is ibis's ipc connection for DML t0 = timer() omnisci_server_worker.connect_to_server(database_name, ipc=ipc_connection) table = omnisci_server_worker.database(database_name).table(table_name) etl_times["t_connect"] += timer() - t0 # group_by/count, merge (join) and filtration queries # We are making 400 columns and then insert them into original table thus avoiding # nested sql requests t_etl_start = timer() count_cols = [] orig_cols = ["ID_code", "target"] + ["var_%s" % i for i in range(200)] cast_cols = [] cast_cols.append(table["target"].cast("int64").name("target0")) gt1_cols = [] for i in range(200): col = "var_%d" % i col_count = "var_%d_count" % i col_gt1 = "var_%d_gt1" % i w = ibis.window(group_by=col) count_cols.append(table[col].count().over(w).name(col_count)) gt1_cols.append(ibis.case().when( table[col].count().over(w).name(col_count) > 1, table[col].cast("float32"), ).else_(ibis.null()).end().name(col_gt1)) cast_cols.append(table[col].cast("float32").name(col)) table = table.mutate(count_cols) table = table.drop(orig_cols) table = table.mutate(gt1_cols) table = table.mutate(cast_cols) table_df = table.execute() etl_times["t_etl"] = timer() - t_etl_start return table_df, etl_times
def etl_ibis(args, run_import_queries, columns_names, columns_types, validation=False): filename = args.file database_name = args.name table_name = args.table delete_old_database = not args.dnd create_new_table = not args.dni run_import_queries = str_arg_to_bool(run_import_queries) validation = str_arg_to_bool(validation) tmp_table_name = "tmp_table" etl_times = {"t_groupby_merge_where": 0.0, "t_train_test_split": 0.0, "t_etl": 0.0} if run_import_queries: etl_times_import = { "t_readcsv_by_ibis": 0.0, "t_readcsv_by_COPY": 0.0, "t_readcsv_by_FSI": 0.0, } etl_times.update(etl_times_import) omnisci_server = OmnisciServer( omnisci_executable=args.omnisci_executable, omnisci_port=args.omnisci_port, database_name=args.name, user=args.user, password=args.password, debug_timer=True, columnar_output=args.server_columnar_output, lazy_fetch=args.server_lazy_fetch, ) omnisci_server.launch() import ibis from server_worker import OmnisciServerWorker omnisci_server_worker = OmnisciServerWorker(omnisci_server) omnisci_server_worker.create_database( database_name, delete_if_exists=delete_old_database ) time.sleep(2) omnisci_server_worker.connect_to_server() if run_import_queries: # SQL statemnts preparation for data file import queries connect_to_db_sql_template = "\c {0} admin HyperInteractive" create_table_sql_template = """ CREATE TABLE {0} ({1}); """ import_by_COPY_sql_template = """ COPY {0} FROM '{1}' WITH (header='{2}'); """ import_by_FSI_sql_template = """ CREATE TEMPORARY TABLE {0} ({1}) WITH (storage_type='CSV:{2}'); """ drop_table_sql_template = """ DROP TABLE IF EXISTS {0}; """ import_query_cols_list = ( ["ID_code TEXT ENCODING NONE, \n", "target SMALLINT, \n"] + ["var_%s DOUBLE, \n" % i for i in range(199)] + ["var_199 DOUBLE"] ) import_query_cols_str = "".join(import_query_cols_list) connect_to_db_sql = connect_to_db_sql_template.format(database_name) create_table_sql = create_table_sql_template.format( tmp_table_name, import_query_cols_str ) import_by_COPY_sql = import_by_COPY_sql_template.format( tmp_table_name, filename, "true" ) import_by_FSI_sql = import_by_FSI_sql_template.format( tmp_table_name, import_query_cols_str, filename ) # data file import by ibis columns_types_import_query = ["string", "int64"] + [ "float64" for _ in range(200) ] schema_table_import = ibis.Schema( names=columns_names, types=columns_types_import_query ) omnisci_server_worker.get_conn().create_table( table_name=tmp_table_name, schema=schema_table_import, database=database_name, fragment_size=args.fragment_size, ) table_import_query = omnisci_server_worker.database(database_name).table(tmp_table_name) t0 = timer() table_import_query.read_csv(filename, delimiter=",") etl_times["t_readcsv_by_ibis"] = timer() - t0 # data file import by FSI omnisci_server_worker.drop_table(tmp_table_name) t0 = timer() omnisci_server_worker.execute_sql_query(import_by_FSI_sql) etl_times["t_readcsv_by_FSI"] = timer() - t0 omnisci_server_worker.drop_table(tmp_table_name) # data file import by SQL COPY statement omnisci_server_worker.execute_sql_query(create_table_sql) t0 = timer() omnisci_server_worker.execute_sql_query(import_by_COPY_sql) etl_times["t_readcsv_by_COPY"] = timer() - t0 omnisci_server_worker.drop_table(tmp_table_name) if create_new_table: # Create table and import data for ETL queries schema_table = ibis.Schema(names=columns_names, types=columns_types) omnisci_server_worker.get_conn().create_table( table_name=table_name, schema=schema_table, database=database_name, fragment_size=args.fragment_size, ) table_import = omnisci_server_worker.database(database_name).table(table_name) table_import.read_csv(filename, delimiter=",") if args.server_conn_type == "regular": omnisci_server_worker.connect_to_server() elif args.server_conn_type == "ipc": omnisci_server_worker.ipc_connect_to_server() else: print("Wrong connection type is specified!") sys.exit(0) db = omnisci_server_worker.database(database_name) table = db.table(table_name) # group_by/count, merge (join) and filtration queries # We are making 400 columns and then insert them into original table thus avoiding # nested sql requests t0 = timer() count_cols = [] orig_cols = ["ID_code", "target"] + ['var_%s'%i for i in range(200)] cast_cols = [] cast_cols.append(table["target"].cast("int64").name("target0")) gt1_cols = [] for i in range(200): col = "var_%d" % i col_count = "var_%d_count" % i col_gt1 = "var_%d_gt1" % i w = ibis.window(group_by=col) count_cols.append(table[col].count().over(w).name(col_count)) gt1_cols.append( ibis.case() .when( table[col].count().over(w).name(col_count) > 1, table[col].cast("float32"), ) .else_(ibis.null()) .end() .name("var_%d_gt1" % i) ) cast_cols.append(table[col].cast("float32").name(col)) table = table.mutate(count_cols) table = table.drop(orig_cols) table = table.mutate(gt1_cols) table = table.mutate(cast_cols) table_df = table.execute() etl_times["t_groupby_merge_where"] = timer() - t0 # rows split query t0 = timer() training_part, validation_part = table_df[:-10000], table_df[-10000:] etl_times["t_train_test_split"] = timer() - t0 etl_times["t_etl"] = etl_times["t_groupby_merge_where"] + etl_times["t_train_test_split"] x_train = training_part.drop(['target0'],axis=1) y_train = training_part['target0'] x_valid = validation_part.drop(['target0'],axis=1) y_valid = validation_part['target0'] omnisci_server.terminate() omnisci_server = None return x_train, y_train, x_valid, y_valid, etl_times
parser.add_argument("-commit", default="1234567890123456789012345678901234567890", help="Commit hash to use to record this benchmark results.") args = parser.parse_args() if args.i < 1: print("Bad number of iterations specified", args.i) def print_omnisci_output(stdout): for line in iter(stdout.readline, b''): print("OMNISCI>>", line.decode().strip()) datafile_columns_names = ["ID_code", "target"] + ["var_" + str(index) for index in range(200)] datafile_columns_types = ["string", "int16"] + ["float32" for _ in range(200)] schema_train = ibis.Schema( names = datafile_columns_names, types = datafile_columns_types ) omnisci_server = server.Omnisci_server(omnisci_executable=args.e, omnisci_port=args.port, database_name=database_name) omnisci_server.launch() time.sleep(2) conn = omnisci_server.connect_to_server() db_reporter = None if args.db_user is not "": print("Connecting to database") db = mysql.connector.connect(host=args.db_server, port=args.db_port, user=args.db_user, passwd=args.db_pass, db=args.db_name) db_reporter = report.DbReport(db, args.db_table, { 'QueryName': 'VARCHAR(500) NOT NULL', 'FirstExecTimeMS': 'BIGINT UNSIGNED',
def etl_ibis( filename, files_limit, columns_names, columns_types, database_name, table_name, omnisci_server_worker, delete_old_database, create_new_table, ipc_connection, input_for_validation, import_mode, fragments_size, debug_mode, ): import ibis fragments_size = check_fragments_size(fragments_size, count_table=1, import_mode=import_mode) queries = {"Query1": q1_ibis, "Query2": q2_ibis, "Query3": q3_ibis, "Query4": q4_ibis} etl_results = {x: 0.0 for x in queries.keys()} etl_results["t_readcsv"] = 0.0 etl_results["t_connect"] = 0.0 omnisci_server_worker.connect_to_server() data_files_names = files_names_from_pattern(filename) if len(data_files_names) == 0: raise FileNotFoundError(f"Could not find any data files matching: [{filename}]") data_files_extension = data_files_names[0].split(".")[-1] if not all([name.endswith(data_files_extension) for name in data_files_names]): raise NotImplementedError( "Import of data files with different extensions is not supported" ) omnisci_server_worker.create_database(database_name, delete_if_exists=delete_old_database) # Create table and import data for ETL queries if create_new_table: schema_table = ibis.Schema(names=columns_names, types=columns_types) if import_mode == "copy-from": t0 = timer() omnisci_server_worker.create_table( table_name=table_name, schema=schema_table, database=database_name, fragment_size=fragments_size[0], ) etl_results["t_connect"] += timer() - t0 table_import = omnisci_server_worker.database(database_name).table(table_name) etl_results["t_connect"] += omnisci_server_worker.get_conn_creation_time() for file_to_import in data_files_names[:files_limit]: t0 = timer() table_import.read_csv(file_to_import, header=False, quotechar='"', delimiter=",") etl_results["t_readcsv"] += timer() - t0 elif import_mode == "pandas": # pymapd load_table (that is called recursively by import_data_by_ibis) # needs homogeneus data, and since vendor_id and payment_type fields # from trips_xad file contain text data, next workaround and check are used columns_types[1] = "int64" columns_types[20] = "int64" files_names = [ file_path.split("/")[-1].split(".")[0] for file_path in data_files_names[:files_limit] ] if not all( [ file_name in accepted_data_files_for_pandas_import_mode for file_name in files_names ] ): raise AttributeError( f"pandas import_mode is supported only for {accepted_data_files_for_pandas_import_mode} data files, actually passed {files_names}" ) t_import_pandas, t_import_ibis = omnisci_server_worker.import_data_by_ibis( table_name=table_name, data_files_names=data_files_names, files_limit=files_limit, columns_names=columns_names, columns_types=columns_types, header=None, nrows=None, compression_type="gzip" if data_files_extension == "gz" else None, use_columns_types_for_pd=False, ) etl_results["t_readcsv"] = t_import_pandas + t_import_ibis etl_results["t_connect"] = omnisci_server_worker.get_conn_creation_time() elif import_mode == "fsi": with FilesCombiner( data_files_names=data_files_names, combined_filename=f"taxibench-{files_limit}--files-fsi.csv", files_limit=files_limit, ) as data_file_path: t0 = timer() omnisci_server_worker.get_conn().create_table_from_csv( table_name, data_file_path, schema_table, header=False, fragment_size=fragments_size[0], ) etl_results["t_readcsv"] += timer() - t0 etl_results["t_connect"] = omnisci_server_worker.get_conn_creation_time() # Second connection - this is ibis's ipc connection for DML omnisci_server_worker.connect_to_server(database_name, ipc=ipc_connection) etl_results["t_connect"] += omnisci_server_worker.get_conn_creation_time() t0 = timer() table = omnisci_server_worker.database(database_name).table(table_name) etl_results["t_connect"] += timer() - t0 queries_parameters = { query_name: { "table": table, "input_for_validation": input_for_validation, "debug_mode": debug_mode, } for query_name in queries.keys() } return run_queries(queries=queries, parameters=queries_parameters, etl_results=etl_results)
default="1234567890123456789012345678901234567890", help="Ibis commit hash to use for tests.") try: args = parser.parse_args() if args.i < 1: print("Bad number of iterations specified", args.i) datafile_columns_names = ["ID_code", "target"] + [ "var_" + str(index) for index in range(200) ] datafile_columns_types = ["string", "int64" ] + ["float64" for _ in range(200)] schema_train = ibis.Schema(names=datafile_columns_names, types=datafile_columns_types) database_name = args.n omnisci_server = OmnisciServer(omnisci_executable=args.e, omnisci_port=args.port, database_name=database_name, user=args.u, password=args.p) omnisci_server.launch() omnisci_server_worker = OmnisciServerWorker(omnisci_server) time.sleep(2) conn = omnisci_server_worker.connect_to_server() db_reporter = None if args.db_user is not "":