Example #1
0
    def import_data_from_pd_df(self, table_name, pd_obj, columns_names,
                               columns_types):
        "Import table data using Ibis load_data to the OmniSciDB from the Pandas.DataFrame"

        schema_table = ibis.Schema(names=columns_names, types=columns_types)

        if not self._conn.exists_table(
                name=table_name, database=self.omnisci_server.database_name):
            try:
                self._conn.create_table(
                    table_name=table_name,
                    schema=schema_table,
                    database=self.omnisci_server.database_name,
                )
            except Exception as err:
                print("Failed to create table:", err)

        self._conn.load_data(
            table_name=table_name,
            obj=pd_obj,
            database=self.omnisci_server.database_name,
            method="columnar",
        )

        return self._conn.database(
            self.omnisci_server.database_name).table(table_name)
Example #2
0
    def import_data(
        self,
        table_name,
        data_files_names,
        files_limit,
        columns_names,
        columns_types,
        header=False,
    ):
        "Import CSV files to the OmniSciDB using COPY SQL statement"

        if header:
            header_value = "true"
        elif not header:
            header_value = "false"
        else:
            print("Wrong value of header argument!")
            sys.exit(2)

        schema_table = ibis.Schema(names=columns_names, types=columns_types)

        if not self._conn.exists_table(
                name=table_name, database=self.omnisci_server.database_name):
            try:
                self._conn.create_table(
                    table_name=table_name,
                    schema=schema_table,
                    database=self.omnisci_server.database_name,
                )
            except Exception as err:
                print("Failed to create table:", err)

        for f in data_files_names[:files_limit]:
            print("Importing datafile", f)
            copy_str = self._command_2_import_CSV % (table_name, f,
                                                     header_value)

            omnisci_cmd_line = self._get_omnisci_cmd_line()
            try:
                import_process = subprocess.Popen(
                    omnisci_cmd_line,
                    stdout=subprocess.PIPE,
                    stderr=subprocess.STDOUT,
                    stdin=subprocess.PIPE,
                )
                output = import_process.communicate(copy_str.encode())
            except OSError as err:
                print(f"Failed to start '{omnisci_cmd_line}'", err)

            print(str(output[0].strip().decode()))
            print("Command returned", import_process.returncode)
Example #3
0
    def import_data_by_ibis(self,
                            table_name,
                            data_files_names,
                            files_limit,
                            columns_names,
                            columns_types,
                            cast_dict,
                            header=None):
        "Import CSV files using Ibis load_data from the Pandas.DataFrame"

        schema_table = ibis.Schema(names=columns_names, types=columns_types)

        if not self._conn.exists_table(name=table_name,
                                       database=self._database_name):
            try:
                self._conn.create_table(table_name=table_name,
                                        schema=schema_table,
                                        database=self._database_name)
            except Exception as err:
                print("Failed to create table:", err)

        t0 = time.time()
        if files_limit > 1:
            pandas_df_from_each_file = (self._read_csv_datafile(
                file_name, columns_names,
                header) for file_name in data_files_names[:files_limit])
            self._imported_pd_df[table_name] = pd.concat(
                pandas_df_from_each_file, ignore_index=True)
        else:
            self._imported_pd_df[table_name] = self._read_csv_datafile(
                data_files_names, columns_names, header)

        t_import_pandas = time.time() - t0

        pandas_concatenated_df_casted = self._imported_pd_df[
            table_name].astype(dtype=cast_dict, copy=True)

        t0 = time.time()
        self._conn.load_data(table_name=table_name,
                             obj=pandas_concatenated_df_casted,
                             database=self._database_name)
        t_import_ibis = time.time() - t0

        return t_import_pandas, t_import_ibis
def etl_ibis(
    filename,
    columns_names,
    columns_types,
    database_name,
    table_name,
    omnisci_server_worker,
    delete_old_database,
    create_new_table,
    ipc_connection,
    validation,
    etl_keys,
    import_mode,
):
    import ibis

    etl_times = {key: 0.0 for key in etl_keys}

    omnisci_server_worker.create_database(database_name,
                                          delete_if_exists=delete_old_database)

    # Create table and import data
    if create_new_table:
        schema_table = ibis.Schema(names=columns_names, types=columns_types)
        if import_mode == "copy-from":
            # Create table and import data for ETL queries
            omnisci_server_worker.create_table(
                table_name=table_name,
                schema=schema_table,
                database=database_name,
            )
            table_import = omnisci_server_worker.database(database_name).table(
                table_name)

            t0 = timer()
            table_import.read_csv(filename,
                                  header=True,
                                  quotechar="",
                                  delimiter=",")
            etl_times["t_readcsv"] = round((timer() - t0) * 1000)

        elif import_mode == "pandas":
            # Datafiles import
            t_import_pandas, t_import_ibis = omnisci_server_worker.import_data_by_ibis(
                table_name=table_name,
                data_files_names=filename,
                files_limit=1,
                columns_names=columns_names,
                columns_types=columns_types,
                header=0,
                nrows=None,
                compression_type="gzip" if filename.endswith("gz") else None,
                validation=validation,
            )
            etl_times["t_readcsv"] = round(
                (t_import_pandas + t_import_ibis) * 1000)

        elif import_mode == "fsi":
            try:
                unzip_name = None
                if filename.endswith("gz"):
                    import gzip

                    unzip_name = "/tmp/census-fsi.csv"

                    with gzip.open(filename, "rb") as gz_input:
                        with open(unzip_name, "wb") as output:
                            output.write(gz_input.read())

                t0 = timer()
                omnisci_server_worker._conn.create_table_from_csv(
                    table_name, unzip_name or filename, schema_table)
                etl_times["t_readcsv"] = round((timer() - t0) * 1000)

            finally:
                if filename.endswith("gz"):
                    import os

                    os.remove(unzip_name)

    # Second connection - this is ibis's ipc connection for DML
    omnisci_server_worker.connect_to_server(database_name, ipc=ipc_connection)
    table = omnisci_server_worker.database(database_name).table(table_name)

    t_etl_start = timer()

    keep_cols = [
        "YEAR0",
        "DATANUM",
        "SERIAL",
        "CBSERIAL",
        "HHWT",
        "CPI99",
        "GQ",
        "PERNUM",
        "SEX",
        "AGE",
        "INCTOT",
        "EDUC",
        "EDUCD",
        "EDUC_HEAD",
        "EDUC_POP",
        "EDUC_MOM",
        "EDUCD_MOM2",
        "EDUCD_POP2",
        "INCTOT_MOM",
        "INCTOT_POP",
        "INCTOT_MOM2",
        "INCTOT_POP2",
        "INCTOT_HEAD",
        "SEX_HEAD",
    ]

    if import_mode == "pandas" and validation:
        keep_cols.append("id")

    table = table[keep_cols]

    # first, we do all filters and eliminate redundant fillna operations for EDUC and EDUCD
    table = table[table.INCTOT != 9999999]
    table = table[table["EDUC"].notnull()]
    table = table[table["EDUCD"].notnull()]

    table = table.set_column("INCTOT", table["INCTOT"] * table["CPI99"])

    cols = []
    # final fillna and casting for necessary columns
    for column in keep_cols:
        cols.append(ibis.case().when(
            table[column].notnull(),
            table[column]).else_(-1).end().cast("float64").name(column))

    table = table.mutate(cols)

    df = table.execute()

    if import_mode == "pandas" and validation:
        df.index = df["id"].values

    # here we use pandas to split table
    y = df["EDUC"]
    X = df.drop(["EDUC", "CPI99"], axis=1)

    etl_times["t_etl"] = round((timer() - t_etl_start) * 1000)
    print("DataFrame shape:", X.shape)

    return df, X, y, etl_times
Example #5
0
def load_data_ibis(
    dataset_path,
    database_name,
    omnisci_server_worker,
    delete_old_database,
    create_new_table,
    ipc_connection,
    skip_rows,
    validation,
    dtypes,
    meta_dtypes,
    import_mode,
    fragments_size,
):
    fragments_size = check_fragments_size(
        fragments_size,
        count_table=4,
        import_mode=import_mode,
        default_fragments_size=[32000000, 32000000, 32000000, 32000000],
    )

    omnisci_server_worker.create_database(database_name, delete_if_exists=delete_old_database)

    t_readcsv = 0.0
    t_connect = 0.0

    # Create tables and import data
    if create_new_table:
        import ibis

        training_file = "%s/training_set.csv" % dataset_path
        # COPY FROM doesn't have skip_rows option
        test_file = "%s/test_set_skiprows.csv" % dataset_path
        training_meta_file = "%s/training_set_metadata.csv" % dataset_path
        test_meta_file = "%s/test_set_metadata.csv" % dataset_path

        schema = ibis.Schema(names=dtypes.keys(), types=dtypes.values())
        meta_schema = ibis.Schema(names=meta_dtypes.keys(), types=meta_dtypes.values())

        target = meta_dtypes.pop("target")
        meta_schema_without_target = ibis.Schema(
            names=meta_dtypes.keys(), types=meta_dtypes.values()
        )
        meta_dtypes["target"] = target

        if import_mode == "copy-from":
            # create tables
            t0 = timer()
            omnisci_server_worker.create_table(
                table_name="training",
                schema=schema,
                database=database_name,
                fragment_size=fragments_size[0],
            )
            omnisci_server_worker.create_table(
                table_name="test",
                schema=schema,
                database=database_name,
                fragment_size=fragments_size[1],
            )
            omnisci_server_worker.create_table(
                table_name="training_meta",
                schema=meta_schema,
                database=database_name,
                fragment_size=fragments_size[2],
            )
            omnisci_server_worker.create_table(
                table_name="test_meta",
                schema=meta_schema_without_target,
                database=database_name,
                fragment_size=fragments_size[3],
            )

            # get tables
            db = omnisci_server_worker.database(database_name)
            training_table = db.table("training")
            test_table = db.table("test")
            training_meta_table = db.table("training_meta")
            test_meta_table = db.table("test_meta")
            t_connect = timer() - t0

            # measuring time of reading
            t0 = timer()
            training_table.read_csv(training_file, header=True, quotechar="", delimiter=",")
            test_table.read_csv(test_file, header=True, quotechar="", delimiter=",")
            training_meta_table.read_csv(
                training_meta_file, header=True, quotechar="", delimiter=","
            )
            test_meta_table.read_csv(test_meta_file, header=True, quotechar="", delimiter=",")
            t_readcsv = timer() - t0

        elif import_mode == "pandas":
            general_options = {
                "files_limit": 1,
                "columns_names": list(dtypes.keys()),
                "columns_types": list(dtypes.values()),
                "header": 0,
                "nrows": None,
                "compression_type": None,
                "validation": validation,
            }

            # create table #1
            t_import_pandas_1, t_import_ibis_1 = omnisci_server_worker.import_data_by_ibis(
                table_name="training",
                data_files_names="%s/training_set.csv" % dataset_path,
                **general_options,
            )

            # create table #2
            t_import_pandas_2, t_import_ibis_2 = omnisci_server_worker.import_data_by_ibis(
                table_name="test",
                data_files_names="%s/test_set.csv" % dataset_path,
                skiprows=skip_rows,
                **general_options,
            )

            # before reading meta test files, we should update columns_names
            # and columns_types in general options with its proper values
            general_options["columns_names"] = list(meta_dtypes.keys())
            general_options["columns_types"] = list(meta_dtypes.values())

            # create table #3
            t_import_pandas_3, t_import_ibis_3 = omnisci_server_worker.import_data_by_ibis(
                table_name="training_meta",
                data_files_names="%s/training_set_metadata.csv" % dataset_path,
                **general_options,
            )

            target = meta_dtypes.pop("target")
            general_options["columns_names"] = list(meta_dtypes.keys())
            general_options["columns_types"] = list(meta_dtypes.values())

            # create table #4
            t_import_pandas_4, t_import_ibis_4 = omnisci_server_worker.import_data_by_ibis(
                table_name="test_meta",
                data_files_names="%s/test_set_metadata.csv" % dataset_path,
                **general_options,
            )
            meta_dtypes["target"] = target

            t_import_pandas = (
                t_import_pandas_1 + t_import_pandas_2 + t_import_pandas_3 + t_import_pandas_4
            )
            t_import_ibis = t_import_ibis_1 + t_import_ibis_2 + t_import_ibis_3 + t_import_ibis_4
            print(f"import times: pandas - {t_import_pandas}s, ibis - {t_import_ibis}s")
            t_readcsv = t_import_pandas + t_import_ibis
            t_connect += omnisci_server_worker.get_conn_creation_time()

        elif import_mode == "fsi":
            t0 = timer()
            omnisci_server_worker._conn.create_table_from_csv(
                "training",
                training_file,
                schema,
                fragment_size=fragments_size[0],
            )
            omnisci_server_worker._conn.create_table_from_csv(
                "test",
                test_file,
                schema,
                fragment_size=fragments_size[1],
            )
            omnisci_server_worker._conn.create_table_from_csv(
                "training_meta",
                training_meta_file,
                meta_schema,
                fragment_size=fragments_size[2],
            )
            omnisci_server_worker._conn.create_table_from_csv(
                "test_meta",
                test_meta_file,
                meta_schema_without_target,
                fragment_size=fragments_size[3],
            )
            t_readcsv = timer() - t0
            t_connect += omnisci_server_worker.get_conn_creation_time()

    # Second connection - this is ibis's ipc connection for DML
    t0 = timer()
    omnisci_server_worker.connect_to_server(database_name, ipc=ipc_connection)
    db = omnisci_server_worker.database(database_name)
    t_connect += timer() - t0

    training_table = db.table("training")
    test_table = db.table("test")

    training_meta_table = db.table("training_meta")
    test_meta_table = db.table("test_meta")

    return (
        training_table,
        training_meta_table,
        test_table,
        test_meta_table,
        t_readcsv,
        t_connect,
    )
def etl_ibis(
    filename,
    columns_names,
    columns_types,
    database_name,
    table_name,
    omnisci_server_worker,
    delete_old_database,
    create_new_table,
    ipc_connection,
    validation,
    etl_keys,
    import_mode,
    fragments_size,
):
    etl_times = {key: 0.0 for key in etl_keys}

    fragments_size = check_fragments_size(fragments_size,
                                          count_table=1,
                                          import_mode=import_mode)

    omnisci_server_worker.create_database(database_name,
                                          delete_if_exists=delete_old_database)

    if create_new_table:
        # Create table and import data for ETL queries
        schema_table = ibis.Schema(names=columns_names, types=columns_types)
        if import_mode == "copy-from":
            t0 = timer()
            omnisci_server_worker.create_table(
                table_name=table_name,
                schema=schema_table,
                database=database_name,
                fragment_size=fragments_size[0],
            )
            table_import = omnisci_server_worker.database(database_name).table(
                table_name)
            etl_times["t_connect"] += timer() - t0

            t0 = timer()
            table_import.read_csv(filename,
                                  header=True,
                                  quotechar="",
                                  delimiter=",")
            etl_times["t_readcsv"] = timer() - t0

        elif import_mode == "pandas":
            # decimal(8, 4) is converted to decimal(9, 6) in order to provide better data conversion
            # accuracy during import from Pandas into OmniSciDB for proper results validation
            columns_types = [
                "decimal(9, 6)" if (x == "decimal(8, 4)") else x
                for x in columns_types
            ]
            t_import_pandas, t_import_ibis = omnisci_server_worker.import_data_by_ibis(
                table_name=table_name,
                data_files_names=filename,
                files_limit=1,
                columns_names=columns_names,
                columns_types=columns_types,
                header=0,
                nrows=None,
                compression_type="gzip" if filename.endswith(".gz") else None,
                use_columns_types_for_pd=False,
            )
            etl_times["t_readcsv"] = t_import_pandas + t_import_ibis
            etl_times[
                "t_connect"] += omnisci_server_worker.get_conn_creation_time()

        elif import_mode == "fsi":
            try:
                unzip_name = None
                if filename.endswith(".gz"):
                    import gzip

                    unzip_name = get_tmp_filepath("santander-fsi.csv")

                    with gzip.open(filename, "rb") as gz_input:
                        with open(unzip_name, "wb") as output:
                            output.write(gz_input.read())

                t0 = timer()
                omnisci_server_worker._conn.create_table_from_csv(
                    table_name,
                    unzip_name or filename,
                    schema_table,
                    fragment_size=fragments_size[0],
                )
                etl_times["t_readcsv"] = timer() - t0
                etl_times[
                    "t_connect"] += omnisci_server_worker.get_conn_creation_time(
                    )

            finally:
                if filename.endswith("gz"):
                    import os

                    os.remove(unzip_name)

    # Second connection - this is ibis's ipc connection for DML
    t0 = timer()
    omnisci_server_worker.connect_to_server(database_name, ipc=ipc_connection)
    table = omnisci_server_worker.database(database_name).table(table_name)
    etl_times["t_connect"] += timer() - t0

    # group_by/count, merge (join) and filtration queries
    # We are making 400 columns and then insert them into original table thus avoiding
    # nested sql requests
    t_etl_start = timer()
    count_cols = []
    orig_cols = ["ID_code", "target"] + ["var_%s" % i for i in range(200)]
    cast_cols = []
    cast_cols.append(table["target"].cast("int64").name("target0"))
    gt1_cols = []
    for i in range(200):
        col = "var_%d" % i
        col_count = "var_%d_count" % i
        col_gt1 = "var_%d_gt1" % i
        w = ibis.window(group_by=col)
        count_cols.append(table[col].count().over(w).name(col_count))
        gt1_cols.append(ibis.case().when(
            table[col].count().over(w).name(col_count) > 1,
            table[col].cast("float32"),
        ).else_(ibis.null()).end().name(col_gt1))
        cast_cols.append(table[col].cast("float32").name(col))

    table = table.mutate(count_cols)
    table = table.drop(orig_cols)
    table = table.mutate(gt1_cols)
    table = table.mutate(cast_cols)

    table_df = table.execute()

    etl_times["t_etl"] = timer() - t_etl_start
    return table_df, etl_times
Example #7
0
def etl_ibis(args, run_import_queries, columns_names, columns_types, validation=False):

    filename = args.file
    database_name = args.name
    table_name = args.table
    delete_old_database = not args.dnd
    create_new_table = not args.dni
    run_import_queries = str_arg_to_bool(run_import_queries)
    validation = str_arg_to_bool(validation)

    tmp_table_name = "tmp_table"

    etl_times = {"t_groupby_merge_where": 0.0, "t_train_test_split": 0.0, "t_etl": 0.0}

    if run_import_queries:
        etl_times_import = {
            "t_readcsv_by_ibis": 0.0,
            "t_readcsv_by_COPY": 0.0,
            "t_readcsv_by_FSI": 0.0,
        }
        etl_times.update(etl_times_import)

    omnisci_server = OmnisciServer(
        omnisci_executable=args.omnisci_executable,
        omnisci_port=args.omnisci_port,
        database_name=args.name,
        user=args.user,
        password=args.password,
        debug_timer=True,
        columnar_output=args.server_columnar_output,
        lazy_fetch=args.server_lazy_fetch,
    )
    omnisci_server.launch()

    import ibis
    from server_worker import OmnisciServerWorker

    omnisci_server_worker = OmnisciServerWorker(omnisci_server)
    omnisci_server_worker.create_database(
        database_name, delete_if_exists=delete_old_database
    )

    time.sleep(2)
    omnisci_server_worker.connect_to_server()

    if run_import_queries:
        # SQL statemnts preparation for data file import queries
        connect_to_db_sql_template = "\c {0} admin HyperInteractive"
        create_table_sql_template = """
        CREATE TABLE {0} ({1});
        """
        import_by_COPY_sql_template = """
        COPY {0} FROM '{1}' WITH (header='{2}');
        """
        import_by_FSI_sql_template = """
        CREATE TEMPORARY TABLE {0} ({1}) WITH (storage_type='CSV:{2}');
        """
        drop_table_sql_template = """
        DROP TABLE IF EXISTS {0};
        """

        import_query_cols_list = (
            ["ID_code TEXT ENCODING NONE, \n", "target SMALLINT, \n"]
            + ["var_%s DOUBLE, \n" % i for i in range(199)]
            + ["var_199 DOUBLE"]
        )
        import_query_cols_str = "".join(import_query_cols_list)

        connect_to_db_sql = connect_to_db_sql_template.format(database_name)
        create_table_sql = create_table_sql_template.format(
            tmp_table_name, import_query_cols_str
        )
        import_by_COPY_sql = import_by_COPY_sql_template.format(
            tmp_table_name, filename, "true"
        )
        import_by_FSI_sql = import_by_FSI_sql_template.format(
            tmp_table_name, import_query_cols_str, filename
        )

        # data file import by ibis
        columns_types_import_query = ["string", "int64"] + [
            "float64" for _ in range(200)
        ]
        schema_table_import = ibis.Schema(
            names=columns_names, types=columns_types_import_query
        )
        omnisci_server_worker.get_conn().create_table(
            table_name=tmp_table_name,
            schema=schema_table_import,
            database=database_name,
            fragment_size=args.fragment_size,
        )

        table_import_query = omnisci_server_worker.database(database_name).table(tmp_table_name)
        t0 = timer()
        table_import_query.read_csv(filename, delimiter=",")
        etl_times["t_readcsv_by_ibis"] = timer() - t0

        # data file import by FSI
        omnisci_server_worker.drop_table(tmp_table_name)
        t0 = timer()
        omnisci_server_worker.execute_sql_query(import_by_FSI_sql)
        etl_times["t_readcsv_by_FSI"] = timer() - t0

        omnisci_server_worker.drop_table(tmp_table_name)

        # data file import by SQL COPY statement
        omnisci_server_worker.execute_sql_query(create_table_sql)

        t0 = timer()
        omnisci_server_worker.execute_sql_query(import_by_COPY_sql)
        etl_times["t_readcsv_by_COPY"] = timer() - t0

        omnisci_server_worker.drop_table(tmp_table_name)

    if create_new_table:
        # Create table and import data for ETL queries
        schema_table = ibis.Schema(names=columns_names, types=columns_types)
        omnisci_server_worker.get_conn().create_table(
            table_name=table_name,
            schema=schema_table,
            database=database_name,
            fragment_size=args.fragment_size,
        )

        table_import = omnisci_server_worker.database(database_name).table(table_name)
        table_import.read_csv(filename, delimiter=",")

    if args.server_conn_type == "regular":
        omnisci_server_worker.connect_to_server()
    elif args.server_conn_type == "ipc":
        omnisci_server_worker.ipc_connect_to_server()
    else:
        print("Wrong connection type is specified!")
        sys.exit(0)

    db = omnisci_server_worker.database(database_name)
    table = db.table(table_name)

    # group_by/count, merge (join) and filtration queries
    # We are making 400 columns and then insert them into original table thus avoiding
    # nested sql requests
    t0 = timer()
    count_cols = []
    orig_cols = ["ID_code", "target"] + ['var_%s'%i for i in range(200)]
    cast_cols = []
    cast_cols.append(table["target"].cast("int64").name("target0"))
    gt1_cols = []
    for i in range(200):
        col = "var_%d" % i
        col_count = "var_%d_count" % i
        col_gt1 = "var_%d_gt1" % i
        w = ibis.window(group_by=col)
        count_cols.append(table[col].count().over(w).name(col_count))
        gt1_cols.append(
            ibis.case()
            .when(
                table[col].count().over(w).name(col_count) > 1,
                table[col].cast("float32"),
            )
            .else_(ibis.null())
            .end()
            .name("var_%d_gt1" % i)
        )
        cast_cols.append(table[col].cast("float32").name(col))

    table = table.mutate(count_cols)
    table = table.drop(orig_cols)
    table = table.mutate(gt1_cols)
    table = table.mutate(cast_cols)

    table_df = table.execute()
    etl_times["t_groupby_merge_where"] = timer() - t0

    # rows split query
    t0 = timer()
    training_part, validation_part = table_df[:-10000], table_df[-10000:]
    etl_times["t_train_test_split"] = timer() - t0
    
    etl_times["t_etl"] = etl_times["t_groupby_merge_where"] + etl_times["t_train_test_split"]
    
    x_train = training_part.drop(['target0'],axis=1)
    y_train = training_part['target0']
    x_valid = validation_part.drop(['target0'],axis=1)
    y_valid = validation_part['target0']
    
    omnisci_server.terminate()
    omnisci_server = None

    return x_train, y_train, x_valid, y_valid, etl_times
parser.add_argument("-commit", default="1234567890123456789012345678901234567890", help="Commit hash to use to record this benchmark results.")

args = parser.parse_args()

if args.i < 1:
    print("Bad number of iterations specified", args.i)
    
def print_omnisci_output(stdout):
    for line in iter(stdout.readline, b''):
        print("OMNISCI>>", line.decode().strip())
    
datafile_columns_names = ["ID_code", "target"] + ["var_" + str(index) for index in range(200)]
datafile_columns_types = ["string", "int16"] + ["float32" for _ in range(200)]

schema_train = ibis.Schema(
    names = datafile_columns_names,
    types = datafile_columns_types
)

omnisci_server = server.Omnisci_server(omnisci_executable=args.e, omnisci_port=args.port, database_name=database_name)
omnisci_server.launch()

time.sleep(2)
conn = omnisci_server.connect_to_server()

db_reporter = None
if args.db_user is not "":
    print("Connecting to database")
    db = mysql.connector.connect(host=args.db_server, port=args.db_port, user=args.db_user, passwd=args.db_pass, db=args.db_name)
    db_reporter = report.DbReport(db, args.db_table, {
        'QueryName': 'VARCHAR(500) NOT NULL',
        'FirstExecTimeMS': 'BIGINT UNSIGNED',
Example #9
0
def etl_ibis(
    filename,
    files_limit,
    columns_names,
    columns_types,
    database_name,
    table_name,
    omnisci_server_worker,
    delete_old_database,
    create_new_table,
    ipc_connection,
    input_for_validation,
    import_mode,
    fragments_size,
    debug_mode,
):
    import ibis

    fragments_size = check_fragments_size(fragments_size, count_table=1, import_mode=import_mode)

    queries = {"Query1": q1_ibis, "Query2": q2_ibis, "Query3": q3_ibis, "Query4": q4_ibis}
    etl_results = {x: 0.0 for x in queries.keys()}
    etl_results["t_readcsv"] = 0.0
    etl_results["t_connect"] = 0.0

    omnisci_server_worker.connect_to_server()

    data_files_names = files_names_from_pattern(filename)

    if len(data_files_names) == 0:
        raise FileNotFoundError(f"Could not find any data files matching: [{filename}]")

    data_files_extension = data_files_names[0].split(".")[-1]
    if not all([name.endswith(data_files_extension) for name in data_files_names]):
        raise NotImplementedError(
            "Import of data files with different extensions is not supported"
        )

    omnisci_server_worker.create_database(database_name, delete_if_exists=delete_old_database)

    # Create table and import data for ETL queries
    if create_new_table:
        schema_table = ibis.Schema(names=columns_names, types=columns_types)
        if import_mode == "copy-from":
            t0 = timer()
            omnisci_server_worker.create_table(
                table_name=table_name,
                schema=schema_table,
                database=database_name,
                fragment_size=fragments_size[0],
            )
            etl_results["t_connect"] += timer() - t0
            table_import = omnisci_server_worker.database(database_name).table(table_name)
            etl_results["t_connect"] += omnisci_server_worker.get_conn_creation_time()

            for file_to_import in data_files_names[:files_limit]:
                t0 = timer()
                table_import.read_csv(file_to_import, header=False, quotechar='"', delimiter=",")
                etl_results["t_readcsv"] += timer() - t0

        elif import_mode == "pandas":
            # pymapd load_table (that is called recursively by import_data_by_ibis)
            # needs homogeneus data, and since vendor_id and payment_type fields
            # from trips_xad file contain text data, next workaround and check are used
            columns_types[1] = "int64"
            columns_types[20] = "int64"
            files_names = [
                file_path.split("/")[-1].split(".")[0]
                for file_path in data_files_names[:files_limit]
            ]
            if not all(
                [
                    file_name in accepted_data_files_for_pandas_import_mode
                    for file_name in files_names
                ]
            ):
                raise AttributeError(
                    f"pandas import_mode is supported only for {accepted_data_files_for_pandas_import_mode} data files, actually passed {files_names}"
                )
            t_import_pandas, t_import_ibis = omnisci_server_worker.import_data_by_ibis(
                table_name=table_name,
                data_files_names=data_files_names,
                files_limit=files_limit,
                columns_names=columns_names,
                columns_types=columns_types,
                header=None,
                nrows=None,
                compression_type="gzip" if data_files_extension == "gz" else None,
                use_columns_types_for_pd=False,
            )

            etl_results["t_readcsv"] = t_import_pandas + t_import_ibis
            etl_results["t_connect"] = omnisci_server_worker.get_conn_creation_time()

        elif import_mode == "fsi":
            with FilesCombiner(
                data_files_names=data_files_names,
                combined_filename=f"taxibench-{files_limit}--files-fsi.csv",
                files_limit=files_limit,
            ) as data_file_path:
                t0 = timer()
                omnisci_server_worker.get_conn().create_table_from_csv(
                    table_name,
                    data_file_path,
                    schema_table,
                    header=False,
                    fragment_size=fragments_size[0],
                )
                etl_results["t_readcsv"] += timer() - t0
                etl_results["t_connect"] = omnisci_server_worker.get_conn_creation_time()

    # Second connection - this is ibis's ipc connection for DML
    omnisci_server_worker.connect_to_server(database_name, ipc=ipc_connection)
    etl_results["t_connect"] += omnisci_server_worker.get_conn_creation_time()
    t0 = timer()
    table = omnisci_server_worker.database(database_name).table(table_name)
    etl_results["t_connect"] += timer() - t0

    queries_parameters = {
        query_name: {
            "table": table,
            "input_for_validation": input_for_validation,
            "debug_mode": debug_mode,
        }
        for query_name in queries.keys()
    }
    return run_queries(queries=queries, parameters=queries_parameters, etl_results=etl_results)
Example #10
0
                    default="1234567890123456789012345678901234567890",
                    help="Ibis commit hash to use for tests.")

try:
    args = parser.parse_args()

    if args.i < 1:
        print("Bad number of iterations specified", args.i)

    datafile_columns_names = ["ID_code", "target"] + [
        "var_" + str(index) for index in range(200)
    ]
    datafile_columns_types = ["string", "int64"
                              ] + ["float64" for _ in range(200)]

    schema_train = ibis.Schema(names=datafile_columns_names,
                               types=datafile_columns_types)

    database_name = args.n
    omnisci_server = OmnisciServer(omnisci_executable=args.e,
                                   omnisci_port=args.port,
                                   database_name=database_name,
                                   user=args.u,
                                   password=args.p)
    omnisci_server.launch()
    omnisci_server_worker = OmnisciServerWorker(omnisci_server)

    time.sleep(2)
    conn = omnisci_server_worker.connect_to_server()

    db_reporter = None
    if args.db_user is not "":