def run_benchmark(parameters):

    ignored_parameters = {
        "optimizer": parameters["optimizer"],
        "no_ml": parameters["no_ml"],
        "gpu_memory": parameters["gpu_memory"],
    }
    warnings.warn(f"Parameters {ignored_parameters} are ignored",
                  RuntimeWarning)

    parameters["data_file"] = parameters["data_file"].replace("'", "")

    columns_names = [
        "trip_id",
        "vendor_id",
        "pickup_datetime",
        "dropoff_datetime",
        "store_and_fwd_flag",
        "rate_code_id",
        "pickup_longitude",
        "pickup_latitude",
        "dropoff_longitude",
        "dropoff_latitude",
        "passenger_count",
        "trip_distance",
        "fare_amount",
        "extra",
        "mta_tax",
        "tip_amount",
        "tolls_amount",
        "ehail_fee",
        "improvement_surcharge",
        "total_amount",
        "payment_type",
        "trip_type",
        "pickup",
        "dropoff",
        "cab_type",
        "precipitation",
        "snow_depth",
        "snowfall",
        "max_temperature",
        "min_temperature",
        "average_wind_speed",
        "pickup_nyct2010_gid",
        "pickup_ctlabel",
        "pickup_borocode",
        "pickup_boroname",
        "pickup_ct2010",
        "pickup_boroct2010",
        "pickup_cdeligibil",
        "pickup_ntacode",
        "pickup_ntaname",
        "pickup_puma",
        "dropoff_nyct2010_gid",
        "dropoff_ctlabel",
        "dropoff_borocode",
        "dropoff_boroname",
        "dropoff_ct2010",
        "dropoff_boroct2010",
        "dropoff_cdeligibil",
        "dropoff_ntacode",
        "dropoff_ntaname",
        "dropoff_puma",
    ]

    columns_types = [
        "int64",
        "int64",
        "timestamp",
        "timestamp",
        "string",
        "int64",
        "float64",
        "float64",
        "float64",
        "float64",
        "int64",
        "float64",
        "float64",
        "float64",
        "float64",
        "float64",
        "float64",
        "float64",
        "float64",
        "float64",
        "int64",
        "float64",
        "string",
        "string",
        "string",
        "float64",
        "int64",
        "float64",
        "int64",
        "int64",
        "float64",
        "float64",
        "float64",
        "float64",
        "string",
        "float64",
        "float64",
        "string",
        "string",
        "string",
        "float64",
        "float64",
        "float64",
        "float64",
        "string",
        "float64",
        "float64",
        "string",
        "string",
        "string",
        "float64",
    ]

    if parameters["dfiles_num"] <= 0:
        print("Bad number of data files specified: ", parameters["dfiles_num"])
        sys.exit(1)
    try:
        import_pandas_into_module_namespace(
            namespace=run_benchmark.__globals__,
            mode=parameters["pandas_mode"],
            ray_tmpdir=parameters["ray_tmpdir"],
            ray_memory=parameters["ray_memory"],
        )

        etl_times_ibis = None
        if not parameters["no_ibis"]:
            etl_times_ibis = etl_ibis(
                filename=parameters["data_file"],
                files_limit=parameters["dfiles_num"],
                columns_names=columns_names,
                columns_types=columns_types,
                database_name=parameters["database_name"],
                table_name=parameters["table"],
                omnisci_server_worker=parameters["omnisci_server_worker"],
                delete_old_database=not parameters["dnd"],
                ipc_connection=parameters["ipc_connection"],
                create_new_table=not parameters["dni"],
                validation=parameters["validation"],
            )

            print_results(results=etl_times_ibis, backend="Ibis", unit="ms")
            etl_times_ibis["Backend"] = "Ibis"

        pandas_files_limit = parameters["dfiles_num"]
        filename = files_names_from_pattern(
            parameters["data_file"])[:pandas_files_limit]
        etl_times = etl_pandas(
            filename=filename,
            files_limit=pandas_files_limit,
            columns_names=columns_names,
            columns_types=columns_types,
        )

        print_results(results=etl_times,
                      backend=parameters["pandas_mode"],
                      unit="ms")
        etl_times["Backend"] = parameters["pandas_mode"]

        return {"ETL": [etl_times_ibis, etl_times], "ML": []}
    except Exception:
        traceback.print_exc(file=sys.stdout)
        sys.exit(1)
def etl_ibis(
    filename,
    files_limit,
    columns_names,
    columns_types,
    database_name,
    table_name,
    omnisci_server_worker,
    delete_old_database,
    create_new_table,
    ipc_connection,
    validation,
):

    queries = {
        "Query1": q1_ibis,
        "Query2": q2_ibis,
        "Query3": q3_ibis,
        "Query4": q4_ibis,
    }
    etl_times = {x: 0.0 for x in queries.keys()}

    queries_validation_results = {"q%s" % i: False for i in range(1, 5)}
    queries_validation_flags = {"q%s" % i: False for i in range(1, 5)}

    omnisci_server_worker.connect_to_server()

    data_files_names = files_names_from_pattern(filename)

    if len(data_files_names) == 0:
        print("Could not find any data files matching ", filename)
        sys.exit(2)

    omnisci_server_worker.create_database(database_name,
                                          delete_if_exists=delete_old_database)

    if create_new_table:
        # TODO t_import_pandas, t_import_ibis = omnisci_server_worker.import_data_by_ibis
        t0 = time.time()
        omnisci_server_worker.import_data(
            table_name=table_name,
            data_files_names=data_files_names,
            files_limit=files_limit,
            columns_names=columns_names,
            columns_types=columns_types,
            header=False,
        )
        etl_times["t_readcsv"] = time.time() - t0
        # etl_times["t_readcsv"] = t_import_pandas + t_import_ibis

    omnisci_server_worker.connect_to_server(database=database_name,
                                            ipc=ipc_connection)
    table = omnisci_server_worker.database(database_name).table(table_name)

    df_pandas = None
    if validation:
        df_pandas = validation_prereqs(omnisci_server_worker, data_files_names,
                                       files_limit, columns_names)

    queries_parameters = {
        "table": table,
        "df_pandas": df_pandas,
        "queries_validation_results": queries_validation_results,
        "queries_validation_flags": queries_validation_flags,
        "validation": validation,
    }
    return run_queries(queries=queries,
                       parameters=queries_parameters,
                       etl_times=etl_times)
Example #3
0
def etl_ibis(
    filename,
    files_limit,
    columns_names,
    columns_types,
    database_name,
    table_name,
    omnisci_server_worker,
    delete_old_database,
    create_new_table,
    ipc_connection,
    input_for_validation,
    import_mode,
    fragments_size,
    debug_mode,
):
    import ibis

    fragments_size = check_fragments_size(fragments_size, count_table=1, import_mode=import_mode)

    queries = {"Query1": q1_ibis, "Query2": q2_ibis, "Query3": q3_ibis, "Query4": q4_ibis}
    etl_results = {x: 0.0 for x in queries.keys()}
    etl_results["t_readcsv"] = 0.0
    etl_results["t_connect"] = 0.0

    omnisci_server_worker.connect_to_server()

    data_files_names = files_names_from_pattern(filename)

    if len(data_files_names) == 0:
        raise FileNotFoundError(f"Could not find any data files matching: [{filename}]")

    data_files_extension = data_files_names[0].split(".")[-1]
    if not all([name.endswith(data_files_extension) for name in data_files_names]):
        raise NotImplementedError(
            "Import of data files with different extensions is not supported"
        )

    omnisci_server_worker.create_database(database_name, delete_if_exists=delete_old_database)

    # Create table and import data for ETL queries
    if create_new_table:
        schema_table = ibis.Schema(names=columns_names, types=columns_types)
        if import_mode == "copy-from":
            t0 = timer()
            omnisci_server_worker.create_table(
                table_name=table_name,
                schema=schema_table,
                database=database_name,
                fragment_size=fragments_size[0],
            )
            etl_results["t_connect"] += timer() - t0
            table_import = omnisci_server_worker.database(database_name).table(table_name)
            etl_results["t_connect"] += omnisci_server_worker.get_conn_creation_time()

            for file_to_import in data_files_names[:files_limit]:
                t0 = timer()
                table_import.read_csv(file_to_import, header=False, quotechar='"', delimiter=",")
                etl_results["t_readcsv"] += timer() - t0

        elif import_mode == "pandas":
            # pymapd load_table (that is called recursively by import_data_by_ibis)
            # needs homogeneus data, and since vendor_id and payment_type fields
            # from trips_xad file contain text data, next workaround and check are used
            columns_types[1] = "int64"
            columns_types[20] = "int64"
            files_names = [
                file_path.split("/")[-1].split(".")[0]
                for file_path in data_files_names[:files_limit]
            ]
            if not all(
                [
                    file_name in accepted_data_files_for_pandas_import_mode
                    for file_name in files_names
                ]
            ):
                raise AttributeError(
                    f"pandas import_mode is supported only for {accepted_data_files_for_pandas_import_mode} data files, actually passed {files_names}"
                )
            t_import_pandas, t_import_ibis = omnisci_server_worker.import_data_by_ibis(
                table_name=table_name,
                data_files_names=data_files_names,
                files_limit=files_limit,
                columns_names=columns_names,
                columns_types=columns_types,
                header=None,
                nrows=None,
                compression_type="gzip" if data_files_extension == "gz" else None,
                use_columns_types_for_pd=False,
            )

            etl_results["t_readcsv"] = t_import_pandas + t_import_ibis
            etl_results["t_connect"] = omnisci_server_worker.get_conn_creation_time()

        elif import_mode == "fsi":
            with FilesCombiner(
                data_files_names=data_files_names,
                combined_filename=f"taxibench-{files_limit}--files-fsi.csv",
                files_limit=files_limit,
            ) as data_file_path:
                t0 = timer()
                omnisci_server_worker.get_conn().create_table_from_csv(
                    table_name,
                    data_file_path,
                    schema_table,
                    header=False,
                    fragment_size=fragments_size[0],
                )
                etl_results["t_readcsv"] += timer() - t0
                etl_results["t_connect"] = omnisci_server_worker.get_conn_creation_time()

    # Second connection - this is ibis's ipc connection for DML
    omnisci_server_worker.connect_to_server(database_name, ipc=ipc_connection)
    etl_results["t_connect"] += omnisci_server_worker.get_conn_creation_time()
    t0 = timer()
    table = omnisci_server_worker.database(database_name).table(table_name)
    etl_results["t_connect"] += timer() - t0

    queries_parameters = {
        query_name: {
            "table": table,
            "input_for_validation": input_for_validation,
            "debug_mode": debug_mode,
        }
        for query_name in queries.keys()
    }
    return run_queries(queries=queries, parameters=queries_parameters, etl_results=etl_results)
Example #4
0
def run_benchmark(parameters):
    check_support(parameters, unsupported_params=["optimizer", "no_ml", "gpu_memory"])

    parameters["data_file"] = parameters["data_file"].replace("'", "")

    columns_names = [
        "trip_id",
        "vendor_id",
        "pickup_datetime",
        "dropoff_datetime",
        "store_and_fwd_flag",
        "rate_code_id",
        "pickup_longitude",
        "pickup_latitude",
        "dropoff_longitude",
        "dropoff_latitude",
        "passenger_count",
        "trip_distance",
        "fare_amount",
        "extra",
        "mta_tax",
        "tip_amount",
        "tolls_amount",
        "ehail_fee",
        "improvement_surcharge",
        "total_amount",
        "payment_type",
        "trip_type",
        "pickup",
        "dropoff",
        "cab_type",
        "precipitation",
        "snow_depth",
        "snowfall",
        "max_temperature",
        "min_temperature",
        "average_wind_speed",
        "pickup_nyct2010_gid",
        "pickup_ctlabel",
        "pickup_borocode",
        "pickup_boroname",
        "pickup_ct2010",
        "pickup_boroct2010",
        "pickup_cdeligibil",
        "pickup_ntacode",
        "pickup_ntaname",
        "pickup_puma",
        "dropoff_nyct2010_gid",
        "dropoff_ctlabel",
        "dropoff_borocode",
        "dropoff_boroname",
        "dropoff_ct2010",
        "dropoff_boroct2010",
        "dropoff_cdeligibil",
        "dropoff_ntacode",
        "dropoff_ntaname",
        "dropoff_puma",
    ]

    columns_types = [
        "int64",
        "category",
        "timestamp",
        "timestamp",
        "category",
        "int64",
        "float64",
        "float64",
        "float64",
        "float64",
        "int64",
        "float64",
        "float64",
        "float64",
        "float64",
        "float64",
        "float64",
        "float64",
        "float64",
        "float64",
        "category",
        "float64",
        "category",
        "category",
        "category",
        "float64",
        "int64",
        "float64",
        "int64",
        "int64",
        "float64",
        "float64",
        "float64",
        "float64",
        "category",
        "float64",
        "float64",
        "category",
        "category",
        "category",
        "float64",
        "float64",
        "float64",
        "float64",
        "category",
        "float64",
        "float64",
        "category",
        "category",
        "category",
        "float64",
    ]

    if parameters["dfiles_num"] <= 0:
        raise ValueError(f"Bad number of data files specified: {parameters['dfiles_num']}")

    if not parameters["no_pandas"]:
        import_pandas_into_module_namespace(
            namespace=run_benchmark.__globals__,
            mode=parameters["pandas_mode"],
            ray_tmpdir=parameters["ray_tmpdir"],
            ray_memory=parameters["ray_memory"],
        )

    etl_results_ibis = None
    etl_results = None
    pd_queries_outputs = {} if parameters["validation"] else None
    if not parameters["no_pandas"]:
        pandas_files_limit = parameters["dfiles_num"]
        filename = files_names_from_pattern(parameters["data_file"])[:pandas_files_limit]
        etl_results = etl_pandas(
            filename=filename,
            files_limit=pandas_files_limit,
            columns_names=columns_names,
            columns_types=columns_types,
            output_for_validation=pd_queries_outputs,
            pandas_mode=parameters["pandas_mode"],
        )

        print_results(results=etl_results, backend=parameters["pandas_mode"], unit="ms")
        etl_results["Backend"] = parameters["pandas_mode"]
        etl_results["dfiles_num"] = parameters["dfiles_num"]
        etl_results["dataset_size"] = get_ny_taxi_dataset_size(parameters["dfiles_num"])

    if not parameters["no_ibis"]:
        etl_results_ibis = etl_ibis(
            filename=parameters["data_file"],
            files_limit=parameters["dfiles_num"],
            columns_names=columns_names,
            columns_types=columns_types,
            database_name=parameters["database_name"],
            table_name=parameters["table"],
            omnisci_server_worker=parameters["omnisci_server_worker"],
            delete_old_database=not parameters["dnd"],
            ipc_connection=parameters["ipc_connection"],
            create_new_table=not parameters["dni"],
            input_for_validation=pd_queries_outputs,
            import_mode=parameters["import_mode"],
            fragments_size=parameters["fragments_size"],
            debug_mode=parameters["debug_mode"],
        )

        print_results(results=etl_results_ibis, backend="Ibis", unit="ms")
        etl_results_ibis["Backend"] = "Ibis"
        etl_results_ibis["dfiles_num"] = parameters["dfiles_num"]
        etl_results_ibis["dataset_size"] = get_ny_taxi_dataset_size(parameters["dfiles_num"])

    return {"ETL": [etl_results_ibis, etl_results], "ML": []}
Example #5
0
def queries_modin(filename, pandas_mode, extended_functionality):
    data_files_names = files_names_from_pattern(filename)
    data_for_groupby_queries = []
    data_for_join_queries = []
    for f in data_files_names:
        if f.split("/")[-1].startswith("G1"):
            data_for_groupby_queries.append(f)
        elif f.split("/")[-1].startswith("J1"):
            data_for_join_queries.append(f)
        else:
            raise AttributeError(f"Unrecognized file is passed as -data_file flag argument: {f}")

    groupby_queries_files_number = len(data_for_groupby_queries)
    join_queries_files_number = len(data_for_join_queries)
    accepted_number_of_files_for_join_queries = [0, 1, 4]

    if all([groupby_queries_files_number, join_queries_files_number]):
        raise AttributeError(
            "Only one type of queries (groupby or join) can be executed during one run, but files for both queries are passed with -data_file flag"
        )
    elif groupby_queries_files_number > 1:
        raise AttributeError(
            f"Only one file for one run is accepted for groupby queries, actually passed {groupby_queries_files_number}: {data_for_groupby_queries}"
        )
    elif join_queries_files_number not in accepted_number_of_files_for_join_queries:
        raise AttributeError(
            f"Accepted numbers of files for join queries are {accepted_number_of_files_for_join_queries}, actually passed {join_queries_files_number}: {data_for_join_queries}"
        )
    elif join_queries_files_number and sum("NA" in f for f in data_for_join_queries) != 1:
        raise FileNotFoundError(
            "Data files for join queries should contain file (only one) with NA component in the file name"
        )

    queries_results_fields = ["t_run1", "chk_t_run1", "t_run2", "chk_t_run2"]
    if groupby_queries_files_number:
        print(f"loading dataset {data_for_groupby_queries[0]}")
        t0 = timer()
        x = pd.read_csv(data_for_groupby_queries[0])
        x_data_file_import_time = timer() - t0

        queries = {
            "groupby_query1": groupby_query1_modin,
            "groupby_query2": groupby_query2_modin,
            "groupby_query3": groupby_query3_modin,
            "groupby_query4": groupby_query4_modin,
            "groupby_query5": groupby_query5_modin,
            "groupby_query6": groupby_query6_modin,
            "groupby_query7": groupby_query7_modin,
            "groupby_query8": groupby_query8_modin,
            "groupby_query9": groupby_query9_modin,
            "groupby_query10": groupby_query10_modin,
        }
        if pandas_mode == "Modin_on_omnisci":
            del queries["groupby_query6"]  # NotImplementedError: unsupported aggregate median
            del queries["groupby_query8"]  # Query execution in `Modin_on_omnisci` mode
            # is under development
            del queries["groupby_query9"]  # core dumped issue
            del queries["groupby_query10"]  # core dumped issue

        queries_results = {x: {y: 0.0 for y in queries_results_fields} for x in queries.keys()}
        x_data_file_size = getsize(data_for_groupby_queries[0])
        query_data_file_sizes = {x: x_data_file_size for x in queries.keys()}
        query_data_file_import_times = {x: x_data_file_import_time for x in queries.keys()}

        queries_parameters = {
            "x": x,
            "queries_results": queries_results,
            "extended_functionality": extended_functionality,
        }

    if join_queries_files_number:
        data_name = next(
            (f for f in data_for_join_queries if "NA" in f), None
        )  # gets the file name with "NA" component
        data_files_paths, data_files_sizes = join_to_tbls(data_name)

        data_files_import_times = {}
        data_df = {}
        print(f"loading dataset {[path for path in data_files_paths.values()]}")
        for data_id, data_path in data_files_paths.items():
            t0 = timer()
            data_df[data_id] = pd.read_csv(data_path)
            data_files_import_times[data_id] = timer() - t0

        print(len(data_df["x"].index), flush=True)
        print(len(data_df["small"].index), flush=True)
        print(len(data_df["medium"].index), flush=True)
        print(len(data_df["big"].index), flush=True)
        queries = {
            "join_query1": join_query1_modin,
            "join_query2": join_query2_modin,
            "join_query3": join_query3_modin,
            "join_query4": join_query4_modin,
            "join_query5": join_query5_modin,
        }
        queries_results = {x: {y: 0.0 for y in queries_results_fields} for x in queries.keys()}
        queries_parameters = {
            "x": data_df["x"],
            "ys": [data_df["small"], data_df["medium"], data_df["big"]],
            "queries_results": queries_results,
            "extended_functionality": extended_functionality,
        }

        query_data_file_sizes = {
            "join_query1": data_files_sizes["x"] + data_files_sizes["small"],
            "join_query2": data_files_sizes["x"] + data_files_sizes["medium"],
            "join_query3": data_files_sizes["x"] + data_files_sizes["medium"],
            "join_query4": data_files_sizes["x"] + data_files_sizes["medium"],
            "join_query5": data_files_sizes["x"] + data_files_sizes["big"],
        }
        query_data_file_import_times = {
            "join_query1": data_files_import_times["x"] + data_files_import_times["small"],
            "join_query2": data_files_import_times["x"] + data_files_import_times["medium"],
            "join_query3": data_files_import_times["x"] + data_files_import_times["medium"],
            "join_query4": data_files_import_times["x"] + data_files_import_times["medium"],
            "join_query5": data_files_import_times["x"] + data_files_import_times["big"],
        }

    for query_name, query_func in queries.items():
        query_func(**queries_parameters)
        print(f"{pandas_mode} {query_name} results:")
        print_results(results=queries_results[query_name], unit="s")
        queries_results[query_name]["Backend"] = pandas_mode
        queries_results[query_name]["t_readcsv"] = query_data_file_import_times[query_name]
        queries_results[query_name]["dataset_size"] = query_data_file_sizes[query_name]

    return queries_results