예제 #1
0
파일: runTest.py 프로젝트: gcca/blazingsql
def logger_results(
    logger,
    test_name,
    input_type,
    test_id,
    sql,
    resultComparisson,
    error_message,
    load_time,
    engine_time,
    total_time,
):
    commitHash = get_CommitHash()
    branchName = get_Branch()
    # dateNow=datetime.now()
    inputType = cs.get_extension(input_type)

    logger.info(get_QueryId(inputType, test_name, test_id))  # QueryID
    logger.info(Settings.dateNow)  # TimeStamp
    logger.info(test_name)  # TestGroup
    logger.info(inputType)  # InputType
    logger.info(sql)  # Query
    logger.info(get_resultId(resultComparisson))  # Result
    logger.info(error_message)  # Error
    logger.info(branchName)  # PR
    logger.info(commitHash)  # CommitHash
    logger.info(Settings.data["RunSettings"]["nRals"])
    logger.info(Settings.data["RunSettings"]["nGPUs"])
    logger.info(Settings.data["TestSettings"]["dataDirectory"])
    logger.info(test_id)
    logger.info(load_time)
    logger.info(engine_time)
    logger.info(total_time)
예제 #2
0
    def __loadTestCaseConfig(self, test_name, fileSchemaType):
        config = copy.deepcopy(self.configLocal)
        if "SETUP" in self.data[test_name]:
            setup = self.data[test_name]["SETUP"]

            if setup.get("SKIP_WITH") is not None: config.skip_with = setup.get("SKIP_WITH")
            if setup.get("COMPARING") is not None: config.comparing = setup.get("COMPARING")
            if setup.get("APPLY_ORDER") is not None: config.apply_order = setup.get("APPLY_ORDER")
            if setup.get("ORDER_BY_COL") is not None: config.order_by_col = setup.get("ORDER_BY_COL")
            if setup.get("PRINT_RESULT") is not None: config.print_result = setup.get("PRINT_RESULT")
            if setup.get("COMPARE_WITH") is not None: config.compare_with = setup.get("COMPARE_WITH")
            if setup.get("USE_PERCENTAGE") is not None: config.use_percentage = setup.get("USE_PERCENTAGE")
            if setup.get("MESSAGE_VALIDATION") is not None: config.message_validation = setup.get("MESSAGE_VALIDATION")
            if setup.get("ACCEPTABLE_DIFFERENCE") is not None: config.acceptable_difference = setup.get("ACCEPTABLE_DIFFERENCE")

        if 'SETUP' in self.data[test_name].keys():
            if "COMPARE_WITH" in self.data[test_name]['SETUP'].keys():
                if "spark" == self.data[test_name]['SETUP']['COMPARE_WITH']:
                    config.spark_query = self.data[test_name]["SQL"]

        if isinstance(config.compare_with, dict):
            formatList = list(config.compare_with.keys())
            ext = createSchema.get_extension(fileSchemaType)
            if ext.upper() in formatList:
                config.compare_with = config.compare_with[ext.upper()]
            else:
                config.compare_with = config.compare_with["OTHER"]

        return config
예제 #3
0
파일: runTest.py 프로젝트: gcca/blazingsql
def print_comparison_results(sql, queryId, queryType, pdf1, pdf2, print_result,
                             engine, input_type, total_time, error_message,
                             stringResult, columnNamesComparison,
                             resultComparisson):
    if print_result:
        print("#BLZ:")
        print(pdf1)
        if not isinstance(engine, str):
            if isinstance(engine, PyDrill):
                print("#DRILL:")
            else:
                print("#PYSPARK:")
            print(pdf2)
        else:
            if engine == "drill":
                print("#DRILL:")
            else:
                print("#PYSPARK:")
    data_type = cs.get_extension(input_type)
    print(str(queryId) + " Test " + queryType + " - " + data_type)
    print("#QUERY:")
    print(sql)
    print("RESULT:")
    print(stringResult)
    if columnNamesComparison is not True:
        print("Columns:")
        print(pdf1.columns)
        print(pdf2.columns)
        print("ERROR:")
        print(error_message)
    if resultComparisson != "Success":
        print("ERROR:")
        print(error_message)

    print("TOTAL TIME: ")
    print(total_time)
    print("CRASHED NODES: ")
    # print(resultgdf.n_crashed_nodes)
    print("TOTAL NODES: ")
    # print(resultgdf.total_nodes)
    print("===================================================")
예제 #4
0
파일: runTest.py 프로젝트: gcca/blazingsql
def run_query(bc, engine, query, queryId, queryType, worder, orderBy,
              acceptable_difference, use_percentage, input_type, **kwargs):
    """
        This function execute the query with blazingsql and drill/spark and call the functions to compare, print results
        and logs.

        ----------
        bc : blazing context
        engine: It's the instance of the engine (pydrill/ỳspark).
        query: Executed query.
        queryId: Query Id.
        worder : (True/False) parameter to indicate if it's neccesary to order the results.
        orderBy : It indicate by what column we want to order the results.
        acceptable_difference: This parameter is related to the acceptable difference beetween values
        from blazingsql results and drill/spark results.
        use_percentage: (True/False) to indicate if the results will be compared by percentage or difference.
        input_type: The data type (CSV, PARQUET, DASK_CUDF, JSON, ORC, GDF) that we use to run the query.
    """

    print(query)

    worder = 1 if worder == True else worder

    query_spark = kwargs.get("query_spark", query)

    algebra = kwargs.get("algebra", "")

    comparing = kwargs.get("comparing", True)

    nRals = Settings.data["RunSettings"]["nRals"]

    print_result = kwargs.get("print_result")
    if print_result is None:
        print_result = False

    message_validation = kwargs.get("message_validation", "")
    if message_validation is None:
        message_validation = False

    nested_query = kwargs.get("nested_query", False)

    blz_result = None
    if nested_query:
        blz_result = kwargs.get("blz_result", [])

    data_type = cs.get_extension(input_type)

    if Settings.execution_mode != "generator":
        print("\n=============== New query: " + str(queryId) + " - " +
              data_type + " (" + queryType + ")" + "=================")

    str_code_test = str(get_codTest(queryType)).upper()
    filename = str_code_test + "-" + str(queryId) + ".parquet"

    result_dir = Settings.data["TestSettings"]["fileResultsDirectory"]
    file_results_dir = str(result_dir)

    testsWithNulls = Settings.data["RunSettings"]["testsWithNulls"]

    result_gdf, load_time, engine_time, total_time, error_message = run_query_blazing(
        bc, nested_query, query, algebra, message_validation, blz_result)

    base_results_gd = None

    compareResults = True

    resultFile = ""

    str_engine = ""

    if not message_validation == "":
        print_validation_results(query, queryId, input_type, queryType,
                                 error_message, message_validation)
    elif not isinstance(engine, str):
        if isinstance(engine, PyDrill):
            # Drill
            query_drill = get_drill_query(query)
            base_results_gd = run_query_drill(engine, query_drill)
            str_engine = "drill"

        elif isinstance(engine, SparkSession):
            # Spark
            base_results_gd = run_query_spark(engine, query_spark)
            str_engine = "spark"

    else:  # GPUCI
        if "compare_result_values" in Settings.data["RunSettings"]:
            compareResults = Settings.data["RunSettings"][
                "compare_result_values"]

        if compareResults:
            if testsWithNulls != "true":
                resultFile = file_results_dir + "/" + str(
                    engine) + "/" + filename
            else:
                resultFile = file_results_dir + "/" + str(
                    engine) + "-nulls" + "/" + filename

            #base_results_gd = get_results(resultFile)

    results_processing(result_gdf, base_results_gd, worder, orderBy,
                       testsWithNulls, filename, query, queryId, queryType,
                       acceptable_difference, use_percentage, print_result,
                       engine, input_type, load_time, engine_time, total_time,
                       comparing, compareResults, resultFile, file_results_dir,
                       str_engine)
예제 #5
0
def test_name(queryType, fileSchemaType):
    ext = get_extension(fileSchemaType)
    tname = "%s%s%s" % (queryType, test_name_delimiter, ext)
    return tname
예제 #6
0
def print_query_results(sql, queryId, queryType, pdf1, pdf2, resultgdf,
                        acceptable_difference, use_percentage, print_result,
                        engine, input_type, load_time, engine_time,
                        total_time):
    if print_result:
        print("#BLZ:")
        print(pdf1)
        if isinstance(engine, PyDrill):
            print("#DRILL:")
        else:
            print("#PYSPARK:")
        print(pdf2)
    data_type = cs.get_extension(input_type)
    print(str(queryId) + " Test " + queryType + " - " + data_type)
    print("#QUERY:")
    print(sql)
    print("RESULT:")

    error_message = ""
    stringResult = ""

    compareResults = True
    if 'compare_results' in Settings.data['RunSettings']:
        compareResults = Settings.data['RunSettings']['compare_results']

    if compareResults:
        columnNamesComparison = compare_column_names(pdf1, pdf2)
        if columnNamesComparison != True:
            print("Columns:")
            print(pdf1.columns)
            print(pdf2.columns)

            error_message = "Column names are not the same"
            print("ERROR:")
            print(error_message)

        resultComparisson = compare_results(pdf1, pdf2, acceptable_difference,
                                            use_percentage, engine)
        if resultComparisson != "Success":
            error_message = resultComparisson[6:]
            print("ERROR:")
            print(error_message)

        stringResult = resultComparisson
        if resultComparisson != "Success" or columnNamesComparison == False:
            stringResult = "Fail"
    else:
        stringResult = "Success"
    print(stringResult)

    print("TOTAL TIME: ")
    print(total_time)
    print("CRASHED NODES: ")
    #print(resultgdf.n_crashed_nodes)
    print("TOTAL NODES: ")
    #print(resultgdf.total_nodes)
    print('===================================================')

    logger = logginghelper(name)

    #TODO percy kharoly bindings we need to get the number from internal api
    #print_fixed_log(logger, queryType, queryId, sql, stringResult, error_message, 1, 1, 2)
    print_fixed_log(logger, queryType, input_type, queryId, sql, stringResult,
                    error_message, load_time, engine_time, total_time)
예제 #7
0
def run_query(bc, engine, query, queryId, queryType, worder, orderBy,
              acceptable_difference, use_percentage, input_type, **kwargs):

    query_spark = kwargs.get('query_spark', query)

    algebra = kwargs.get('algebra', "")

    nRals = Settings.data['RunSettings']['nRals']

    print_result = kwargs.get('print_result')
    if print_result is None:
        print_result = False

    data_type = cs.get_extension(input_type)

    if Settings.execution_mode != "Generator":
        print("\n=============== New query: " + str(queryId) + " - " +
              data_type + " =================")

    load_time = 0
    engine_time = 0
    total_time = 0

    nested_query = kwargs.get('nested_query')
    if nested_query is None:
        nested_query = False

    if nested_query == False:
        #if int(nRals) == 1:  # Single Node
        query_blz = query  #get_blazingsql_query('main', query)
        if algebra == "":
            start_time = time.time()
            result_gdf = bc.sql(query_blz)
            end_time = time.time()
            total_time = (end_time - start_time) * 1000
            #SUM(CASE WHEN info = 'evaluate_split_query load_data' THEN duration ELSE 0 END) AS load_time,
            #MAX(load_time) AS load_time,
            log_result = bc.log("""SELECT
                    MAX(end_time) as end_time, query_id, 
                    MAX(total_time) AS total_time 
                FROM (
                    SELECT
                        query_id, node_id,
                        
                        SUM(CASE WHEN info = 'Query Execution Done' THEN duration ELSE 0 END) AS total_time,
                        MAX(log_time) AS end_time
                    FROM
                        bsql_logs
                    WHERE
                        info = 'evaluate_split_query load_data'
                        OR info = 'Query Execution Done'
                    GROUP BY
                        node_id, query_id
                    )
                GROUP BY
                    query_id
                ORDER BY
                    end_time DESC limit 1""")

            if int(nRals) == 1:  # Single Node
                n_log = log_result
            else:  # Simple Distribution
                n_log = log_result.compute()

            load_time = 0  #n_log['load_time'][0]
            engine_time = n_log['total_time'][0]
        else:
            result_gdf = bc.sql(query_blz, algebra=algebra)

    else:  # for nested queries as column basis test
        result_gdf = kwargs.get('blz_result')
        if result_gdf is None:
            result_gdf = []

    filename = str(
        get_codTest(queryType)).upper() + "-" + str(queryId) + ".parquet"

    file_results_dir = str(
        Settings.data['TestSettings']['fileResultsDirectory'])

    if not isinstance(engine, str):
        if isinstance(engine, PyDrill):
            # Drill
            query_drill = get_drill_query(query)
            result_drill_gd = run_query_drill(engine, query_drill)
            if result_gdf is not None:
                if result_gdf.columns is not None:
                    #FOR DASK CUDF
                    import dask_cudf
                    if type(result_gdf) is dask_cudf.core.DataFrame:
                        result_gdf = result_gdf.compute()

                    expected_dtypes = result_gdf.dtypes.to_list()
                    pdf1 = upcast_to_float(result_gdf).fillna(
                        get_null_constants(result_gdf)).to_pandas()
                    pdf2 = to_pandas_f64_engine(result_drill_gd.resultSet,
                                                expected_dtypes)
                    pdf2 = upcast_to_float(pdf2).fillna(
                        get_null_constants(pdf2))
                    formatResults(pdf1, pdf2, worder, orderBy)

                    if Settings.execution_mode == ExecutionMode.GENERATOR:
                        file_res_drill_dir = file_results_dir + "/" + "drill" + "/" + filename

                        if not os.path.exists(file_res_drill_dir):
                            save_results_parquet(file_res_drill_dir, pdf2)

                        print("Drill: " + filename + " generated.")

                    else:
                        print_query_results(query, queryId, queryType, pdf1,
                                            pdf2, result_gdf,
                                            acceptable_difference,
                                            use_percentage, print_result,
                                            engine, input_type, load_time,
                                            engine_time, total_time)

                else:
                    print_query_results2(query, queryId, queryType,
                                         result_gdf.error_message)
        elif isinstance(engine, SparkSession):
            #Spark
            result_spark_df = run_query_spark(engine, query_spark)

            if result_gdf is not None:
                if result_gdf.columns is not None:

                    import dask_cudf
                    if type(result_gdf) is dask_cudf.core.DataFrame:
                        result_gdf = result_gdf.compute()

                    expected_dtypes = result_gdf.dtypes.to_list()
                    pdf1 = upcast_to_float(result_gdf).fillna(
                        get_null_constants(result_gdf)).to_pandas()
                    pdf2 = to_pandas_f64_engine(result_spark_df.resultSet,
                                                expected_dtypes)
                    pdf2 = upcast_to_float(pdf2).fillna(
                        get_null_constants(pdf2))
                    formatResults(pdf1, pdf2, worder, orderBy)

                    if Settings.execution_mode == ExecutionMode.GENERATOR:

                        file_res_drill_dir = file_results_dir + "/" + "spark" + "/" + filename

                        if not os.path.exists(file_res_drill_dir):
                            save_results_parquet(file_res_drill_dir, pdf2)
                            print("Spark: " + filename + " generated.")

                    else:
                        print_query_results(query_spark, queryId, queryType,
                                            pdf1, pdf2, result_gdf,
                                            acceptable_difference,
                                            use_percentage, print_result,
                                            engine, input_type, load_time,
                                            engine_time, total_time)
            else:
                print_query_results2(query_spark, queryId, queryType,
                                     result_gdf.error_message)
    else:  #GPUCI

        compareResults = True
        if 'compare_results' in Settings.data['RunSettings']:
            compareResults = Settings.data['RunSettings']['compare_results']

        if compareResults == "true":
            resultFile = file_results_dir + "/" + str(engine) + "/" + filename
            pdf2 = get_results(resultFile)
            if result_gdf is not None:
                if result_gdf.columns is not None:
                    #FOR DASK CUDF
                    import dask_cudf
                    if type(result_gdf) is dask_cudf.core.DataFrame:
                        result_gdf = result_gdf.compute()

                    expected_dtypes = result_gdf.dtypes.to_list()
                    pdf1 = upcast_to_float(result_gdf).fillna(
                        get_null_constants(result_gdf)).to_pandas()
                    format_pdf(pdf1, worder, orderBy)
                    print(pdf2)

                    print_query_results(query, queryId, queryType, pdf1, pdf2,
                                        result_gdf, acceptable_difference,
                                        use_percentage, print_result, engine,
                                        input_type, load_time, engine_time,
                                        total_time)

                else:
                    print_query_results2(query, queryId, queryType,
                                         result_gdf.error_message)
        else:
            if result_gdf is not None:
                if result_gdf.columns is not None:
                    #FOR DASK CUDF
                    import dask_cudf
                    if type(result_gdf) is dask_cudf.core.DataFrame:
                        result_gdf = result_gdf.compute()

                    expected_dtypes = result_gdf.dtypes.to_list()
                    pdf1 = upcast_to_float(result_gdf).fillna(
                        get_null_constants(result_gdf)).to_pandas()
                    pdf2 = pd.DataFrame()
                    formatResults(pdf1, pdf2, worder, orderBy)

                    print_query_results(query, queryId, queryType, pdf1, pdf2,
                                        result_gdf, acceptable_difference,
                                        use_percentage, print_result, engine,
                                        input_type, load_time, engine_time,
                                        total_time)
            else:
                print_query_results2(query, queryId, queryType,
                                     result_gdf.error_message)