def logger_results( logger, test_name, input_type, test_id, sql, resultComparisson, error_message, load_time, engine_time, total_time, ): commitHash = get_CommitHash() branchName = get_Branch() # dateNow=datetime.now() inputType = cs.get_extension(input_type) logger.info(get_QueryId(inputType, test_name, test_id)) # QueryID logger.info(Settings.dateNow) # TimeStamp logger.info(test_name) # TestGroup logger.info(inputType) # InputType logger.info(sql) # Query logger.info(get_resultId(resultComparisson)) # Result logger.info(error_message) # Error logger.info(branchName) # PR logger.info(commitHash) # CommitHash logger.info(Settings.data["RunSettings"]["nRals"]) logger.info(Settings.data["RunSettings"]["nGPUs"]) logger.info(Settings.data["TestSettings"]["dataDirectory"]) logger.info(test_id) logger.info(load_time) logger.info(engine_time) logger.info(total_time)
def __loadTestCaseConfig(self, test_name, fileSchemaType): config = copy.deepcopy(self.configLocal) if "SETUP" in self.data[test_name]: setup = self.data[test_name]["SETUP"] if setup.get("SKIP_WITH") is not None: config.skip_with = setup.get("SKIP_WITH") if setup.get("COMPARING") is not None: config.comparing = setup.get("COMPARING") if setup.get("APPLY_ORDER") is not None: config.apply_order = setup.get("APPLY_ORDER") if setup.get("ORDER_BY_COL") is not None: config.order_by_col = setup.get("ORDER_BY_COL") if setup.get("PRINT_RESULT") is not None: config.print_result = setup.get("PRINT_RESULT") if setup.get("COMPARE_WITH") is not None: config.compare_with = setup.get("COMPARE_WITH") if setup.get("USE_PERCENTAGE") is not None: config.use_percentage = setup.get("USE_PERCENTAGE") if setup.get("MESSAGE_VALIDATION") is not None: config.message_validation = setup.get("MESSAGE_VALIDATION") if setup.get("ACCEPTABLE_DIFFERENCE") is not None: config.acceptable_difference = setup.get("ACCEPTABLE_DIFFERENCE") if 'SETUP' in self.data[test_name].keys(): if "COMPARE_WITH" in self.data[test_name]['SETUP'].keys(): if "spark" == self.data[test_name]['SETUP']['COMPARE_WITH']: config.spark_query = self.data[test_name]["SQL"] if isinstance(config.compare_with, dict): formatList = list(config.compare_with.keys()) ext = createSchema.get_extension(fileSchemaType) if ext.upper() in formatList: config.compare_with = config.compare_with[ext.upper()] else: config.compare_with = config.compare_with["OTHER"] return config
def print_comparison_results(sql, queryId, queryType, pdf1, pdf2, print_result, engine, input_type, total_time, error_message, stringResult, columnNamesComparison, resultComparisson): if print_result: print("#BLZ:") print(pdf1) if not isinstance(engine, str): if isinstance(engine, PyDrill): print("#DRILL:") else: print("#PYSPARK:") print(pdf2) else: if engine == "drill": print("#DRILL:") else: print("#PYSPARK:") data_type = cs.get_extension(input_type) print(str(queryId) + " Test " + queryType + " - " + data_type) print("#QUERY:") print(sql) print("RESULT:") print(stringResult) if columnNamesComparison is not True: print("Columns:") print(pdf1.columns) print(pdf2.columns) print("ERROR:") print(error_message) if resultComparisson != "Success": print("ERROR:") print(error_message) print("TOTAL TIME: ") print(total_time) print("CRASHED NODES: ") # print(resultgdf.n_crashed_nodes) print("TOTAL NODES: ") # print(resultgdf.total_nodes) print("===================================================")
def run_query(bc, engine, query, queryId, queryType, worder, orderBy, acceptable_difference, use_percentage, input_type, **kwargs): """ This function execute the query with blazingsql and drill/spark and call the functions to compare, print results and logs. ---------- bc : blazing context engine: It's the instance of the engine (pydrill/ỳspark). query: Executed query. queryId: Query Id. worder : (True/False) parameter to indicate if it's neccesary to order the results. orderBy : It indicate by what column we want to order the results. acceptable_difference: This parameter is related to the acceptable difference beetween values from blazingsql results and drill/spark results. use_percentage: (True/False) to indicate if the results will be compared by percentage or difference. input_type: The data type (CSV, PARQUET, DASK_CUDF, JSON, ORC, GDF) that we use to run the query. """ print(query) worder = 1 if worder == True else worder query_spark = kwargs.get("query_spark", query) algebra = kwargs.get("algebra", "") comparing = kwargs.get("comparing", True) nRals = Settings.data["RunSettings"]["nRals"] print_result = kwargs.get("print_result") if print_result is None: print_result = False message_validation = kwargs.get("message_validation", "") if message_validation is None: message_validation = False nested_query = kwargs.get("nested_query", False) blz_result = None if nested_query: blz_result = kwargs.get("blz_result", []) data_type = cs.get_extension(input_type) if Settings.execution_mode != "generator": print("\n=============== New query: " + str(queryId) + " - " + data_type + " (" + queryType + ")" + "=================") str_code_test = str(get_codTest(queryType)).upper() filename = str_code_test + "-" + str(queryId) + ".parquet" result_dir = Settings.data["TestSettings"]["fileResultsDirectory"] file_results_dir = str(result_dir) testsWithNulls = Settings.data["RunSettings"]["testsWithNulls"] result_gdf, load_time, engine_time, total_time, error_message = run_query_blazing( bc, nested_query, query, algebra, message_validation, blz_result) base_results_gd = None compareResults = True resultFile = "" str_engine = "" if not message_validation == "": print_validation_results(query, queryId, input_type, queryType, error_message, message_validation) elif not isinstance(engine, str): if isinstance(engine, PyDrill): # Drill query_drill = get_drill_query(query) base_results_gd = run_query_drill(engine, query_drill) str_engine = "drill" elif isinstance(engine, SparkSession): # Spark base_results_gd = run_query_spark(engine, query_spark) str_engine = "spark" else: # GPUCI if "compare_result_values" in Settings.data["RunSettings"]: compareResults = Settings.data["RunSettings"][ "compare_result_values"] if compareResults: if testsWithNulls != "true": resultFile = file_results_dir + "/" + str( engine) + "/" + filename else: resultFile = file_results_dir + "/" + str( engine) + "-nulls" + "/" + filename #base_results_gd = get_results(resultFile) results_processing(result_gdf, base_results_gd, worder, orderBy, testsWithNulls, filename, query, queryId, queryType, acceptable_difference, use_percentage, print_result, engine, input_type, load_time, engine_time, total_time, comparing, compareResults, resultFile, file_results_dir, str_engine)
def test_name(queryType, fileSchemaType): ext = get_extension(fileSchemaType) tname = "%s%s%s" % (queryType, test_name_delimiter, ext) return tname
def print_query_results(sql, queryId, queryType, pdf1, pdf2, resultgdf, acceptable_difference, use_percentage, print_result, engine, input_type, load_time, engine_time, total_time): if print_result: print("#BLZ:") print(pdf1) if isinstance(engine, PyDrill): print("#DRILL:") else: print("#PYSPARK:") print(pdf2) data_type = cs.get_extension(input_type) print(str(queryId) + " Test " + queryType + " - " + data_type) print("#QUERY:") print(sql) print("RESULT:") error_message = "" stringResult = "" compareResults = True if 'compare_results' in Settings.data['RunSettings']: compareResults = Settings.data['RunSettings']['compare_results'] if compareResults: columnNamesComparison = compare_column_names(pdf1, pdf2) if columnNamesComparison != True: print("Columns:") print(pdf1.columns) print(pdf2.columns) error_message = "Column names are not the same" print("ERROR:") print(error_message) resultComparisson = compare_results(pdf1, pdf2, acceptable_difference, use_percentage, engine) if resultComparisson != "Success": error_message = resultComparisson[6:] print("ERROR:") print(error_message) stringResult = resultComparisson if resultComparisson != "Success" or columnNamesComparison == False: stringResult = "Fail" else: stringResult = "Success" print(stringResult) print("TOTAL TIME: ") print(total_time) print("CRASHED NODES: ") #print(resultgdf.n_crashed_nodes) print("TOTAL NODES: ") #print(resultgdf.total_nodes) print('===================================================') logger = logginghelper(name) #TODO percy kharoly bindings we need to get the number from internal api #print_fixed_log(logger, queryType, queryId, sql, stringResult, error_message, 1, 1, 2) print_fixed_log(logger, queryType, input_type, queryId, sql, stringResult, error_message, load_time, engine_time, total_time)
def run_query(bc, engine, query, queryId, queryType, worder, orderBy, acceptable_difference, use_percentage, input_type, **kwargs): query_spark = kwargs.get('query_spark', query) algebra = kwargs.get('algebra', "") nRals = Settings.data['RunSettings']['nRals'] print_result = kwargs.get('print_result') if print_result is None: print_result = False data_type = cs.get_extension(input_type) if Settings.execution_mode != "Generator": print("\n=============== New query: " + str(queryId) + " - " + data_type + " =================") load_time = 0 engine_time = 0 total_time = 0 nested_query = kwargs.get('nested_query') if nested_query is None: nested_query = False if nested_query == False: #if int(nRals) == 1: # Single Node query_blz = query #get_blazingsql_query('main', query) if algebra == "": start_time = time.time() result_gdf = bc.sql(query_blz) end_time = time.time() total_time = (end_time - start_time) * 1000 #SUM(CASE WHEN info = 'evaluate_split_query load_data' THEN duration ELSE 0 END) AS load_time, #MAX(load_time) AS load_time, log_result = bc.log("""SELECT MAX(end_time) as end_time, query_id, MAX(total_time) AS total_time FROM ( SELECT query_id, node_id, SUM(CASE WHEN info = 'Query Execution Done' THEN duration ELSE 0 END) AS total_time, MAX(log_time) AS end_time FROM bsql_logs WHERE info = 'evaluate_split_query load_data' OR info = 'Query Execution Done' GROUP BY node_id, query_id ) GROUP BY query_id ORDER BY end_time DESC limit 1""") if int(nRals) == 1: # Single Node n_log = log_result else: # Simple Distribution n_log = log_result.compute() load_time = 0 #n_log['load_time'][0] engine_time = n_log['total_time'][0] else: result_gdf = bc.sql(query_blz, algebra=algebra) else: # for nested queries as column basis test result_gdf = kwargs.get('blz_result') if result_gdf is None: result_gdf = [] filename = str( get_codTest(queryType)).upper() + "-" + str(queryId) + ".parquet" file_results_dir = str( Settings.data['TestSettings']['fileResultsDirectory']) if not isinstance(engine, str): if isinstance(engine, PyDrill): # Drill query_drill = get_drill_query(query) result_drill_gd = run_query_drill(engine, query_drill) if result_gdf is not None: if result_gdf.columns is not None: #FOR DASK CUDF import dask_cudf if type(result_gdf) is dask_cudf.core.DataFrame: result_gdf = result_gdf.compute() expected_dtypes = result_gdf.dtypes.to_list() pdf1 = upcast_to_float(result_gdf).fillna( get_null_constants(result_gdf)).to_pandas() pdf2 = to_pandas_f64_engine(result_drill_gd.resultSet, expected_dtypes) pdf2 = upcast_to_float(pdf2).fillna( get_null_constants(pdf2)) formatResults(pdf1, pdf2, worder, orderBy) if Settings.execution_mode == ExecutionMode.GENERATOR: file_res_drill_dir = file_results_dir + "/" + "drill" + "/" + filename if not os.path.exists(file_res_drill_dir): save_results_parquet(file_res_drill_dir, pdf2) print("Drill: " + filename + " generated.") else: print_query_results(query, queryId, queryType, pdf1, pdf2, result_gdf, acceptable_difference, use_percentage, print_result, engine, input_type, load_time, engine_time, total_time) else: print_query_results2(query, queryId, queryType, result_gdf.error_message) elif isinstance(engine, SparkSession): #Spark result_spark_df = run_query_spark(engine, query_spark) if result_gdf is not None: if result_gdf.columns is not None: import dask_cudf if type(result_gdf) is dask_cudf.core.DataFrame: result_gdf = result_gdf.compute() expected_dtypes = result_gdf.dtypes.to_list() pdf1 = upcast_to_float(result_gdf).fillna( get_null_constants(result_gdf)).to_pandas() pdf2 = to_pandas_f64_engine(result_spark_df.resultSet, expected_dtypes) pdf2 = upcast_to_float(pdf2).fillna( get_null_constants(pdf2)) formatResults(pdf1, pdf2, worder, orderBy) if Settings.execution_mode == ExecutionMode.GENERATOR: file_res_drill_dir = file_results_dir + "/" + "spark" + "/" + filename if not os.path.exists(file_res_drill_dir): save_results_parquet(file_res_drill_dir, pdf2) print("Spark: " + filename + " generated.") else: print_query_results(query_spark, queryId, queryType, pdf1, pdf2, result_gdf, acceptable_difference, use_percentage, print_result, engine, input_type, load_time, engine_time, total_time) else: print_query_results2(query_spark, queryId, queryType, result_gdf.error_message) else: #GPUCI compareResults = True if 'compare_results' in Settings.data['RunSettings']: compareResults = Settings.data['RunSettings']['compare_results'] if compareResults == "true": resultFile = file_results_dir + "/" + str(engine) + "/" + filename pdf2 = get_results(resultFile) if result_gdf is not None: if result_gdf.columns is not None: #FOR DASK CUDF import dask_cudf if type(result_gdf) is dask_cudf.core.DataFrame: result_gdf = result_gdf.compute() expected_dtypes = result_gdf.dtypes.to_list() pdf1 = upcast_to_float(result_gdf).fillna( get_null_constants(result_gdf)).to_pandas() format_pdf(pdf1, worder, orderBy) print(pdf2) print_query_results(query, queryId, queryType, pdf1, pdf2, result_gdf, acceptable_difference, use_percentage, print_result, engine, input_type, load_time, engine_time, total_time) else: print_query_results2(query, queryId, queryType, result_gdf.error_message) else: if result_gdf is not None: if result_gdf.columns is not None: #FOR DASK CUDF import dask_cudf if type(result_gdf) is dask_cudf.core.DataFrame: result_gdf = result_gdf.compute() expected_dtypes = result_gdf.dtypes.to_list() pdf1 = upcast_to_float(result_gdf).fillna( get_null_constants(result_gdf)).to_pandas() pdf2 = pd.DataFrame() formatResults(pdf1, pdf2, worder, orderBy) print_query_results(query, queryId, queryType, pdf1, pdf2, result_gdf, acceptable_difference, use_percentage, print_result, engine, input_type, load_time, engine_time, total_time) else: print_query_results2(query, queryId, queryType, result_gdf.error_message)