Example #1
0
    drill = "drill"  # None

    compareResults = True
    if "compare_results" in Settings.data["RunSettings"]:
        compareResults = Settings.data["RunSettings"]["compare_results"]

    if ((Settings.execution_mode == ExecutionMode.FULL
         and compareResults == "true")
            or Settings.execution_mode == ExecutionMode.GENERATOR):
        # Create Table Drill ------------------------------------------------
        print("starting drill")
        from pydrill.client import PyDrill

        drill = PyDrill(host="localhost", port=8047)
        cs.init_drill_schema(drill,
                             Settings.data["TestSettings"]["dataDirectory"])

    # Create Context For BlazingSQL

    bc, dask_client = init_context()

    nRals = Settings.data["RunSettings"]["nRals"]

    main(dask_client, drill, Settings.data["TestSettings"]["dataDirectory"],
         bc, nRals)

    if Settings.execution_mode != ExecutionMode.GENERATOR:
        runTest.save_log()
        gpuMemory.print_log_gpu_memory()
Example #2
0
    Execution.getArgs()
    
    nvmlInit()

    drill = "drill" #None
    spark = "spark"

    compareResults = True
    if 'compare_results' in Settings.data['RunSettings']:
        compareResults = Settings.data['RunSettings']['compare_results'] 

    if (Settings.execution_mode == ExecutionMode.FULL and compareResults == "true") or Settings.execution_mode == ExecutionMode.GENERATOR:
        # Create Table Drill ------------------------------------------------------------------------------------------------------
        from pydrill.client import PyDrill
        drill = PyDrill(host = 'localhost', port = 8047)
        cs.init_drill_schema(drill, Settings.data['TestSettings']['dataDirectory'])

        # Create Table Spark ------------------------------------------------------------------------------------------------------
        spark = SparkSession.builder.appName("timestampTest").getOrCreate()
        cs.init_spark_schema(spark, Settings.data['TestSettings']['dataDirectory'])

    #Create Context For BlazingSQL
    
    bc, dask_client = init_context()

    nRals = Settings.data['RunSettings']['nRals']

    main(dask_client, drill, spark, Settings.data['TestSettings']['dataDirectory'], bc, nRals)
    
    if Settings.execution_mode != ExecutionMode.GENERATOR:
        runTest.save_log()
Example #3
0
    drill = "drill"  # None

    compareResults = True
    if "compare_results" in Settings.data["RunSettings"]:
        compareResults = Settings.data["RunSettings"]["compare_results"]

    if ((Settings.execution_mode == ExecutionMode.FULL
         and compareResults == "true")
            or Settings.execution_mode == ExecutionMode.GENERATOR):
        # Create Table Drill ------------------------------------------------
        print("starting drill")
        from pydrill.client import PyDrill

        drill = PyDrill(host="localhost", port=8047)
        cs.init_drill_schema(drill,
                             Settings.data["TestSettings"]["dataDirectory"],
                             bool_test=True)

    # Create Context For BlazingSQL

    bc, dask_client = init_context()

    nRals = Settings.data["RunSettings"]["nRals"]

    main(dask_client, drill, Settings.data["TestSettings"]["dataDirectory"],
         bc, nRals)

    if Settings.execution_mode != ExecutionMode.GENERATOR:
        runTest.save_log()
        gpuMemory.print_log_gpu_memory()
Example #4
0
def main():
    print("**init end2end**")
    Execution.getArgs()
    nvmlInit()
    dir_data_file = Settings.data["TestSettings"]["dataDirectory"]
    nRals = Settings.data["RunSettings"]["nRals"]

    drill = "drill"
    spark = "spark"
    compareResults = True
    if "compare_results" in Settings.data["RunSettings"]:
        compareResults = Settings.data["RunSettings"]["compare_results"]

    if (Settings.execution_mode == ExecutionMode.FULL and compareResults
            == "true") or Settings.execution_mode == ExecutionMode.GENERATOR:

        # Create Table Drill -----------------------------------------
        from pydrill.client import PyDrill

        drill = PyDrill(host="localhost", port=8047)
        createSchema.init_drill_schema(
            drill,
            Settings.data["TestSettings"]["dataDirectory"],
            bool_test=True)

        # Create Table Spark -------------------------------------------------
        from pyspark.sql import SparkSession

        spark = SparkSession.builder.appName("allE2ETest").getOrCreate()
        createSchema.init_spark_schema(
            spark, Settings.data["TestSettings"]["dataDirectory"])

    # Create Context For BlazingSQL
    bc, dask_client = init_context()

    targetTestGroups = Settings.data["RunSettings"]["targetTestGroups"]
    runAllTests = (
        len(targetTestGroups) == 0
    )  # if targetTestGroups was empty the user wants to run all the tests

    if runAllTests or ("aggregationsWithoutGroupByTest" in targetTestGroups):
        aggregationsWithoutGroupByTest.main(dask_client, drill, dir_data_file,
                                            bc, nRals)

    if runAllTests or ("coalesceTest" in targetTestGroups):
        coalesceTest.main(dask_client, drill, dir_data_file, bc,
                          nRals)  # we are not supporting coalesce yet

    if runAllTests or ("columnBasisTest" in targetTestGroups):
        columnBasisTest.main(dask_client, drill, dir_data_file, bc, nRals)

    if runAllTests or ("commonTableExpressionsTest" in targetTestGroups):
        commonTableExpressionsTest.main(dask_client, drill, dir_data_file, bc,
                                        nRals)

    # we are not supporting count distinct yet
    # countDistincTest.main(dask_client, drill, dir_data_file, bc)

    if runAllTests or ("countWithoutGroupByTest" in targetTestGroups):
        countWithoutGroupByTest.main(dask_client, drill, dir_data_file, bc,
                                     nRals)

    if runAllTests or ("dateTest" in targetTestGroups):
        dateTest.main(dask_client, drill, spark, dir_data_file, bc, nRals)

    if runAllTests or ("timestampTest" in targetTestGroups):
        timestampTest.main(dask_client, drill, spark, dir_data_file, bc, nRals)

    if runAllTests or ("fullOuterJoinsTest" in targetTestGroups):
        fullOuterJoinsTest.main(dask_client, drill, dir_data_file, bc, nRals)

    if runAllTests or ("groupByTest" in targetTestGroups):
        groupByTest.main(dask_client, drill, dir_data_file, bc, nRals)

    if runAllTests or ("GroupByWitoutAggregations" in targetTestGroups):
        GroupByWitoutAggregations.main(dask_client, drill, dir_data_file, bc,
                                       nRals)

    if runAllTests or ("innerJoinsTest" in targetTestGroups):
        innerJoinsTest.main(dask_client, drill, dir_data_file, bc, nRals)

    if runAllTests or ("crossJoinsTest" in targetTestGroups):
        crossJoinsTest.main(dask_client, spark, dir_data_file, bc, nRals)

    if runAllTests or ("" in targetTestGroups):
        leftOuterJoinsTest.main(dask_client, drill, dir_data_file, bc, nRals)

    if runAllTests or ("nonEquiJoinsTest" in targetTestGroups):
        nonEquiJoinsTest.main(dask_client, drill, dir_data_file, bc, nRals)

    # loadDataTest.main(dask_client, bc) #check this

    if runAllTests or ("nestedQueriesTest" in targetTestGroups):
        nestedQueriesTest.main(dask_client, drill, dir_data_file, bc, nRals)

    if runAllTests or ("orderbyTest" in targetTestGroups):
        orderbyTest.main(dask_client, drill, dir_data_file, bc, nRals)

    if runAllTests or ("predicatesWithNulls" in targetTestGroups):
        predicatesWithNulls.main(dask_client, drill, dir_data_file, bc, nRals)

    if runAllTests or ("stringTests" in targetTestGroups):
        stringTests.main(dask_client, drill, spark, dir_data_file, bc, nRals)

    if runAllTests or ("tablesFromPandasTest" in targetTestGroups):
        tablesFromPandasTest.main(dask_client, drill, dir_data_file, bc, nRals)

    if runAllTests or ("unaryOpsTest" in targetTestGroups):
        unaryOpsTest.main(dask_client, drill, dir_data_file, bc, nRals)

    if runAllTests or ("unifyTablesTest" in targetTestGroups):
        unifyTablesTest.main(dask_client, drill, dir_data_file, bc, nRals)

    if runAllTests or ("unionTest" in targetTestGroups):
        unionTest.main(dask_client, drill, spark, dir_data_file, bc, nRals)

    if runAllTests or ("useLimitTest" in targetTestGroups):
        useLimitTest.main(dask_client, drill, spark, dir_data_file, bc, nRals)

    if runAllTests or ("whereClauseTest" in targetTestGroups):
        whereClauseTest.main(dask_client, drill, dir_data_file, bc, nRals)

    if runAllTests or ("bindableAliasTest" in targetTestGroups):
        bindableAliasTest.main(dask_client, drill, spark, dir_data_file, bc,
                               nRals)

    if runAllTests or ("booleanTest" in targetTestGroups):
        booleanTest.main(dask_client, drill, dir_data_file, bc, nRals)

    if runAllTests or ("caseTest" in targetTestGroups):
        caseTest.main(dask_client, drill, spark, dir_data_file, bc, nRals)

    if runAllTests or ("castTest" in targetTestGroups):
        castTest.main(dask_client, drill, spark, dir_data_file, bc, nRals)

    if runAllTests or ("concatTest" in targetTestGroups):
        concatTest.main(dask_client, drill, spark, dir_data_file, bc, nRals)

    if runAllTests or ("literalTest" in targetTestGroups):
        literalTest.main(dask_client, drill, spark, dir_data_file, bc, nRals)

    if runAllTests or ("dirTest" in targetTestGroups):
        dirTest.main(dask_client, drill, dir_data_file, bc, nRals)

    # HDFS is not working yet
    # fileSystemHdfsTest.main(dask_client, drill, dir_data_file, bc)

    # HDFS is not working yet
    # mixedFileSystemTest.main(dask_client, drill, dir_data_file, bc)

    if runAllTests or ("likeTest" in targetTestGroups):
        likeTest.main(dask_client, drill, dir_data_file, bc, nRals)

    if runAllTests or ("simpleDistributionTest" in targetTestGroups):
        simpleDistributionTest.main(dask_client, drill, spark, dir_data_file,
                                    bc, nRals)

    if runAllTests or ("substringTest" in targetTestGroups):
        substringTest.main(dask_client, drill, spark, dir_data_file, bc, nRals)

    if runAllTests or ("wildCardTest" in targetTestGroups):
        wildCardTest.main(dask_client, drill, dir_data_file, bc, nRals)

    if runAllTests or ("tpchQueriesTest" in targetTestGroups):
        tpchQueriesTest.main(dask_client, drill, spark, dir_data_file, bc,
                             nRals)

    if runAllTests or ("roundTest" in targetTestGroups):
        roundTest.main(dask_client, drill, dir_data_file, bc, nRals)

    if runAllTests or ("fileSystemLocalTest" in targetTestGroups):
        fileSystemLocalTest.main(dask_client, drill, dir_data_file, bc, nRals)

    if runAllTests or ("messageValidationTest" in targetTestGroups):
        messageValidationTest.main(dask_client, drill, dir_data_file, bc,
                                   nRals)

    if Settings.execution_mode != ExecutionMode.GPUCI:
        if runAllTests or ("fileSystemS3Test" in targetTestGroups):
            fileSystemS3Test.main(dask_client, drill, dir_data_file, bc, nRals)

        if runAllTests or ("fileSystemGSTest" in targetTestGroups):
            fileSystemGSTest.main(dask_client, drill, dir_data_file, bc, nRals)

    # timestampdiffTest.main(dask_client, spark, dir_data_file, bc, nRals)

    if Settings.execution_mode != ExecutionMode.GENERATOR:

        result, error_msgs = runTest.save_log(
            Settings.execution_mode == ExecutionMode.GPUCI)

        max = 0
        for i in range(0, len(Settings.memory_list)):
            if (Settings.memory_list[i].delta) > max:
                max = Settings.memory_list[i].delta

        print("MAX DELTA: " + str(max))
        print("""***********************************************************
              ********************""")

        for i in range(0, len(Settings.memory_list)):
            print(Settings.memory_list[i].name + ":" + "   Start Mem: " +
                  str(Settings.memory_list[i].start_mem) + "   End Mem: " +
                  str(Settings.memory_list[i].end_mem) + "   Diff: " +
                  str(Settings.memory_list[i].delta))

        return result, error_msgs

    return True, []
Example #5
0
def main(dask_client, bc):

    # Create Table Drill ------------------------------------------------------------------------------------------------------

    drill = PyDrill(host='localhost', port=8047)

    dir_data_lc = Settings.data['TestSettings']['dataDirectory']

    for x in range(5):

        # [numberOfFiles, type_nation, type_region, type_supplier, type_customer, type_lineitem, type_orders]

        run = []

        if x == 0:
            run = [1, 'psv', 'psv', 'psv', 'psv', 'psv', 'psv']
        elif x == 1:
            run = [
                2, 'parquet', 'parquet', 'parquet', 'parquet', 'parquet',
                'parquet'
            ]
        elif x == 2:
            run = [6, 'parquet', 'psv', 'parquet', 'psv', 'parquet', 'psv']
        elif x == 3:
            run = [10, 'psv', 'parquet', 'psv', 'parquet', 'psv', 'parquet']
        elif x == 4:
            run = [12, 'psv', 'psv', 'parquet', 'parquet', 'psv', 'parquet']

        print(
            "======================================================================="
        )
        print("Running " + str(x + 1) + ":")
        print("NĂºmero de Archivos: " + str(run[0]))
        print("Type of files for Nation: " + run[1])
        print("Type of files for Region: " + run[2])
        print("Type of files for Supplier: " + run[3])
        print("Type of files for Customer: " + run[4])
        print("Type of files for Lineitem: " + run[5])
        print("Type of files for Orders: " + run[6])
        print(
            "======================================================================="
        )
        print("1")
        num_files = run[0]
        print("2")
        cs.init_drill_schema(drill, dir_data_lc, n_files=num_files)
        print("3")
        #Read Data TPCH------------------------------------------------------------------------------------------------------------
        nation_files = cs.get_filenames_table('nation', dir_data_lc, num_files,
                                              run[1])
        bc.create_table('nation',
                        nation_files,
                        delimiter='|',
                        dtype=cs.get_dtypes('nation'),
                        names=cs.get_column_names('nation'))

        region_files = cs.get_filenames_table('region', dir_data_lc, num_files,
                                              run[2])
        bc.create_table('region',
                        region_files,
                        delimiter='|',
                        dtype=cs.get_dtypes('region'),
                        names=cs.get_column_names('region'))

        supplier_files = cs.get_filenames_table('supplier', dir_data_lc,
                                                num_files, run[3])
        bc.create_table('supplier',
                        supplier_files,
                        delimiter='|',
                        dtype=cs.get_dtypes('supplier'),
                        names=cs.get_column_names('supplier'))

        customer_files = cs.get_filenames_table('customer', dir_data_lc,
                                                num_files, run[4])
        bc.create_table('customer',
                        customer_files,
                        delimiter='|',
                        dtype=cs.get_dtypes('customer'),
                        names=cs.get_column_names('customer'))

        lineitem_files = cs.get_filenames_table('lineitem', dir_data_lc,
                                                num_files, run[5])
        bc.create_table('lineitem',
                        lineitem_files,
                        delimiter='|',
                        dtype=cs.get_dtypes('lineitem'),
                        names=cs.get_column_names('lineitem'))

        orders_files = cs.get_filenames_table('orders', dir_data_lc, num_files,
                                              run[6])
        bc.create_table('orders',
                        orders_files,
                        delimiter='|',
                        dtype=cs.get_dtypes('orders'),
                        names=cs.get_column_names('orders'))

        #Run Query -----------------------------------------------------------------------------
        worder = 1  #Parameter to indicate if its necessary to order the resulsets before compare them
        use_percentage = False
        acceptable_difference = 0.01
        queryType = 'Load Data Test'

        print('==============================')
        print(queryType)
        print('==============================')

        queryId = 'TEST_01'
        query = "select count(c_custkey) as c1, count(c_acctbal) as c2 from customer"
        runTest.run_query(bc, drill, query, queryId, queryType, worder, '',
                          acceptable_difference, use_percentage,
                          fileSchemaType)

        queryId = 'TEST_02'
        query = "select count(n_nationkey), count(n_regionkey) from nation"
        runTest.run_query(bc, drill, query, queryId, queryType, worder, '',
                          acceptable_difference, use_percentage,
                          fileSchemaType)

        queryId = 'TEST_03'
        query = "select count(s_suppkey), count(s_nationkey) from supplier"
        runTest.run_query(bc, drill, query, queryId, queryType, worder, '',
                          acceptable_difference, use_percentage,
                          fileSchemaType)

        queryId = 'TEST_04'
        query = "select count(c_custkey), sum(c_acctbal), sum(c_acctbal)/count(c_acctbal), min(c_custkey), max(c_nationkey), (max(c_nationkey) + min(c_nationkey))/2 c_nationkey from customer where c_custkey < 100 group by c_nationkey"
        runTest.run_query(bc, drill, query, queryId, queryType, worder, '',
                          acceptable_difference,
                          True)  #TODO: Change sum/count for avg KC

        queryId = 'TEST_05'
        query = "select c.c_custkey, c.c_nationkey, n.n_regionkey from customer as c inner join nation as n on c.c_nationkey = n.n_nationkey where n.n_regionkey = 1 and c.c_custkey < 50"
        runTest.run_query(bc, drill, query, queryId, queryType, worder, '',
                          acceptable_difference, use_percentage,
                          fileSchemaType)

        queryId = 'TEST_06'
        query = "select c_custkey, c_nationkey, c_acctbal from customer order by c_nationkey, c_acctbal"
        runTest.run_query(bc, drill, query, queryId, queryType, 0, '',
                          acceptable_difference, use_percentage,
                          fileSchemaType)

        queryId = 'TEST_07'
        query = "select c_custkey + c_nationkey, c_acctbal from customer order by 1, 2"
        runTest.run_query(bc, drill, query, queryId, queryType, 0, '',
                          acceptable_difference, use_percentage,
                          fileSchemaType)

        queryId = 'TEST_08'
        query = "select n1.n_nationkey as supp_nation, n2.n_nationkey as cust_nation, l.l_extendedprice * l.l_discount from supplier as s inner join lineitem as l on s.s_suppkey = l.l_suppkey inner join orders as o on o.o_orderkey = l.l_orderkey inner join customer as c on c.c_custkey = o.o_custkey inner join nation as n1 on s.s_nationkey = n1.n_nationkey inner join nation as n2 on c.c_nationkey = n2.n_nationkey where n1.n_nationkey = 1 and n2.n_nationkey = 2 and o.o_orderkey < 10000"
        runTest.run_query(bc, drill, query, queryId, queryType, worder, '',
                          acceptable_difference, use_percentage,
                          fileSchemaType)

        queryId = 'TEST_09'
        query = "select c_custkey, c_nationkey as nkey from customer where c_custkey < 0 and c_nationkey >=30"
        runTest.run_query(bc, drill, query, queryId, queryType, worder, '',
                          acceptable_difference, use_percentage,
                          fileSchemaType)

        queryId = 'TEST_10'
        query = "select sin(c_acctbal), cos(c_acctbal), asin(c_acctbal), acos(c_acctbal), ln(c_acctbal), tan(c_acctbal), atan(c_acctbal), floor(c_acctbal), c_acctbal from customer"
        runTest.run_query(bc, drill, query, queryId, queryType, worder, '',
                          acceptable_difference, use_percentage,
                          fileSchemaType)

        queryId = 'TEST_11'
        query = "select n1.n_nationkey as n1key, n2.n_nationkey as n2key, n1.n_nationkey + n2.n_nationkey from nation as n1 full outer join nation as n2 on n1.n_nationkey = n2.n_nationkey + 6 where n1.n_nationkey < 10 and n1.n_nationkey > 5"
        runTest.run_query(bc, drill, query, queryId, queryType, worder, '',
                          acceptable_difference, use_percentage,
                          fileSchemaType)

        queryId = 'TEST_12'
        query = "select count(n1.n_nationkey) as n1key, count(n2.n_nationkey) as n2key, count(*) as cstar from nation as n1 full outer join nation as n2 on n1.n_nationkey = n2.n_nationkey + 6"
        runTest.run_query(bc, drill, query, queryId, queryType, worder, '',
                          acceptable_difference, use_percentage,
                          fileSchemaType)

        queryId = 'TEST_13'
        query = "select o_orderkey, o_custkey from orders where o_orderkey < 10 and o_orderkey >= 1"
        runTest.run_query(bc, drill, query, queryId, queryType, worder, '',
                          acceptable_difference, use_percentage,
                          fileSchemaType)

        queryId = 'TEST_14'
        query = "select 100168549 - sum(o_orderkey)/count(o_orderkey), 56410984/sum(o_totalprice), (123 - 945/max(o_orderkey))/(sum(81619/o_orderkey)/count(81619/o_orderkey)) from orders"
        runTest.run_query(bc, drill, query, queryId, queryType, worder, '',
                          acceptable_difference,
                          True)  #TODO: Change sum/count for avg KC

        queryId = 'TEST_15'
        query = "select o_orderkey, sum(o_totalprice)/count(o_orderstatus) from orders where o_custkey < 100 group by o_orderstatus, o_orderkey"
        runTest.run_query(bc, drill, query, queryId, queryType, worder, '',
                          acceptable_difference, use_percentage,
                          fileSchemaType)

        queryId = 'TEST_16'
        query = "select o_orderkey, o_orderstatus from orders where o_custkey < 10 and o_orderstatus <> 'O' order by o_orderkey, o_orderstatus limit 50"
        runTest.run_query(bc, drill, query, queryId, queryType, worder, '',
                          acceptable_difference, use_percentage,
                          fileSchemaType)

        queryId = 'TEST_17'
        query = "select count(o_orderstatus) from orders where o_orderstatus <> 'O'"
        runTest.run_query(bc, drill, query, queryId, queryType, worder, '',
                          acceptable_difference, use_percentage,
                          fileSchemaType)

        queryId = 'TEST_18'
        query = "select count(o_orderkey), sum(o_orderkey), o_clerk from orders where o_custkey < 1000 group by o_clerk, o_orderstatus"
        runTest.run_query(bc, drill, query, queryId, queryType, worder, '',
                          acceptable_difference, use_percentage,
                          fileSchemaType)

        queryId = 'TEST_19'
        query = "select sum(o_orderkey)/count(o_orderkey) from orders group by o_orderstatus"
        runTest.run_query(bc, drill, query, queryId, queryType, worder, '',
                          acceptable_difference,
                          True)  #TODO: Change sum/count for avg KC

        queryId = 'TEST_20'
        query = "select count(o_shippriority), sum(o_totalprice) from orders group by o_shippriority"
        runTest.run_query(bc, drill, query, queryId, queryType, worder, '',
                          acceptable_difference, use_percentage,
                          fileSchemaType)

        queryId = 'TEST_21'
        query = """with regionTemp as ( select r_regionkey, r_name from region where r_regionkey > 2 ),
        nationTemp as(select n_nationkey, n_regionkey as fkey, n_name from nation where n_nationkey > 3 order by n_nationkey)
        select regionTemp.r_name, nationTemp.n_name from regionTemp inner join nationTemp on regionTemp.r_regionkey = nationTemp.fkey"""
        runTest.run_query(bc, drill, query, queryId, queryType, worder, '',
                          acceptable_difference, use_percentage,
                          fileSchemaType)

        queryId = 'TEST_22'
        query = """select o.o_totalprice, l.l_partkey from orders as o
            left outer join lineitem as l on o.o_custkey = l.l_linenumber and l.l_suppkey = o.o_orderkey where l.l_linenumber < 1000"""
        runTest.run_query_performance(bc, drill, query, queryId, queryType,
                                      worder, '', acceptable_difference,
                                      use_percentage, fileSchemaType)

        queryId = 'TEST_23'
        query = """select o.o_orderkey, o.o_totalprice, l.l_partkey, l.l_returnflag from lineitem as l 
            inner join orders as o on o.o_orderkey = l.l_orderkey
            inner join customer as c on c.c_custkey = o.o_custkey where c.c_custkey < 1000"""
        runTest.run_query_performance(bc, drill, query, queryId, queryType,
                                      worder, '', acceptable_difference,
                                      use_percentage, fileSchemaType)

        queryId = 'TEST_24'
        query = """select o.o_orderkey, o.o_totalprice, l.l_partkey, l.l_linestatus from orders as o
            full outer join lineitem as l on l.l_orderkey = o.o_orderkey where o.o_orderkey < 1000"""
        runTest.run_query_performance(bc, drill, query, queryId, queryType,
                                      worder, '', acceptable_difference,
                                      use_percentage, fileSchemaType)

        runTest.save_log()