Example #1
0
def main(sf, parts, sharded, expected_result):
    max_orderdate = datetime.strptime('1995-01-01', '%Y-%m-%d')
    max_shipdate = datetime.strptime('1995-01-01', '%Y-%m-%d')

    settings = SyntheticBaselineJoinSettings(
        parallel=True,
        use_pandas=True,
        secure=False,
        use_native=False,
        buffer_size=0,
        use_shared_mem=False,
        shared_memory_size=-1,
        sf=sf,
        table_A_key='customer',
        table_A_parts=parts,
        table_A_sharded=sharded,
        table_A_field_names=[
            'c_custkey', 'c_name', 'c_address', 'c_nationkey', 'c_phone',
            'c_acctbal', 'c_mktsegment', 'c_comment'
        ],
        table_A_filter_fn=lambda df: df['c_acctbal'].astype(np.float
                                                            ) <= -999.0,
        table_A_AB_join_key='c_custkey',
        table_B_key='orders',
        table_B_parts=parts,
        table_B_sharded=sharded,
        table_B_field_names=[
            'o_orderkey', 'o_custkey', 'o_orderstatus', 'o_totalprice',
            'o_orderdate', 'o_orderpriority', 'o_clerk', 'o_shippriority',
            'o_comment'
        ],
        table_B_filter_fn=lambda df: pd.to_datetime(df['o_orderdate']
                                                    ) < max_orderdate,
        table_B_AB_join_key='o_custkey',
        table_B_BC_join_key='o_orderkey',
        table_B_detail_field_name=None,
        table_C_key='lineitem',
        table_C_parts=parts,
        table_C_sharded=sharded,
        table_C_field_names=[
            'l_orderkey', 'l_partkey', 'l_suppkey', 'l_linenumber',
            'l_quantity', 'l_extendedprice', 'l_discount', 'l_tax',
            'l_returnflag', 'l_linestatus', 'l_shipdate', 'l_commitdate',
            'l_receiptdate', 'l_shipinstruct', 'l_shipmode', 'l_comment'
        ],
        table_C_filter_fn=lambda df: pd.to_datetime(df['l_shipdate']
                                                    ) < max_shipdate,
        table_C_BC_join_key='l_orderkey',
        table_C_detail_field_name='l_extendedprice')

    print("--- TEST: {} ---".format(gen_test_id()))
    print("--- SCALE FACTOR: {} ---".format(sf))

    query_plan = synthetic_join_baseline.query_plan(settings)

    runner.run(query_plan,
               expected_result=expected_result,
               test_id=gen_test_id())
Example #2
0
def main(sf, parts, sharded, fp_rate, table_a_filter_sql, table_b_filter_sql,
         expected_result):
    settings = SyntheticSemiJoinSettings(
        parallel=True,
        use_pandas=True,
        secure=False,
        use_native=False,
        buffer_size=0,
        use_shared_mem=False,
        shared_memory_size=-1,
        sf=sf,
        fp_rate=fp_rate,
        table_A_key='customer',
        table_A_parts=parts,
        table_A_sharded=sharded,
        table_A_field_names=['c_custkey'],
        table_A_filter_sql=table_a_filter_sql,
        table_A_AB_join_key='c_custkey',
        table_B_key='orders',
        table_B_parts=parts,
        table_B_sharded=sharded,
        table_B_field_names=[
            'o_orderkey', 'o_custkey', 'o_orderstatus', 'o_totalprice',
            'o_orderdate', 'o_orderpriority', 'o_clerk', 'o_shippriority',
            'o_comment'
        ],
        table_B_filter_sql=table_b_filter_sql,
        table_B_AB_join_key='o_custkey',
        table_B_BC_join_key='o_orderkey',
        table_B_primary_key='o_orderkey',
        table_B_detail_field_name='o_totalprice',
        table_C_key=None,
        table_C_parts=None,
        table_C_sharded=None,
        table_C_field_names=None,
        table_C_filter_sql=None,
        table_C_BC_join_key=None,
        table_C_primary_key=None,
        table_C_detail_field_name=None)

    print("--- TEST: {} ---".format(gen_test_id()))
    print("--- SCALE FACTOR: {} ---".format(sf))
    print("--- FALSE POSITIVE RATE: {} ---".format(fp_rate))

    query_plan = synthetic_join_semi.query_plan(settings)

    runner.run(query_plan,
               expected_result=expected_result,
               test_id=gen_test_id())
Example #3
0
def main(sf, parts, sharded, fp_rate, expected_result):
    settings = SyntheticSemiJoinSettings(
        parallel=True,
        use_pandas=True,
        secure=False,
        use_native=False,
        buffer_size=0,
        use_shared_mem=False,
        shared_memory_size=-1,
        sf=sf,
        fp_rate=fp_rate,
        table_A_key='customer',
        table_A_parts=parts,
        table_A_sharded=sharded,
        table_A_field_names=['c_custkey'],
        table_A_filter_sql='cast(c_acctbal as float) <= -999.0',
        table_A_AB_join_key='c_custkey',
        table_B_key='orders',
        table_B_parts=parts,
        table_B_sharded=sharded,
        table_B_field_names=['o_orderkey', 'o_custkey'],
        table_B_filter_sql=
        'cast(o_orderdate as timestamp) < cast(\'1995-01-01\' as timestamp)',
        table_B_AB_join_key='o_custkey',
        table_B_BC_join_key='o_orderkey',
        table_B_primary_key='o_orderkey',
        table_B_detail_field_name=None,
        table_C_key='lineitem',
        table_C_parts=parts,
        table_C_sharded=sharded,
        table_C_field_names=['l_orderkey', 'l_orderkey_2'],
        table_C_filter_sql=
        'cast(l_shipdate as timestamp) < cast(\'1995-01-01\' as timestamp)',
        table_C_BC_join_key='l_orderkey',
        table_C_primary_key='l_orderkey',
        table_C_detail_field_name='l_extendedprice')

    print("--- TEST: {} ---".format(gen_test_id()))
    print("--- SCALE FACTOR: {} ---".format(sf))
    print("--- FALSE POSITIVE RATE: {} ---".format(fp_rate))

    query_plan = synthetic_join_semi.query_plan(settings)

    runner.run(query_plan,
               expected_result=expected_result,
               test_id=gen_test_id())
Example #4
0
def main(sf, format_, parts, sharded, other_parts, table_a_filter_val,
         table_b_filter_val, expected_result, trial):
    table_a_filter_sql, _, table_b_filter_sql, _ = runner.build_filters(
        table_a_filter_val, table_b_filter_val)

    settings = SyntheticFilteredJoinSettings(
        parallel=True,
        use_pandas=True,
        secure=False,
        use_native=False,
        buffer_size=0,
        use_shared_mem=False,
        shared_memory_size=-1,
        format_=format_,
        sf=sf,
        table_A_key='customer',
        table_A_parts=parts,
        table_A_sharded=sharded,
        table_A_field_names=['c_custkey'],
        table_A_filter_sql=table_a_filter_sql,
        table_A_AB_join_key='c_custkey',
        table_B_key='orders',
        table_B_parts=parts,
        table_B_sharded=sharded,
        table_B_field_names=[
            'o_orderkey', 'o_custkey', 'o_orderstatus', 'o_totalprice',
            'o_orderdate', 'o_orderpriority', 'o_clerk', 'o_shippriority',
            'o_comment'
        ],
        table_B_filter_sql=table_b_filter_sql,
        table_B_AB_join_key='o_custkey',
        table_B_BC_join_key=None,
        table_B_detail_field_name='o_totalprice',
        table_C_key=None,
        table_C_parts=None,
        table_C_sharded=None,
        table_C_field_names=None,
        table_C_filter_sql=None,
        table_C_BC_join_key=None,
        table_C_detail_field_name=None,
        other_parts=other_parts)

    path = os.path.join(ROOT_DIR, "../aws-exps/join")
    filesystem_util.create_dirs(path)
    out_file = "synthetic_join_2_filtered_sf{}_aval{}_bval{}_trial{}.txt" \
        .format(sf, table_a_filter_val, table_b_filter_val, trial)
    sys.stdout = open(os.path.join(path, out_file), "w+")

    print("--- TEST: {} ---".format(gen_test_id()))
    print("--- SCALE FACTOR: {} ---".format(sf))
    print("--- FORMAT: {} ---".format(format_))
    print("--- CUSTOMER FILTER: {} ---".format(table_a_filter_sql))
    print("--- ORDER FILTER: {} ---".format(table_b_filter_sql))

    query_plan = synthetic_join_filtered.query_plan(settings)

    runner.run(query_plan,
               expected_result=expected_result,
               test_id=gen_test_id())

    sys.stdout.close()

    subprocess.call(['cat', os.path.join(path, out_file)])