def main(sf, parts, sharded, expected_result): max_orderdate = datetime.strptime('1995-01-01', '%Y-%m-%d') max_shipdate = datetime.strptime('1995-01-01', '%Y-%m-%d') settings = SyntheticBaselineJoinSettings( parallel=True, use_pandas=True, secure=False, use_native=False, buffer_size=0, use_shared_mem=False, shared_memory_size=-1, sf=sf, table_A_key='customer', table_A_parts=parts, table_A_sharded=sharded, table_A_field_names=[ 'c_custkey', 'c_name', 'c_address', 'c_nationkey', 'c_phone', 'c_acctbal', 'c_mktsegment', 'c_comment' ], table_A_filter_fn=lambda df: df['c_acctbal'].astype(np.float ) <= -999.0, table_A_AB_join_key='c_custkey', table_B_key='orders', table_B_parts=parts, table_B_sharded=sharded, table_B_field_names=[ 'o_orderkey', 'o_custkey', 'o_orderstatus', 'o_totalprice', 'o_orderdate', 'o_orderpriority', 'o_clerk', 'o_shippriority', 'o_comment' ], table_B_filter_fn=lambda df: pd.to_datetime(df['o_orderdate'] ) < max_orderdate, table_B_AB_join_key='o_custkey', table_B_BC_join_key='o_orderkey', table_B_detail_field_name=None, table_C_key='lineitem', table_C_parts=parts, table_C_sharded=sharded, table_C_field_names=[ 'l_orderkey', 'l_partkey', 'l_suppkey', 'l_linenumber', 'l_quantity', 'l_extendedprice', 'l_discount', 'l_tax', 'l_returnflag', 'l_linestatus', 'l_shipdate', 'l_commitdate', 'l_receiptdate', 'l_shipinstruct', 'l_shipmode', 'l_comment' ], table_C_filter_fn=lambda df: pd.to_datetime(df['l_shipdate'] ) < max_shipdate, table_C_BC_join_key='l_orderkey', table_C_detail_field_name='l_extendedprice') print("--- TEST: {} ---".format(gen_test_id())) print("--- SCALE FACTOR: {} ---".format(sf)) query_plan = synthetic_join_baseline.query_plan(settings) runner.run(query_plan, expected_result=expected_result, test_id=gen_test_id())
def main(sf, parts, sharded, fp_rate, table_a_filter_sql, table_b_filter_sql, expected_result): settings = SyntheticSemiJoinSettings( parallel=True, use_pandas=True, secure=False, use_native=False, buffer_size=0, use_shared_mem=False, shared_memory_size=-1, sf=sf, fp_rate=fp_rate, table_A_key='customer', table_A_parts=parts, table_A_sharded=sharded, table_A_field_names=['c_custkey'], table_A_filter_sql=table_a_filter_sql, table_A_AB_join_key='c_custkey', table_B_key='orders', table_B_parts=parts, table_B_sharded=sharded, table_B_field_names=[ 'o_orderkey', 'o_custkey', 'o_orderstatus', 'o_totalprice', 'o_orderdate', 'o_orderpriority', 'o_clerk', 'o_shippriority', 'o_comment' ], table_B_filter_sql=table_b_filter_sql, table_B_AB_join_key='o_custkey', table_B_BC_join_key='o_orderkey', table_B_primary_key='o_orderkey', table_B_detail_field_name='o_totalprice', table_C_key=None, table_C_parts=None, table_C_sharded=None, table_C_field_names=None, table_C_filter_sql=None, table_C_BC_join_key=None, table_C_primary_key=None, table_C_detail_field_name=None) print("--- TEST: {} ---".format(gen_test_id())) print("--- SCALE FACTOR: {} ---".format(sf)) print("--- FALSE POSITIVE RATE: {} ---".format(fp_rate)) query_plan = synthetic_join_semi.query_plan(settings) runner.run(query_plan, expected_result=expected_result, test_id=gen_test_id())
def main(sf, parts, sharded, fp_rate, expected_result): settings = SyntheticSemiJoinSettings( parallel=True, use_pandas=True, secure=False, use_native=False, buffer_size=0, use_shared_mem=False, shared_memory_size=-1, sf=sf, fp_rate=fp_rate, table_A_key='customer', table_A_parts=parts, table_A_sharded=sharded, table_A_field_names=['c_custkey'], table_A_filter_sql='cast(c_acctbal as float) <= -999.0', table_A_AB_join_key='c_custkey', table_B_key='orders', table_B_parts=parts, table_B_sharded=sharded, table_B_field_names=['o_orderkey', 'o_custkey'], table_B_filter_sql= 'cast(o_orderdate as timestamp) < cast(\'1995-01-01\' as timestamp)', table_B_AB_join_key='o_custkey', table_B_BC_join_key='o_orderkey', table_B_primary_key='o_orderkey', table_B_detail_field_name=None, table_C_key='lineitem', table_C_parts=parts, table_C_sharded=sharded, table_C_field_names=['l_orderkey', 'l_orderkey_2'], table_C_filter_sql= 'cast(l_shipdate as timestamp) < cast(\'1995-01-01\' as timestamp)', table_C_BC_join_key='l_orderkey', table_C_primary_key='l_orderkey', table_C_detail_field_name='l_extendedprice') print("--- TEST: {} ---".format(gen_test_id())) print("--- SCALE FACTOR: {} ---".format(sf)) print("--- FALSE POSITIVE RATE: {} ---".format(fp_rate)) query_plan = synthetic_join_semi.query_plan(settings) runner.run(query_plan, expected_result=expected_result, test_id=gen_test_id())
def main(sf, format_, parts, sharded, other_parts, table_a_filter_val, table_b_filter_val, expected_result, trial): table_a_filter_sql, _, table_b_filter_sql, _ = runner.build_filters( table_a_filter_val, table_b_filter_val) settings = SyntheticFilteredJoinSettings( parallel=True, use_pandas=True, secure=False, use_native=False, buffer_size=0, use_shared_mem=False, shared_memory_size=-1, format_=format_, sf=sf, table_A_key='customer', table_A_parts=parts, table_A_sharded=sharded, table_A_field_names=['c_custkey'], table_A_filter_sql=table_a_filter_sql, table_A_AB_join_key='c_custkey', table_B_key='orders', table_B_parts=parts, table_B_sharded=sharded, table_B_field_names=[ 'o_orderkey', 'o_custkey', 'o_orderstatus', 'o_totalprice', 'o_orderdate', 'o_orderpriority', 'o_clerk', 'o_shippriority', 'o_comment' ], table_B_filter_sql=table_b_filter_sql, table_B_AB_join_key='o_custkey', table_B_BC_join_key=None, table_B_detail_field_name='o_totalprice', table_C_key=None, table_C_parts=None, table_C_sharded=None, table_C_field_names=None, table_C_filter_sql=None, table_C_BC_join_key=None, table_C_detail_field_name=None, other_parts=other_parts) path = os.path.join(ROOT_DIR, "../aws-exps/join") filesystem_util.create_dirs(path) out_file = "synthetic_join_2_filtered_sf{}_aval{}_bval{}_trial{}.txt" \ .format(sf, table_a_filter_val, table_b_filter_val, trial) sys.stdout = open(os.path.join(path, out_file), "w+") print("--- TEST: {} ---".format(gen_test_id())) print("--- SCALE FACTOR: {} ---".format(sf)) print("--- FORMAT: {} ---".format(format_)) print("--- CUSTOMER FILTER: {} ---".format(table_a_filter_sql)) print("--- ORDER FILTER: {} ---".format(table_b_filter_sql)) query_plan = synthetic_join_filtered.query_plan(settings) runner.run(query_plan, expected_result=expected_result, test_id=gen_test_id()) sys.stdout.close() subprocess.call(['cat', os.path.join(path, out_file)])