def test_join_topk(): """Tests a top k with a join :return: None """ limit = 5 query_plan = QueryPlan() # Query plan ts1 = query_plan.add_operator(SQLTableScan('supplier.csv', 'select * from S3Object;', False, 'ts1', query_plan, False)) ts1_project = query_plan.add_operator( Project([ProjectExpression(lambda t_: t_['_3'], 's_nationkey')], 'ts1_project', query_plan, False)) ts2 = query_plan.add_operator(SQLTableScan('nation.csv', 'select * from S3Object;', False, 'ts2', query_plan, False)) ts2_project = query_plan.add_operator( Project([ProjectExpression(lambda t_: t_['_0'], 'n_nationkey')], 'ts2_project', query_plan, False)) j = query_plan.add_operator(HashJoin(JoinExpression('s_nationkey', 'n_nationkey'), 'j', query_plan, False)) t = query_plan.add_operator(Limit(limit, 't', query_plan, False)) c = query_plan.add_operator(Collate('c', query_plan, False)) ts1.connect(ts1_project) ts2.connect(ts2_project) j.connect_left_producer(ts1_project) j.connect_right_producer(ts2_project) j.connect(t) t.connect(c) # Write the plan graph query_plan.write_graph(os.path.join(ROOT_DIR, "../tests-output"), gen_test_id()) # Start the query query_plan.execute() # Assert the results # num_rows = 0 # for t in c.tuples(): # num_rows += 1 # print("{}:{}".format(num_rows, t)) c.print_tuples() field_names = ['s_nationkey', 'n_nationkey'] assert len(c.tuples()) == limit + 1 assert c.tuples()[0] == field_names num_rows = 0 for t in c.tuples(): num_rows += 1 # Assert that the nation_key in table 1 has been joined with the record in table 2 with the same nation_key if num_rows > 1: lt = IndexedTuple.build(t, field_names) assert lt['s_nationkey'] == lt['n_nationkey'] # Write the metrics query_plan.print_metrics()
def test_join_empty(): """Executes a join where no records are returned. We tst this as it's somewhat peculiar with s3 select, in so much as s3 does not return column names when selecting data, meaning, unlike a traditional DBMS, no field names tuple should be present in the results. :return: None """ query_plan = QueryPlan() # Query plan supplier_scan = query_plan.add_operator( SQLTableScan('supplier.csv', 'select * from S3Object limit 0;', False, 'supplier_scan', query_plan, False)) supplier_project = query_plan.add_operator( Project([ProjectExpression(lambda t_: t_['_3'], 's_nationkey')], 'supplier_project', query_plan, False)) nation_scan = query_plan.add_operator( SQLTableScan('nation.csv', 'select * from S3Object limit 0;', False, 'nation_scan', query_plan, False)) nation_project = query_plan.add_operator( Project([ProjectExpression(lambda t_: t_['_0'], 'n_nationkey')], 'nation_project', query_plan, False)) supplier_nation_join = query_plan.add_operator( HashJoin(JoinExpression('s_nationkey', 'n_nationkey'), 'supplier_nation_join', query_plan, False)) collate = query_plan.add_operator(Collate('collate', query_plan, False)) supplier_scan.connect(supplier_project) nation_scan.connect(nation_project) supplier_nation_join.connect_left_producer(supplier_project) supplier_nation_join.connect_right_producer(nation_project) supplier_nation_join.connect(collate) # Write the plan graph query_plan.write_graph(os.path.join(ROOT_DIR, "../tests-output"), gen_test_id()) # Start the query query_plan.execute() # Assert the results # num_rows = 0 # for t in collate.tuples(): # num_rows += 1 # print("{}:{}".format(num_rows, t)) assert len(collate.tuples()) == 0 # Write the metrics query_plan.print_metrics()
def test_scan_empty(): """Executes a scan where no records are returned. We tst this as it's somewhat peculiar with s3 select, in so much as s3 does not return column names when selecting data, meaning, unlike a traditional DBMS, no field names tuple should be present in the results. :return: None """ query_plan = QueryPlan() # Query plan ts = query_plan.add_operator( SQLTableScan('nation.csv', "select * from s3object limit 0", False, 'ts', query_plan, False)) c = query_plan.add_operator(Collate('c', query_plan, False)) ts.connect(c) # Write the plan graph query_plan.write_graph(os.path.join(ROOT_DIR, "../tests-output"), gen_test_id()) # Start the query query_plan.execute() # Assert the results # num_rows = 0 # for t in c.tuples(): # num_rows += 1 # print("{}:{}".format(num_rows, t)) assert len(c.tuples()) == 0 # Write the metrics query_plan.print_metrics()
def sql_scan_lineitem_select_partkey_quantity_extendedprice_discount_shipinstruct_shipmode_where_filtered_op( sharded, shard, num_shards, use_pandas, secure, use_native, name, query_plan, sf, format_): return SQLTableScan( get_file_key('lineitem', sharded, shard, sf, format_), "select " " l_partkey, " " l_quantity, " " l_extendedprice, " " l_discount, " " l_shipinstruct, " " l_shipmode " "from " " S3Object " "where " " ( " " ( " " cast(l_quantity as integer) >= 3 and cast(l_quantity as integer) <= 3 + 10 " " and l_shipmode in ('AIR', 'AIR REG') " " and l_shipinstruct = 'DELIVER IN PERSON' " " ) " " or " " ( " " cast(l_quantity as integer) >= 16 and cast(l_quantity as integer) <= 16 + 10 " " and l_shipmode in ('AIR', 'AIR REG') " " and l_shipinstruct = 'DELIVER IN PERSON' " " ) " " or " " ( " " cast(l_quantity as integer) >= 24 and cast(l_quantity as integer) <= 24 + 10 " " and l_shipmode in ('AIR', 'AIR REG') " " and l_shipinstruct = 'DELIVER IN PERSON' " " ) " " ) {}".format(get_sql_suffix('lineitem', num_shards, shard, sharded)), format_, use_pandas, secure, use_native, name, query_plan, False)
def sql_scan_part_partkey_brand_size_container_where_extra_filtered_op(sharded, shard, num_shards, use_pandas, secure, use_native, name, query_plan, sf): return SQLTableScan(get_file_key('part', sharded, shard, sf), "select " " p_partkey, " " p_brand, " " p_size, " " p_container " "from " " S3Object " "where " " (" " p_partkey = '103853' or " " p_partkey = '104277' or " " p_partkey = '104744' " " ) and " " ( " " ( " " p_brand = 'Brand#11' " " and p_container in (" " 'SM CASE', " " 'SM BOX', " " 'SM PACK', " " 'SM PKG'" " ) " " and cast(p_size as integer) between 1 and 5 " " ) " " or " " ( " " p_brand = 'Brand#44' " " and p_container in (" " 'MED BAG', " " 'MED BOX', " " 'MED PKG', " " 'MED PACK'" " ) " " and cast(p_size as integer) between 1 and 10 " " ) " " or " " ( " " p_brand = 'Brand#53' " " and p_container in (" " 'LG CASE', " " 'LG BOX', " " 'LG PACK', " " 'LG PKG'" " ) " " and cast(p_size as integer) between 1 and 15 " " ) " " ) {}" .format(get_sql_suffix('part', num_shards, shard, sharded)), use_pandas, secure, use_native, name, query_plan, False)
def test_pandas_project_simple(): """Tests a projection :return: None """ query_plan = QueryPlan() # Query plan ts = query_plan.add_operator( SQLTableScan('nation.csv', 'select * from S3Object ' 'limit 3;', True, 'ts', query_plan, False)) p = query_plan.add_operator( Project([ ProjectExpression(lambda t_: t_['_2'], 'n_regionkey'), ProjectExpression(lambda t_: t_['_0'], 'n_nationkey'), ProjectExpression(lambda t_: t_['_3'], 'n_comment') ], 'p', query_plan, False)) c = query_plan.add_operator(Collate('c', query_plan, False)) ts.connect(p) p.connect(c) # Write the plan graph query_plan.write_graph(os.path.join(ROOT_DIR, "../tests-output"), gen_test_id()) # Start the query query_plan.execute() # Assert the results # num_rows = 0 # for t in c.tuples(): # num_rows += 1 # print("{}:{}".format(num_rows, t)) field_names = ['n_regionkey', 'n_nationkey', 'n_comment'] assert len(c.tuples()) == 3 + 1 assert c.tuples()[0] == field_names assert c.tuples()[1] == [ '0', '0', ' haggle. carefully final deposits detect slyly agai' ] assert c.tuples()[2] == [ '1', '1', 'al foxes promise slyly according to the regular accounts. bold requests alon' ] assert c.tuples()[3] == [ '1', '2', 'y alongside of the pending deposits. carefully special packages ' 'are about the ironic forges. slyly special ' ] # Write the metrics query_plan.print_metrics()
def sql_filtered_scan_lineitem_operator_def(max_shipped_date, sharded, shard, sf, use_pandas, secure, use_native, name, query_plan, format_): return SQLTableScan(get_file_key('lineitem', sharded, shard, sf), " select l_quantity, l_extendedprice, l_discount, l_tax, l_returnflag, l_linestatus, l_shipdate " " from S3Object " " where cast(l_shipdate as timestamp) <= cast(\'{}\' as timestamp) " ";".format(max_shipped_date.strftime('%Y-%m-%d')), format_, use_pandas, secure, use_native, name, query_plan, False)
def test_group_count(): """Tests a group by query with a count aggregate :return: None """ num_rows = 0 query_plan = QueryPlan() # Query plan # select s_nationkey, count(s_suppkey) from supplier.csv group by s_nationkey ts = query_plan.add_operator( SQLTableScan('supplier.csv', 'select * from S3Object;', False, 'ts', query_plan, False)) g = query_plan.add_operator( Group( ['_3'], [ AggregateExpression(AggregateExpression.COUNT, lambda t_: t_['_0']) # count(s_suppkey) ], 'g', query_plan, False)) c = query_plan.add_operator(Collate('c', query_plan, False)) query_plan.write_graph(os.path.join(ROOT_DIR, "../tests-output"), gen_test_id()) ts.connect(g) g.connect(c) # Start the query query_plan.execute() # Assert the results for _ in c.tuples(): num_rows += 1 # print("{}:{}".format(num_rows, t)) field_names = ['_0', '_1'] assert c.tuples()[0] == field_names assert len(c.tuples()) == 25 + 1 nation_24 = filter( lambda t: IndexedTuple.build(t, field_names)['_0'] == '24', c.tuples())[0] assert nation_24[1] == 393 assert num_rows == 25 + 1 # Write the metrics query_plan.print_metrics()
def start(self): """Executes the query and sends the tuples to consumers. :return: None """ # NOTE: There should only ever be one bloom filter if self.__bloom_filters[0].fp_rate() < 1.0: bloom_filter_sql_predicates = [] for bf in self.__bloom_filters: bloom_filter_sql_predicate = bf.build_bit_array_string_sql_predicate( self.__bloom_filter_field_name) bloom_filter_sql_predicates.append(bloom_filter_sql_predicate) # Don't need this anymore del self.__bloom_filters sql_suffix = self.__build_sql_suffix(self.s3sql, bloom_filter_sql_predicates) self.s3sql = self.s3sql + sql_suffix else: print( "{}('{}') | Bloom filter fp rate ({}) is 1.0, not using bloom filter" .format(self.__class__.__name__, self.name, self.__bloom_filters[0].fp_rate())) if self.log_enabled: print("{}('{}') | sql length: {}".format(self.__class__.__name__, self.name, len(self.s3sql))) if self.use_pandas: cur = SQLTableScan.execute_pandas_query(self) else: cur = SQLTableScan.execute_py_query(self) self.op_metrics.bytes_scanned = cur.bytes_scanned self.op_metrics.bytes_processed = cur.bytes_processed self.op_metrics.bytes_returned = cur.bytes_returned self.op_metrics.time_to_first_record_response = cur.time_to_first_record_response self.op_metrics.time_to_last_record_response = cur.time_to_last_record_response if not self.is_completed(): self.complete()
def sql_scan_lineitem_operator_def(sharded, shard, parts, use_pandas, secure, use_native, name, query_plan, sf, format_): return SQLTableScan( get_file_key('lineitem', sharded, shard, sf, format_), "select " " * " "from " " S3Object " " {} ".format( get_sql_suffix('lineitem', parts, shard, sharded, add_where=True)), format_, use_pandas, secure, use_native, name, query_plan, False)
def test_aggregate_count(): """Tests a group by query with a count aggregate :return: None """ query_plan = QueryPlan() # Query plan # select count(*) from supplier.csv ts = query_plan.add_operator( SQLTableScan('supplier.csv', 'select * from S3Object;', False, 'ts', query_plan, False)) a = query_plan.add_operator( Aggregate( [ AggregateExpression(AggregateExpression.COUNT, lambda t_: t_['_0']) # count(s_suppkey) ], 'a', query_plan, False)) c = query_plan.add_operator(Collate('c', query_plan, False)) ts.connect(a) a.connect(c) # Write the plan graph query_plan.write_graph(os.path.join(ROOT_DIR, "../tests-output"), gen_test_id()) # Start the query query_plan.execute() # Assert the results # num_rows = 0 # for t in c.tuples(): # num_rows += 1 # print("{}:{}".format(num_rows, t)) c.print_tuples() field_names = ['_0'] tuples = c.tuples() assert tuples[0] == field_names assert IndexedTuple.build(tuples[1], field_names)['_0'] == 10000 assert len(tuples) == 1 + 1 # Write the metrics query_plan.print_metrics()
def sql_scan_part_select_all_op(sharded, shard, num_shards, use_pandas, secure, use_native, name, query_plan, sf, format_): return SQLTableScan( get_file_key('part', sharded, shard, sf, format_), "select " " * " "from " " S3Object " " {} ".format( get_sql_suffix('part', num_shards, shard, sharded, add_where=True)), format_, use_pandas, secure, use_native, name, query_plan, False)
def sql_scan_lineitem_where_shipdate_sharded_operator_def( min_shipped_date, max_shipped_date, shard, sharded, use_pandas, secure, use_native, name, query_plan, sf, format_): return SQLTableScan( get_file_key('lineitem', sharded, shard, sf), "select * from S3Object " "where " " l_shipdate >= \'{}\' and " " l_shipdate < \'{}\' " ";".format(min_shipped_date.strftime('%Y-%m-%d'), max_shipped_date.strftime('%Y-%m-%d')), format_, use_pandas, secure, use_native, name, query_plan, False)
def test_sort_desc(): """Executes a sorted query. The results are collated. :return: None """ query_plan = QueryPlan() # Query plan ts = query_plan.add_operator( SQLTableScan('supplier.csv', 'select * from S3Object ' 'limit 3;', False, 'ts', query_plan, False)) s = query_plan.add_operator( Sort([SortExpression('_5', float, 'DESC')], 's', query_plan, False)) c = query_plan.add_operator(Collate('c', query_plan, False)) # Write the plan graph query_plan.write_graph(os.path.join(ROOT_DIR, "../tests-output"), gen_test_id()) ts.connect(s) s.connect(c) # Start the query query_plan.execute() # Assert the results # num_rows = 0 # for t in c.tuples(): # num_rows += 1 # print("{}:{}".format(num_rows, t)) assert len(c.tuples()) == 3 + 1 assert c.tuples()[0] == ['_0', '_1', '_2', '_3', '_4', '_5', '_6'] assert c.tuples()[1] == [ '1', 'Supplier#000000001', ' N kD4on9OM Ipw3,gf0JBoQDd7tgrzrddZ', '17', '27-918-335-1736', '5755.94', 'each slyly above the careful' ] assert c.tuples()[2] == [ '3', 'Supplier#000000003', 'q1,G3Pj6OjIuUYfUoH18BFTKP5aU9bEV3', '1', '11-383-516-1199', '4192.40', 'blithely silent requests after the express dependencies are sl' ] assert c.tuples()[3] == [ '2', 'Supplier#000000002', '89eJ5ksX3ImxJQBvxObC,', '5', '15-679-861-2259', '4032.68', ' slyly bold instructions. idle dependen' ] # Write the metrics query_plan.print_metrics()
def test_aggregate_empty(): """Executes an aggregate query with no results returned. We tst this as it's somewhat peculiar with s3 select, in so much as s3 does not return column names when selecting data, though being an aggregate query we can generate the tuple field names based on the expressions supplied. TODO: Unsure whether the aggregate operator should return field names. It makes sense in one way, but is different to how all the other operators behave. :return: None """ query_plan = QueryPlan() # Query plan # select sum(float(s_acctbal)) from supplier.csv limit 0 ts = query_plan.add_operator( SQLTableScan('supplier.csv', 'select * from S3Object limit 0;', False, 'ts', query_plan, False)) a = query_plan.add_operator( Aggregate([ AggregateExpression(AggregateExpression.SUM, lambda t_: float(t_['_5'])) ], 'a', query_plan, False)) c = query_plan.add_operator(Collate('c', query_plan, False)) ts.connect(a) a.connect(c) # Write the plan graph query_plan.write_graph(os.path.join(ROOT_DIR, "../tests-output"), gen_test_id()) # Start the query query_plan.execute() # Assert the results # num_rows = 0 # for t in c.tuples(): # num_rows += 1 # print("{}:{}".format(num_rows, t)) field_names = ['_0'] assert c.tuples()[0] == field_names assert len(c.tuples()) == 0 + 1 # Write the metrics query_plan.print_metrics()
def sql_scan_part_partkey_type_part_where_brand12_partitioned_operator_def( sharded, part, parts, sf, use_pandas, secure, use_native, name, query_plan, format_): # key_lower = math.ceil((200000.0 / float(parts)) * part) # key_upper = math.ceil((200000.0 / float(parts)) * (part + 1)) return SQLTableScan( get_file_key('part', sharded, part, sf, format_), "select " " p_partkey, p_type " "from " " S3Object " " {} ".format( get_sql_suffix('part', parts, part, sharded, add_where=True)), format_, use_pandas, secure, use_native, name, query_plan, False)
def run(parallel, use_pandas, buffer_size, table_parts, perc, path, format_): secure = False use_native = False print('') print("Indexing Benchmark") print("------------------") # Query plan query_plan = QueryPlan(is_async=parallel, buffer_size=buffer_size) # Scan Index Files upper = perc * 100 scan = map( lambda p: query_plan.add_operator( SQLTableScan( '{}/data_{}.csv'.format(path, p), "select * from S3Object " " where cast(F0 as float) < {};".format(upper), format_, use_pandas, secure, use_native, 'scan_{}'.format( p), query_plan, False)), range(0, table_parts)) collate = query_plan.add_operator(Collate('collate', query_plan, False)) map(lambda (p, o): o.connect(collate), enumerate(scan)) # Plan settings print('') print("Settings") print("--------") print('') print('use_pandas: {}'.format(use_pandas)) print("table parts: {}".format(table_parts)) print('') # Write the plan graph query_plan.write_graph(os.path.join(ROOT_DIR, "../benchmark-output"), gen_test_id() + "-" + str(table_parts)) # Start the query query_plan.execute() print('Done') tuples = collate.tuples() # collate.print_tuples(tuples) # Write the metrics query_plan.print_metrics() # Shut everything down query_plan.stop()
def sql_scan_part_select_all_where_partkey_op(sharded, shard, num_shards, use_pandas, secure, use_native, name, query_plan, sf): return SQLTableScan( get_file_key('part', sharded, shard, sf), "select " " * " "from " " S3Object " "where " " (" " p_partkey = '103853' or " " p_partkey = '104277' or " " p_partkey = '104744'" " ) {}".format(get_sql_suffix('part', num_shards, shard, sharded)), use_pandas, secure, use_native, name, query_plan, False)
def test_aggregate_sum(): """Tests a group by query with a sum aggregate :return: None """ query_plan = QueryPlan() # Query plan # select sum(float(s_acctbal)) from supplier.csv ts = query_plan.add_operator( SQLTableScan('supplier.csv', 'select * from S3Object;', False, 'ts', query_plan, False)) a = query_plan.add_operator( Aggregate([ AggregateExpression(AggregateExpression.SUM, lambda t_: float(t_['_5'])) ], 'a', query_plan, False)) c = query_plan.add_operator(Collate('c', query_plan, False)) ts.connect(a) a.connect(c) # Write the plan graph query_plan.write_graph(os.path.join(ROOT_DIR, "../tests-output"), gen_test_id()) # Start the query query_plan.execute() # Assert the results # num_rows = 0 # for t in c.tuples(): # num_rows += 1 # print("{}:{}".format(num_rows, t)) field_names = ['_0'] assert c.tuples()[0] == field_names assert round(IndexedTuple.build(c.tuples()[1], field_names)['_0'], 2) == 45103548.65 assert len(c.tuples()) == 1 + 1 # Write the metrics query_plan.print_metrics()
def test_group_sum(): """Tests a group by query with a sum aggregate :return: None """ query_plan = QueryPlan() # Query plan # select s_nationkey, sum(float(s_acctbal)) from supplier.csv group by s_nationkey ts = query_plan.add_operator( SQLTableScan('supplier.csv', 'select * from S3Object;', False, 'ts', query_plan, False)) g = query_plan.add_operator( Group(['_3'], [ AggregateExpression(AggregateExpression.SUM, lambda t_: float(t_['_5'])) ], 'g', query_plan, False)) c = query_plan.add_operator(Collate('c', query_plan, False)) ts.connect(g) g.connect(c) # Start the query query_plan.execute() # Assert the results # num_rows = 0 # for t in c.tuples(): # num_rows += 1 # print("{}:{}".format(num_rows, t_)) field_names = ['_0', '_1'] assert c.tuples()[0] == field_names assert len(c.tuples()) == 25 + 1 nation_24 = filter( lambda t_: IndexedTuple.build(t_, field_names)['_0'] == '24', c.tuples())[0] assert round(nation_24[1], 2) == 1833872.56 # Write the metrics query_plan.print_metrics()
def test_filter_2(): # Let's forget about the local filter for now. The pd._expr field of the PredicateExpression class is not well documented and it is needed for the Filter class (on line 102). """ :return: """ query_plan = QueryPlan(buffer_size=64, is_async=True, use_shared_mem=False) # Query plan ''' ts = query_plan.add_operator( SQLTableScan('lineitem.csv', 'select * from S3Object limit 3;' , False, 'ts', query_plan, False)) ''' # using a 'use_native=True' argument will result in a None object being returned ts = query_plan.add_operator( SQLTableScan('random_strings_2.csv', 'select f1,f2 from S3Object limit 5;', Format.CSV, True, False, False, 'ts', query_plan, False)) def fn(df): df.columns = ["first_col", "second_col"] #We can change the types of the columns #df["first_col"] = df["first_col"].astype(np.int64) return df p = query_plan.add_operator(Project([], 'project', query_plan, False, fn)) c = query_plan.add_operator(Collate('c', query_plan, False)) ts.connect(p) p.connect(c) # Write the plan graph query_plan.write_graph(os.path.join(ROOT_DIR, "../tests-output"), gen_test_id()) # Start the query query_plan.execute() #assert 2 + 1 == len(c.tuples()) print("Tuples:") print(c.tuples()) # Write the metrics query_plan.print_metrics() print(ROOT_DIR)
def sql_scan_part_partkey_where_brand12_operator_def(sharded, shard, num_shards, use_pandas, secure, use_native, name, query_plan, sf, format_): # key_lower = math.ceil((200000.0 / float(num_shards)) * shard) # key_upper = math.ceil((200000.0 / float(num_shards)) * (shard + 1)) return SQLTableScan( get_file_key('part', sharded, shard, sf), "select " " p_partkey " "from " " S3Object " "where " " p_brand = 'Brand#12' and " " p_partkey >= {} and p_partkey < {} " " ".format(key_lower, key_upper), format_, use_pandas, secure, use_native, name, query_plan, False)
def test_group_empty(): """Executes a group where no records are returned. We tst this as it's somewhat peculiar with s3 select, in so much as s3 does not return column names when selecting data, meaning, unlike a traditional DBMS, no field names tuple should be present in the results. :return: None """ query_plan = QueryPlan() # Query plan # select s_nationkey, sum(float(s_acctbal)) from supplier.csv group by s_nationkey ts = query_plan.add_operator( SQLTableScan('supplier.csv', 'select * from S3Object limit 0;', False, 'ts', query_plan, False)) g = query_plan.add_operator( Group(['_3'], [ AggregateExpression(AggregateExpression.SUM, lambda t_: float(t_['_5'])) ], 'g', query_plan, False)) c = query_plan.add_operator(Collate('c', query_plan, False)) ts.connect(g) g.connect(c) # Start the query query_plan.execute() # Assert the results # num_rows = 0 # for t in c.tuples(): # num_rows += 1 # print("{}:{}".format(num_rows, t)) field_names = ['_0', '_1'] assert c.tuples()[0] == field_names assert len(c.tuples()) == 0 + 1 # Write the metrics query_plan.print_metrics()
def run(use_pandas, secure, use_native, format_): profile_file_name = os.path.join(ROOT_DIR, "../benchmark-output/" + gen_test_id() + ".prof") os.remove(profile_file_name) if os.path.exists(profile_file_name) else None print("SQL Table Scan | Settings {}".format({'use_pandas': use_pandas, 'secure': secure, 'use_native': use_native})) query_plan = QueryPlan(is_async=True, buffer_size=0) # Query plan scan = query_plan.add_operator( SQLTableScan('lineitem.csv', "select " " * " "from " " S3Object limit 100000", format_, use_pandas, secure, use_native, 'scan', query_plan, True)) # scan.set_profiled(True, profile_file_name) null = query_plan.add_operator( Null('null', query_plan, True)) scan.connect(null) # Write the plan graph query_plan.write_graph(os.path.join(ROOT_DIR, "../benchmark-output"), gen_test_id()) # Start the query query_plan.execute() query_plan.print_metrics() query_plan.stop() # Write the profile # s = pstats.Stats(profile_file_name) # s.strip_dirs().sort_stats("time").print_stats() print("SQL Table Scan | Done")
def test_project_empty(): """Executes an projection query with no results returned. We tst this as it's somewhat peculiar with s3 select, in so much as s3 does not return column names when selecting data, meaning, unlike a traditional DBMS, no field names tuple should be present in the results. :return: None """ query_plan = QueryPlan() # Query plan ts = query_plan.add_operator( SQLTableScan('nation.csv', "select * from S3Object " "limit 0;", False, 'ts', query_plan, False)) p = query_plan.add_operator( Project([ ProjectExpression(lambda t_: t_['_2'], 'n_regionkey'), ProjectExpression(lambda t_: t_['_0'], 'n_nationkey'), ProjectExpression(lambda t_: t_['_3'], 'n_comment') ], 'p', query_plan, False)) c = query_plan.add_operator(Collate('c', query_plan, False)) ts.connect(p) p.connect(c) # Write the plan graph query_plan.write_graph(os.path.join(ROOT_DIR, "../tests-output"), gen_test_id()) # Start the query query_plan.execute() # Assert the results # num_rows = 0 # for t in c.tuples(): # num_rows += 1 # print("{}:{}".format(num_rows, t)) assert len(c.tuples()) == 0 # Write the metrics query_plan.print_metrics()
def test_fast_scan_simple(): """Executes a scan. The results are then collated. :return: None """ query_plan = QueryPlan() # Query plan ts = query_plan.add_operator( SQLTableScan('nation.csv', "select * from s3object", True, True, True, 'ts', query_plan, False)) c = query_plan.add_operator(Collate('c', query_plan, False)) ts.connect(c) # Write the plan graph query_plan.write_graph(os.path.join(ROOT_DIR, "../tests-output"), gen_test_id()) # Start the query query_plan.execute() # Write the metrics query_plan.print_metrics() assert len(c.tuples()) == 25 + 1 assert c.tuples()[0] == ['_0', '_1', '_2', '_3'] assert c.tuples()[1] == [ '0', 'ALGERIA', '0', ' haggle. carefully final deposits detect slyly agai' ] assert c.tuples()[2] == [ '1', 'ARGENTINA', '1', 'al foxes promise slyly according to the regular accounts. bold requests alon' ] assert c.tuples()[3] == [ '2', 'BRAZIL', '1', 'y alongside of the pending deposits. carefully special packages are about ' 'the ironic forges. slyly special ' ]
def sql_scan_lineitem_partkey_extendedprice_discount_where_shipdate_partitioned_operator_def( min_shipped_date, max_shipped_date, part, parts, use_pandas, secure, use_native, name, query_plan, sf, format_): # key_lower = math.ceil((6000000.0 / float(parts)) * part) # key_upper = math.ceil((6000000.0 / float(parts)) * (part + 1)) return SQLTableScan( get_file_key('lineitem', sharded, shard, sf), "select " " l_partkey, l_extendedprice, l_discount " "from " " S3Object " "where " " l_shipdate >= \'{}\' and " " l_shipdate < \'{}\' and " " l_orderkey >= {} and l_orderkey < {} " ";".format(min_shipped_date.strftime('%Y-%m-%d'), max_shipped_date.strftime('%Y-%m-%d'), key_lower, key_upper), format_, use_pandas, secure, use_native, name, query_plan, False)
def test_filter_empty(): """Executes a filter where no records are returned. We tst this as it's somewhat peculiar with s3 select, in so much as s3 does not return column names when selecting data, meaning, unlike a traditional DBMS, no field names tuple should be present in the results. :return: None """ query_plan = QueryPlan() # Query plan ts = query_plan.add_operator( SQLTableScan('lineitem.csv', 'select * from S3Object limit 0;', False, 'ts', query_plan, False)) f = query_plan.add_operator( Filter( PredicateExpression(lambda t_: cast(t_['_10'], timestamp) >= cast( '1996-03-01', timestamp)), 'f', query_plan, False)) c = query_plan.add_operator(Collate('c', query_plan, False)) ts.connect(f) f.connect(c) # Write the plan graph query_plan.write_graph(os.path.join(ROOT_DIR, "../tests-output"), gen_test_id()) # Start the query query_plan.execute() # Assert the results # num_rows = 0 # for t in c.tuples(): # num_rows += 1 # print("{}:{}".format(num_rows, t)) assert len(c.tuples()) == 0 # Write the metrics query_plan.print_metrics()
def main(): parts = 32 query_plan = QueryPlan(is_async=True, buffer_size=0) # Query plan lineitem_scan = map(lambda p: query_plan.add_operator( SQLTableScan(get_file_key('lineitem', True, p), "select * from S3Object;", Format.CSV, use_pandas=True, secure=False, use_native=False, name='scan_' + str(p), query_plan=query_plan, log_enabled=False)), range(0, parts)) collate = query_plan.add_operator( Collate('collate', query_plan, False)) map(lambda (p, o): o.connect(collate), enumerate(lineitem_scan)) query_plan.execute()
def sql_scan_lineitem_partkey_extendedprice_discount_where_shipdate_sharded_operator_def( min_shipped_date, max_shipped_date, sharded, shard, num_parts, sf, use_pandas, secure, use_native, name, query_plan, format_): return SQLTableScan( get_file_key('lineitem', sharded, shard, sf, format_), "select " " l_partkey, l_extendedprice, l_discount " "from " " S3Object " "where " " l_shipdate >= \'{}\' and " " l_shipdate < \'{}\' " " {}" ";".format( min_shipped_date.strftime('%Y-%m-%d'), max_shipped_date.strftime('%Y-%m-%d'), get_sql_suffix('lineitem', num_parts, shard, sharded, add_where=False)), format_, use_pandas, secure, use_native, name, query_plan, False)