Example #1
0
def test_join_topk():
    """Tests a top k with a join

    :return: None
    """

    limit = 5

    query_plan = QueryPlan()

    # Query plan
    ts1 = query_plan.add_operator(SQLTableScan('supplier.csv',
                                               'select * from S3Object;', False, 'ts1', query_plan, False))
    ts1_project = query_plan.add_operator(
        Project([ProjectExpression(lambda t_: t_['_3'], 's_nationkey')], 'ts1_project', query_plan, False))
    ts2 = query_plan.add_operator(SQLTableScan('nation.csv',
                                               'select * from S3Object;', False, 'ts2', query_plan, False))
    ts2_project = query_plan.add_operator(
        Project([ProjectExpression(lambda t_: t_['_0'], 'n_nationkey')], 'ts2_project', query_plan, False))
    j = query_plan.add_operator(HashJoin(JoinExpression('s_nationkey', 'n_nationkey'), 'j', query_plan, False))
    t = query_plan.add_operator(Limit(limit, 't', query_plan, False))
    c = query_plan.add_operator(Collate('c', query_plan, False))

    ts1.connect(ts1_project)
    ts2.connect(ts2_project)
    j.connect_left_producer(ts1_project)
    j.connect_right_producer(ts2_project)
    j.connect(t)
    t.connect(c)

    # Write the plan graph
    query_plan.write_graph(os.path.join(ROOT_DIR, "../tests-output"), gen_test_id())

    # Start the query
    query_plan.execute()

    # Assert the results
    # num_rows = 0
    # for t in c.tuples():
    #     num_rows += 1
    #     print("{}:{}".format(num_rows, t))

    c.print_tuples()

    field_names = ['s_nationkey', 'n_nationkey']

    assert len(c.tuples()) == limit + 1

    assert c.tuples()[0] == field_names

    num_rows = 0
    for t in c.tuples():
        num_rows += 1
        # Assert that the nation_key in table 1 has been joined with the record in table 2 with the same nation_key
        if num_rows > 1:
            lt = IndexedTuple.build(t, field_names)
            assert lt['s_nationkey'] == lt['n_nationkey']

    # Write the metrics
    query_plan.print_metrics()
Example #2
0
def test_join_empty():
    """Executes a join where no records are returned. We tst this as it's somewhat peculiar with s3 select, in so much
    as s3 does not return column names when selecting data, meaning, unlike a traditional DBMS, no field names tuple
    should be present in the results.

    :return: None
    """

    query_plan = QueryPlan()

    # Query plan
    supplier_scan = query_plan.add_operator(
        SQLTableScan('supplier.csv', 'select * from S3Object limit 0;', False, 'supplier_scan', query_plan, False))

    supplier_project = query_plan.add_operator(
        Project([ProjectExpression(lambda t_: t_['_3'], 's_nationkey')], 'supplier_project', query_plan, False))

    nation_scan = query_plan.add_operator(
        SQLTableScan('nation.csv', 'select * from S3Object limit 0;', False, 'nation_scan', query_plan, False))

    nation_project = query_plan.add_operator(
        Project([ProjectExpression(lambda t_: t_['_0'], 'n_nationkey')], 'nation_project', query_plan, False))

    supplier_nation_join = query_plan.add_operator(
        HashJoin(JoinExpression('s_nationkey', 'n_nationkey'), 'supplier_nation_join', query_plan, False))

    collate = query_plan.add_operator(Collate('collate', query_plan, False))

    supplier_scan.connect(supplier_project)
    nation_scan.connect(nation_project)
    supplier_nation_join.connect_left_producer(supplier_project)
    supplier_nation_join.connect_right_producer(nation_project)
    supplier_nation_join.connect(collate)

    # Write the plan graph
    query_plan.write_graph(os.path.join(ROOT_DIR, "../tests-output"), gen_test_id())

    # Start the query
    query_plan.execute()

    # Assert the results
    # num_rows = 0
    # for t in collate.tuples():
    #     num_rows += 1
    #     print("{}:{}".format(num_rows, t))

    assert len(collate.tuples()) == 0

    # Write the metrics
    query_plan.print_metrics()
Example #3
0
def test_scan_empty():
    """Executes a scan where no records are returned. We tst this as it's somewhat peculiar with s3 select, in so much
    as s3 does not return column names when selecting data, meaning, unlike a traditional DBMS, no field names tuple
    should be present in the results.

    :return: None
    """

    query_plan = QueryPlan()

    # Query plan
    ts = query_plan.add_operator(
        SQLTableScan('nation.csv', "select * from s3object limit 0", False,
                     'ts', query_plan, False))
    c = query_plan.add_operator(Collate('c', query_plan, False))

    ts.connect(c)

    # Write the plan graph
    query_plan.write_graph(os.path.join(ROOT_DIR, "../tests-output"),
                           gen_test_id())

    # Start the query
    query_plan.execute()

    # Assert the results
    # num_rows = 0
    # for t in c.tuples():
    #     num_rows += 1
    #     print("{}:{}".format(num_rows, t))

    assert len(c.tuples()) == 0

    # Write the metrics
    query_plan.print_metrics()
Example #4
0
def sql_scan_lineitem_select_partkey_quantity_extendedprice_discount_shipinstruct_shipmode_where_filtered_op(
        sharded, shard, num_shards, use_pandas, secure, use_native, name,
        query_plan, sf, format_):
    return SQLTableScan(
        get_file_key('lineitem', sharded, shard, sf, format_), "select "
        "  l_partkey, "
        "  l_quantity, "
        "  l_extendedprice, "
        "  l_discount, "
        "  l_shipinstruct, "
        "  l_shipmode "
        "from "
        "  S3Object "
        "where "
        "  ( "
        "      ( "
        "          cast(l_quantity as integer) >= 3 and cast(l_quantity as integer) <= 3 + 10 "
        "          and l_shipmode in ('AIR', 'AIR REG') "
        "          and l_shipinstruct = 'DELIVER IN PERSON' "
        "      ) "
        "      or "
        "      ( "
        "          cast(l_quantity as integer) >= 16 and cast(l_quantity as integer) <= 16 + 10 "
        "          and l_shipmode in ('AIR', 'AIR REG') "
        "          and l_shipinstruct = 'DELIVER IN PERSON' "
        "      ) "
        "      or "
        "      ( "
        "          cast(l_quantity as integer) >= 24 and cast(l_quantity as integer) <= 24 + 10 "
        "          and l_shipmode in ('AIR', 'AIR REG') "
        "          and l_shipinstruct = 'DELIVER IN PERSON' "
        "      ) "
        "  ) {}".format(get_sql_suffix('lineitem', num_shards, shard,
                                       sharded)), format_, use_pandas, secure,
        use_native, name, query_plan, False)
Example #5
0
def sql_scan_part_partkey_brand_size_container_where_extra_filtered_op(sharded,
                                                                       shard,
                                                                       num_shards,
                                                                       use_pandas, secure, use_native,
                                                                       name,
                                                                       query_plan, sf):
    return SQLTableScan(get_file_key('part', sharded, shard, sf),
                        "select "
                        "  p_partkey, "
                        "  p_brand, "
                        "  p_size, "
                        "  p_container "
                        "from "
                        "  S3Object "
                        "where "
                        "  ("
                        "    p_partkey = '103853' or "
                        "    p_partkey = '104277' or "
                        "    p_partkey = '104744' "
                        "  ) and "
                        "  ( "
                        "      ( "
                        "          p_brand = 'Brand#11' "
                        "          and p_container in ("
                        "              'SM CASE', "
                        "              'SM BOX', "
                        "              'SM PACK', "
                        "              'SM PKG'"
                        "          ) "
                        "          and cast(p_size as integer) between 1 and 5 "
                        "      ) "
                        "      or "
                        "      ( "
                        "          p_brand = 'Brand#44' "
                        "          and p_container in ("
                        "              'MED BAG', "
                        "              'MED BOX', "
                        "              'MED PKG', "
                        "              'MED PACK'"
                        "          ) "
                        "          and cast(p_size as integer) between 1 and 10 "
                        "      ) "
                        "      or "
                        "      ( "
                        "          p_brand = 'Brand#53' "
                        "          and p_container in ("
                        "              'LG CASE', "
                        "              'LG BOX', "
                        "              'LG PACK', "
                        "              'LG PKG'"
                        "          ) "
                        "          and cast(p_size as integer) between 1 and 15 "
                        "      ) "
                        "  ) {}"
                        .format(get_sql_suffix('part', num_shards, shard, sharded)),
                        use_pandas, secure, use_native,
                        name,
                        query_plan,
                        False)
Example #6
0
def test_pandas_project_simple():
    """Tests a projection

    :return: None
    """

    query_plan = QueryPlan()

    # Query plan
    ts = query_plan.add_operator(
        SQLTableScan('nation.csv', 'select * from S3Object '
                     'limit 3;', True, 'ts', query_plan, False))

    p = query_plan.add_operator(
        Project([
            ProjectExpression(lambda t_: t_['_2'], 'n_regionkey'),
            ProjectExpression(lambda t_: t_['_0'], 'n_nationkey'),
            ProjectExpression(lambda t_: t_['_3'], 'n_comment')
        ], 'p', query_plan, False))

    c = query_plan.add_operator(Collate('c', query_plan, False))

    ts.connect(p)
    p.connect(c)

    # Write the plan graph
    query_plan.write_graph(os.path.join(ROOT_DIR, "../tests-output"),
                           gen_test_id())

    # Start the query
    query_plan.execute()

    # Assert the results
    # num_rows = 0
    # for t in c.tuples():
    #     num_rows += 1
    #     print("{}:{}".format(num_rows, t))

    field_names = ['n_regionkey', 'n_nationkey', 'n_comment']

    assert len(c.tuples()) == 3 + 1

    assert c.tuples()[0] == field_names

    assert c.tuples()[1] == [
        '0', '0', ' haggle. carefully final deposits detect slyly agai'
    ]
    assert c.tuples()[2] == [
        '1', '1',
        'al foxes promise slyly according to the regular accounts. bold requests alon'
    ]
    assert c.tuples()[3] == [
        '1', '2',
        'y alongside of the pending deposits. carefully special packages '
        'are about the ironic forges. slyly special '
    ]

    # Write the metrics
    query_plan.print_metrics()
Example #7
0
def sql_filtered_scan_lineitem_operator_def(max_shipped_date, sharded, shard, sf, use_pandas, secure, use_native, name, query_plan, format_):
    return SQLTableScan(get_file_key('lineitem', sharded, shard, sf),
                        " select l_quantity, l_extendedprice, l_discount, l_tax, l_returnflag, l_linestatus, l_shipdate "
                        " from S3Object "
                        " where cast(l_shipdate as timestamp) <= cast(\'{}\' as timestamp) "
                        ";".format(max_shipped_date.strftime('%Y-%m-%d')), format_,
                        use_pandas, secure, use_native, name, query_plan,
                        False)
Example #8
0
def test_group_count():
    """Tests a group by query with a count aggregate

    :return: None
    """

    num_rows = 0

    query_plan = QueryPlan()

    # Query plan
    # select s_nationkey, count(s_suppkey) from supplier.csv group by s_nationkey
    ts = query_plan.add_operator(
        SQLTableScan('supplier.csv', 'select * from S3Object;', False, 'ts',
                     query_plan, False))

    g = query_plan.add_operator(
        Group(
            ['_3'],
            [
                AggregateExpression(AggregateExpression.COUNT,
                                    lambda t_: t_['_0'])
                # count(s_suppkey)
            ],
            'g',
            query_plan,
            False))

    c = query_plan.add_operator(Collate('c', query_plan, False))

    query_plan.write_graph(os.path.join(ROOT_DIR, "../tests-output"),
                           gen_test_id())

    ts.connect(g)
    g.connect(c)

    # Start the query
    query_plan.execute()

    # Assert the results
    for _ in c.tuples():
        num_rows += 1
        # print("{}:{}".format(num_rows, t))

    field_names = ['_0', '_1']

    assert c.tuples()[0] == field_names

    assert len(c.tuples()) == 25 + 1

    nation_24 = filter(
        lambda t: IndexedTuple.build(t, field_names)['_0'] == '24',
        c.tuples())[0]
    assert nation_24[1] == 393
    assert num_rows == 25 + 1

    # Write the metrics
    query_plan.print_metrics()
Example #9
0
    def start(self):
        """Executes the query and sends the tuples to consumers.

        :return: None
        """

        # NOTE: There should only ever be one bloom filter
        if self.__bloom_filters[0].fp_rate() < 1.0:
            bloom_filter_sql_predicates = []
            for bf in self.__bloom_filters:
                bloom_filter_sql_predicate = bf.build_bit_array_string_sql_predicate(
                    self.__bloom_filter_field_name)
                bloom_filter_sql_predicates.append(bloom_filter_sql_predicate)

            # Don't need this anymore
            del self.__bloom_filters

            sql_suffix = self.__build_sql_suffix(self.s3sql,
                                                 bloom_filter_sql_predicates)
            self.s3sql = self.s3sql + sql_suffix
        else:
            print(
                "{}('{}') | Bloom filter fp rate ({}) is 1.0, not using bloom filter"
                .format(self.__class__.__name__, self.name,
                        self.__bloom_filters[0].fp_rate()))

        if self.log_enabled:
            print("{}('{}') | sql length: {}".format(self.__class__.__name__,
                                                     self.name,
                                                     len(self.s3sql)))

        if self.use_pandas:
            cur = SQLTableScan.execute_pandas_query(self)
        else:
            cur = SQLTableScan.execute_py_query(self)

        self.op_metrics.bytes_scanned = cur.bytes_scanned
        self.op_metrics.bytes_processed = cur.bytes_processed
        self.op_metrics.bytes_returned = cur.bytes_returned

        self.op_metrics.time_to_first_record_response = cur.time_to_first_record_response
        self.op_metrics.time_to_last_record_response = cur.time_to_last_record_response

        if not self.is_completed():
            self.complete()
Example #10
0
def sql_scan_lineitem_operator_def(sharded, shard, parts, use_pandas, secure,
                                   use_native, name, query_plan, sf, format_):
    return SQLTableScan(
        get_file_key('lineitem', sharded, shard, sf, format_), "select "
        "  * "
        "from "
        "  S3Object "
        "  {} ".format(
            get_sql_suffix('lineitem', parts, shard, sharded, add_where=True)),
        format_, use_pandas, secure, use_native, name, query_plan, False)
Example #11
0
def test_aggregate_count():
    """Tests a group by query with a count aggregate

    :return: None
    """

    query_plan = QueryPlan()

    # Query plan
    # select count(*) from supplier.csv
    ts = query_plan.add_operator(
        SQLTableScan('supplier.csv', 'select * from S3Object;', False, 'ts',
                     query_plan, False))

    a = query_plan.add_operator(
        Aggregate(
            [
                AggregateExpression(AggregateExpression.COUNT,
                                    lambda t_: t_['_0'])
                # count(s_suppkey)
            ],
            'a',
            query_plan,
            False))

    c = query_plan.add_operator(Collate('c', query_plan, False))

    ts.connect(a)
    a.connect(c)

    # Write the plan graph
    query_plan.write_graph(os.path.join(ROOT_DIR, "../tests-output"),
                           gen_test_id())

    # Start the query
    query_plan.execute()

    # Assert the results
    # num_rows = 0
    # for t in c.tuples():
    #     num_rows += 1
    #     print("{}:{}".format(num_rows, t))

    c.print_tuples()

    field_names = ['_0']

    tuples = c.tuples()

    assert tuples[0] == field_names
    assert IndexedTuple.build(tuples[1], field_names)['_0'] == 10000
    assert len(tuples) == 1 + 1

    # Write the metrics
    query_plan.print_metrics()
Example #12
0
def sql_scan_part_select_all_op(sharded, shard, num_shards, use_pandas, secure,
                                use_native, name, query_plan, sf, format_):
    return SQLTableScan(
        get_file_key('part', sharded, shard, sf, format_), "select "
        "  * "
        "from "
        "  S3Object "
        "  {} ".format(
            get_sql_suffix('part', num_shards, shard, sharded,
                           add_where=True)), format_, use_pandas, secure,
        use_native, name, query_plan, False)
Example #13
0
def sql_scan_lineitem_where_shipdate_sharded_operator_def(
        min_shipped_date, max_shipped_date, shard, sharded, use_pandas, secure,
        use_native, name, query_plan, sf, format_):
    return SQLTableScan(
        get_file_key('lineitem', sharded, shard, sf), "select * from S3Object "
        "where "
        "  l_shipdate  >= \'{}\'  and "
        "  l_shipdate  < \'{}\'  "
        ";".format(min_shipped_date.strftime('%Y-%m-%d'),
                   max_shipped_date.strftime('%Y-%m-%d')), format_, use_pandas,
        secure, use_native, name, query_plan, False)
Example #14
0
def test_sort_desc():
    """Executes a sorted query. The results are collated.

    :return: None
    """

    query_plan = QueryPlan()

    # Query plan
    ts = query_plan.add_operator(
        SQLTableScan('supplier.csv', 'select * from S3Object '
                     'limit 3;', False, 'ts', query_plan, False))

    s = query_plan.add_operator(
        Sort([SortExpression('_5', float, 'DESC')], 's', query_plan, False))

    c = query_plan.add_operator(Collate('c', query_plan, False))

    # Write the plan graph
    query_plan.write_graph(os.path.join(ROOT_DIR, "../tests-output"),
                           gen_test_id())

    ts.connect(s)
    s.connect(c)

    # Start the query
    query_plan.execute()

    # Assert the results
    # num_rows = 0
    # for t in c.tuples():
    #     num_rows += 1
    #     print("{}:{}".format(num_rows, t))

    assert len(c.tuples()) == 3 + 1

    assert c.tuples()[0] == ['_0', '_1', '_2', '_3', '_4', '_5', '_6']

    assert c.tuples()[1] == [
        '1', 'Supplier#000000001', ' N kD4on9OM Ipw3,gf0JBoQDd7tgrzrddZ', '17',
        '27-918-335-1736', '5755.94', 'each slyly above the careful'
    ]
    assert c.tuples()[2] == [
        '3', 'Supplier#000000003', 'q1,G3Pj6OjIuUYfUoH18BFTKP5aU9bEV3', '1',
        '11-383-516-1199', '4192.40',
        'blithely silent requests after the express dependencies are sl'
    ]
    assert c.tuples()[3] == [
        '2', 'Supplier#000000002', '89eJ5ksX3ImxJQBvxObC,', '5',
        '15-679-861-2259', '4032.68', ' slyly bold instructions. idle dependen'
    ]

    # Write the metrics
    query_plan.print_metrics()
Example #15
0
def test_aggregate_empty():
    """Executes an aggregate query with no results returned. We tst this as it's somewhat peculiar with s3 select,
    in so much as s3 does not return column names when selecting data, though being an aggregate query we can generate
    the tuple field names based on the expressions supplied.

    TODO: Unsure whether the aggregate operator should return field names. It makes sense in one way, but is different
    to how all the other operators behave.

    :return: None
    """

    query_plan = QueryPlan()

    # Query plan
    # select sum(float(s_acctbal)) from supplier.csv limit 0
    ts = query_plan.add_operator(
        SQLTableScan('supplier.csv', 'select * from S3Object limit 0;', False,
                     'ts', query_plan, False))

    a = query_plan.add_operator(
        Aggregate([
            AggregateExpression(AggregateExpression.SUM,
                                lambda t_: float(t_['_5']))
        ], 'a', query_plan, False))

    c = query_plan.add_operator(Collate('c', query_plan, False))

    ts.connect(a)
    a.connect(c)

    # Write the plan graph
    query_plan.write_graph(os.path.join(ROOT_DIR, "../tests-output"),
                           gen_test_id())

    # Start the query
    query_plan.execute()

    # Assert the results
    # num_rows = 0
    # for t in c.tuples():
    #     num_rows += 1
    #     print("{}:{}".format(num_rows, t))

    field_names = ['_0']

    assert c.tuples()[0] == field_names
    assert len(c.tuples()) == 0 + 1

    # Write the metrics
    query_plan.print_metrics()
Example #16
0
def sql_scan_part_partkey_type_part_where_brand12_partitioned_operator_def(
        sharded, part, parts, sf, use_pandas, secure, use_native, name,
        query_plan, format_):
    # key_lower = math.ceil((200000.0 / float(parts)) * part)
    # key_upper = math.ceil((200000.0 / float(parts)) * (part + 1))

    return SQLTableScan(
        get_file_key('part', sharded, part, sf, format_), "select "
        "  p_partkey, p_type "
        "from "
        "  S3Object "
        "  {} ".format(
            get_sql_suffix('part', parts, part, sharded, add_where=True)),
        format_, use_pandas, secure, use_native, name, query_plan, False)
Example #17
0
def run(parallel, use_pandas, buffer_size, table_parts, perc, path, format_):
    secure = False
    use_native = False
    print('')
    print("Indexing Benchmark")
    print("------------------")

    # Query plan
    query_plan = QueryPlan(is_async=parallel, buffer_size=buffer_size)

    # Scan Index Files
    upper = perc * 100
    scan = map(
        lambda p: query_plan.add_operator(
            SQLTableScan(
                '{}/data_{}.csv'.format(path, p), "select * from S3Object "
                " where cast(F0 as float) < {};".format(upper), format_,
                use_pandas, secure, use_native, 'scan_{}'.format(
                    p), query_plan, False)), range(0, table_parts))

    collate = query_plan.add_operator(Collate('collate', query_plan, False))

    map(lambda (p, o): o.connect(collate), enumerate(scan))

    # Plan settings
    print('')
    print("Settings")
    print("--------")
    print('')
    print('use_pandas: {}'.format(use_pandas))
    print("table parts: {}".format(table_parts))
    print('')

    # Write the plan graph
    query_plan.write_graph(os.path.join(ROOT_DIR, "../benchmark-output"),
                           gen_test_id() + "-" + str(table_parts))

    # Start the query
    query_plan.execute()
    print('Done')
    tuples = collate.tuples()

    # collate.print_tuples(tuples)

    # Write the metrics
    query_plan.print_metrics()

    # Shut everything down
    query_plan.stop()
Example #18
0
def sql_scan_part_select_all_where_partkey_op(sharded, shard, num_shards,
                                              use_pandas, secure, use_native,
                                              name, query_plan, sf):
    return SQLTableScan(
        get_file_key('part', sharded, shard, sf), "select "
        "  * "
        "from "
        "  S3Object "
        "where "
        "  ("
        "    p_partkey = '103853' or "
        "    p_partkey = '104277' or "
        "    p_partkey = '104744'"
        "  ) {}".format(get_sql_suffix('part', num_shards, shard, sharded)),
        use_pandas, secure, use_native, name, query_plan, False)
Example #19
0
def test_aggregate_sum():
    """Tests a group by query with a sum aggregate

    :return: None
    """

    query_plan = QueryPlan()

    # Query plan
    # select sum(float(s_acctbal)) from supplier.csv
    ts = query_plan.add_operator(
        SQLTableScan('supplier.csv', 'select * from S3Object;', False, 'ts',
                     query_plan, False))

    a = query_plan.add_operator(
        Aggregate([
            AggregateExpression(AggregateExpression.SUM,
                                lambda t_: float(t_['_5']))
        ], 'a', query_plan, False))

    c = query_plan.add_operator(Collate('c', query_plan, False))

    ts.connect(a)
    a.connect(c)

    # Write the plan graph
    query_plan.write_graph(os.path.join(ROOT_DIR, "../tests-output"),
                           gen_test_id())

    # Start the query
    query_plan.execute()

    # Assert the results
    # num_rows = 0
    # for t in c.tuples():
    #     num_rows += 1
    #     print("{}:{}".format(num_rows, t))

    field_names = ['_0']

    assert c.tuples()[0] == field_names
    assert round(IndexedTuple.build(c.tuples()[1], field_names)['_0'],
                 2) == 45103548.65
    assert len(c.tuples()) == 1 + 1

    # Write the metrics
    query_plan.print_metrics()
Example #20
0
def test_group_sum():
    """Tests a group by query with a sum aggregate

    :return: None
    """

    query_plan = QueryPlan()

    # Query plan
    # select s_nationkey, sum(float(s_acctbal)) from supplier.csv group by s_nationkey
    ts = query_plan.add_operator(
        SQLTableScan('supplier.csv', 'select * from S3Object;', False, 'ts',
                     query_plan, False))

    g = query_plan.add_operator(
        Group(['_3'], [
            AggregateExpression(AggregateExpression.SUM,
                                lambda t_: float(t_['_5']))
        ], 'g', query_plan, False))

    c = query_plan.add_operator(Collate('c', query_plan, False))

    ts.connect(g)
    g.connect(c)

    # Start the query
    query_plan.execute()

    # Assert the results
    # num_rows = 0
    # for t in c.tuples():
    #     num_rows += 1
    #     print("{}:{}".format(num_rows, t_))

    field_names = ['_0', '_1']

    assert c.tuples()[0] == field_names

    assert len(c.tuples()) == 25 + 1

    nation_24 = filter(
        lambda t_: IndexedTuple.build(t_, field_names)['_0'] == '24',
        c.tuples())[0]
    assert round(nation_24[1], 2) == 1833872.56

    # Write the metrics
    query_plan.print_metrics()
Example #21
0
def test_filter_2():
    # Let's forget about the local filter for now. The pd._expr field of the PredicateExpression class is not well documented and it is needed for the Filter class (on line 102).
    """

    :return:
    """

    query_plan = QueryPlan(buffer_size=64, is_async=True, use_shared_mem=False)

    # Query plan
    '''
    ts = query_plan.add_operator(
        SQLTableScan('lineitem.csv', 'select * from S3Object limit 3;' , False, 'ts', query_plan, False))
    '''
    # using a 'use_native=True' argument will result in a None object being returned
    ts = query_plan.add_operator(
        SQLTableScan('random_strings_2.csv',
                     'select f1,f2  from S3Object limit 5;', Format.CSV, True,
                     False, False, 'ts', query_plan, False))

    def fn(df):
        df.columns = ["first_col", "second_col"]
        #We can change the types of the columns
        #df["first_col"] = df["first_col"].astype(np.int64)
        return df

    p = query_plan.add_operator(Project([], 'project', query_plan, False, fn))

    c = query_plan.add_operator(Collate('c', query_plan, False))

    ts.connect(p)
    p.connect(c)

    # Write the plan graph
    query_plan.write_graph(os.path.join(ROOT_DIR, "../tests-output"),
                           gen_test_id())

    # Start the query
    query_plan.execute()

    #assert 2 + 1 == len(c.tuples())
    print("Tuples:")
    print(c.tuples())
    # Write the metrics
    query_plan.print_metrics()
    print(ROOT_DIR)
Example #22
0
def sql_scan_part_partkey_where_brand12_operator_def(sharded, shard,
                                                     num_shards, use_pandas,
                                                     secure, use_native, name,
                                                     query_plan, sf, format_):
    # key_lower = math.ceil((200000.0 / float(num_shards)) * shard)
    # key_upper = math.ceil((200000.0 / float(num_shards)) * (shard + 1))

    return SQLTableScan(
        get_file_key('part', sharded, shard, sf), "select "
        "  p_partkey "
        "from "
        "  S3Object "
        "where "
        "  p_brand = 'Brand#12' and "
        "  p_partkey >= {} and p_partkey  < {} "
        " ".format(key_lower, key_upper), format_, use_pandas, secure,
        use_native, name, query_plan, False)
Example #23
0
def test_group_empty():
    """Executes a group where no records are returned. We tst this as it's somewhat peculiar with s3 select, in so much
    as s3 does not return column names when selecting data, meaning, unlike a traditional DBMS, no field names tuple
    should be present in the results.

    :return: None
    """

    query_plan = QueryPlan()

    # Query plan
    # select s_nationkey, sum(float(s_acctbal)) from supplier.csv group by s_nationkey
    ts = query_plan.add_operator(
        SQLTableScan('supplier.csv', 'select * from S3Object limit 0;', False,
                     'ts', query_plan, False))

    g = query_plan.add_operator(
        Group(['_3'], [
            AggregateExpression(AggregateExpression.SUM,
                                lambda t_: float(t_['_5']))
        ], 'g', query_plan, False))

    c = query_plan.add_operator(Collate('c', query_plan, False))

    ts.connect(g)
    g.connect(c)

    # Start the query
    query_plan.execute()

    # Assert the results
    # num_rows = 0
    # for t in c.tuples():
    #     num_rows += 1
    #     print("{}:{}".format(num_rows, t))

    field_names = ['_0', '_1']

    assert c.tuples()[0] == field_names

    assert len(c.tuples()) == 0 + 1

    # Write the metrics
    query_plan.print_metrics()
Example #24
0
def run(use_pandas, secure, use_native, format_):
    profile_file_name = os.path.join(ROOT_DIR, "../benchmark-output/" + gen_test_id() + ".prof")
    os.remove(profile_file_name) if os.path.exists(profile_file_name) else None

    print("SQL Table Scan | Settings {}".format({'use_pandas': use_pandas, 'secure': secure, 'use_native': use_native}))

    query_plan = QueryPlan(is_async=True, buffer_size=0)

    # Query plan
    scan = query_plan.add_operator(
        SQLTableScan('lineitem.csv',
                     "select "
                     "  * "
                     "from "
                     "  S3Object limit 100000", format_,
                     use_pandas,
                     secure,
                     use_native,
                     'scan',
                     query_plan,
                     True))

    # scan.set_profiled(True, profile_file_name)

    null = query_plan.add_operator(
        Null('null', query_plan, True))

    scan.connect(null)

    # Write the plan graph
    query_plan.write_graph(os.path.join(ROOT_DIR, "../benchmark-output"), gen_test_id())

    # Start the query
    query_plan.execute()

    query_plan.print_metrics()

    query_plan.stop()

    # Write the profile
    # s = pstats.Stats(profile_file_name)
    # s.strip_dirs().sort_stats("time").print_stats()

    print("SQL Table Scan | Done")
Example #25
0
def test_project_empty():
    """Executes an projection query with no results returned. We tst this as it's somewhat peculiar with s3 select,
     in so much as s3 does not return column names when selecting data, meaning, unlike a traditional DBMS,
     no field names tuple should be present in the results.

    :return: None
    """

    query_plan = QueryPlan()

    # Query plan
    ts = query_plan.add_operator(
        SQLTableScan('nation.csv', "select * from S3Object "
                     "limit 0;", False, 'ts', query_plan, False))

    p = query_plan.add_operator(
        Project([
            ProjectExpression(lambda t_: t_['_2'], 'n_regionkey'),
            ProjectExpression(lambda t_: t_['_0'], 'n_nationkey'),
            ProjectExpression(lambda t_: t_['_3'], 'n_comment')
        ], 'p', query_plan, False))

    c = query_plan.add_operator(Collate('c', query_plan, False))

    ts.connect(p)
    p.connect(c)

    # Write the plan graph
    query_plan.write_graph(os.path.join(ROOT_DIR, "../tests-output"),
                           gen_test_id())

    # Start the query
    query_plan.execute()

    # Assert the results
    # num_rows = 0
    # for t in c.tuples():
    #     num_rows += 1
    #     print("{}:{}".format(num_rows, t))

    assert len(c.tuples()) == 0

    # Write the metrics
    query_plan.print_metrics()
Example #26
0
def test_fast_scan_simple():
    """Executes a scan. The results are then collated.

    :return: None
    """

    query_plan = QueryPlan()

    # Query plan
    ts = query_plan.add_operator(
        SQLTableScan('nation.csv', "select * from s3object", True, True, True,
                     'ts', query_plan, False))
    c = query_plan.add_operator(Collate('c', query_plan, False))

    ts.connect(c)

    # Write the plan graph
    query_plan.write_graph(os.path.join(ROOT_DIR, "../tests-output"),
                           gen_test_id())

    # Start the query
    query_plan.execute()

    # Write the metrics
    query_plan.print_metrics()

    assert len(c.tuples()) == 25 + 1

    assert c.tuples()[0] == ['_0', '_1', '_2', '_3']

    assert c.tuples()[1] == [
        '0', 'ALGERIA', '0',
        ' haggle. carefully final deposits detect slyly agai'
    ]
    assert c.tuples()[2] == [
        '1', 'ARGENTINA', '1',
        'al foxes promise slyly according to the regular accounts. bold requests alon'
    ]
    assert c.tuples()[3] == [
        '2', 'BRAZIL', '1',
        'y alongside of the pending deposits. carefully special packages are about '
        'the ironic forges. slyly special '
    ]
Example #27
0
def sql_scan_lineitem_partkey_extendedprice_discount_where_shipdate_partitioned_operator_def(
        min_shipped_date, max_shipped_date, part, parts, use_pandas, secure,
        use_native, name, query_plan, sf, format_):
    # key_lower = math.ceil((6000000.0 / float(parts)) * part)
    # key_upper = math.ceil((6000000.0 / float(parts)) * (part + 1))

    return SQLTableScan(
        get_file_key('lineitem', sharded, shard, sf), "select "
        "  l_partkey, l_extendedprice, l_discount "
        "from "
        "  S3Object "
        "where "
        "  l_shipdate  >= \'{}\'  and "
        "  l_shipdate  < \'{}\'  and "
        "  l_orderkey  >= {} and l_orderkey  < {} "
        ";".format(min_shipped_date.strftime('%Y-%m-%d'),
                   max_shipped_date.strftime('%Y-%m-%d'), key_lower,
                   key_upper), format_, use_pandas, secure, use_native, name,
        query_plan, False)
Example #28
0
def test_filter_empty():
    """Executes a filter where no records are returned. We tst this as it's somewhat peculiar with s3 select, in so much
    as s3 does not return column names when selecting data, meaning, unlike a traditional DBMS, no field names tuple
    should be present in the results.

    :return: None
    """

    query_plan = QueryPlan()

    # Query plan
    ts = query_plan.add_operator(
        SQLTableScan('lineitem.csv', 'select * from S3Object limit 0;', False,
                     'ts', query_plan, False))

    f = query_plan.add_operator(
        Filter(
            PredicateExpression(lambda t_: cast(t_['_10'], timestamp) >= cast(
                '1996-03-01', timestamp)), 'f', query_plan, False))

    c = query_plan.add_operator(Collate('c', query_plan, False))

    ts.connect(f)
    f.connect(c)

    # Write the plan graph
    query_plan.write_graph(os.path.join(ROOT_DIR, "../tests-output"),
                           gen_test_id())

    # Start the query
    query_plan.execute()

    # Assert the results
    # num_rows = 0
    # for t in c.tuples():
    #     num_rows += 1
    #     print("{}:{}".format(num_rows, t))

    assert len(c.tuples()) == 0

    # Write the metrics
    query_plan.print_metrics()
Example #29
0
def main():
    parts = 32
    query_plan = QueryPlan(is_async=True, buffer_size=0)

    # Query plan
    lineitem_scan = map(lambda p:
                        query_plan.add_operator(
                            SQLTableScan(get_file_key('lineitem', True, p),
                                         "select * from S3Object;", Format.CSV,
                                         use_pandas=True, secure=False, use_native=False,
                                         name='scan_' + str(p), query_plan=query_plan,
                                         log_enabled=False)),
                        range(0, parts))

    collate = query_plan.add_operator(
        Collate('collate', query_plan, False))

    map(lambda (p, o): o.connect(collate), enumerate(lineitem_scan))

    query_plan.execute()
Example #30
0
def sql_scan_lineitem_partkey_extendedprice_discount_where_shipdate_sharded_operator_def(
        min_shipped_date, max_shipped_date, sharded, shard, num_parts, sf,
        use_pandas, secure, use_native, name, query_plan, format_):
    return SQLTableScan(
        get_file_key('lineitem', sharded, shard, sf, format_), "select "
        "  l_partkey, l_extendedprice, l_discount "
        "from "
        "  S3Object "
        "where "
        "  l_shipdate  >= \'{}\'  and "
        "  l_shipdate  < \'{}\'  "
        "  {}"
        ";".format(
            min_shipped_date.strftime('%Y-%m-%d'),
            max_shipped_date.strftime('%Y-%m-%d'),
            get_sql_suffix('lineitem',
                           num_parts,
                           shard,
                           sharded,
                           add_where=False)), format_, use_pandas, secure,
        use_native, name, query_plan, False)