Exemple #1
0
def group_partkey_avg_quantity_5_op(name, query_plan):
    """with lineitem_part_avg_group as (select avg(l_quantity) from part_lineitem_join group by l_partkey)

    :param query_plan:
    :param name:
    :return:
    """
    def groupby_fn(df):
        df['l_quantity'] = df['l_quantity'].astype(np.float)
        grouped = df.groupby('l_partkey')
        # agg_df = grouped['l_quantity'].sum()

        agg_df = pd.DataFrame({
            'sum_l_quantity_computed00':
            grouped['l_quantity'].sum(),
            'cnt_l_quantity_computed00':
            grouped['l_quantity'].size()
        })

        return agg_df.reset_index()

    return Group(
        ['l_partkey'],  # l_partkey
        [
            # avg(l_quantity)
            # AggregateExpression(AggregateExpression.AVG, lambda t_: float(t_['l_quantity']))
            AggregateExpression(AggregateExpression.AVG,
                                lambda t_: float(t_[5]))
        ],
        name,
        query_plan,
        False,
        groupby_fn)
Exemple #2
0
def groupby_reduce_returnflag_linestatus_operator_def(name, query_plan, use_pandas):
    agger = OrderedDict([
        ('sum_qty', np.sum),
        ('sum_base_price', np.sum),
        ('sum_disc_price', np.sum),
        ('sum_charge', np.mean),
        ('avg_qty', np.mean),
        ('avg_price', np.mean),
        ('avg_disc', np.mean),
        ('count_order', np.sum),
    ])

    def fn(df):
        grouped = df.groupby(['l_returnflag', 'l_linestatus'], as_index=False)  # type: DataFrameGroupBy

        agg_df = grouped.agg(agger)

        return agg_df

    return Group(
        ['l_returnflag', 'l_linestatus'] if use_pandas else ['_0', '_1'],
        [
            AggregateExpression(AggregateExpression.SUM, lambda t_: float(t_[2]), 'sum_qty'),
            AggregateExpression(AggregateExpression.SUM, lambda t_: float(t_[3]), 'sum_base_price'),
            AggregateExpression(AggregateExpression.SUM, lambda t_: float(t_[4]), 'sum_disc_price'),
            AggregateExpression(AggregateExpression.SUM, lambda t_: float(t_[5]), 'sum_charge'),
            AggregateExpression(AggregateExpression.SUM, lambda t_: float(t_[6]), 'avg_qty'),
            AggregateExpression(AggregateExpression.SUM, lambda t_: float(t_[7]), 'avg_price'),
            AggregateExpression(AggregateExpression.SUM, lambda t_: float(t_[8]), 'avg_disc'),
            AggregateExpression(AggregateExpression.SUM, lambda t_: float(t_[9]), '__count'),

        ],
        name, query_plan, False, fn)
Exemple #3
0
def test_group_count():
    """Tests a group by query with a count aggregate

    :return: None
    """

    num_rows = 0

    query_plan = QueryPlan()

    # Query plan
    # select s_nationkey, count(s_suppkey) from supplier.csv group by s_nationkey
    ts = query_plan.add_operator(
        SQLTableScan('supplier.csv', 'select * from S3Object;', False, 'ts',
                     query_plan, False))

    g = query_plan.add_operator(
        Group(
            ['_3'],
            [
                AggregateExpression(AggregateExpression.COUNT,
                                    lambda t_: t_['_0'])
                # count(s_suppkey)
            ],
            'g',
            query_plan,
            False))

    c = query_plan.add_operator(Collate('c', query_plan, False))

    query_plan.write_graph(os.path.join(ROOT_DIR, "../tests-output"),
                           gen_test_id())

    ts.connect(g)
    g.connect(c)

    # Start the query
    query_plan.execute()

    # Assert the results
    for _ in c.tuples():
        num_rows += 1
        # print("{}:{}".format(num_rows, t))

    field_names = ['_0', '_1']

    assert c.tuples()[0] == field_names

    assert len(c.tuples()) == 25 + 1

    nation_24 = filter(
        lambda t: IndexedTuple.build(t, field_names)['_0'] == '24',
        c.tuples())[0]
    assert nation_24[1] == 393
    assert num_rows == 25 + 1

    # Write the metrics
    query_plan.print_metrics()
Exemple #4
0
def test_group_sum():
    """Tests a group by query with a sum aggregate

    :return: None
    """

    query_plan = QueryPlan()

    # Query plan
    # select s_nationkey, sum(float(s_acctbal)) from supplier.csv group by s_nationkey
    ts = query_plan.add_operator(
        SQLTableScan('supplier.csv', 'select * from S3Object;', False, 'ts',
                     query_plan, False))

    g = query_plan.add_operator(
        Group(['_3'], [
            AggregateExpression(AggregateExpression.SUM,
                                lambda t_: float(t_['_5']))
        ], 'g', query_plan, False))

    c = query_plan.add_operator(Collate('c', query_plan, False))

    ts.connect(g)
    g.connect(c)

    # Start the query
    query_plan.execute()

    # Assert the results
    # num_rows = 0
    # for t in c.tuples():
    #     num_rows += 1
    #     print("{}:{}".format(num_rows, t_))

    field_names = ['_0', '_1']

    assert c.tuples()[0] == field_names

    assert len(c.tuples()) == 25 + 1

    nation_24 = filter(
        lambda t_: IndexedTuple.build(t_, field_names)['_0'] == '24',
        c.tuples())[0]
    assert round(nation_24[1], 2) == 1833872.56

    # Write the metrics
    query_plan.print_metrics()
Exemple #5
0
def test_group_empty():
    """Executes a group where no records are returned. We tst this as it's somewhat peculiar with s3 select, in so much
    as s3 does not return column names when selecting data, meaning, unlike a traditional DBMS, no field names tuple
    should be present in the results.

    :return: None
    """

    query_plan = QueryPlan()

    # Query plan
    # select s_nationkey, sum(float(s_acctbal)) from supplier.csv group by s_nationkey
    ts = query_plan.add_operator(
        SQLTableScan('supplier.csv', 'select * from S3Object limit 0;', False,
                     'ts', query_plan, False))

    g = query_plan.add_operator(
        Group(['_3'], [
            AggregateExpression(AggregateExpression.SUM,
                                lambda t_: float(t_['_5']))
        ], 'g', query_plan, False))

    c = query_plan.add_operator(Collate('c', query_plan, False))

    ts.connect(g)
    g.connect(c)

    # Start the query
    query_plan.execute()

    # Assert the results
    # num_rows = 0
    # for t in c.tuples():
    #     num_rows += 1
    #     print("{}:{}".format(num_rows, t))

    field_names = ['_0', '_1']

    assert c.tuples()[0] == field_names

    assert len(c.tuples()) == 0 + 1

    # Write the metrics
    query_plan.print_metrics()
Exemple #6
0
def run(parallel, use_pandas, secure, use_native, buffer_size, format_, customer_parts, order_parts, lineitem_parts,
        customer_sharded,
        order_sharded, lineitem_sharded, other_parts, fp_rate, sf, expected_result, customer_filter_sql=None,
        order_filter_sql=None, lineitem_filter_sql=None):
    """

    :return: None
    """

    print('')
    print("TPCH Q3 Bloom Join")
    print("------------------")

    query_plan = QueryPlan(is_async=parallel, buffer_size=buffer_size)

    customer_scan = map(lambda p:
                        query_plan.add_operator(
                            SQLTableScan(get_file_key('customer', customer_sharded, p, sf, format_),
                                         "select "
                                         "  c_custkey "
                                         "from "
                                         "  S3Object "
                                         "where "
                                         "  c_mktsegment = 'BUILDING' "
                                         "  {} "
                                         "  {} "
                                         .format(
                                             ' and ' + customer_filter_sql if customer_filter_sql is not None else '',
                                             get_sql_suffix('customer', customer_parts, p, customer_sharded,
                                                            add_where=False)),
                                         format_,
                                         use_pandas, secure, use_native,
                                         'customer_scan' + '_{}'.format(p),
                                         query_plan,
                                         False)),
                        range(0, customer_parts))

    def customer_project_fn(df):
        df = df.filter(items=['_0'], axis=1)

        df.rename(columns={'_0': 'c_custkey'},
                  inplace=True)

        return df

    customer_project = map(lambda p:
                           query_plan.add_operator(
                               Project([],
                                       'customer_project' + '_{}'.format(p),
                                       query_plan,
                                       False, customer_project_fn)),
                           range(0, customer_parts))

    customer_bloom_create = query_plan.add_operator(
        BloomCreate('c_custkey', 'customer_bloom_create', query_plan, False,
                    fp_rate))

    customer_map = map(lambda p:
                       query_plan.add_operator(Map('c_custkey', 'customer_map' + '_' + str(p), query_plan, False)),
                       range(0, customer_parts))

    order_scan = map(lambda p:
                     query_plan.add_operator(
                         SQLTableScanBloomUse(get_file_key('orders', order_sharded, p, sf, format_),
                                              "select "
                                              "  o_custkey, o_orderkey, o_orderdate, o_shippriority "
                                              "from "
                                              "  S3Object "
                                              "where "
                                              "  cast(o_orderdate as timestamp) < cast('1995-03-01' as timestamp) "
                                              "  {} "
                                              "  {} "
                                              .format(
                                                  ' and ' + order_filter_sql if order_filter_sql is not None else '',
                                                  get_sql_suffix('orders', order_parts, p, order_sharded,
                                                                 add_where=False)),
                                              'o_custkey',
                                              format_,
                                              use_pandas, secure, use_native,
                                              'order_scan' + '_{}'.format(p),
                                              query_plan,
                                              False)),
                     range(0, order_parts))

    def order_project_fn(df):
        df = df.filter(items=['_0', '_1', '_2', '_3'], axis=1)

        df.rename(columns={'_0': 'o_custkey', '_1': 'o_orderkey', '_2': 'o_orderdate', '_3': 'o_shippriority'},
                  inplace=True)

        return df

    order_project = map(lambda p:
                        query_plan.add_operator(
                            Project([],
                                    'order_project' + '_{}'.format(p),
                                    query_plan,
                                    False, order_project_fn)),
                        range(0, customer_parts))

    order_map_1 = map(lambda p:
                      query_plan.add_operator(Map('o_custkey', 'order_map_1' + '_' + str(p), query_plan, False)),
                      range(0, order_parts))

    customer_order_join_build = map(lambda p:
                                    query_plan.add_operator(
                                        HashJoinBuild('c_custkey',
                                                      'customer_order_join_build' + '_' + str(p), query_plan,
                                                      False)),
                                    range(0, other_parts))

    customer_order_join_probe = map(lambda p:
                                    query_plan.add_operator(
                                        HashJoinProbe(JoinExpression('c_custkey', 'o_custkey'),
                                                      'customer_order_join_probe' + '_' + str(p),
                                                      query_plan, False)),
                                    range(0, other_parts))

    order_bloom_create = query_plan.add_operator(
        BloomCreate('o_orderkey', 'order_bloom_create', query_plan, False,
                    fp_rate))

    lineitem_scan = map(lambda p:
                        query_plan.add_operator(
                            SQLTableScanBloomUse(get_file_key('lineitem', lineitem_sharded, p, sf, format_),
                                                 "select "
                                                 "  l_orderkey, l_extendedprice, l_discount "
                                                 "from "
                                                 "  S3Object "
                                                 "where "
                                                 "  cast(l_shipdate as timestamp) > cast('1995-03-01' as timestamp) "
                                                 "  {} "
                                                 "  {} "
                                                 .format(
                                                     ' and ' + lineitem_filter_sql if lineitem_filter_sql is not None else '',
                                                     get_sql_suffix('lineitem', lineitem_parts, p, lineitem_sharded,
                                                                    add_where=False)),
                                                 'l_orderkey', format_,
                                                 use_pandas, secure, use_native,
                                                 'lineitem_scan' + '_{}'.format(p),
                                                 query_plan,
                                                 False)),
                        range(0, lineitem_parts))

    def lineitem_project_fn(df):
        df = df.filter(items=['_0', '_1', '_2'], axis=1)

        df.rename(columns={'_0': 'l_orderkey', '_1': 'l_extendedprice', '_2': 'l_discount'},
                  inplace=True)

        return df

    lineitem_project = map(lambda p:
                           query_plan.add_operator(
                               Project([],
                                       'lineitem_project' + '_{}'.format(p),
                                       query_plan,
                                       False, lineitem_project_fn)),
                           range(0, lineitem_parts))

    lineitem_map = map(lambda p:
                       query_plan.add_operator(Map('l_orderkey', 'lineitem_map' + '_' + str(p), query_plan, False)),
                       range(0, lineitem_parts))

    order_map_2 = map(lambda p:
                      query_plan.add_operator(Map('o_orderkey', 'order_map_2' + '_' + str(p), query_plan, False)),
                      range(0, other_parts))

    customer_order_lineitem_join_build = map(lambda p:
                                             query_plan.add_operator(
                                                 HashJoinBuild('o_orderkey',
                                                               'customer_order_lineitem_join_build' + '_' + str(p),
                                                               query_plan,
                                                               False)),
                                             range(0, other_parts))

    customer_order_lineitem_join_probe = map(lambda p:
                                             query_plan.add_operator(
                                                 HashJoinProbe(JoinExpression('o_orderkey', 'l_orderkey'),
                                                               'customer_order_lineitem_join_probe' + '_' + str(p),
                                                               query_plan, False)),
                                             range(0, other_parts))

    def groupby_fn(df):
        df['l_extendedprice'] = df['l_extendedprice'].astype(np.float)
        df['l_discount'] = df['l_discount'].astype(np.float)
        df['revenue'] = df['l_extendedprice'] * (1 - df['l_discount'])
        grouped = df.groupby(['l_orderkey', 'o_orderdate', 'o_shippriority'])
        agg_df = grouped['revenue'].sum()
        return agg_df.reset_index()

    group = map(lambda p:
                query_plan.add_operator(
                    Group(
                        ['l_orderkey', 'o_orderdate', 'o_shippriority'],  # l_partkey
                        [
                            AggregateExpression(AggregateExpression.SUM,
                                                lambda t_: float(t_['l_extendedprice'] * (1 - t_['l_discount'])))
                        ],
                        'group' + '_{}'.format(p), query_plan,
                        False, groupby_fn)),
                range(0, other_parts))

    def group_reduce_fn(df):
        grouped = df.groupby(['l_orderkey', 'o_orderdate', 'o_shippriority'])
        agg_df = grouped['revenue'].sum()
        return agg_df.reset_index()

    group_reduce = query_plan.add_operator(
        Group(
            ['l_orderkey', 'o_orderdate', 'o_shippriority'],  # l_partkey
            [
                AggregateExpression(AggregateExpression.SUM,
                                    lambda t_: float(t_['l_extendedprice'] * (1 - t_['l_discount'])))
            ],
            'group_reduce', query_plan,
            False, group_reduce_fn))

    top = query_plan.add_operator(
        Top(10, [SortExpression('revenue', float, 'DESC'), SortExpression('o_orderdate', date, 'ASC')], use_pandas,
            'top', query_plan,
            False))

    collate = query_plan.add_operator(tpch_q19.collate_op('collate', query_plan))

    # Inline what we can
    map(lambda o: o.set_async(False), lineitem_project)
    map(lambda o: o.set_async(False), customer_project)
    map(lambda o: o.set_async(False), order_project)
    map(lambda o: o.set_async(False), lineitem_map)
    map(lambda o: o.set_async(False), customer_map)
    map(lambda o: o.set_async(False), order_map_1)
    map(lambda o: o.set_async(False), order_map_2)

    # Connect the operators
    connect_many_to_many(customer_scan, customer_project)
    connect_many_to_many(customer_project, customer_map)

    connect_many_to_one(customer_project, customer_bloom_create)
    connect_one_to_many(customer_bloom_create, order_scan)

    connect_many_to_many(order_scan, order_project)
    connect_many_to_many(order_project, order_map_1)

    connect_all_to_all(customer_map, customer_order_join_build)
    connect_many_to_many(customer_order_join_build, customer_order_join_probe)
    connect_all_to_all(order_map_1, customer_order_join_probe)

    # connect_many_to_one(customer_order_join_probe, collate)

    connect_many_to_one(order_project, order_bloom_create)
    connect_one_to_many(order_bloom_create, lineitem_scan)

    connect_many_to_many(lineitem_scan, lineitem_project)
    connect_many_to_many(lineitem_project, lineitem_map)

    connect_many_to_many(customer_order_join_probe, order_map_2)
    connect_all_to_all(order_map_2, customer_order_lineitem_join_build)
    connect_many_to_many(customer_order_lineitem_join_build, customer_order_lineitem_join_probe)
    connect_all_to_all(lineitem_map, customer_order_lineitem_join_probe)

    # connect_many_to_one(customer_order_lineitem_join_probe, collate)

    connect_many_to_many(customer_order_lineitem_join_probe, group)
    # connect_many_to_one(group, collate)

    connect_many_to_one(group, group_reduce)
    connect_one_to_one(group_reduce, top)
    connect_one_to_one(top, collate)

    # Plan settings
    print('')
    print("Settings")
    print("--------")
    print('')
    print('use_pandas: {}'.format(use_pandas))
    print('secure: {}'.format(secure))
    print('use_native: {}'.format(use_native))
    print("customer_parts: {}".format(customer_parts))
    print("order_parts: {}".format(order_parts))
    print("lineitem_parts: {}".format(lineitem_parts))
    print("customer_sharded: {}".format(customer_sharded))
    print("order_sharded: {}".format(order_sharded))
    print("lineitem_sharded: {}".format(lineitem_sharded))
    print("other_parts: {}".format(other_parts))
    print("fp_rate: {}".format(fp_rate))
    print("format: {}".format(format_))
    print('')

    # Write the plan graph
    query_plan.write_graph(os.path.join(ROOT_DIR, "../benchmark-output"), gen_test_id())

    # Start the query
    query_plan.execute()

    tuples = collate.tuples()

    collate.print_tuples(tuples)

    # Write the metrics
    query_plan.print_metrics()

    # Shut everything down
    query_plan.stop()

    field_names = ['l_orderkey', 'o_orderdate', 'o_shippriority', 'revenue']

    assert len(tuples) == 10 + 1

    assert tuples[0] == field_names

    # NOTE: This result has been verified with the equivalent data and query on PostgreSQL
    test_util.assert_tuples(expected_result, tuples)
Exemple #7
0
def run(group_fields, agg_fields, parallel, use_pandas, buffer_size,
        table_parts, files):
    """
    
    :return: None
    """

    secure = False
    use_native = False
    print('')
    print("Groupby Benchmark, Baseline. Group Fields: {} Aggregate Fields: {}".
          format(group_fields, agg_fields))
    print("----------------------")

    # Query plan
    query_plan = QueryPlan(is_async=parallel, buffer_size=buffer_size)

    def fn(df):
        df.columns = [
            'G0',
            'G1',
            'G2',
            'G3',
            'G4',
            'G5',
            'G6',
            'G7',
            'G8',
            'G9',
            'F0',
            'F1',
            'F2',
            'F3',
            'F4',
            'F5',
            'F6',
            'F7',
            'F8',
            'F9',
        ]
        df = df.filter(items=group_fields + agg_fields, axis=1)
        return df

    # Scan
    scan = map(
        lambda p: query_plan.add_operator(
            TableScan(files.format(p),
                      use_pandas,
                      secure,
                      use_native,
                      'scan_{}'.format(p),
                      query_plan,
                      False,
                      fn=fn)), range(0, table_parts))

    # Groupby
    def groupby_fn(df):

        df[agg_fields] = df[agg_fields].astype(np.float)
        grouped = df.groupby(group_fields)
        agg_df = pd.DataFrame(
            {f: grouped[f].sum()
             for n, f in enumerate(agg_fields)})
        return agg_df.reset_index()

    groupby = map(
        lambda p: query_plan.add_operator(
            Group(group_fields, [], 'groupby_{}'.format(p), query_plan, False,
                  groupby_fn)), range(0, table_parts))

    # inlined
    map(lambda p: p.set_async(False), groupby)

    groupby_reduce = query_plan.add_operator(
        Group(group_fields, [], 'groupby_reduce', query_plan, False,
              groupby_fn))

    collate = query_plan.add_operator(Collate('collate', query_plan, False))

    #profile_path = '../benchmark-output/groupby/'
    #scan[0].set_profiled(True, os.path.join(ROOT_DIR, profile_path, gen_test_id() + "_scan_0" + ".prof"))
    #project[0].set_profiled(True, os.path.join(ROOT_DIR, profile_path, gen_test_id() + "_project_0" + ".prof"))
    #groupby[0].set_profiled(True, os.path.join(ROOT_DIR, profile_path, gen_test_id() + "_groupby_0" + ".prof"))
    #groupby_reduce.set_profiled(True, os.path.join(ROOT_DIR, profile_path, gen_test_id() + "_groupby_reduce" + ".prof"))
    #collate.set_profiled(True, os.path.join(ROOT_DIR, profile_path, gen_test_id() + "_collate" + ".prof"))
    connect_many_to_many(scan, groupby)
    connect_many_to_one(groupby, groupby_reduce)
    connect_one_to_one(groupby_reduce, collate)

    # Plan settings
    print('')
    print("Settings")
    print("--------")
    print('')
    print('use_pandas: {}'.format(use_pandas))
    print("table parts: {}".format(table_parts))
    print('')

    # Write the plan graph
    query_plan.write_graph(os.path.join(ROOT_DIR, "../benchmark-output"),
                           gen_test_id() + "-" + str(table_parts))

    # Start the query
    query_plan.execute()
    print('Done')
    tuples = collate.tuples()

    collate.print_tuples(tuples)

    # Write the metrics
    query_plan.print_metrics()

    # Shut everything down
    query_plan.stop()
Exemple #8
0
def run(group_fields, agg_fields, parallel, use_pandas, buffer_size,
        table_parts, files, format_):
    """
    
    :return: None
    """

    secure = False
    use_native = False
    print('')
    print("Groupby Benchmark, Baseline. Group Fields: {} Aggregate Fields: {}".
          format(group_fields, agg_fields))
    print("----------------------")

    # Query plan
    query_plan = QueryPlan(is_async=parallel, buffer_size=buffer_size)

    ##########################
    ## Phase 1. Find out group names
    ##########################
    # Scan
    scan_phase1 = map(
        lambda p: query_plan.add_operator(
            SQLTableScan(
                files.format(p), "select {} from S3Object;".format(','.join(
                    group_fields)), format_, use_pandas, secure, use_native,
                'scan_phase1_{}'.format(p), query_plan, False)),
        range(0, table_parts))

    # Project
    def project_fn(df):
        df.columns = group_fields
        return df

    project_exprs = [
        ProjectExpression(lambda t_: t_['_{}'.format(n)], v)
        for n, v in enumerate(group_fields)
    ]

    project = map(
        lambda p: query_plan.add_operator(
            Project(project_exprs, 'project_{}'.format(p), query_plan, False,
                    project_fn)), range(0, table_parts))

    # Groupby
    def groupby_fn(df):
        return df.drop_duplicates()

    groupby = map(
        lambda p: query_plan.add_operator(
            Group(group_fields, [], 'groupby_{}'.format(p), query_plan, False,
                  groupby_fn)), range(0, table_parts))

    groupby_reduce = query_plan.add_operator(
        Group(group_fields, [], 'groupby_reduce', query_plan, False,
              groupby_fn))

    # GroupbyFilterBuild
    agg_exprs = [('SUM', 'CAST({} AS float)'.format(agg_field))
                 for agg_field in agg_fields]

    groupby_filter_build = query_plan.add_operator(
        GroupbyFilterBuild(group_fields, agg_fields, agg_exprs,
                           'groupby_filter_build', query_plan, False))

    ##########################
    ## Phase 2. Perform aggregation at S3.
    ##########################
    # Scan
    scan_phase2 = map(
        lambda p: query_plan.add_operator(
            SQLTableScan(
                'groupby_benchmark/shards-10GB/groupby_data_{}.csv'.format(
                    p), "", format_, use_pandas, secure, use_native,
                'scan_phase2_{}'.format(p), query_plan, False)),
        range(0, table_parts))

    groupby_decoder = map(
        lambda p: query_plan.add_operator(
            GroupbyDecoder(agg_fields, 'groupby_decoder_{}'.format(p),
                           query_plan, False)), range(0, table_parts))

    def groupby_fn_phase2(df):
        #print df
        df[agg_fields] = df[agg_fields].astype(np.float)
        grouped = df.groupby(group_fields)
        agg_df = pd.DataFrame(
            {f: grouped[f].sum()
             for n, f in enumerate(agg_fields)})
        return agg_df.reset_index()

    groupby_reduce_phase2 = query_plan.add_operator(
        Group(group_fields, [], 'groupby_reduce_phase2', query_plan, False,
              groupby_fn_phase2))

    #scan_phase1[0].set_profiled(True, os.path.join(ROOT_DIR, "../benchmark-output/", gen_test_id() + "_scan_phase1_0" + ".prof"))
    #scan_phase2[0].set_profiled(True, os.path.join(ROOT_DIR, "../benchmark-output/", gen_test_id() + "_scan_phase2_0" + ".prof"))

    collate = query_plan.add_operator(Collate('collate', query_plan, False))

    # phase 1
    map(lambda (p, o): o.connect(project[p]), enumerate(scan_phase1))
    map(lambda (p, o): o.connect(groupby[p]), enumerate(project))
    map(lambda (p, o): o.connect(groupby_reduce), enumerate(groupby))
    groupby_reduce.connect(groupby_filter_build)

    # phase 2
    map(lambda (p, o): groupby_filter_build.connect(o, 0),
        enumerate(scan_phase2))
    map(lambda (p, o): groupby_filter_build.connect(o, 1),
        enumerate(groupby_decoder))
    map(lambda (p, o): o.connect(groupby_decoder[p]), enumerate(scan_phase2))
    map(lambda (p, o): o.connect(groupby_reduce_phase2),
        enumerate(groupby_decoder))
    # map(lambda (p, o): groupby_reduce.connect(o), enumerate(groupby_decoder))

    groupby_reduce_phase2.connect(collate)

    # Plan settings
    print('')
    print("Settings")
    print("--------")
    print('')
    print('use_pandas: {}'.format(use_pandas))
    print("table parts: {}".format(table_parts))
    print('')

    # Write the plan graph
    query_plan.write_graph(os.path.join(ROOT_DIR, "../benchmark-output"),
                           gen_test_id() + "-" + str(table_parts))

    # Start the query
    query_plan.execute()
    print('Done')
    tuples = collate.tuples()

    collate.print_tuples(tuples)

    # Write the metrics
    query_plan.print_metrics()

    # Shut everything down
    query_plan.stop()
Exemple #9
0
def run(group_fields, agg_fields, parallel, use_pandas, buffer_size,
        table_parts, files, format_):
    """
    
    :return: None
    """

    secure = False
    use_native = False
    print('')
    print("Groupby Benchmark, Baseline. Group Fields: {} Aggregate Fields: {}".
          format(group_fields, agg_fields))
    print("----------------------")

    # Query plan
    query_plan = QueryPlan(is_async=parallel, buffer_size=buffer_size)

    # Scan
    scan = map(
        lambda p: query_plan.add_operator(
            SQLTableScan(
                files.format(p), "select {} from S3Object;".format(','.join(
                    group_fields + agg_fields)), format_, use_pandas, secure,
                use_native, 'scan_{}'.format(p), query_plan, False)),
        range(0, table_parts))

    # Project
    def project_fn(df):
        df.columns = group_fields + agg_fields
        return df

    project_exprs = [ProjectExpression(lambda t_: t_['_{}'.format(n)], v) for n, v in enumerate(group_fields)] \
                  + [ProjectExpression(lambda t_: t_['_{}'.format(n + len(group_fields))], v) for n, v in enumerate(agg_fields)]

    project = map(
        lambda p: query_plan.add_operator(
            Project(project_exprs, 'project_{}'.format(p), query_plan, False,
                    project_fn)), range(0, table_parts))

    # Groupby
    def groupby_fn(df):
        df[agg_fields] = df[agg_fields].astype(np.float)
        grouped = df.groupby(group_fields)
        agg_df = pd.DataFrame(
            {f: grouped[f].sum()
             for n, f in enumerate(agg_fields)})
        return agg_df.reset_index()

    groupby = map(
        lambda p: query_plan.add_operator(
            Group(group_fields, [], 'groupby_{}'.format(p), query_plan, False,
                  groupby_fn)), range(0, table_parts))

    groupby_reduce = query_plan.add_operator(
        Group(group_fields, [], 'groupby_reduce', query_plan, False,
              groupby_fn))

    collate = query_plan.add_operator(Collate('collate', query_plan, False))

    map(lambda (p, o): o.connect(project[p]), enumerate(scan))
    map(lambda (p, o): o.connect(groupby[p]), enumerate(project))
    map(lambda (p, o): o.connect(groupby_reduce), enumerate(groupby))
    groupby_reduce.connect(collate)

    # Plan settings
    print('')
    print("Settings")
    print("--------")
    print('')
    print('use_pandas: {}'.format(use_pandas))
    print("table parts: {}".format(table_parts))
    print('')

    # Write the plan graph
    query_plan.write_graph(os.path.join(ROOT_DIR, "../benchmark-output"),
                           gen_test_id() + "-" + str(table_parts))

    # Start the query
    query_plan.execute()
    print('Done')
    tuples = collate.tuples()

    collate.print_tuples(tuples)

    # Write the metrics
    query_plan.print_metrics()

    # Shut everything down
    query_plan.stop()
Exemple #10
0
def groupby_returnflag_linestatus_operator_def(name, query_plan):
    def fn(df):
        convert_types(df)

        compute_expressions(df)

        grouped = group(df)

        agg_df = aggregate(grouped)

        return agg_df

    def aggregate(grouped):
        agg_df = pd.DataFrame({
            'sum_qty': grouped['l_quantity'].sum(),
            'avg_qty': grouped['l_quantity'].mean(),
            'count_order': grouped['l_quantity'].size(),
            'sum_base_price': grouped['l_extendedprice'].sum(),
            'avg_price': grouped['l_extendedprice'].mean(),
            'avg_disc': grouped['l_discount'].mean(),
            'sum_disc_price': grouped['c0'].sum(),
            'sum_charge': grouped['c1'].sum()
        })

        return agg_df.reset_index()

    def group(df):
        return df.groupby(['l_returnflag', 'l_linestatus'])

    def compute_expressions(df):
        df['c0'] = df['l_extendedprice'].mul(df['l_discount'].rsub(1))
        df['c1'] = df['l_extendedprice'].mul(df['l_discount'].rsub(1)).mul(df['l_tax'].radd(1))

    def convert_types(df):
        df['l_quantity'] = df['l_quantity'].astype(np.int)
        df[['l_extendedprice', 'l_discount', 'l_tax']] = \
            df[['l_extendedprice', 'l_discount', 'l_tax']].astype(np.float)

    return Group(
        [
            'l_returnflag',
            'l_linestatus'
        ],
        [
            #    0              1                  2             3        4               5               6
            #    ['l_quantity', 'l_extendedprice', 'l_discount', 'l_tax', 'l_returnflag', 'l_linestatus', 'l_shipdate']
            # sum(l_quantity)
            AggregateExpression(AggregateExpression.SUM, lambda t_: float(t_[0]), 'sum_qty'),
            # sum(l_extendedprice) as sum_base_price
            AggregateExpression(AggregateExpression.SUM, lambda t_: float(t_[1]), 'sum_base_price'),
            # sum(l_extendedprice * (1 - l_discount)) as sum_disc_price
            AggregateExpression(AggregateExpression.SUM, lambda t_: float(t_[1]) * (1 - float(t_[2])),
                                'sum_disc_price'),
            # sum(l_extendedprice * (1 - l_discount) * (1 + l_tax)) as sum_charge
            AggregateExpression(AggregateExpression.SUM,
                                lambda t_: float(t_[1]) * (1 - float(t_[2])) * (1 + float(t_[3])), 'sum_charge'),
            # avg(l_quantity)
            AggregateExpression(AggregateExpression.AVG, lambda t_: float(t_[0]), 'avg_qty'),
            # avg(l_extendedprice)
            AggregateExpression(AggregateExpression.AVG, lambda t_: float(t_[1]), 'avg_price'),
            # avg(l_discount)
            AggregateExpression(AggregateExpression.AVG, lambda t_: float(t_[2]), 'avg_disc'),
            # count(*) as count_order
            AggregateExpression(AggregateExpression.COUNT, lambda t_: t_[4], 'count_order')
        ],
        name, query_plan, False, fn)
Exemple #11
0
def test_group_baseline():
    """TPC-H Q1

    :return: None
    """

    # Query plan

    # select
    #   l_returnflag,
    #   l_linestatus,
    #   sum(l_quantity) as sum_qty,
    #   sum(l_extendedprice) as sum_base_price,
    #   sum(l_extendedprice * (1 - l_discount)) as sum_disc_price,
    #   sum(l_extendedprice * (1 - l_discount) * (1 + l_tax)) as sum_charge,
    #   avg(l_quantity) as avg_qty,
    #   avg(l_extendedprice) as avg_price,
    #   avg(l_discount) as avg_disc,
    #   count(*) as count_order
    # from
    #   lineitem
    # where
    #   l_shipdate <= date '1992-04-01' - interval '[DELTA]' day(3)
    # group by
    #   l_returnflag,
    #   l_linestatus
    # order by
    #   l_returnflag,
    #   l_linestatus;

    query_plan = QueryPlan()

    delta_days = 60  # TODO: This is supposed to be randomized I think
    shipped_date = datetime.strptime('1992-04-01', '%Y-%m-%d') - timedelta(days=delta_days)

    ts = query_plan.add_operator(SQLTableScan("lineitem.csv",
                                              "select * from S3Object "
                                              "where cast(l_shipdate as timestamp) <= cast(\'{}\' as timestamp)"
                                              .format(shipped_date.strftime('%Y-%m-%d')),
                                              False, 'lineitem', query_plan,
                                              False))
    g = query_plan.add_operator(Group(
        [
            '_8',  # l_returnflag
            '_9'  # l_linestatus
        ],
        [
            # sum(l_quantity)
            AggregateExpression(AggregateExpression.SUM, lambda t_: float(t_['_4'])),
            # sum(l_extendedprice) as sum_base_price
            AggregateExpression(AggregateExpression.SUM, lambda t_: float(t_['_5'])),
            # sum(l_extendedprice * (1 - l_discount)) as sum_disc_price
            AggregateExpression(AggregateExpression.SUM, lambda t_: float(t_['_5']) * (1 - float(t_['_6']))),
            # sum(l_extendedprice * (1 - l_discount) * (1 + l_tax)) as sum_charge
            AggregateExpression(AggregateExpression.SUM,
                                lambda t_: float(t_['_5']) * (1 - float(t_['_6'])) * (1 + float(t_['_7']))),
            # avg(l_quantity)
            AggregateExpression(AggregateExpression.AVG, lambda t_: float(t_['_4'])),
            # avg(l_extendedprice)
            AggregateExpression(AggregateExpression.AVG, lambda t_: float(t_['_5'])),
            # avg(l_discount)
            AggregateExpression(AggregateExpression.AVG, lambda t_: float(t_['_6'])),
            # count(*) as count_order
            AggregateExpression(AggregateExpression.COUNT, lambda t_: t_['_0'])
        ],
        'lineitem_grouped', query_plan,
        False))
    s = query_plan.add_operator(Sort(
        [
            SortExpression('_0', str, 'ASC'),
            SortExpression('_1', str, 'ASC')
        ],
        'lineitem_group_sorted', query_plan,
        False))
    c = query_plan.add_operator(Collate('collation', query_plan, False))

    ts.connect(g)
    g.connect(s)
    s.connect(c)

    # Write the plan graph
    query_plan.write_graph(os.path.join(ROOT_DIR, "../tests-output"), gen_test_id())

    # Start the query
    query_plan.execute()

    # Assert the results
    # num_rows = 0
    # for t in c.tuples():
    #     num_rows += 1
    #     print("{}:{}".format(num_rows, t))

    field_names = ['_0', '_1', '_2', '_3', '_4', '_5', '_6', '_7', '_8', '_9']

    assert c.tuples()[0] == field_names

    assert len(c.tuples()) == 2 + 1

    # These have been verified in Postgres
    assert c.tuples()[1] == ["A", "F", 129850, 194216048.19000033, 184525343.78730044, 191943492.96455324,
                             25.445816186556925,
                             38059.19031746035, 0.050005878894768374, 5103]
    assert c.tuples()[2] == ["R", "F", 129740, 193438367.33999985, 183701990.7670003, 191045646.36937532,
                             25.509241053873353,
                             38033.49731419579, 0.05061541486433399, 5086]

    # Write the metrics
    query_plan.print_metrics()