Esempio n. 1
0
def lineitem_filter_def(name, query_plan):
    def pd_expr(df):
        return ((df['l_quantity'].astype(np.int) >= 3) &
                (df['l_quantity'].astype(np.int) <= 3 + 10) &
                (df['l_shipmode'].isin(['AIR', 'AIR REG'])) &
                (df['l_shipinstruct'] == 'DELIVER IN PERSON')) | (
                    (df['l_quantity'].astype(np.int) >= 16) &
                    (df['l_quantity'].astype(np.int) <= 16 + 10) &
                    (df['l_shipmode'].isin(['AIR', 'AIR REG'])) &
                    (df['l_shipinstruct'] == 'DELIVER IN PERSON')) | (
                        (df['l_quantity'].astype(np.int) >= 24) &
                        (df['l_quantity'].astype(np.int) <= 24 + 10) &
                        (df['l_shipmode'].isin(['AIR', 'AIR REG'])) &
                        (df['l_shipinstruct'] == 'DELIVER IN PERSON'))

    return Filter(
        PredicateExpression(
            lambda t_:
            (3 <= int(t_['l_quantity']) <= 3 + 10 and t_['l_shipmode'] in [
                'AIR', 'AIR REG'
            ] and t_['l_shipinstruct'] == 'DELIVER IN PERSON') or
            (16 <= int(t_['l_quantity']) <= 16 + 10 and t_['l_shipmode'] in
             ['AIR', 'AIR REG'] and t_['l_shipinstruct'] == 'DELIVER IN PERSON'
             ) or (24 <= int(t_['l_quantity']) <= 24 + 10 and t_[
                 'l_shipmode'] in ['AIR', 'AIR REG'] and t_['l_shipinstruct']
                   == 'DELIVER IN PERSON'), pd_expr), name, query_plan, False)
Esempio n. 2
0
def part_filter_def(name, query_plan):
    def pd_expr(df):
        return ((df['p_brand'] == 'Brand#11') & (df['p_container'].isin(
            ['SM CASE', 'SM BOX', 'SM PACK', 'SM PKG'])) &
                (df['p_size'].astype(np.int) >= 1) &
                (df['p_size'].astype(np.int) <= 5)) | (
                    (df['p_brand'] == 'Brand#44') & (df['p_container'].isin(
                        ['MED BAG', 'MED BOX', 'MED PACK', 'MED PKG'])) &
                    (df['p_size'].astype(np.int) >= 1) &
                    (df['p_size'].astype(np.int) <= 10)) | (
                        (df['p_brand'] == 'Brand#53') &
                        (df['p_container'].isin(
                            ['LG CASE', 'LG BOX', 'LG PACK', 'LG PKG'])) &
                        (df['p_size'].astype(np.int) >= 1) &
                        (df['p_size'].astype(np.int) <= 15))

    return Filter(
        PredicateExpression(
            lambda t_: (t_['p_brand'] == 'Brand#11' and t_['p_container'] in [
                'SM CASE', 'SM BOX', 'SM PACK', 'SM PKG'
            ] and 1 <= int(t_['p_size']) <= 5) or
            (t_['p_brand'] == 'Brand#44' and t_['p_container'] in [
                'MED BAG', 'MED BOX', 'MED PACK', 'MED PKG'
            ] and 1 <= int(t_['p_size']) <= 10) or
            (t_['p_brand'] == 'Brand#53' and t_['p_container'] in [
                'LG CASE', 'LG BOX', 'LG PACK', 'LG PKG'
            ] and 1 <= int(t_['p_size']) <= 15), pd_expr), name, query_plan,
        False)
Esempio n. 3
0
def filter_def(name, query_plan):
    def pd_expr(df):
        return (
                       (df['p_brand'] == 'Brand#11') &
                       (df['p_container'].isin(['SM CASE', 'SM BOX', 'SM PACK', 'SM PKG'])) &
                       (df['l_quantity'].astype(np.int) >= 3) &
                       (df['l_quantity'].astype(np.int) <= 3 + 10) &
                       (df['p_size'].astype(np.int) >= 1) &
                       (df['p_size'].astype(np.int) <= 5) &
                       (df['l_shipmode'].isin(['AIR', 'AIR REG'])) &
                       (df['l_shipinstruct'] == 'DELIVER IN PERSON')
               ) | (
                       (df['p_brand'] == 'Brand#44') &
                       (df['p_container'].isin(['MED BAG', 'MED BOX', 'MED PACK', 'MED PKG'])) &
                       (df['l_quantity'].astype(np.int) >= 16) &
                       (df['l_quantity'].astype(np.int) <= 16 + 10) &
                       (df['p_size'].astype(np.int) >= 1) &
                       (df['p_size'].astype(np.int) <= 10) &
                       (df['l_shipmode'].isin(['AIR', 'AIR REG'])) &
                       (df['l_shipinstruct'] == 'DELIVER IN PERSON')
               ) | (
                       (df['p_brand'] == 'Brand#53') &
                       (df['p_container'].isin(['LG CASE', 'LG BOX', 'LG PACK', 'LG PKG'])) &
                       (df['l_quantity'].astype(np.int) >= 24) &
                       (df['l_quantity'].astype(np.int) <= 24 + 10) &
                       (df['p_size'].astype(np.int) >= 1) &
                       (df['p_size'].astype(np.int) <= 15) &
                       (df['l_shipmode'].isin(['AIR', 'AIR REG'])) &
                       (df['l_shipinstruct'] == 'DELIVER IN PERSON')
               )

    return Filter(
        PredicateExpression(lambda t_:
                            (
                                    t_['p_brand'] == 'Brand#11' and
                                    t_['p_container'] in ['SM CASE', 'SM BOX', 'SM PACK', 'SM PKG'] and
                                    3 <= int(t_['l_quantity']) <= 3 + 10 and
                                    1 <= int(t_['p_size']) <= 5 and
                                    t_['l_shipmode'] in ['AIR', 'AIR REG'] and
                                    t_['l_shipinstruct'] == 'DELIVER IN PERSON'
                            ) or (
                                    t_['p_brand'] == 'Brand#44' and
                                    t_['p_container'] in ['MED BAG', 'MED BOX', 'MED PACK', 'MED PKG'] and
                                    16 <= int(t_['l_quantity']) <= 16 + 10 and
                                    1 <= int(t_['p_size']) <= 10 and
                                    t_['l_shipmode'] in ['AIR', 'AIR REG'] and
                                    t_['l_shipinstruct'] == 'DELIVER IN PERSON'
                            ) or (
                                    t_['p_brand'] == 'Brand#53' and
                                    t_['p_container'] in ['LG CASE', 'LG BOX', 'LG PACK', 'LG PKG'] and
                                    24 <= int(t_['l_quantity']) <= 24 + 10 and
                                    1 <= int(t_['p_size']) <= 15 and
                                    t_['l_shipmode'] in ['AIR', 'AIR REG'] and
                                    t_['l_shipinstruct'] == 'DELIVER IN PERSON'
                            ),
                            pd_expr),
        name,
        query_plan,
        False)
Esempio n. 4
0
def filter_brand_container_op(name, query_plan):
    def pd_expr(df):
        return (df['p_brand'] == 'Brand#41') & (df['p_container'] == 'SM PACK')

    return Filter(
        PredicateExpression(
            lambda t_: t_['p_brand'] == 'Brand#41' and t_['p_container'] ==
            'SM PACK', pd_expr), name, query_plan, False)
Esempio n. 5
0
def filter_brand12_operator_def(name, query_plan):
    # type: (str, QueryPlan) -> Filter

    def pd_expr(df):
        return df['p_brand'] == 'Brand#12'

    return Filter(
        PredicateExpression(lambda t_: t_['p_brand'] == 'Brand#12', pd_expr),
        name, query_plan, False)
Esempio n. 6
0
def filter_shipdate_operator_def(max_shipped_date, name, query_plan):
    def pd_expr(df):
        return pd.to_datetime(df['l_shipdate'], cache=True) <= max_shipped_date

    return Filter(
        PredicateExpression(lambda t_:
                            (cast(t_['l_shipdate'], timestamp) <= cast(max_shipped_date, timestamp)),
                            pd_expr),
        name, query_plan,
        False)
Esempio n. 7
0
def filter_shipdate_operator_def(min_shipped_date, max_shipped_date, name,
                                 query_plan):
    def pd_expr(df):
        # df['_10'] = pd.to_datetime(df['_10'])
        return (pd.to_datetime(df['l_shipdate']) >= min_shipped_date) & (
            pd.to_datetime(df['l_shipdate']) < max_shipped_date)

    return Filter(
        PredicateExpression(
            lambda t_: (cast(t_['l_shipdate'], timestamp) >= cast(
                min_shipped_date, timestamp)) and
            (cast(t_['l_shipdate'], timestamp) < cast(
                max_shipped_date, timestamp)), pd_expr), name, query_plan,
        False)
Esempio n. 8
0
def filter_lineitem_quantity_op(name, query_plan):
    """with filter_join_2 as (select * from part_lineitem_join_avg_group_join where l_quantity < avg_l_quantity_computed00)

    :param query_plan:
    :param name:
    :return:
    """
    def pd_expr(df):
        return (df['l_quantity'].astype(np.float) <
                df['avg_l_quantity_computed00'])

    return Filter(
        PredicateExpression(
            lambda t_: float(t_['l_quantity']) < t_['avg_l_quantity_computed00'
                                                    ], pd_expr),
        name,
        query_plan,
        False,
    )
Esempio n. 9
0
def separate_query_plan(shared_mem_buffer_size, parallel, use_pandas,
                        buffer_size, table_parts, lower, upper, sharded, sf,
                        use_shared_mem, inline_ops, merge_ops):
    secure = False
    use_native = False

    if use_shared_mem:
        system = WorkerSystem(shared_mem_buffer_size)
    else:
        system = None

    query_plan = QueryPlan(system,
                           is_async=parallel,
                           buffer_size=buffer_size,
                           use_shared_mem=use_shared_mem)

    # scan the file
    scan = map(
        lambda p: query_plan.add_operator(
            TableScan(get_file_key('lineitem', sharded, p, sf=sf), use_pandas,
                      secure, use_native, 'scan_{}'.format(
                          p), query_plan, False)), range(0, table_parts))

    def filter_fn(df):
        return (df['_5'].astype(np.float) >= lower) & (df['_5'].astype(
            np.float) <= upper)

    filter_ = map(
        lambda p: query_plan.add_operator(
            Filter(PredicateExpression(None, filter_fn), 'filter_{}'.format(p),
                   query_plan, False)), range(0, table_parts))

    # aggregation
    def agg_fun(df):
        return pd.DataFrame({'count': [len(df)]})

    aggregate = query_plan.add_operator(
        Aggregate([], use_pandas, 'agg', query_plan, False, agg_fun))

    collate = query_plan.add_operator(Collate('collate', query_plan, False))

    map(lambda p: p.set_async(not inline_ops), filter_)

    connect_many_to_many(scan, filter_)
    connect_many_to_one(filter_, aggregate)
    connect_one_to_one(aggregate, collate)

    profile_file_suffix = get_profile_file_suffix(inline_ops, merge_ops,
                                                  use_shared_mem)

    scan[0].set_profiled(
        True,
        os.path.join(ROOT_DIR, "../benchmark-output/",
                     gen_test_id() + "_scan_0" + profile_file_suffix +
                     ".prof"))
    filter_[0].set_profiled(
        True,
        os.path.join(
            ROOT_DIR, "../benchmark-output/",
            gen_test_id() + "_filter_0" + profile_file_suffix + ".prof"))
    aggregate.set_profiled(
        True,
        os.path.join(
            ROOT_DIR, "../benchmark-output/",
            gen_test_id() + "_aggregate" + profile_file_suffix + ".prof"))
    collate.set_profiled(
        True,
        os.path.join(
            ROOT_DIR, "../benchmark-output/",
            gen_test_id() + "_collate" + profile_file_suffix + ".prof"))
    return query_plan
Esempio n. 10
0
def query_plan(settings):
    # type: (SyntheticBaselineJoinSettings) -> QueryPlan
    """

    :type settings:
    :return: None
    """

    if settings.use_shared_mem:
        system = WorkerSystem(settings.shared_memory_size)
    else:
        system = None

    query_plan = QueryPlan(system, is_async=settings.parallel, buffer_size=settings.buffer_size,
                           use_shared_mem=settings.use_shared_mem)

    # Define the operators
    scan_A = \
        map(lambda p:
            query_plan.add_operator(
                SQLTableScan(get_file_key(settings.table_A_key, settings.table_A_sharded, p, settings.sf),
                             "select "
                             "  * "
                             "from "
                             "  S3Object "
                             "{}"
                             .format(
                                 get_sql_suffix(settings.table_A_key, settings.table_A_parts, p,
                                                settings.table_A_sharded, add_where=True)), settings.format_,
                             settings.use_pandas,
                             settings.secure,
                             settings.use_native,
                             'scan_A_{}'.format(p),
                             query_plan,
                             False)),
            range(0, settings.table_A_parts))

    field_names_map_A = OrderedDict(
        zip(['_{}'.format(i) for i, name in enumerate(settings.table_A_field_names)], settings.table_A_field_names))

    def project_fn_A(df):
        df = df.rename(columns=field_names_map_A, copy=False)
        return df

    project_A = map(lambda p:
                    query_plan.add_operator(Project(
                        [ProjectExpression(k, v) for k, v in field_names_map_A.iteritems()],
                        'project_A_{}'.format(p),
                        query_plan,
                        False,
                        project_fn_A)),
                    range(0, settings.table_A_parts))

    if settings.table_A_filter_fn is not None:
        filter_A = map(lambda p:
                       query_plan.add_operator(Filter(
                           PredicateExpression(None, pd_expr=settings.table_A_filter_fn), 'filter_A_{}'.format(p), query_plan,
                           False)),
                       range(0, settings.table_A_parts))

    scan_B = \
        map(lambda p:
            query_plan.add_operator(
                SQLTableScan(get_file_key(settings.table_B_key, settings.table_B_sharded, p, settings.sf),
                             "select "
                             "  * "
                             "from "
                             "  S3Object "
                             "{}"
                             .format(
                                 get_sql_suffix(settings.table_B_key, settings.table_B_parts, p,
                                                settings.table_B_sharded, add_where=True)), settings.format_,
                             settings.use_pandas,
                             settings.secure,
                             settings.use_native,
                             'scan_B_{}'.format(p),
                             query_plan,
                             False)),
            range(0, settings.table_B_parts))

    field_names_map_B = OrderedDict(
        zip(['_{}'.format(i) for i, name in enumerate(settings.table_B_field_names)], settings.table_B_field_names))

    def project_fn_B(df):
        df.rename(columns=field_names_map_B, inplace=True)
        return df

    project_B = map(lambda p:
                    query_plan.add_operator(Project(
                        [ProjectExpression(k, v) for k, v in field_names_map_B.iteritems()],
                        'project_B_{}'.format(p),
                        query_plan,
                        False,
                        project_fn_B)),
                    range(0, settings.table_B_parts))

    if settings.table_B_filter_fn is not None:
        filter_b = map(lambda p:
                       query_plan.add_operator(Filter(
                           PredicateExpression(None, pd_expr=settings.table_B_filter_fn), 'filter_b' + '_{}'.format(p), query_plan,
                           False)),
                       range(0, settings.table_B_parts))

    if settings.table_C_key is not None:
        scan_C = \
            map(lambda p:
                query_plan.add_operator(
                    SQLTableScan(get_file_key(settings.table_C_key, settings.table_C_sharded, p, settings.sf),
                                 "select "
                                 "  * "
                                 "from "
                                 "  S3Object "
                                 "{}"
                                 .format(
                                     get_sql_suffix(settings.table_C_key, settings.table_C_parts, p,
                                                    settings.table_C_sharded, add_where=True)), settings.format_,
                                 settings.use_pandas,
                                 settings.secure,
                                 settings.use_native,
                                 'scan_C_{}'.format(p),
                                 query_plan,
                                 False)),
                range(0, settings.table_C_parts))

        field_names_map_C = OrderedDict(
            zip(['_{}'.format(i) for i, name in enumerate(settings.table_C_field_names)], settings.table_C_field_names))

        def project_fn_C(df):
            df = df.rename(columns=field_names_map_C, copy=False)
            return df

        project_C = map(lambda p:
                        query_plan.add_operator(Project(
                            [ProjectExpression(k, v) for k, v in field_names_map_C.iteritems()],
                            'project_C_{}'.format(p),
                            query_plan,
                            False,
                            project_fn_C)),
                        range(0, settings.table_C_parts))

        filter_c = map(lambda p:
                       query_plan.add_operator(Filter(
                           PredicateExpression(None, pd_expr=settings.table_C_filter_fn), 'filter_c' + '_{}'.format(p), query_plan,
                           False)),
                       range(0, settings.table_C_parts))

        map_B_to_C = map(lambda p:
                         query_plan.add_operator(
                             Map(settings.table_B_BC_join_key, 'map_B_to_C_{}'.format(p), query_plan, False)),
                         range(0, settings.table_B_parts))

        map_C_to_C = map(lambda p:
                         query_plan.add_operator(
                             Map(settings.table_C_BC_join_key, 'map_C_to_C_{}'.format(p), query_plan, False)),
                         range(0, settings.table_C_parts))

        join_build_AB_C = map(lambda p:
                              query_plan.add_operator(
                                  HashJoinBuild(settings.table_B_BC_join_key, 'join_build_AB_C_{}'.format(p),
                                                query_plan,
                                                False)),
                              range(0, settings.table_C_parts))

        join_probe_AB_C = map(lambda p:
                              query_plan.add_operator(
                                  HashJoinProbe(
                                      JoinExpression(settings.table_B_BC_join_key, settings.table_C_BC_join_key),
                                      'join_probe_AB_C_{}'.format(p),
                                      query_plan, False)),
                              range(0, settings.table_C_parts))

    map_A_to_B = map(lambda p:
                     query_plan.add_operator(
                         Map(settings.table_A_AB_join_key, 'map_A_to_B_{}'.format(p), query_plan, False)),
                     range(0, settings.table_A_parts))

    map_B_to_B = map(lambda p:
                     query_plan.add_operator(
                         Map(settings.table_B_AB_join_key, 'map_B_to_B_{}'.format(p), query_plan, False)),
                     range(0, settings.table_B_parts))

    join_build_A_B = map(lambda p:
                         query_plan.add_operator(
                             HashJoinBuild(settings.table_A_AB_join_key, 'join_build_A_B_{}'.format(p), query_plan,
                                           False)),
                         range(0, settings.other_parts))

    join_probe_A_B = map(lambda p:
                         query_plan.add_operator(
                             HashJoinProbe(JoinExpression(settings.table_A_AB_join_key, settings.table_B_AB_join_key),
                                           'join_probe_A_B_{}'.format(p),
                                           query_plan, False)),
                         range(0, settings.other_parts))

    if settings.table_C_key is None:

        def part_aggregate_fn(df):
            sum_ = df[settings.table_B_detail_field_name].astype(np.float).sum()
            return pd.DataFrame({'_0': [sum_]})

        part_aggregate = map(lambda p:
                             query_plan.add_operator(Aggregate(
                                 [
                                     AggregateExpression(AggregateExpression.SUM,
                                                         lambda t: float(t[settings.table_B_detail_field_name]))
                                 ],
                                 settings.use_pandas,
                                 'part_aggregate_{}'.format(p), query_plan, False, part_aggregate_fn)),
                             range(0, settings.other_parts))

    else:
        def part_aggregate_fn(df):
            sum_ = df[settings.table_C_detail_field_name].astype(np.float).sum()
            return pd.DataFrame({'_0': [sum_]})

        part_aggregate = map(lambda p:
                             query_plan.add_operator(Aggregate(
                                 [
                                     AggregateExpression(AggregateExpression.SUM,
                                                         lambda t: float(t[settings.table_C_detail_field_name]))
                                 ],
                                 settings.use_pandas,
                                 'part_aggregate_{}'.format(p), query_plan, False, part_aggregate_fn)),
                             range(0, settings.table_C_parts))

    def aggregate_reduce_fn(df):
        sum_ = df['_0'].astype(np.float).sum()
        return pd.DataFrame({'_0': [sum_]})

    aggregate_reduce = query_plan.add_operator(Aggregate(
        [
            AggregateExpression(AggregateExpression.SUM, lambda t: float(t['_0']))
        ],
        settings.use_pandas,
        'aggregate_reduce', query_plan, False, aggregate_reduce_fn))

    aggregate_project = query_plan.add_operator(Project(
        [
            ProjectExpression(lambda t: t['_0'], 'total_balance')
        ],
        'aggregate_project', query_plan,
        False))

    collate = query_plan.add_operator(Collate('collate', query_plan, False))

    # Inline some of the operators
    map(lambda o: o.set_async(False), project_A)
    if settings.table_A_filter_fn is not None:
        map(lambda o: o.set_async(False), filter_A)
    map(lambda o: o.set_async(False), project_B)
    if settings.table_B_filter_fn is not None:
        map(lambda o: o.set_async(False), filter_b)
    map(lambda o: o.set_async(False), map_A_to_B)
    map(lambda o: o.set_async(False), map_B_to_B)
    if settings.table_C_key is not None:
        map(lambda o: o.set_async(False), map_B_to_C)
        map(lambda o: o.set_async(False), map_C_to_C)
        map(lambda o: o.set_async(False), project_C)
        map(lambda o: o.set_async(False), filter_c)
    map(lambda o: o.set_async(False), part_aggregate)
    aggregate_project.set_async(False)

    # Connect the operators
    connect_many_to_many(scan_A, project_A)
    if settings.table_A_filter_fn is not None:
        connect_many_to_many(project_A, filter_A)
        connect_many_to_many(filter_A, map_A_to_B)
    else:
        connect_many_to_many(project_A, map_A_to_B)
    connect_all_to_all(map_A_to_B, join_build_A_B)
    connect_many_to_many(join_build_A_B, join_probe_A_B)

    connect_many_to_many(scan_B, project_B)
    if settings.table_B_filter_fn is not None:
        connect_many_to_many(project_B, filter_b)
        connect_many_to_many(filter_b, map_B_to_B)
    else:
        connect_many_to_many(project_B, map_B_to_B)

    connect_all_to_all(map_B_to_B, join_probe_A_B)

    if settings.table_C_key is None:
        connect_many_to_many(join_probe_A_B, part_aggregate)
    else:
        connect_many_to_many(join_probe_A_B, map_B_to_C)
        connect_all_to_all(map_B_to_C, join_build_AB_C)
        connect_many_to_many(join_build_AB_C, join_probe_AB_C)
        connect_many_to_many(scan_C, project_C)
        connect_many_to_many(project_C, filter_c)
        connect_many_to_many(filter_c, map_C_to_C)
        connect_all_to_all(map_C_to_C, join_probe_AB_C)
        connect_many_to_many(join_probe_AB_C, part_aggregate)

    connect_many_to_one(part_aggregate, aggregate_reduce)
    connect_one_to_one(aggregate_reduce, aggregate_project)
    connect_one_to_one(aggregate_project, collate)

    return query_plan
Esempio n. 11
0
def run(parallel,
        use_pandas,
        secure,
        use_native,
        buffer_size,
        format_,
        customer_parts,
        order_parts,
        lineitem_parts,
        customer_sharded,
        order_sharded,
        lineitem_sharded,
        other_parts,
        sf,
        expected_result,
        customer_filter_sql=None,
        order_filter_sql=None,
        lineitem_filter_sql=None):
    """

    :return: None
    """

    print('')
    print("TPCH Q3 Baseline Join")
    print("---------------------")

    query_plan = QueryPlan(is_async=parallel, buffer_size=buffer_size)

    customer_scan = map(
        lambda p: query_plan.add_operator(
            SQLTableScan(
                get_file_key('customer', customer_sharded, p, sf, format_
                             ), "select "
                " * "
                "from "
                "  S3Object "
                "  {} "
                "  {} ".format(
                    ' where ' + customer_filter_sql
                    if customer_filter_sql is not None else '',
                    get_sql_suffix('customer',
                                   customer_parts,
                                   p,
                                   customer_sharded,
                                   add_where=customer_filter_sql is None)
                ), format_, use_pandas, secure, use_native, 'customer_scan' +
                '_{}'.format(p), query_plan, False)), range(0, customer_parts))

    def customer_project_fn(df):
        df = df.filter(items=['_0', '_6'], axis=1)

        df.rename(columns={
            '_0': 'c_custkey',
            '_6': 'c_mktsegment'
        },
                  inplace=True)

        return df

    customer_project = map(
        lambda p: query_plan.add_operator(
            Project([], 'customer_project' + '_{}'.format(p), query_plan,
                    False, customer_project_fn)), range(0, customer_parts))

    def customer_filter_fn(df):
        return df['c_mktsegment'] == 'BUILDING'

    customer_filter = map(
        lambda p: query_plan.add_operator(
            Filter(PredicateExpression(pd_expr=customer_filter_fn),
                   'customer_filter' + '_{}'.format(p), query_plan, False)),
        range(0, customer_parts))

    customer_map = map(
        lambda p: query_plan.add_operator(
            Map('c_custkey', 'customer_map' + '_' + str(p), query_plan, False)
        ), range(0, customer_parts))

    order_scan = map(
        lambda p: query_plan.add_operator(
            SQLTableScan(
                get_file_key('orders', order_sharded, p, sf, format_
                             ), "select "
                "  * "
                "from "
                "  S3Object "
                "  {} "
                "  {} ".format(
                    ' where ' + order_filter_sql
                    if order_filter_sql is not None else '',
                    get_sql_suffix('orders',
                                   order_parts,
                                   p,
                                   order_sharded,
                                   add_where=order_filter_sql is None)
                ), format_, use_pandas, secure, use_native, 'order_scan' +
                '_{}'.format(p), query_plan, False)), range(0, order_parts))

    def order_project_fn(df):
        df = df.filter(items=['_0', '_1', '_4', '_7'], axis=1)

        df.rename(columns={
            '_1': 'o_custkey',
            '_0': 'o_orderkey',
            '_4': 'o_orderdate',
            '_7': 'o_shippriority'
        },
                  inplace=True)

        return df

    order_project = map(
        lambda p: query_plan.add_operator(
            Project([], 'order_project' + '_{}'.format(p), query_plan, False,
                    order_project_fn)), range(0, customer_parts))

    def order_filter_fn(df):
        return pd.to_datetime(df['o_orderdate']) < pd.Timestamp(
            datetime.strptime('1995-03-01', '%Y-%m-%d').date())

    order_filter = map(
        lambda p: query_plan.add_operator(
            Filter(PredicateExpression(pd_expr=order_filter_fn),
                   'order_filter' + '_{}'.format(p), query_plan, False)),
        range(0, order_parts))

    order_map_1 = map(
        lambda p: query_plan.add_operator(
            Map('o_custkey', 'order_map_1' + '_' + str(p), query_plan, False)),
        range(0, order_parts))

    customer_order_join_build = map(
        lambda p: query_plan.add_operator(
            HashJoinBuild('c_custkey', 'customer_order_join_build' + '_' + str(
                p), query_plan, False)), range(0, other_parts))

    customer_order_join_probe = map(
        lambda p: query_plan.add_operator(
            HashJoinProbe(JoinExpression('c_custkey', 'o_custkey'),
                          'customer_order_join_probe' + '_' + str(
                              p), query_plan, False)), range(0, other_parts))

    lineitem_scan = map(
        lambda p: query_plan.add_operator(
            SQLTableScan(
                get_file_key('lineitem', lineitem_sharded, p, sf, format_
                             ), "select "
                "  * "
                "from "
                "  S3Object "
                "  {} "
                "  {} ".format(
                    ' where ' + lineitem_filter_sql
                    if lineitem_filter_sql is not None else "",
                    get_sql_suffix('lineitem',
                                   lineitem_parts,
                                   p,
                                   lineitem_sharded,
                                   add_where=lineitem_filter_sql is None)
                ), format_, use_pandas, secure, use_native, 'lineitem_scan' +
                '_{}'.format(p), query_plan, False)), range(0, lineitem_parts))

    def lineitem_project_fn(df):
        df = df.filter(items=['_0', '_5', '_6', '_10'], axis=1)

        df.rename(columns={
            '_0': 'l_orderkey',
            '_5': 'l_extendedprice',
            '_6': 'l_discount',
            '_10': 'l_shipdate'
        },
                  inplace=True)

        return df

    lineitem_project = map(
        lambda p: query_plan.add_operator(
            Project([], 'lineitem_project' + '_{}'.format(p), query_plan,
                    False, lineitem_project_fn)), range(0, lineitem_parts))

    def lineitem_filter_fn(df):
        return pd.to_datetime(df['l_shipdate']) > pd.Timestamp(
            datetime.strptime('1995-03-01', '%Y-%m-%d').date())

    lineitem_filter = map(
        lambda p: query_plan.add_operator(
            Filter(PredicateExpression(pd_expr=lineitem_filter_fn),
                   'lineitem_filter' + '_{}'.format(p), query_plan, False)),
        range(0, lineitem_parts))

    lineitem_map = map(
        lambda p: query_plan.add_operator(
            Map('l_orderkey', 'lineitem_map' + '_' + str(p), query_plan, False)
        ), range(0, lineitem_parts))

    order_map_2 = map(
        lambda p: query_plan.add_operator(
            Map('o_orderkey', 'order_map_2' + '_' + str(p), query_plan, False)
        ), range(0, other_parts))

    customer_order_lineitem_join_build = map(
        lambda p: query_plan.add_operator(
            HashJoinBuild('o_orderkey', 'customer_order_lineitem_join_build' +
                          '_' + str(p), query_plan, False)),
        range(0, other_parts))

    customer_order_lineitem_join_probe = map(
        lambda p: query_plan.add_operator(
            HashJoinProbe(JoinExpression('o_orderkey', 'l_orderkey'),
                          'customer_order_lineitem_join_probe' + '_' + str(
                              p), query_plan, False)), range(0, other_parts))

    def groupby_fn(df):
        df['l_extendedprice'] = df['l_extendedprice'].astype(np.float)
        df['l_discount'] = df['l_discount'].astype(np.float)
        df['revenue'] = df['l_extendedprice'] * (1 - df['l_discount'])
        grouped = df.groupby(['l_orderkey', 'o_orderdate', 'o_shippriority'])
        agg_df = grouped['revenue'].sum()
        return agg_df.reset_index()

    group = map(
        lambda p: query_plan.add_operator(
            Group(
                ['l_orderkey', 'o_orderdate', 'o_shippriority'],  # l_partkey
                [
                    AggregateExpression(
                        AggregateExpression.SUM, lambda t_: float(t_[
                            'l_extendedprice'] * (1 - t_['l_discount'])))
                ],
                'group' + '_{}'.format(p),
                query_plan,
                False,
                groupby_fn)),
        range(0, other_parts))

    def group_reduce_fn(df):
        grouped = df.groupby(['l_orderkey', 'o_orderdate', 'o_shippriority'])
        agg_df = grouped['revenue'].sum()
        return agg_df.reset_index()

    group_reduce = query_plan.add_operator(
        Group(
            ['l_orderkey', 'o_orderdate', 'o_shippriority'],  # l_partkey
            [
                AggregateExpression(
                    AggregateExpression.SUM,
                    lambda t_: float(t_['l_extendedprice'] *
                                     (1 - t_['l_discount'])))
            ],
            'group_reduce',
            query_plan,
            False,
            group_reduce_fn))

    top = query_plan.add_operator(
        Top(10, [
            SortExpression('revenue', float, 'DESC'),
            SortExpression('o_orderdate', date, 'ASC')
        ], use_pandas, 'top', query_plan, False))

    collate = query_plan.add_operator(
        tpch_q19.collate_op('collate', query_plan))

    # Inline what we can
    map(lambda o: o.set_async(False), lineitem_project)
    map(lambda o: o.set_async(False), customer_project)
    map(lambda o: o.set_async(False), order_project)
    map(lambda o: o.set_async(False), lineitem_filter)
    map(lambda o: o.set_async(False), customer_filter)
    map(lambda o: o.set_async(False), order_filter)
    map(lambda o: o.set_async(False), lineitem_map)
    map(lambda o: o.set_async(False), customer_map)
    map(lambda o: o.set_async(False), order_map_1)
    map(lambda o: o.set_async(False), order_map_2)

    # Connect the operators
    connect_many_to_many(customer_scan, customer_project)
    connect_many_to_many(customer_project, customer_filter)
    connect_many_to_many(customer_filter, customer_map)
    connect_many_to_many(order_scan, order_project)
    connect_many_to_many(order_project, order_filter)
    connect_many_to_many(order_filter, order_map_1)

    connect_all_to_all(customer_map, customer_order_join_build)
    connect_many_to_many(customer_order_join_build, customer_order_join_probe)
    connect_all_to_all(order_map_1, customer_order_join_probe)

    # connect_many_to_one(customer_order_join_probe, collate)

    connect_many_to_many(lineitem_scan, lineitem_project)
    connect_many_to_many(lineitem_project, lineitem_filter)
    connect_many_to_many(lineitem_filter, lineitem_map)

    connect_many_to_many(customer_order_join_probe, order_map_2)
    connect_all_to_all(order_map_2, customer_order_lineitem_join_build)
    connect_many_to_many(customer_order_lineitem_join_build,
                         customer_order_lineitem_join_probe)
    connect_all_to_all(lineitem_map, customer_order_lineitem_join_probe)

    # connect_many_to_one(customer_order_lineitem_join_probe, collate)

    connect_many_to_many(customer_order_lineitem_join_probe, group)
    # connect_many_to_one(group, collate)

    connect_many_to_one(group, group_reduce)
    connect_one_to_one(group_reduce, top)
    connect_one_to_one(top, collate)

    # Plan settings
    print('')
    print("Settings")
    print("--------")
    print('')
    print('use_pandas: {}'.format(use_pandas))
    print('secure: {}'.format(secure))
    print('use_native: {}'.format(use_native))
    print("customer_parts: {}".format(customer_parts))
    print("order_parts: {}".format(order_parts))
    print("lineitem_parts: {}".format(lineitem_parts))
    print("customer_sharded: {}".format(customer_sharded))
    print("order_sharded: {}".format(order_sharded))
    print("lineitem_sharded: {}".format(lineitem_sharded))
    print("other_parts: {}".format(other_parts))
    print("format: {}".format(format_))
    print('')

    # Write the plan graph
    query_plan.write_graph(os.path.join(ROOT_DIR, "../benchmark-output"),
                           gen_test_id())

    # Start the query
    query_plan.execute()

    tuples = collate.tuples()

    collate.print_tuples(tuples)

    # Write the metrics
    query_plan.print_metrics()

    # Shut everything down
    query_plan.stop()

    field_names = ['l_orderkey', 'o_orderdate', 'o_shippriority', 'revenue']

    assert len(tuples) == 10 + 1

    assert tuples[0] == field_names

    # NOTE: This result has been verified with the equivalent data and query on PostgreSQL
    test_util.assert_tuples(expected_result, tuples)
Esempio n. 12
0
def run(parallel, use_pandas, buffer_size, table_parts, lower, upper, sf):
    secure = False
    use_native = False
    print('')
    print("Indexing Benchmark")
    print("------------------")

    # Query plan
    query_plan = QueryPlan(is_async=parallel, buffer_size=buffer_size)

    # scan the file
    scan = map(
        lambda p: query_plan.add_operator(
            TableScan(get_file_key('lineitem', True, p, sf=sf), use_pandas,
                      secure, use_native, 'scan_{}'.format(
                          p), query_plan, False)), range(0, table_parts))

    # project
    def fn(df):
        df.columns = [
            'l_orderkey', 'l_partkey', 'l_suppkey', 'l_linenumber',
            'l_quantity', 'l_extendedprice', 'l_discount', 'l_tax',
            'l_returnflag', 'l_linestatus', 'l_shipdate', 'l_commitdate',
            'l_receiptdate', 'l_shipinstruct', 'l_shipmode', 'l_comment'
        ]
        df[['l_extendedprice']] = df[['l_extendedprice']].astype(np.float)
        return df

    project = map(
        lambda p: query_plan.add_operator(
            Project([], 'project_{}'.format(p), query_plan, False, fn)),
        range(0, table_parts))

    # perform the filter locally
    def pd_expr(df):
        return (df['l_extendedprice'] >= lower) & (df['l_extendedprice'] <=
                                                   upper)

    filters = map(
        lambda p: query_plan.add_operator(
            Filter(
                PredicateExpression(
                    lambda t_: (t_['l_extendedprice'] >= lower) and
                    (t_['l_extendedprice'] <= upper), pd_expr), 'filter_{}'.
                format(p), query_plan, False)), range(0, table_parts))

    # aggregation
    def agg_fun(df):
        return pd.DataFrame({'count': [len(df)]})

    aggregate = query_plan.add_operator(
        Aggregate([], True, 'agg', query_plan, False, agg_fun))

    collate = query_plan.add_operator(Collate('collate', query_plan, False))

    map(lambda (p, o): o.connect(project[p]), enumerate(scan))
    map(lambda (p, o): o.connect(filters[p]), enumerate(project))
    map(lambda (p, o): o.connect(aggregate), enumerate(filters))
    aggregate.connect(collate)

    # scan[0].set_profiled(True, os.path.join(ROOT_DIR, "../benchmark-output/", gen_test_id() + "_scan_0" + ".prof"))
    # project[0].set_profiled(True, os.path.join(ROOT_DIR, "../benchmark-output/", gen_test_id() + "_profile_0" + ".prof"))
    # filters[0].set_profiled(True, os.path.join(ROOT_DIR, "../benchmark-output/", gen_test_id() + "_filters_0" + ".prof"))
    # aggregate.set_profiled(True, os.path.join(ROOT_DIR, "../benchmark-output/", gen_test_id() + "_aggregate" + ".prof"))
    # collate.set_profiled(True, os.path.join(ROOT_DIR, "../benchmark-output/", gen_test_id() + "_collate" + ".prof"))

    # Plan settings
    print('')
    print("Settings")
    print("--------")
    print('')
    print('use_pandas: {}'.format(use_pandas))
    print("table parts: {}".format(table_parts))
    print('')

    # Write the plan graph
    query_plan.write_graph(os.path.join(ROOT_DIR, "../benchmark-output"),
                           gen_test_id() + "-" + str(table_parts))

    # Start the query
    query_plan.execute()
    print('Done')
    tuples = collate.tuples()

    collate.print_tuples(tuples)

    # Write the metrics
    query_plan.print_metrics()

    # Shut everything down
    query_plan.stop()