Example #1
0
def sql_scan_lineitem_select_partkey_quantity_extendedprice_discount_shipinstruct_shipmode_where_filtered_op(
        sharded, shard, num_shards, use_pandas, secure, use_native, name,
        query_plan, sf, format_):
    return SQLTableScan(
        get_file_key('lineitem', sharded, shard, sf, format_), "select "
        "  l_partkey, "
        "  l_quantity, "
        "  l_extendedprice, "
        "  l_discount, "
        "  l_shipinstruct, "
        "  l_shipmode "
        "from "
        "  S3Object "
        "where "
        "  ( "
        "      ( "
        "          cast(l_quantity as integer) >= 3 and cast(l_quantity as integer) <= 3 + 10 "
        "          and l_shipmode in ('AIR', 'AIR REG') "
        "          and l_shipinstruct = 'DELIVER IN PERSON' "
        "      ) "
        "      or "
        "      ( "
        "          cast(l_quantity as integer) >= 16 and cast(l_quantity as integer) <= 16 + 10 "
        "          and l_shipmode in ('AIR', 'AIR REG') "
        "          and l_shipinstruct = 'DELIVER IN PERSON' "
        "      ) "
        "      or "
        "      ( "
        "          cast(l_quantity as integer) >= 24 and cast(l_quantity as integer) <= 24 + 10 "
        "          and l_shipmode in ('AIR', 'AIR REG') "
        "          and l_shipinstruct = 'DELIVER IN PERSON' "
        "      ) "
        "  ) {}".format(get_sql_suffix('lineitem', num_shards, shard,
                                       sharded)), format_, use_pandas, secure,
        use_native, name, query_plan, False)
Example #2
0
def sql_scan_part_partkey_brand_size_container_where_extra_filtered_op(sharded,
                                                                       shard,
                                                                       num_shards,
                                                                       use_pandas, secure, use_native,
                                                                       name,
                                                                       query_plan, sf):
    return SQLTableScan(get_file_key('part', sharded, shard, sf),
                        "select "
                        "  p_partkey, "
                        "  p_brand, "
                        "  p_size, "
                        "  p_container "
                        "from "
                        "  S3Object "
                        "where "
                        "  ("
                        "    p_partkey = '103853' or "
                        "    p_partkey = '104277' or "
                        "    p_partkey = '104744' "
                        "  ) and "
                        "  ( "
                        "      ( "
                        "          p_brand = 'Brand#11' "
                        "          and p_container in ("
                        "              'SM CASE', "
                        "              'SM BOX', "
                        "              'SM PACK', "
                        "              'SM PKG'"
                        "          ) "
                        "          and cast(p_size as integer) between 1 and 5 "
                        "      ) "
                        "      or "
                        "      ( "
                        "          p_brand = 'Brand#44' "
                        "          and p_container in ("
                        "              'MED BAG', "
                        "              'MED BOX', "
                        "              'MED PKG', "
                        "              'MED PACK'"
                        "          ) "
                        "          and cast(p_size as integer) between 1 and 10 "
                        "      ) "
                        "      or "
                        "      ( "
                        "          p_brand = 'Brand#53' "
                        "          and p_container in ("
                        "              'LG CASE', "
                        "              'LG BOX', "
                        "              'LG PACK', "
                        "              'LG PKG'"
                        "          ) "
                        "          and cast(p_size as integer) between 1 and 15 "
                        "      ) "
                        "  ) {}"
                        .format(get_sql_suffix('part', num_shards, shard, sharded)),
                        use_pandas, secure, use_native,
                        name,
                        query_plan,
                        False)
Example #3
0
def sql_filtered_scan_lineitem_operator_def(max_shipped_date, sharded, shard, sf, use_pandas, secure, use_native, name, query_plan, format_):
    return SQLTableScan(get_file_key('lineitem', sharded, shard, sf),
                        " select l_quantity, l_extendedprice, l_discount, l_tax, l_returnflag, l_linestatus, l_shipdate "
                        " from S3Object "
                        " where cast(l_shipdate as timestamp) <= cast(\'{}\' as timestamp) "
                        ";".format(max_shipped_date.strftime('%Y-%m-%d')), format_,
                        use_pandas, secure, use_native, name, query_plan,
                        False)
Example #4
0
def sql_scan_part_select_all_op(sharded, shard, num_shards, use_pandas, secure,
                                use_native, name, query_plan, sf, format_):
    return SQLTableScan(
        get_file_key('part', sharded, shard, sf, format_), "select "
        "  * "
        "from "
        "  S3Object "
        "  {} ".format(
            get_sql_suffix('part', num_shards, shard, sharded,
                           add_where=True)), format_, use_pandas, secure,
        use_native, name, query_plan, False)
Example #5
0
def bloom_scan_partkey_quantity_extendedprice_discount_shipinstruct_shipmode_where_extra_filtered_op(
        sharded,
        shard,
        num_shards,
        use_pandas, secure, use_native,
        name,
        query_plan, sf):
    return SQLTableScanBloomUse(get_file_key('lineitem', sharded, shard, sf),
                                "select "
                                "  l_partkey, "
                                "  l_quantity, "
                                "  l_extendedprice, "
                                "  l_discount, "
                                "  l_shipinstruct, "
                                "  l_shipmode "
                                "from "
                                "  S3Object "
                                "where "
                                "  ("
                                "    l_partkey = '103853' or "
                                "    l_partkey = '104277' or "
                                "    l_partkey = '104744' "
                                "  ) and "
                                "  ( "
                                "      ( "
                                "          cast(l_quantity as integer) >= 3 "
                                "          and cast(l_quantity as integer) <= 3 + 10 "
                                "          and l_shipmode in ('AIR', 'AIR REG') "
                                "          and l_shipinstruct = 'DELIVER IN PERSON' "
                                "      ) "
                                "      or "
                                "      ( "
                                "          cast(l_quantity as integer) >= 16 "
                                "          and cast(l_quantity as integer) <= 16 + 10 "
                                "          and l_shipmode in ('AIR', 'AIR REG') "
                                "          and l_shipinstruct = 'DELIVER IN PERSON' "
                                "      ) "
                                "      or "
                                "      ( "
                                "          cast(l_quantity as integer) >= 24 "
                                "          and cast(l_quantity as integer) <= 24 + 10 "
                                "          and l_shipmode in ('AIR', 'AIR REG') "
                                "          and l_shipinstruct = 'DELIVER IN PERSON' "
                                "      ) "
                                "  ) {}"
                                .format(get_sql_suffix('lineitem', num_shards, shard, sharded)),
                                'l_partkey',
                                use_pandas, secure, use_native,
                                name,
                                query_plan,
                                False)
Example #6
0
def sql_scan_part_select_all_where_partkey_op(sharded, shard, num_shards,
                                              use_pandas, secure, use_native,
                                              name, query_plan, sf):
    return SQLTableScan(
        get_file_key('part', sharded, shard, sf), "select "
        "  * "
        "from "
        "  S3Object "
        "where "
        "  ("
        "    p_partkey = '103853' or "
        "    p_partkey = '104277' or "
        "    p_partkey = '104744'"
        "  ) {}".format(get_sql_suffix('part', num_shards, shard, sharded)),
        use_pandas, secure, use_native, name, query_plan, False)
Example #7
0
def bloom_scan_lineitem_select_orderkey_partkey_quantity_extendedprice_where_partkey_bloom_partkey_op(
        sharded, shard, num_shards, use_pandas, secure, use_native, name,
        query_plan, sf):
    return SQLTableScanBloomUse(
        get_file_key('lineitem', sharded, shard, sf), "select "
        "  l_orderkey, l_partkey, l_quantity, l_extendedprice "
        "from "
        "  S3Object "
        "where "
        "  l_partkey in ('181726', '182405') "
        "  {}".format(
            get_sql_suffix('lineitem',
                           num_shards,
                           shard,
                           sharded,
                           add_where=False)), 'l_partkey', use_pandas, secure,
        use_native, name, query_plan, False)
Example #8
0
def main():
    parts = 32
    query_plan = QueryPlan(is_async=True, buffer_size=0)

    # Query plan
    lineitem_scan = map(lambda p:
                        query_plan.add_operator(
                            SQLTableScan(get_file_key('lineitem', True, p),
                                         "select * from S3Object;", Format.CSV,
                                         use_pandas=True, secure=False, use_native=False,
                                         name='scan_' + str(p), query_plan=query_plan,
                                         log_enabled=False)),
                        range(0, parts))

    collate = query_plan.add_operator(
        Collate('collate', query_plan, False))

    map(lambda (p, o): o.connect(collate), enumerate(lineitem_scan))

    query_plan.execute()
Example #9
0
def sql_scan_part_select_all_where_brand_and_container_op(
        sharded, shard, num_shards, use_pandas, secure, use_native, name,
        query_plan, sf):
    """with part_scan as (select * from part)

    :param query_plan:
    :param name:
    :return:
    """

    return SQLTableScan(
        get_file_key('part', sharded, shard, sf), "select "
        "  * "
        "from "
        "  S3Object "
        "where "
        "  p_brand = 'Brand#41' and "
        "  p_container = 'SM PACK' "
        "  {} ".format(
            get_sql_suffix('part', num_shards, shard, sharded,
                           add_where=False)), use_pandas, secure, use_native,
        name, query_plan, False)
Example #10
0
def sql_scan_lineitem_select_orderkey_partkey_quantity_extendedprice(
        sharded, shard, num_shards, use_pandas, secure, use_native, name,
        query_plan, sf, format_):
    """with lineitem_scan as (select * from lineitem)

    :param query_plan:
    :param name:
    :return:
    """

    return SQLTableScan(
        get_file_key('lineitem', sharded, shard, sf, format_), "select "
        "  l_orderkey, l_partkey, l_quantity, l_extendedprice "
        "from "
        "  S3Object "
        "  {} ".format(
            get_sql_suffix('lineitem',
                           num_shards,
                           shard,
                           sharded,
                           add_where=True)), format_, use_pandas, secure,
        use_native, name, query_plan, False)
Example #11
0
def bloom_scan_lineitem_select_orderkey_partkey_quantity_extendedprice_bloom_partkey_op(
        sharded, shard, num_shards, use_pandas, secure, use_native, name,
        query_plan, sf, format_):
    return SQLTableScanBloomUse(
        get_file_key('lineitem', sharded, shard, sf, format_),
        "select "
        "  l_orderkey, l_partkey, l_quantity, l_extendedprice "
        "from "
        "  S3Object "
        "  {}".format(
            get_sql_suffix('lineitem',
                           num_shards,
                           shard,
                           sharded,
                           add_where=True)),
        'l_partkey',
        format_=format_,
        use_pandas=use_pandas,
        secure=secure,
        use_native=use_native,
        name=name,
        query_plan=query_plan,
        log_enabled=False)
Example #12
0
def sql_scan_lineitem_select_all_where_partkey_op(sharded, shard, num_shards,
                                                  use_pandas, secure,
                                                  use_native, name, query_plan,
                                                  sf):
    """with lineitem_scan as (select * from lineitem)

    :param query_plan:
    :param name:
    :return:
    """
    return SQLTableScan(
        get_file_key('lineitem', sharded, shard, sf), "select "
        "  * "
        "from "
        "  S3Object "
        "where "
        "  l_partkey in ('181726', '182405') "
        "  {}".format(
            get_sql_suffix('lineitem',
                           num_shards,
                           shard,
                           sharded,
                           add_where=False)), use_pandas, secure, use_native,
        name, query_plan, False)
Example #13
0
def query_plan(settings):
    # type: (SyntheticSemiJoinSettings) -> QueryPlan
    """

    :return: None
    """

    if settings.use_shared_mem:
        system = WorkerSystem(settings.shared_memory_size)
    else:
        system = None

    query_plan = QueryPlan(system,
                           is_async=settings.parallel,
                           buffer_size=settings.buffer_size,
                           use_shared_mem=settings.use_shared_mem)

    # Define the operators
    scan_a = \
        map(lambda p:
            query_plan.add_operator(
                SQLTableScan(get_file_key(settings.table_A_key, settings.table_A_sharded, p, settings.sf),
                             "select "
                             "  {} "
                             "from "
                             "  S3Object "
                             "where "
                             "  {} "
                             "  {} "
                             .format(settings.table_A_AB_join_key,
                                     settings.table_A_filter_sql,
                                     get_sql_suffix(settings.table_A_key, settings.table_A_parts, p,
                                                    settings.table_A_sharded)), settings.format_,
                             settings.use_pandas,
                             settings.secure,
                             settings.use_native,
                             'scan_a' + '_{}'.format(p),
                             query_plan,
                             False)),
            range(0, settings.table_A_parts))

    field_names_map_a = OrderedDict(
        zip([
            '_{}'.format(i)
            for i, name in enumerate(settings.table_A_field_names)
        ], settings.table_A_field_names))

    def project_fn_a(df):
        df = df.rename(columns=field_names_map_a, copy=False)
        return df

    project_a = map(
        lambda p: query_plan.add_operator(
            Project([
                ProjectExpression(k, v)
                for k, v in field_names_map_a.iteritems()
            ], 'project_a' + '_{}'.format(p), query_plan, False, project_fn_a)
        ), range(0, settings.table_A_parts))

    bloom_create_ab_join_key = map(
        lambda p: query_plan.add_operator(
            BloomCreate(settings.table_A_AB_join_key,
                        'bloom_create_ab_join_key' + '_{}'.format(p),
                        query_plan,
                        False,
                        fp_rate=settings.fp_rate)),
        range(0, settings.table_A_parts))

    scan_b_on_ab_join_key = \
        map(lambda p:
            query_plan.add_operator(
                SQLTableScanBloomUse(get_file_key(settings.table_B_key, settings.table_B_sharded, p, settings.sf),
                                     "select "
                                     "  {},{} "
                                     "from "
                                     "  S3Object "
                                     "where "
                                     "  {} "
                                     "  {} "
                                     .format(settings.table_B_BC_join_key,
                                             settings.table_B_AB_join_key,
                                             settings.table_B_filter_sql,
                                             get_sql_suffix(settings.table_B_key, settings.table_B_parts, p,
                                                            settings.table_B_sharded, add_where=False)), settings.format_,
                                     settings.table_B_AB_join_key,
                                     settings.use_pandas,
                                     settings.secure,
                                     settings.use_native,
                                     'scan_b_on_ab_join_key' + '_{}'.format(p),
                                     query_plan,
                                     False)),
            range(0, settings.table_B_parts))

    if settings.table_C_key is None:

        scan_b_detail_on_b_pk = \
            map(lambda p:
                query_plan.add_operator(
                    SQLTableScanBloomUse(get_file_key(settings.table_B_key, settings.table_B_sharded, p, settings.sf),
                                         "select "
                                         "  {},{} "
                                         "from "
                                         "  S3Object "
                                         "where "
                                         "  {} "
                                         "  {} "
                                         .format(settings.table_B_primary_key,
                                                 settings.table_B_detail_field_name,
                                                 settings.table_B_filter_sql,
                                                 get_sql_suffix(settings.table_B_key, settings.table_B_parts, p,
                                                                settings.table_B_sharded, add_where=False)), settings.format_,
                                         settings.table_B_primary_key,
                                         settings.use_pandas,
                                         settings.secure,
                                         settings.use_native,
                                         'scan_c_detail_on_b_pk' + '_{}'.format(p),
                                         query_plan,
                                         False)),
                range(0, settings.table_B_parts))

        field_names_map_b_detail = OrderedDict([
            ('_0', settings.table_B_primary_key),
            ('_1', settings.table_B_detail_field_name)
        ])

        def project_fn_b_detail(df):
            df.rename(columns=field_names_map_b_detail, inplace=True)
            return df

        project_b_detail = map(
            lambda p: query_plan.add_operator(
                Project([
                    ProjectExpression(k, v)
                    for k, v in field_names_map_b_detail.iteritems()
                ], 'project_b_detail' + '_{}'.format(p), query_plan,
                        False, project_fn_b_detail)),
            range(0, settings.table_B_parts))

        map_b_pk_1 = map(
            lambda p: query_plan.add_operator(
                Map(settings.table_B_primary_key, 'map_b_pk_1' + '_{}'.format(
                    p), query_plan, False)), range(0, settings.table_B_parts))

        map_b_pk_2 = map(
            lambda p: query_plan.add_operator(
                Map(settings.table_B_primary_key, 'map_b_pk_2' + '_{}'.format(
                    p), query_plan, False)), range(0, settings.table_B_parts))

        bloom_create_b_pk = map(
            lambda p: query_plan.add_operator(
                BloomCreate(settings.table_B_primary_key,
                            'bloom_create_b_pk' + '_{}'.format(p),
                            query_plan,
                            False,
                            fp_rate=settings.fp_rate)),
            range(0, settings.table_B_parts))

        join_probe_ab_and_b_on_b_pk = map(
            lambda p: query_plan.add_operator(
                HashJoinProbe(
                    JoinExpression(settings.table_B_primary_key, settings.
                                   table_B_primary_key),
                    'join_probe_ab_and_b_on_b_pk' + '_{}'.format(
                        p), query_plan, False)),
            range(0, settings.table_B_parts))

        join_build_ab_and_b_on_b_pk = map(
            lambda p: query_plan.add_operator(
                HashJoinBuild(settings.table_B_primary_key,
                              'join_build_ab_and_b_on_b_pk' + '_{}'.format(
                                  p), query_plan, False)),
            range(0, settings.table_B_parts))

    else:
        scan_c_on_bc_join_key = \
            map(lambda p:
                query_plan.add_operator(
                    SQLTableScanBloomUse(get_file_key(settings.table_C_key, settings.table_C_sharded, p, settings.sf),
                                         "select "
                                         "  {}, {} "
                                         "from "
                                         "  S3Object "
                                         "where "
                                         "  {} "
                                         "  {} "
                                         .format(settings.table_C_primary_key,
                                                 settings.table_C_BC_join_key,
                                                 settings.table_C_filter_sql,
                                                 get_sql_suffix(settings.table_C_key, settings.table_C_parts, p,
                                                                settings.table_C_sharded, add_where=False)),
                                         settings.table_C_BC_join_key, settings.format_,
                                         settings.use_pandas,
                                         settings.secure,
                                         settings.use_native,
                                         'scan_c_on_bc_join_key' + '_{}'.format(p),
                                         query_plan,
                                         False)),
                range(0, settings.table_C_parts))

        field_names_map_c = OrderedDict(
            zip([
                '_{}'.format(i)
                for i, name in enumerate(settings.table_C_field_names)
            ], settings.table_C_field_names))

        def project_fn_c(df):
            df.rename(columns=field_names_map_c, inplace=True)
            return df

        project_c = map(
            lambda p: query_plan.add_operator(
                Project([
                    ProjectExpression(k, v)
                    for k, v in field_names_map_c.iteritems()
                ], 'project_c' + '_{}'.format(p), query_plan, False,
                        project_fn_c)), range(0, settings.table_C_parts))

        scan_c_detail_on_c_pk = \
            map(lambda p:
                query_plan.add_operator(
                    SQLTableScanBloomUse(get_file_key(settings.table_C_key, settings.table_C_sharded, p, settings.sf),
                                         "select "
                                         "  {},{} "
                                         "from "
                                         "  S3Object "
                                         "where "
                                         "  {} "
                                         "  {} "
                                         .format(settings.table_C_primary_key,
                                                 settings.table_C_detail_field_name,
                                                 settings.table_C_filter_sql,
                                                 get_sql_suffix(settings.table_C_key, settings.table_C_parts, p,
                                                                settings.table_C_sharded, add_where=False)),
                                         settings.table_C_primary_key, settings.format_,
                                         settings.use_pandas,
                                         settings.secure,
                                         settings.use_native,
                                         'scan_c_detail_on_c_pk' + '_{}'.format(p),
                                         query_plan,
                                         False)),
                range(0, settings.table_C_parts))

        field_names_map_c_detail = OrderedDict([
            ('_0', settings.table_C_primary_key),
            ('_1', settings.table_C_detail_field_name)
        ])

        def project_fn_c_detail(df):
            df.rename(columns=field_names_map_c_detail, inplace=True)
            return df

        project_c_detail = map(
            lambda p: query_plan.add_operator(
                Project([
                    ProjectExpression(k, v)
                    for k, v in field_names_map_c_detail.iteritems()
                ], 'project_c_detail' + '_{}'.format(p), query_plan,
                        False, project_fn_c_detail)),
            range(0, settings.table_C_parts))

        map_bc_b_join_key = map(
            lambda p: query_plan.add_operator(
                Map(settings.table_B_BC_join_key, 'map_bc_b_join_key' + '_{}'.
                    format(p), query_plan, False)),
            range(0, settings.table_C_parts))

        map_c_pk_1 = map(
            lambda p: query_plan.add_operator(
                Map(settings.table_C_primary_key, 'map_c_pk_1' + '_{}'.format(
                    p), query_plan, False)), range(0, settings.table_C_parts))

        map_c_pk_2 = map(
            lambda p: query_plan.add_operator(
                Map(settings.table_C_primary_key, 'map_c_pk_2' + '_{}'.format(
                    p), query_plan, False)), range(0, settings.table_C_parts))

        bloom_create_c_pk = map(
            lambda p: query_plan.add_operator(
                BloomCreate(settings.table_C_primary_key,
                            'bloom_create_bc_b_to_c_join_key_{}'.format(p),
                            query_plan,
                            False,
                            fp_rate=settings.fp_rate)),
            range(0, settings.table_C_parts))

        join_build_ab_and_c_on_bc_join_key = map(
            lambda p: query_plan.add_operator(
                HashJoinBuild(
                    settings.table_B_BC_join_key,
                    'join_build_ab_and_c_on_bc_join_key' + '_{}'.format(
                        p), query_plan, False)),
            range(0, settings.table_C_parts))

        join_probe_ab_and_c_on_bc_join_key = map(
            lambda p: query_plan.add_operator(
                HashJoinProbe(
                    JoinExpression(settings.table_B_BC_join_key, settings.
                                   table_C_BC_join_key),
                    'join_probe_ab_and_c_on_bc_join_key' + '_{}'.format(
                        p), query_plan, False)),
            range(0, settings.table_C_parts))

        join_build_abc_and_c_on_c_pk = map(
            lambda p: query_plan.add_operator(
                HashJoinBuild(settings.table_C_primary_key,
                              'join_build_abc_and_c_on_c_pk' + '_{}'.format(
                                  p), query_plan, False)),
            range(0, settings.table_C_parts))

        join_probe_abc_and_c_on_c_pk = map(
            lambda p: query_plan.add_operator(
                HashJoinProbe(
                    JoinExpression(settings.table_C_primary_key, settings.
                                   table_C_primary_key),
                    'join_probe_abc_and_c_on_c_pk' + '_{}'.format(
                        p), query_plan, False)),
            range(0, settings.table_C_parts))

        bloom_create_bc_join_key = map(
            lambda p: query_plan.add_operator(
                BloomCreate(settings.table_B_BC_join_key,
                            'bloom_create_bc_join_key' + '_{}'.format(
                                p), query_plan, False)),
            range(0, settings.table_B_parts))

        map_bc_c_join_key = map(
            lambda p: query_plan.add_operator(
                Map(settings.table_C_BC_join_key, 'map_bc_c_join_key' + '_{}'.
                    format(p), query_plan, False)),
            range(0, settings.table_B_parts))

    field_names_map_b = OrderedDict(
        zip([
            '_{}'.format(i)
            for i, name in enumerate(settings.table_B_field_names)
        ], settings.table_B_field_names))

    def project_fn_b(df):
        df.rename(columns=field_names_map_b, inplace=True)
        return df

    project_b = map(
        lambda p: query_plan.add_operator(
            Project([
                ProjectExpression(k, v)
                for k, v in field_names_map_b.iteritems()
            ], 'project_b' + '_{}'.format(p), query_plan, False, project_fn_b)
        ), range(0, settings.table_B_parts))

    map_ab_a_join_key = map(
        lambda p: query_plan.add_operator(
            Map(settings.table_A_AB_join_key, 'map_ab_a_join_key' + '_{}'
                .format(p), query_plan, False)),
        range(0, settings.table_A_parts))

    map_ab_b_join_key = map(
        lambda p: query_plan.add_operator(
            Map(settings.table_B_AB_join_key, 'map_ab_b_join_key' + '_{}'
                .format(p), query_plan, False)),
        range(0, settings.table_B_parts))

    join_build_a_and_b_on_ab_join_key = map(
        lambda p: query_plan.add_operator(
            HashJoinBuild(
                settings.table_A_AB_join_key,
                'join_build_a_and_b_on_ab_join_key' + '_{}'.format(
                    p), query_plan, False)), range(0, settings.table_B_parts))

    join_probe_a_and_b_on_ab_join_key = map(
        lambda p: query_plan.add_operator(
            HashJoinProbe(
                JoinExpression(settings.table_A_AB_join_key, settings.
                               table_B_AB_join_key),
                'join_probe_a_and_b_on_ab_join_key' + '_{}'.format(
                    p), query_plan, False)), range(0, settings.table_B_parts))

    if settings.table_C_key is None:

        def part_aggregate_fn(df):
            sum_ = df[settings.table_B_detail_field_name].astype(
                np.float).sum()
            return pd.DataFrame({'_0': [sum_]})

        part_aggregate = map(
            lambda p: query_plan.add_operator(
                Aggregate([
                    AggregateExpression(
                        AggregateExpression.SUM, lambda t: float(t[
                            settings.table_B_detail_field_name]))
                ], settings.use_pandas, 'part_aggregate_{}'.format(p),
                          query_plan, False, part_aggregate_fn)),
            range(0, settings.table_B_parts))

    else:

        def part_aggregate_fn(df):
            sum_ = df[settings.table_C_detail_field_name].astype(
                np.float).sum()
            return pd.DataFrame({'_0': [sum_]})

        part_aggregate = map(
            lambda p: query_plan.add_operator(
                Aggregate([
                    AggregateExpression(
                        AggregateExpression.SUM, lambda t: float(t[
                            settings.table_C_detail_field_name]))
                ], settings.use_pandas, 'part_aggregate_{}'.format(p),
                          query_plan, False, part_aggregate_fn)),
            range(0, settings.table_C_parts))

    def aggregate_reduce_fn(df):
        sum_ = df['_0'].astype(np.float).sum()
        return pd.DataFrame({'_0': [sum_]})

    aggregate_reduce = query_plan.add_operator(
        Aggregate([
            AggregateExpression(AggregateExpression.SUM,
                                lambda t: float(t['_0']))
        ], settings.use_pandas, 'aggregate_reduce', query_plan, False,
                  aggregate_reduce_fn))

    aggregate_project = query_plan.add_operator(
        Project([ProjectExpression(lambda t: t['_0'], 'total_balance')],
                'aggregate_project', query_plan, False))

    collate = query_plan.add_operator(Collate('collate', query_plan, False))

    # Inline some of the operators
    map(lambda o: o.set_async(False), project_a)
    map(lambda o: o.set_async(False), project_b)
    map(lambda o: o.set_async(False), map_ab_a_join_key)
    map(lambda o: o.set_async(False), map_ab_b_join_key)
    if settings.table_C_key is None:
        map(lambda o: o.set_async(False), map_b_pk_1)
        map(lambda o: o.set_async(False), map_b_pk_2)
        map(lambda o: o.set_async(False), project_b_detail)
    else:
        map(lambda o: o.set_async(False), map_bc_b_join_key)
        map(lambda o: o.set_async(False), map_bc_c_join_key)
        map(lambda o: o.set_async(False), map_c_pk_1)
        map(lambda o: o.set_async(False), map_c_pk_2)
        map(lambda o: o.set_async(False), project_c)
        map(lambda o: o.set_async(False), project_c_detail)
    aggregate_project.set_async(False)

    # Connect the operators
    connect_many_to_many(scan_a, project_a)

    connect_many_to_many(project_a, map_ab_a_join_key)

    connect_all_to_all(map_ab_a_join_key, join_build_a_and_b_on_ab_join_key)
    connect_all_to_all(project_a, bloom_create_ab_join_key)
    # connect_all_to_all(map_A_to_B, join_build_a_and_b_on_ab_join_key)
    connect_many_to_many(join_build_a_and_b_on_ab_join_key,
                         join_probe_a_and_b_on_ab_join_key)

    # connect_all_to_all(map_bloom_A_to_B, bloom_create_ab_join_key)
    connect_many_to_many(bloom_create_ab_join_key, scan_b_on_ab_join_key)
    connect_many_to_many(scan_b_on_ab_join_key, project_b)
    # connect_many_to_many(project_b, join_probe_a_and_b_on_ab_join_key)
    # connect_all_to_all(map_B_to_B, join_probe_a_and_b_on_ab_join_key)

    connect_many_to_many(project_b, map_ab_b_join_key)
    connect_all_to_all(map_ab_b_join_key, join_probe_a_and_b_on_ab_join_key)

    # connect_many_to_many(join_probe_a_and_b_on_ab_join_key, map_bloom_B_to_B)

    if settings.table_C_key is None:
        # connect_all_to_all(join_probe_a_and_b_on_ab_join_key, part_aggregate)
        connect_many_to_many(scan_b_detail_on_b_pk, project_b_detail)
        connect_many_to_many(project_b_detail, map_b_pk_2)
        connect_many_to_many(bloom_create_b_pk, scan_b_detail_on_b_pk)
        connect_all_to_all(join_probe_a_and_b_on_ab_join_key,
                           bloom_create_b_pk)
        connect_all_to_all(map_b_pk_2, join_probe_ab_and_b_on_b_pk)
        connect_many_to_many(join_probe_ab_and_b_on_b_pk, part_aggregate)
        connect_many_to_many(join_build_ab_and_b_on_b_pk,
                             join_probe_ab_and_b_on_b_pk)
        connect_many_to_many(join_probe_a_and_b_on_ab_join_key, map_b_pk_1)
        connect_all_to_all(map_b_pk_1, join_build_ab_and_b_on_b_pk)

    else:
        connect_all_to_all(join_probe_a_and_b_on_ab_join_key,
                           bloom_create_bc_join_key)
        connect_many_to_many(bloom_create_bc_join_key, scan_c_on_bc_join_key)
        connect_many_to_many(scan_c_on_bc_join_key, project_c)
        # connect_many_to_many(project_c, join_probe_ab_and_c_on_bc_join_key)
        connect_all_to_all(map_bc_c_join_key,
                           join_probe_ab_and_c_on_bc_join_key)
        # connect_many_to_many(join_probe_a_and_b_on_ab_join_key, join_build_ab_and_c_on_bc_join_key)
        connect_many_to_many(join_probe_a_and_b_on_ab_join_key,
                             map_bc_b_join_key)
        connect_all_to_all(map_bc_b_join_key,
                           join_build_ab_and_c_on_bc_join_key)
        connect_all_to_all(join_probe_ab_and_c_on_bc_join_key,
                           bloom_create_c_pk)
        # connect_many_to_many(join_probe_ab_and_c_on_bc_join_key, join_build_abc_and_c_on_c_pk)
        connect_many_to_many(join_probe_ab_and_c_on_bc_join_key, map_c_pk_1)
        connect_all_to_all(map_c_pk_1, join_build_abc_and_c_on_c_pk)
        connect_many_to_many(bloom_create_c_pk, scan_c_detail_on_c_pk)
        # connect_all_to_all(bloom_create_bc_join_key, scan_c_detail_on_c_pk)
        connect_many_to_many(join_build_abc_and_c_on_c_pk,
                             join_probe_abc_and_c_on_c_pk)
        # connect_many_to_many(join_probe_a_and_b_on_ab_join_key, map_B_to_C)
        # connect_all_to_all(join_probe_a_and_b_on_ab_join_key, join_build_abc_and_c_on_c_pk)
        connect_many_to_many(scan_c_detail_on_c_pk, project_c_detail)
        # connect_many_to_many(project_c_detail, map_C_to_C)
        # connect_all_to_all(project_c_detail, join_probe_abc_and_c_on_c_pk)
        connect_many_to_many(project_c_detail, map_c_pk_2)

        connect_many_to_many(project_c, map_bc_c_join_key)
        connect_many_to_many(join_build_ab_and_c_on_bc_join_key,
                             join_probe_ab_and_c_on_bc_join_key)
        connect_all_to_all(map_c_pk_2, join_probe_abc_and_c_on_c_pk)

        connect_many_to_many(join_probe_abc_and_c_on_c_pk, part_aggregate)

    connect_many_to_one(part_aggregate, aggregate_reduce)
    connect_one_to_one(aggregate_reduce, aggregate_project)
    connect_one_to_one(aggregate_project, collate)

    return query_plan
Example #14
0
def run(parallel,
        use_pandas,
        buffer_size,
        table_parts,
        lower,
        upper,
        sf,
        nthreads=16,
        format_=Format.CSV):
    secure = False
    use_native = False
    print('')
    print("Indexing Benchmark")
    print("------------------")

    # Query plan
    query_plan = QueryPlan(is_async=parallel, buffer_size=buffer_size)
    assert sf == 1 or sf == 10

    # Scan Index Files
    index_scan = map(
        lambda p: query_plan.add_operator(
            SQLTableScan(
                'tpch-sf{}/lineitem_sharded/index/index_l_extendedprice.csv.{}'
                .format(sf, p
                        if sf == 1 else p + 1), "select first_byte, last_byte "
                " from S3Object "
                " where col_values  >= {} and col_values  <= {};".format(
                    lower, upper), format_, use_pandas, secure, use_native,
                'index_scan_{}'.format(p), query_plan, False)),
        range(0, table_parts))

    # Range accesses
    range_access = map(
        lambda p: query_plan.add_operator(
            TableRangeAccess(
                get_file_key('lineitem', True, p, sf=sf, format_=Format.CSV),
                use_pandas, secure, use_native, 'range_access_{}'.format(
                    p), query_plan, False)), range(0, table_parts))

    map(lambda o: o.set_nthreads(nthreads), range_access)

    # Aggregation
    def agg_fun(df):
        return pd.DataFrame({'count': [len(df)]})

    aggregate = query_plan.add_operator(
        Aggregate([], True, 'agg', query_plan, False, agg_fun))

    collate = query_plan.add_operator(Collate('collate', query_plan, False))

    map(lambda (p, o): o.connect(range_access[p]), enumerate(index_scan))
    map(lambda (p, o): o.connect(aggregate), enumerate(range_access))
    aggregate.connect(collate)

    # Plan settings
    print('')
    print("Settings")
    print("--------")
    print('')
    print('use_pandas: {}'.format(use_pandas))
    print("table parts: {}".format(table_parts))
    print('')

    # Write the plan graph
    query_plan.write_graph(os.path.join(ROOT_DIR, "../benchmark-output"),
                           gen_test_id() + "-" + str(table_parts))

    # Start the query
    query_plan.execute()
    print('Done')
    tuples = collate.tuples()

    collate.print_tuples(tuples)

    # Write the metrics
    query_plan.print_metrics()

    # Shut everything down
    query_plan.stop()
Example #15
0
def query_plan(settings):
    # type: (SyntheticBaselineJoinSettings) -> QueryPlan
    """

    :type settings:
    :return: None
    """

    query_plan = QueryPlan(is_async=settings.parallel,
                           buffer_size=settings.buffer_size)

    def scan_A_fun(df):
        df.columns = settings.table_A_field_names
        criterion = settings.table_A_filter_fn(df)
        df = df[criterion]
        return df

    # Define the operators
    scan_A = \
        map(lambda p:
            query_plan.add_operator(
                SQLTableScan(get_file_key(settings.table_A_key, settings.table_A_sharded, p),
                             "select "
                             "  * "
                             "from "
                             "  S3Object "
                             "{}"
                             .format(
                                 get_sql_suffix(settings.table_A_key, settings.table_A_parts, p,
                                                settings.table_A_sharded, add_where=True)), settings.format_,
                             settings.use_pandas,
                             settings.secure,
                             settings.use_native,
                             'scan_A_{}'.format(p),
                             query_plan,
                             False, fn=scan_A_fun)),
            range(0, settings.table_A_parts))

    def scan_B_fun(df):
        df.columns = settings.table_B_field_names
        criterion = settings.table_B_filter_fn(df)
        return df[criterion]

    scan_B = \
        map(lambda p:
            query_plan.add_operator(
                SQLTableScan(get_file_key(settings.table_B_key, settings.table_B_sharded, p),
                             "select "
                             "  * "
                             "from "
                             "  S3Object "
                             "{}"
                             .format(
                                 get_sql_suffix(settings.table_B_key, settings.table_B_parts, p,
                                                settings.table_B_sharded, add_where=True)), settings.format_,
                             settings.use_pandas,
                             settings.secure,
                             settings.use_native,
                             'scan_B_{}'.format(p),
                             query_plan,
                             False, fn=scan_B_fun)),
            range(0, settings.table_B_parts))
    """
    field_names_map_B = OrderedDict(
        zip(['_{}'.format(i) for i, name in enumerate(settings.table_B_field_names)], settings.table_B_field_names))

    def project_fn_B(df):
        df.rename(columns=field_names_map_B, inplace=True)
        return df

    project_B = map(lambda p:
                    query_plan.add_operator(Project(
                        [ProjectExpression(k, v) for k, v in field_names_map_B.iteritems()],
                        'project_B_{}'.format(p),
                        query_plan,
                        True,
                        project_fn_B)),
                    range(0, settings.table_B_parts))

    filter_b = map(lambda p:
                   query_plan.add_operator(Filter(
                       PredicateExpression(None, pd_expr=settings.table_B_filter_fn), 'filter_b' + '_{}'.format(p), query_plan,
                       False)),
                   range(0, settings.table_B_parts))
    """

    def scan_C_fun(df):
        df.columns = settings.table_C_field_names
        criterion = settings.table_C_filter_fn(df)
        return df[criterion]

    scan_C = \
        map(lambda p:
            query_plan.add_operator(
                SQLTableScan(get_file_key(settings.table_C_key, settings.table_C_sharded, p),
                             "select "
                             "  * "
                             "from "
                             "  S3Object "
                             "{}"
                             .format(
                                 get_sql_suffix(settings.table_C_key, settings.table_C_parts, p,
                                                settings.table_C_sharded, add_where=True)), settings.format_,
                             settings.use_pandas,
                             settings.secure,
                             settings.use_native,
                             'scan_C_{}'.format(p),
                             query_plan,
                             False, fn=scan_C_fun)),
            range(0, settings.table_C_parts))
    """
    field_names_map_C = OrderedDict(
        zip(['_{}'.format(i) for i, name in enumerate(settings.table_C_field_names)], settings.table_C_field_names))

    def project_fn_C(df):
        df.rename(columns=field_names_map_C, inplace=True)
        return df

    project_C = map(lambda p:
                    query_plan.add_operator(Project(
                        [ProjectExpression(k, v) for k, v in field_names_map_C.iteritems()],
                        'project_C_{}'.format(p),
                        query_plan,
                        True,
                        project_fn_C)),
                    range(0, settings.table_C_parts))

    filter_c = map(lambda p:
                   query_plan.add_operator(Filter(
                       PredicateExpression(None, pd_expr=settings.table_C_filter_fn), 'filter_c' + '_{}'.format(p), query_plan,
                       False)),
                   range(0, settings.table_C_parts))
    """

    map_A_to_B = map(
        lambda p: query_plan.add_operator(
            Map(settings.table_A_AB_join_key, 'map_A_to_B_{}'.format(p),
                query_plan, False)), range(0, settings.table_A_parts))

    map_B_to_B = map(
        lambda p: query_plan.add_operator(
            Map(settings.table_B_AB_join_key, 'map_B_to_B_{}'.format(p),
                query_plan, False)), range(0, settings.table_B_parts))

    map_B_to_C = map(
        lambda p: query_plan.add_operator(
            Map(settings.table_B_BC_join_key, 'map_B_to_C_{}'.format(p),
                query_plan, False)), range(0, settings.table_B_parts))

    map_C_to_C = map(
        lambda p: query_plan.add_operator(
            Map(settings.table_C_BC_join_key, 'map_C_to_C_{}'.format(p),
                query_plan, False)), range(0, settings.table_C_parts))

    join_build_A_B = map(
        lambda p: query_plan.add_operator(
            HashJoinBuild(settings.table_A_AB_join_key, 'join_build_A_B_{}'.
                          format(p), query_plan, False)),
        range(0, settings.table_B_parts))

    join_probe_A_B = map(
        lambda p: query_plan.add_operator(
            HashJoinProbe(
                JoinExpression(settings.table_A_AB_join_key, settings.
                               table_B_AB_join_key), 'join_probe_A_B_{}'
                .format(p), query_plan, False)),
        range(0, settings.table_B_parts))

    join_build_AB_C = map(
        lambda p: query_plan.add_operator(
            HashJoinBuild(settings.table_B_BC_join_key, 'join_build_AB_C_{}'.
                          format(p), query_plan, False)),
        range(0, settings.table_C_parts))

    join_probe_AB_C = map(
        lambda p: query_plan.add_operator(
            HashJoinProbe(
                JoinExpression(settings.table_B_BC_join_key, settings.
                               table_C_BC_join_key), 'join_probe_AB_C_{}'
                .format(p), query_plan, False)),
        range(0, settings.table_C_parts))

    def agg_fun(df):
        return pd.DataFrame({
            'sum':
            [df[settings.table_C_detail_field_name].astype(float).sum()]
        })
        #return pd.DataFrame( { 'sum' : [ len(df) ] } )

    part_aggregate = map(
        lambda p: query_plan.add_operator(
            Aggregate([
                AggregateExpression(
                    AggregateExpression.SUM, lambda t: float(t[
                        settings.table_C_detail_field_name]))
            ], settings.use_pandas, 'part_aggregate_{}'.format(p), query_plan,
                      False, agg_fun)), range(0, settings.table_C_parts))

    def agg_reduce_fun(df):
        return pd.DataFrame({'sum': [df['sum'].sum()]})

    aggregate_reduce = query_plan.add_operator(
        Aggregate([
            AggregateExpression(AggregateExpression.SUM,
                                lambda t: float(t['_0']))
        ], settings.use_pandas, 'aggregate_reduce', query_plan, False,
                  agg_reduce_fun))
    """
    aggregate_project = query_plan.add_operator(Project(
        [
            ProjectExpression(lambda t: t['_0'], 'total_balance')
        ],
        'aggregate_project', query_plan,
        False))
    """

    collate = query_plan.add_operator(Collate('collate', query_plan, False))

    # Connect the operators
    connect_many_to_many(scan_A, map_A_to_B)
    #connect_many_to_many(project_A, filter_A)
    #connect_many_to_many(filter_A, map_A_to_B)
    connect_all_to_all(map_A_to_B, join_build_A_B)
    connect_many_to_many(join_build_A_B, join_probe_A_B)

    connect_many_to_many(scan_B, map_B_to_B)
    #connect_many_to_many(project_B, filter_b)
    #connect_many_to_many(filter_b, map_B_to_B)
    connect_all_to_all(map_B_to_B, join_probe_A_B)

    #connect_many_to_many(join_probe_A_B, part_aggregate)
    #connect_many_to_one(part_aggregate, aggregate_reduce)
    #connect_one_to_one(aggregate_reduce, collate)

    connect_many_to_many(join_build_AB_C, join_probe_AB_C)

    connect_many_to_many(join_probe_A_B, map_B_to_C)
    connect_all_to_all(map_B_to_C, join_build_AB_C)

    connect_many_to_many(scan_C, map_C_to_C)
    #connect_many_to_many(project_C, filter_c)
    #connect_many_to_many(filter_c, map_C_to_C)
    connect_all_to_all(map_C_to_C, join_probe_AB_C)

    connect_many_to_many(join_probe_AB_C, part_aggregate)

    connect_many_to_one(part_aggregate, aggregate_reduce)
    connect_one_to_one(aggregate_reduce, collate)
    #connect_one_to_one(aggregate_project, collate)

    return query_plan
Example #16
0
def query_plan(settings):
    # type: (SyntheticFilteredJoinSettings) -> QueryPlan
    """

    :return: None
    """

    if settings.use_shared_mem:
        system = WorkerSystem(settings.shared_memory_size)
    else:
        system = None

    query_plan = QueryPlan(system,
                           is_async=settings.parallel,
                           buffer_size=settings.buffer_size,
                           use_shared_mem=settings.use_shared_mem)

    # Define the operators
    scan_A = \
        map(lambda p:
            query_plan.add_operator(
                SQLTableScan(get_file_key(settings.table_A_key, settings.table_A_sharded, p, settings.sf),
                             "select "
                             "  {} "
                             "from "
                             "  S3Object "
                             "  {} "
                             "  {} "
                             .format(','.join(settings.table_A_field_names),
                                     ' where {} '.format(
                                         settings.table_A_filter_sql) if settings.table_A_filter_sql is not None else '',
                                     get_sql_suffix(settings.table_A_key, settings.table_A_parts, p,
                                                    settings.table_A_sharded,
                                                    add_where=settings.table_A_filter_sql is None)), settings.format_,
                             settings.use_pandas,
                             settings.secure,
                             settings.use_native,
                             'scan_A_{}'.format(p),
                             query_plan,
                             False)),
            range(0, settings.table_A_parts))

    field_names_map_A = OrderedDict(
        zip([
            '_{}'.format(i)
            for i, name in enumerate(settings.table_A_field_names)
        ], settings.table_A_field_names))

    def project_fn_A(df):
        df = df.rename(columns=field_names_map_A, copy=False)
        return df

    project_A = map(
        lambda p: query_plan.add_operator(
            Project([
                ProjectExpression(k, v)
                for k, v in field_names_map_A.iteritems()
            ], 'project_A_{}'.format(p), query_plan, False, project_fn_A)),
        range(0, settings.table_A_parts))

    scan_B = \
        map(lambda p:
            query_plan.add_operator(
                SQLTableScan(
                    get_file_key(settings.table_B_key, settings.table_B_sharded, p, settings.sf),
                    "select "
                    "  {} "
                    "from "
                    "  S3Object "
                    "  {} "
                    "  {} "
                        .format(','.join(settings.table_B_field_names),
                                ' where {} '.format(
                                    settings.table_B_filter_sql) if settings.table_B_filter_sql is not None else '',
                                get_sql_suffix(settings.table_B_key, settings.table_B_parts, p,
                                               settings.table_B_sharded,
                                               add_where=settings.table_B_filter_sql is None)), settings.format_,
                    settings.use_pandas,
                    settings.secure,
                    settings.use_native,
                    'scan_B_{}'.format(p),
                    query_plan,
                    False)),
            range(0, settings.table_B_parts))

    field_names_map_B = OrderedDict(
        zip([
            '_{}'.format(i)
            for i, name in enumerate(settings.table_B_field_names)
        ], settings.table_B_field_names))

    def project_fn_B(df):
        df.rename(columns=field_names_map_B, inplace=True)
        return df

    project_B = map(
        lambda p: query_plan.add_operator(
            Project([
                ProjectExpression(k, v)
                for k, v in field_names_map_B.iteritems()
            ], 'project_B_{}'.format(p), query_plan, False, project_fn_B)),
        range(0, settings.table_B_parts))

    if settings.table_C_key is not None:
        scan_C = \
            map(lambda p:
                query_plan.add_operator(
                    SQLTableScan(
                        get_file_key(settings.table_C_key, settings.table_C_sharded, p, settings.sf),
                        "select "
                        "  {} "
                        "from "
                        "  S3Object "
                        "where "
                        "  {} "
                        "  {} "
                            .format(','.join(settings.table_C_field_names),
                                    settings.table_C_filter_sql,
                                    get_sql_suffix(settings.table_C_key, settings.table_C_parts, p,
                                                   settings.table_C_sharded, add_where=False)), settings.format_,
                        settings.use_pandas,
                        settings.secure,
                        settings.use_native,
                        'scan_C_{}'.format(p),
                        query_plan,
                        False)),
                range(0, settings.table_C_parts))

        field_names_map_C = OrderedDict(
            zip([
                '_{}'.format(i)
                for i, name in enumerate(settings.table_C_field_names)
            ], settings.table_C_field_names))

        def project_fn_C(df):
            df = df.rename(columns=field_names_map_C, copy=False)
            return df

        project_C = map(
            lambda p: query_plan.add_operator(
                Project([
                    ProjectExpression(k, v)
                    for k, v in field_names_map_C.iteritems()
                ], 'project_C_{}'.format(p), query_plan, False, project_fn_C)),
            range(0, settings.table_C_parts))

        map_B_to_C = map(
            lambda p: query_plan.add_operator(
                Map(settings.table_B_BC_join_key, 'map_B_to_C_{}'.format(p),
                    query_plan, False)), range(0, settings.table_B_parts))

        map_C_to_C = map(
            lambda p: query_plan.add_operator(
                Map(settings.table_C_BC_join_key, 'map_C_to_C_{}'.format(p),
                    query_plan, False)), range(0, settings.table_C_parts))

        join_build_AB_C = map(
            lambda p: query_plan.add_operator(
                HashJoinBuild(settings.table_B_BC_join_key,
                              'join_build_AB_C_{}'.format(
                                  p), query_plan, False)),
            range(0, settings.table_C_parts))

        join_probe_AB_C = map(
            lambda p: query_plan.add_operator(
                HashJoinProbe(
                    JoinExpression(settings.table_B_BC_join_key, settings.
                                   table_C_BC_join_key), 'join_probe_AB_C_{}'.
                    format(p), query_plan, False)),
            range(0, settings.table_C_parts))

    map_A_to_B = map(
        lambda p: query_plan.add_operator(
            Map(settings.table_A_AB_join_key, 'map_A_to_B_{}'.format(p),
                query_plan, False)), range(0, settings.table_A_parts))

    map_B_to_B = map(
        lambda p: query_plan.add_operator(
            Map(settings.table_B_AB_join_key, 'map_B_to_B_{}'.format(p),
                query_plan, False)), range(0, settings.table_B_parts))

    join_build_A_B = map(
        lambda p: query_plan.add_operator(
            HashJoinBuild(settings.table_A_AB_join_key, 'join_build_A_B_{}'.
                          format(p), query_plan, False)),
        range(0, settings.other_parts))

    join_probe_A_B = map(
        lambda p: query_plan.add_operator(
            HashJoinProbe(
                JoinExpression(settings.table_A_AB_join_key, settings.
                               table_B_AB_join_key), 'join_probe_A_B_{}'.
                format(p), query_plan, False)), range(0, settings.other_parts))

    if settings.table_C_key is None:

        def part_aggregate_fn(df):
            sum_ = df[settings.table_B_detail_field_name].astype(
                np.float).sum()
            return pd.DataFrame({'_0': [sum_]})

        part_aggregate = map(
            lambda p: query_plan.add_operator(
                Aggregate([
                    AggregateExpression(
                        AggregateExpression.SUM, lambda t: float(t[
                            settings.table_B_detail_field_name]))
                ], settings.use_pandas, 'part_aggregate_{}'.format(p),
                          query_plan, False, part_aggregate_fn)),
            range(0, settings.other_parts))

    else:

        def part_aggregate_fn(df):
            sum_ = df[settings.table_C_detail_field_name].astype(
                np.float).sum()
            return pd.DataFrame({'_0': [sum_]})

        part_aggregate = map(
            lambda p: query_plan.add_operator(
                Aggregate([
                    AggregateExpression(
                        AggregateExpression.SUM, lambda t: float(t[
                            settings.table_C_detail_field_name]))
                ], settings.use_pandas, 'part_aggregate_{}'.format(p),
                          query_plan, False, part_aggregate_fn)),
            range(0, settings.table_C_parts))

    def aggregate_reduce_fn(df):
        sum_ = df['_0'].astype(np.float).sum()
        return pd.DataFrame({'_0': [sum_]})

    aggregate_reduce = query_plan.add_operator(
        Aggregate([
            AggregateExpression(AggregateExpression.SUM,
                                lambda t: float(t['_0']))
        ], settings.use_pandas, 'aggregate_reduce', query_plan, False,
                  aggregate_reduce_fn))

    aggregate_project = query_plan.add_operator(
        Project([ProjectExpression(lambda t: t['_0'], 'total_balance')],
                'aggregate_project', query_plan, False))

    collate = query_plan.add_operator(Collate('collate', query_plan, False))

    # Inline some of the operators
    map(lambda o: o.set_async(False), project_A)
    map(lambda o: o.set_async(False), project_B)
    map(lambda o: o.set_async(False), map_A_to_B)
    map(lambda o: o.set_async(False), map_B_to_B)
    if settings.table_C_key is not None:
        map(lambda o: o.set_async(False), map_B_to_C)
        map(lambda o: o.set_async(False), map_C_to_C)
        map(lambda o: o.set_async(False), project_C)
    map(lambda o: o.set_async(False), part_aggregate)
    aggregate_project.set_async(False)

    # Connect the operators
    connect_many_to_many(scan_A, project_A)
    connect_many_to_many(project_A, map_A_to_B)
    connect_all_to_all(map_A_to_B, join_build_A_B)
    connect_many_to_many(join_build_A_B, join_probe_A_B)

    connect_many_to_many(scan_B, project_B)
    connect_many_to_many(project_B, map_B_to_B)
    connect_all_to_all(map_B_to_B, join_probe_A_B)

    if settings.table_C_key is None:
        connect_many_to_many(join_probe_A_B, part_aggregate)
    else:
        connect_many_to_many(join_probe_A_B, map_B_to_C)
        connect_all_to_all(map_B_to_C, join_build_AB_C)
        connect_many_to_many(join_build_AB_C, join_probe_AB_C)
        connect_many_to_many(scan_C, project_C)
        connect_many_to_many(project_C, map_C_to_C)
        connect_all_to_all(map_C_to_C, join_probe_AB_C)
        connect_many_to_many(join_probe_AB_C, part_aggregate)

    connect_many_to_one(part_aggregate, aggregate_reduce)
    connect_one_to_one(aggregate_reduce, aggregate_project)
    connect_one_to_one(aggregate_project, collate)

    return query_plan
Example #17
0
def run(parallel, use_pandas, secure, use_native, buffer_size, format_, customer_parts, order_parts, lineitem_parts,
        customer_sharded,
        order_sharded, lineitem_sharded, other_parts, fp_rate, sf, expected_result, customer_filter_sql=None,
        order_filter_sql=None, lineitem_filter_sql=None):
    """

    :return: None
    """

    print('')
    print("TPCH Q3 Bloom Join")
    print("------------------")

    query_plan = QueryPlan(is_async=parallel, buffer_size=buffer_size)

    customer_scan = map(lambda p:
                        query_plan.add_operator(
                            SQLTableScan(get_file_key('customer', customer_sharded, p, sf, format_),
                                         "select "
                                         "  c_custkey "
                                         "from "
                                         "  S3Object "
                                         "where "
                                         "  c_mktsegment = 'BUILDING' "
                                         "  {} "
                                         "  {} "
                                         .format(
                                             ' and ' + customer_filter_sql if customer_filter_sql is not None else '',
                                             get_sql_suffix('customer', customer_parts, p, customer_sharded,
                                                            add_where=False)),
                                         format_,
                                         use_pandas, secure, use_native,
                                         'customer_scan' + '_{}'.format(p),
                                         query_plan,
                                         False)),
                        range(0, customer_parts))

    def customer_project_fn(df):
        df = df.filter(items=['_0'], axis=1)

        df.rename(columns={'_0': 'c_custkey'},
                  inplace=True)

        return df

    customer_project = map(lambda p:
                           query_plan.add_operator(
                               Project([],
                                       'customer_project' + '_{}'.format(p),
                                       query_plan,
                                       False, customer_project_fn)),
                           range(0, customer_parts))

    customer_bloom_create = query_plan.add_operator(
        BloomCreate('c_custkey', 'customer_bloom_create', query_plan, False,
                    fp_rate))

    customer_map = map(lambda p:
                       query_plan.add_operator(Map('c_custkey', 'customer_map' + '_' + str(p), query_plan, False)),
                       range(0, customer_parts))

    order_scan = map(lambda p:
                     query_plan.add_operator(
                         SQLTableScanBloomUse(get_file_key('orders', order_sharded, p, sf, format_),
                                              "select "
                                              "  o_custkey, o_orderkey, o_orderdate, o_shippriority "
                                              "from "
                                              "  S3Object "
                                              "where "
                                              "  cast(o_orderdate as timestamp) < cast('1995-03-01' as timestamp) "
                                              "  {} "
                                              "  {} "
                                              .format(
                                                  ' and ' + order_filter_sql if order_filter_sql is not None else '',
                                                  get_sql_suffix('orders', order_parts, p, order_sharded,
                                                                 add_where=False)),
                                              'o_custkey',
                                              format_,
                                              use_pandas, secure, use_native,
                                              'order_scan' + '_{}'.format(p),
                                              query_plan,
                                              False)),
                     range(0, order_parts))

    def order_project_fn(df):
        df = df.filter(items=['_0', '_1', '_2', '_3'], axis=1)

        df.rename(columns={'_0': 'o_custkey', '_1': 'o_orderkey', '_2': 'o_orderdate', '_3': 'o_shippriority'},
                  inplace=True)

        return df

    order_project = map(lambda p:
                        query_plan.add_operator(
                            Project([],
                                    'order_project' + '_{}'.format(p),
                                    query_plan,
                                    False, order_project_fn)),
                        range(0, customer_parts))

    order_map_1 = map(lambda p:
                      query_plan.add_operator(Map('o_custkey', 'order_map_1' + '_' + str(p), query_plan, False)),
                      range(0, order_parts))

    customer_order_join_build = map(lambda p:
                                    query_plan.add_operator(
                                        HashJoinBuild('c_custkey',
                                                      'customer_order_join_build' + '_' + str(p), query_plan,
                                                      False)),
                                    range(0, other_parts))

    customer_order_join_probe = map(lambda p:
                                    query_plan.add_operator(
                                        HashJoinProbe(JoinExpression('c_custkey', 'o_custkey'),
                                                      'customer_order_join_probe' + '_' + str(p),
                                                      query_plan, False)),
                                    range(0, other_parts))

    order_bloom_create = query_plan.add_operator(
        BloomCreate('o_orderkey', 'order_bloom_create', query_plan, False,
                    fp_rate))

    lineitem_scan = map(lambda p:
                        query_plan.add_operator(
                            SQLTableScanBloomUse(get_file_key('lineitem', lineitem_sharded, p, sf, format_),
                                                 "select "
                                                 "  l_orderkey, l_extendedprice, l_discount "
                                                 "from "
                                                 "  S3Object "
                                                 "where "
                                                 "  cast(l_shipdate as timestamp) > cast('1995-03-01' as timestamp) "
                                                 "  {} "
                                                 "  {} "
                                                 .format(
                                                     ' and ' + lineitem_filter_sql if lineitem_filter_sql is not None else '',
                                                     get_sql_suffix('lineitem', lineitem_parts, p, lineitem_sharded,
                                                                    add_where=False)),
                                                 'l_orderkey', format_,
                                                 use_pandas, secure, use_native,
                                                 'lineitem_scan' + '_{}'.format(p),
                                                 query_plan,
                                                 False)),
                        range(0, lineitem_parts))

    def lineitem_project_fn(df):
        df = df.filter(items=['_0', '_1', '_2'], axis=1)

        df.rename(columns={'_0': 'l_orderkey', '_1': 'l_extendedprice', '_2': 'l_discount'},
                  inplace=True)

        return df

    lineitem_project = map(lambda p:
                           query_plan.add_operator(
                               Project([],
                                       'lineitem_project' + '_{}'.format(p),
                                       query_plan,
                                       False, lineitem_project_fn)),
                           range(0, lineitem_parts))

    lineitem_map = map(lambda p:
                       query_plan.add_operator(Map('l_orderkey', 'lineitem_map' + '_' + str(p), query_plan, False)),
                       range(0, lineitem_parts))

    order_map_2 = map(lambda p:
                      query_plan.add_operator(Map('o_orderkey', 'order_map_2' + '_' + str(p), query_plan, False)),
                      range(0, other_parts))

    customer_order_lineitem_join_build = map(lambda p:
                                             query_plan.add_operator(
                                                 HashJoinBuild('o_orderkey',
                                                               'customer_order_lineitem_join_build' + '_' + str(p),
                                                               query_plan,
                                                               False)),
                                             range(0, other_parts))

    customer_order_lineitem_join_probe = map(lambda p:
                                             query_plan.add_operator(
                                                 HashJoinProbe(JoinExpression('o_orderkey', 'l_orderkey'),
                                                               'customer_order_lineitem_join_probe' + '_' + str(p),
                                                               query_plan, False)),
                                             range(0, other_parts))

    def groupby_fn(df):
        df['l_extendedprice'] = df['l_extendedprice'].astype(np.float)
        df['l_discount'] = df['l_discount'].astype(np.float)
        df['revenue'] = df['l_extendedprice'] * (1 - df['l_discount'])
        grouped = df.groupby(['l_orderkey', 'o_orderdate', 'o_shippriority'])
        agg_df = grouped['revenue'].sum()
        return agg_df.reset_index()

    group = map(lambda p:
                query_plan.add_operator(
                    Group(
                        ['l_orderkey', 'o_orderdate', 'o_shippriority'],  # l_partkey
                        [
                            AggregateExpression(AggregateExpression.SUM,
                                                lambda t_: float(t_['l_extendedprice'] * (1 - t_['l_discount'])))
                        ],
                        'group' + '_{}'.format(p), query_plan,
                        False, groupby_fn)),
                range(0, other_parts))

    def group_reduce_fn(df):
        grouped = df.groupby(['l_orderkey', 'o_orderdate', 'o_shippriority'])
        agg_df = grouped['revenue'].sum()
        return agg_df.reset_index()

    group_reduce = query_plan.add_operator(
        Group(
            ['l_orderkey', 'o_orderdate', 'o_shippriority'],  # l_partkey
            [
                AggregateExpression(AggregateExpression.SUM,
                                    lambda t_: float(t_['l_extendedprice'] * (1 - t_['l_discount'])))
            ],
            'group_reduce', query_plan,
            False, group_reduce_fn))

    top = query_plan.add_operator(
        Top(10, [SortExpression('revenue', float, 'DESC'), SortExpression('o_orderdate', date, 'ASC')], use_pandas,
            'top', query_plan,
            False))

    collate = query_plan.add_operator(tpch_q19.collate_op('collate', query_plan))

    # Inline what we can
    map(lambda o: o.set_async(False), lineitem_project)
    map(lambda o: o.set_async(False), customer_project)
    map(lambda o: o.set_async(False), order_project)
    map(lambda o: o.set_async(False), lineitem_map)
    map(lambda o: o.set_async(False), customer_map)
    map(lambda o: o.set_async(False), order_map_1)
    map(lambda o: o.set_async(False), order_map_2)

    # Connect the operators
    connect_many_to_many(customer_scan, customer_project)
    connect_many_to_many(customer_project, customer_map)

    connect_many_to_one(customer_project, customer_bloom_create)
    connect_one_to_many(customer_bloom_create, order_scan)

    connect_many_to_many(order_scan, order_project)
    connect_many_to_many(order_project, order_map_1)

    connect_all_to_all(customer_map, customer_order_join_build)
    connect_many_to_many(customer_order_join_build, customer_order_join_probe)
    connect_all_to_all(order_map_1, customer_order_join_probe)

    # connect_many_to_one(customer_order_join_probe, collate)

    connect_many_to_one(order_project, order_bloom_create)
    connect_one_to_many(order_bloom_create, lineitem_scan)

    connect_many_to_many(lineitem_scan, lineitem_project)
    connect_many_to_many(lineitem_project, lineitem_map)

    connect_many_to_many(customer_order_join_probe, order_map_2)
    connect_all_to_all(order_map_2, customer_order_lineitem_join_build)
    connect_many_to_many(customer_order_lineitem_join_build, customer_order_lineitem_join_probe)
    connect_all_to_all(lineitem_map, customer_order_lineitem_join_probe)

    # connect_many_to_one(customer_order_lineitem_join_probe, collate)

    connect_many_to_many(customer_order_lineitem_join_probe, group)
    # connect_many_to_one(group, collate)

    connect_many_to_one(group, group_reduce)
    connect_one_to_one(group_reduce, top)
    connect_one_to_one(top, collate)

    # Plan settings
    print('')
    print("Settings")
    print("--------")
    print('')
    print('use_pandas: {}'.format(use_pandas))
    print('secure: {}'.format(secure))
    print('use_native: {}'.format(use_native))
    print("customer_parts: {}".format(customer_parts))
    print("order_parts: {}".format(order_parts))
    print("lineitem_parts: {}".format(lineitem_parts))
    print("customer_sharded: {}".format(customer_sharded))
    print("order_sharded: {}".format(order_sharded))
    print("lineitem_sharded: {}".format(lineitem_sharded))
    print("other_parts: {}".format(other_parts))
    print("fp_rate: {}".format(fp_rate))
    print("format: {}".format(format_))
    print('')

    # Write the plan graph
    query_plan.write_graph(os.path.join(ROOT_DIR, "../benchmark-output"), gen_test_id())

    # Start the query
    query_plan.execute()

    tuples = collate.tuples()

    collate.print_tuples(tuples)

    # Write the metrics
    query_plan.print_metrics()

    # Shut everything down
    query_plan.stop()

    field_names = ['l_orderkey', 'o_orderdate', 'o_shippriority', 'revenue']

    assert len(tuples) == 10 + 1

    assert tuples[0] == field_names

    # NOTE: This result has been verified with the equivalent data and query on PostgreSQL
    test_util.assert_tuples(expected_result, tuples)
Example #18
0
def run(parallel,
        use_pandas,
        buffer_size,
        table_parts,
        lower,
        upper,
        sf,
        format_=Format.CSV):
    secure = False
    use_native = False
    print('')
    print("Indexing Benchmark")
    print("------------------")

    # Query plan
    query_plan = QueryPlan(is_async=parallel, buffer_size=buffer_size)

    # SQL scan the file

    scan = map(
        lambda p: query_plan.add_operator(
            SQLTableScan(
                get_file_key('lineitem', True, p, sf=sf, format_=format_
                             ), "select * from S3Object "
                "where cast(l_extendedprice as float) >= {} and cast(l_extendedprice as float) <= {};"
                .format(lower, upper), format_, use_pandas, secure,
                use_native, 'scan_{}'.format(p), query_plan, False)),
        range(0, table_parts))
    '''
    scan = map(lambda p:
               query_plan.add_operator(
                   SQLTableScan(get_file_key('lineitem', True, p, sf=sf, format_=format_),
                                "select * from S3Object "
                                "where l_extendedprice  >= {} and l_extendedprice  <= {};".format(
                                    lower, upper), format_,
                                use_pandas, secure, use_native,
                                'scan_{}'.format(p), query_plan,
                                False)),
               range(0, table_parts))
    '''

    # project
    def fn(df):
        df.columns = [
            'l_orderkey', 'l_partkey', 'l_suppkey', 'l_linenumber',
            'l_quantity', 'l_extendedprice', 'l_discount', 'l_tax',
            'l_returnflag', 'l_linestatus', 'l_shipdate', 'l_commitdate',
            'l_receiptdate', 'l_shipinstruct', 'l_shipmode', 'l_comment'
        ]
        df[['l_extendedprice']] = df[['l_extendedprice']].astype(np.float)
        return df

    project = map(
        lambda p: query_plan.add_operator(
            Project([], 'project_{}'.format(p), query_plan, False, fn)),
        range(0, table_parts))

    # aggregation
    def agg_fun(df):
        return pd.DataFrame({'count': [len(df)]})

    aggregate = query_plan.add_operator(
        Aggregate([], True, 'agg', query_plan, False, agg_fun))

    collate = query_plan.add_operator(Collate('collate', query_plan, False))

    map(lambda (p, o): o.connect(project[p]), enumerate(scan))
    map(lambda (p, o): o.connect(aggregate), enumerate(project))
    aggregate.connect(collate)

    # Plan settings
    print('')
    print("Settings")
    print("--------")
    print('')
    print('use_pandas: {}'.format(use_pandas))
    print("table parts: {}".format(table_parts))
    print('')

    # Write the plan graph
    query_plan.write_graph(os.path.join(ROOT_DIR, "../benchmark-output"),
                           gen_test_id() + "-" + str(table_parts))

    # Start the query
    query_plan.execute()
    print('Done')
    tuples = collate.tuples()

    collate.print_tuples(tuples)

    # Write the metrics
    query_plan.print_metrics()

    # Shut everything down
    query_plan.stop()
Example #19
0
def sql_scan_lineitem_operator_def(sharded, shard, sf, use_pandas, secure, use_native, name, query_plan, format_):
    print(format_)
    return SQLTableScan(get_file_key('lineitem', sharded, shard, sf, format_),
                        "select * from S3Object;", format_, use_pandas, secure, use_native,
                        name, query_plan,
                        False)
Example #20
0
def run(parallel, use_pandas, buffer_size, table_parts, lower, upper, sf):
    secure = False
    use_native = False
    print('')
    print("Indexing Benchmark")
    print("------------------")

    # Query plan
    query_plan = QueryPlan(is_async=parallel, buffer_size=buffer_size)

    def fn(df):
        # df.columns = ['l_orderkey', 'l_partkey', 'l_suppkey', 'l_linenumber', 'l_quantity', 'l_extendedprice', 'l_discount', 'l_tax', 'l_returnflag', 'l_linestatus', 'l_shipdate', 'l_commitdate', 'l_receiptdate', 'l_shipinstruct', 'l_shipmode', 'l_comment']
        # df[ ['l_extendedprice'] ] = df[ ['l_extendedprice'] ].astype(np.float)
        # criterion = (df['l_extendedprice'] >= lower) & (
        #        df['l_extendedprice'] <= upper)
        df[['_5']] = df[['_5']].astype(np.float)
        criterion = (df['_5'] >= lower) & (df['_5'] <= upper)
        return df[criterion]

    # scan the file
    scan_filter = map(
        lambda p: query_plan.add_operator(
            TableScan(get_file_key('lineitem', True, p, sf=sf),
                      use_pandas,
                      secure,
                      use_native,
                      'scan_{}'.format(p),
                      query_plan,
                      False,
                      fn=fn)), range(0, table_parts))

    # aggregation
    def agg_fun(df):
        return pd.DataFrame({'count': [len(df)]})

    aggregate = query_plan.add_operator(
        Aggregate([], True, 'agg', query_plan, False, agg_fun))

    collate = query_plan.add_operator(Collate('collate', query_plan, False))

    map(lambda (p, o): o.connect(aggregate), enumerate(scan_filter))
    aggregate.connect(collate)

    # scan_filter[0].set_profiled(True, os.path.join(ROOT_DIR, "../benchmark-output/", gen_test_id() + "_scan_0" + ".prof"))
    # aggregate.set_profiled(True, os.path.join(ROOT_DIR, "../benchmark-output/", gen_test_id() + "_aggregate" + ".prof"))
    # collate.set_profiled(True, os.path.join(ROOT_DIR, "../benchmark-output/", gen_test_id() + "_collate" + ".prof"))

    # Plan settings
    print('')
    print("Settings")
    print("--------")
    print('')
    print('use_pandas: {}'.format(use_pandas))
    print("table parts: {}".format(table_parts))
    print('')

    # Write the plan graph
    query_plan.write_graph(os.path.join(ROOT_DIR, "../benchmark-output"),
                           gen_test_id() + "-" + str(table_parts))

    # Start the query
    query_plan.execute()
    print('Done')
    tuples = collate.tuples()

    collate.print_tuples(tuples)

    # Write the metrics
    query_plan.print_metrics()

    # Shut everything down
    query_plan.stop()
Example #21
0
def query_plan(settings):
    # type: (SyntheticBloomJoinSettings) -> QueryPlan
    """

    :return: None
    """

    query_plan = QueryPlan(is_async=settings.parallel,
                           buffer_size=settings.buffer_size)

    def scan_A_fn(df):
        df.columns = settings.table_A_field_names
        return df

    # Define the operators
    scan_A = \
        map(lambda p:
            query_plan.add_operator(
                SQLTableScan(get_file_key(settings.table_A_key, settings.table_A_sharded, p),
                             "select "
                             "  {} "
                             "from "
                             "  S3Object "
                             "where "
                             "  {} "
                             "  {} "
                             .format(','.join(settings.table_A_field_names),
                                     settings.table_A_filter_sql,
                                     get_sql_suffix(settings.table_A_key, settings.table_A_parts, p,
                                                    settings.table_A_sharded)), settings.format_,
                             settings.use_pandas,
                             settings.secure,
                             settings.use_native,
                             'scan_A_{}'.format(p),
                             query_plan,
                             False,
                             fn=scan_A_fn)),
            range(0, settings.table_A_parts))
    """
    field_names_map_A = OrderedDict(
        zip(['_{}'.format(i) for i, name in enumerate(settings.table_A_field_names)], settings.table_A_field_names))

    def project_fn_A(df):
        df.rename(columns=field_names_map_A, inplace=True)
        return df

    project_A = map(lambda p:
                    query_plan.add_operator(Project(
                        [ProjectExpression(k, v) for k, v in field_names_map_A.iteritems()],
                        'project_A_{}'.format(p),
                        query_plan,
                        True,
                        project_fn_A)),
                    range(0, settings.table_A_parts))
    """

    bloom_create_a = map(
        lambda p: query_plan.add_operator(
            BloomCreate(settings.table_A_AB_join_key, 'bloom_create_a_{}'.
                        format(p), query_plan, False)),
        range(0, settings.table_A_parts))

    def scan_B_fn(df):
        df.columns = settings.table_B_field_names
        return df

    scan_B = \
        map(lambda p:
            query_plan.add_operator(
                SQLTableScanBloomUse(get_file_key(settings.table_B_key, settings.table_B_sharded, p),
                                     "select "
                                     "  {} "
                                     "from "
                                     "  S3Object "
                                     "where "
                                     "  {} "
                                     "  {} "
                                     .format(','.join(settings.table_B_field_names),
                                             settings.table_B_filter_sql,
                                             get_sql_suffix(settings.table_B_key, settings.table_B_parts, p,
                                                            settings.table_B_sharded, add_where=False)),
                                     settings.table_B_AB_join_key, settings.format_,
                                     settings.use_pandas,
                                     settings.secure,
                                     settings.use_native,
                                     'scan_B_{}'.format(p),
                                     query_plan,
                                     False,
                                     fn=scan_B_fn)),
            range(0, settings.table_B_parts))
    """
    field_names_map_B = OrderedDict(
        zip(['_{}'.format(i) for i, name in enumerate(settings.table_B_field_names)], settings.table_B_field_names))

    def project_fn_B(df):
        df.rename(columns=field_names_map_B, inplace=True)
        return df

    project_B = map(lambda p:
                    query_plan.add_operator(Project(
                        [ProjectExpression(k, v) for k, v in field_names_map_B.iteritems()],
                        'project_B_{}'.format(p),
                        query_plan,
                        True,
                        project_fn_B)),
                    range(0, settings.table_B_parts))
    """

    def scan_C_fn(df):
        df.columns = settings.table_C_field_names
        return df

    scan_C = \
        map(lambda p:
            query_plan.add_operator(
                SQLTableScanBloomUse(get_file_key(settings.table_C_key, settings.table_C_sharded, p),
                                     "select "
                                     "  {} "
                                     "from "
                                     "  S3Object "
                                     "where "
                                     "  {} "
                                     "  {} "
                                     .format(','.join(settings.table_C_field_names),
                                             settings.table_C_filter_sql,
                                             get_sql_suffix(settings.table_C_key, settings.table_C_parts, p,
                                                            settings.table_C_sharded, add_where=False)),
                                     settings.table_C_BC_join_key, settings.format_,
                                     settings.use_pandas,
                                     settings.secure,
                                     settings.use_native,
                                     'scan_C_{}'.format(p),
                                     query_plan,
                                     False,
                                     fn=scan_C_fn)),
            range(0, settings.table_C_parts))
    """
    field_names_map_C = OrderedDict(
        zip(['_{}'.format(i) for i, name in enumerate(settings.table_C_field_names)], settings.table_C_field_names))

    def project_fn_C(df):
        df.rename(columns=field_names_map_C, inplace=True)
        return df

    project_C = map(lambda p:
                    query_plan.add_operator(Project(
                        [ProjectExpression(k, v) for k, v in field_names_map_C.iteritems()],
                        'project_C_{}'.format(p),
                        query_plan,
                        True,
                        project_fn_C)),
                    range(0, settings.table_C_parts))
    """

    map_A_to_B = map(
        lambda p: query_plan.add_operator(
            Map(settings.table_A_AB_join_key, 'map_A_to_B_{}'.format(p),
                query_plan, False)), range(0, settings.table_A_parts))

    map_B_to_B = map(
        lambda p: query_plan.add_operator(
            Map(settings.table_B_AB_join_key, 'map_B_to_B_{}'.format(p),
                query_plan, False)), range(0, settings.table_B_parts))

    map_B_to_C = map(
        lambda p: query_plan.add_operator(
            Map(settings.table_B_BC_join_key, 'map_B_to_C_{}'.format(p),
                query_plan, False)), range(0, settings.table_B_parts))

    map_C_to_C = map(
        lambda p: query_plan.add_operator(
            Map(settings.table_C_BC_join_key, 'map_C_to_C_{}'.format(p),
                query_plan, False)), range(0, settings.table_C_parts))

    join_build_A_B = map(
        lambda p: query_plan.add_operator(
            HashJoinBuild(settings.table_A_AB_join_key, 'join_build_A_B_{}'.
                          format(p), query_plan, False)),
        range(0, settings.table_B_parts))

    join_probe_A_B = map(
        lambda p: query_plan.add_operator(
            HashJoinProbe(
                JoinExpression(settings.table_A_AB_join_key, settings.
                               table_B_AB_join_key), 'join_probe_A_B_{}'
                .format(p), query_plan, False)),
        range(0, settings.table_B_parts))

    bloom_create_ab = map(
        lambda p: query_plan.add_operator(
            BloomCreate(settings.table_B_BC_join_key, 'bloom_create_ab_{}'.
                        format(p), query_plan, False)),
        range(0, settings.table_B_parts))

    join_build_AB_C = map(
        lambda p: query_plan.add_operator(
            HashJoinBuild(settings.table_B_BC_join_key, 'join_build_AB_C_{}'.
                          format(p), query_plan, False)),
        range(0, settings.table_C_parts))

    join_probe_AB_C = map(
        lambda p: query_plan.add_operator(
            HashJoinProbe(
                JoinExpression(settings.table_B_BC_join_key, settings.
                               table_C_BC_join_key), 'join_probe_AB_C_{}'
                .format(p), query_plan, False)),
        range(0, settings.table_C_parts))

    def part_aggregate_fn(df):
        sum_ = df[settings.table_C_detail_field_name].astype(float).sum()
        return pd.DataFrame({'_0': [sum_]})

    part_aggregate = map(
        lambda p: query_plan.add_operator(
            Aggregate([
                AggregateExpression(
                    AggregateExpression.SUM, lambda t: float(t[
                        settings.table_C_detail_field_name]))
            ], settings.use_pandas, 'part_aggregate_{}'.format(p), query_plan,
                      False, part_aggregate_fn)),
        range(0, settings.table_C_parts))

    def aggregate_reduce_fn(df):
        sum_ = df['_0'].astype(np.float).sum()
        return pd.DataFrame({'_0': [sum_]})

    aggregate_reduce = query_plan.add_operator(
        Aggregate([
            AggregateExpression(AggregateExpression.SUM,
                                lambda t: float(t['_0']))
        ], settings.use_pandas, 'aggregate_reduce', query_plan, False,
                  aggregate_reduce_fn))

    aggregate_project = query_plan.add_operator(
        Project([ProjectExpression(lambda t: t['_0'], 'total_balance')],
                'aggregate_project', query_plan, False))

    collate = query_plan.add_operator(Collate('collate', query_plan, False))

    # Connect the operators
    connect_many_to_many(scan_A, map_A_to_B)
    #connect_many_to_many(scan_A, project_A)
    #connect_many_to_many(project_A, map_A_to_B)
    connect_all_to_all(map_A_to_B, join_build_A_B)
    connect_many_to_many(join_build_A_B, join_probe_A_B)

    #connect_many_to_many(project_A, bloom_create_a)
    connect_many_to_many(scan_A, bloom_create_a)
    connect_all_to_all(bloom_create_a, scan_B)
    connect_many_to_many(scan_B, map_B_to_B)
    #connect_many_to_many(scan_B, project_B)
    #connect_many_to_many(project_B, map_B_to_B)
    connect_all_to_all(map_B_to_B, join_probe_A_B)
    connect_many_to_many(join_probe_A_B, bloom_create_ab)
    connect_all_to_all(bloom_create_ab, scan_C)
    connect_many_to_many(join_build_AB_C, join_probe_AB_C)

    connect_many_to_many(join_probe_A_B, map_B_to_C)
    connect_all_to_all(map_B_to_C, join_build_AB_C)

    connect_many_to_many(scan_C, map_C_to_C)
    #connect_many_to_many(scan_C, project_C)
    #connect_many_to_many(project_C, map_C_to_C)
    connect_all_to_all(map_C_to_C, join_probe_AB_C)

    connect_many_to_many(join_probe_AB_C, part_aggregate)

    connect_many_to_one(part_aggregate, aggregate_reduce)
    connect_one_to_one(aggregate_reduce, aggregate_project)
    connect_one_to_one(aggregate_project, collate)

    return query_plan
Example #22
0
def separate_query_plan(shared_mem_buffer_size, parallel, use_pandas,
                        buffer_size, table_parts, lower, upper, sharded, sf,
                        use_shared_mem, inline_ops, merge_ops):
    secure = False
    use_native = False

    if use_shared_mem:
        system = WorkerSystem(shared_mem_buffer_size)
    else:
        system = None

    query_plan = QueryPlan(system,
                           is_async=parallel,
                           buffer_size=buffer_size,
                           use_shared_mem=use_shared_mem)

    # scan the file
    scan = map(
        lambda p: query_plan.add_operator(
            TableScan(get_file_key('lineitem', sharded, p, sf=sf), use_pandas,
                      secure, use_native, 'scan_{}'.format(
                          p), query_plan, False)), range(0, table_parts))

    def filter_fn(df):
        return (df['_5'].astype(np.float) >= lower) & (df['_5'].astype(
            np.float) <= upper)

    filter_ = map(
        lambda p: query_plan.add_operator(
            Filter(PredicateExpression(None, filter_fn), 'filter_{}'.format(p),
                   query_plan, False)), range(0, table_parts))

    # aggregation
    def agg_fun(df):
        return pd.DataFrame({'count': [len(df)]})

    aggregate = query_plan.add_operator(
        Aggregate([], use_pandas, 'agg', query_plan, False, agg_fun))

    collate = query_plan.add_operator(Collate('collate', query_plan, False))

    map(lambda p: p.set_async(not inline_ops), filter_)

    connect_many_to_many(scan, filter_)
    connect_many_to_one(filter_, aggregate)
    connect_one_to_one(aggregate, collate)

    profile_file_suffix = get_profile_file_suffix(inline_ops, merge_ops,
                                                  use_shared_mem)

    scan[0].set_profiled(
        True,
        os.path.join(ROOT_DIR, "../benchmark-output/",
                     gen_test_id() + "_scan_0" + profile_file_suffix +
                     ".prof"))
    filter_[0].set_profiled(
        True,
        os.path.join(
            ROOT_DIR, "../benchmark-output/",
            gen_test_id() + "_filter_0" + profile_file_suffix + ".prof"))
    aggregate.set_profiled(
        True,
        os.path.join(
            ROOT_DIR, "../benchmark-output/",
            gen_test_id() + "_aggregate" + profile_file_suffix + ".prof"))
    collate.set_profiled(
        True,
        os.path.join(
            ROOT_DIR, "../benchmark-output/",
            gen_test_id() + "_collate" + profile_file_suffix + ".prof"))
    return query_plan