コード例 #1
0
ファイル: tpcx_bb_query_19.py プロジェクト: eyal0/tpcx-bb
def read_tables(config):
    table_reader = build_reader(
        data_format=config["file_format"], basepath=config["data_dir"],
    )
    date_dim_cols = ["d_week_seq", "d_date_sk", "d_date"]
    date_dim_df = table_reader.read("date_dim", relevant_cols=date_dim_cols)
    store_returns_cols = ["sr_returned_date_sk", "sr_item_sk", "sr_return_quantity"]
    store_returns_df = table_reader.read(
        "store_returns", relevant_cols=store_returns_cols
    )
    web_returns_cols = ["wr_returned_date_sk", "wr_item_sk", "wr_return_quantity"]
    web_returns_df = table_reader.read("web_returns", relevant_cols=web_returns_cols)

    ### splitting by row groups for better parallelism
    pr_table_reader = build_reader(
        data_format=config["file_format"],
        basepath=config["data_dir"],
        split_row_groups=True,
    )

    product_reviews_cols = ["pr_item_sk", "pr_review_content", "pr_review_sk"]
    product_reviews = pr_table_reader.read(
        "product_reviews", relevant_cols=product_reviews_cols
    )

    return date_dim_df, store_returns_df, web_returns_df, product_reviews
コード例 #2
0
ファイル: tpcx_bb_query_18.py プロジェクト: eyal0/tpcx-bb
def read_tables(config):
    table_reader = build_reader(
        data_format=config["file_format"],
        basepath=config["data_dir"],
    )

    store_sales_cols = [
        "ss_store_sk",
        "ss_sold_date_sk",
        "ss_net_paid",
    ]
    date_cols = ["d_date_sk", "d_date"]
    store_cols = ["s_store_sk", "s_store_name"]

    store_sales = table_reader.read("store_sales",
                                    relevant_cols=store_sales_cols)
    date_dim = table_reader.read("date_dim", relevant_cols=date_cols)
    store = table_reader.read("store", relevant_cols=store_cols)

    ### splitting by row groups for better parallelism
    pr_table_reader = build_reader(
        data_format=config["file_format"],
        basepath=config["data_dir"],
        split_row_groups=True,
    )

    product_reviews_cols = [
        "pr_review_date", "pr_review_content", "pr_review_sk"
    ]
    product_reviews = pr_table_reader.read(
        "product_reviews",
        relevant_cols=product_reviews_cols,
    )

    return store_sales, date_dim, store, product_reviews
コード例 #3
0
def read_tables(config):
    table_reader = build_reader(
        data_format=config["file_format"],
        basepath=config["data_dir"],
        split_row_groups=config["split_row_groups"],
    )

    store_sales_df = table_reader.read("store_sales", relevant_cols=store_sales_cols)
    item_df = table_reader.read("item", relevant_cols=item_cols)
    customer_df = table_reader.read("customer", relevant_cols=customer_cols)
    store_df = table_reader.read("store", relevant_cols=store_cols)
    date_dim_df = table_reader.read("date_dim", relevant_cols=date_cols)
    customer_address_df = table_reader.read(
        "customer_address", relevant_cols=customer_address_cols
    )
    promotion_df = table_reader.read("promotion", relevant_cols=promotion_cols)

    return (
        store_sales_df,
        item_df,
        customer_df,
        store_df,
        date_dim_df,
        customer_address_df,
        promotion_df,
    )
コード例 #4
0
ファイル: tpcx_bb_query_07.py プロジェクト: terryjx/tpcx-bb
def read_tables():
    table_reader = build_reader(basepath=cli_args["data_dir"])

    item_cols = ["i_item_sk", "i_current_price", "i_category"]
    store_sales_cols = ["ss_item_sk", "ss_customer_sk", "ss_sold_date_sk"]
    store_cols = ["s_store_sk"]
    date_cols = ["d_date_sk", "d_year", "d_moy"]
    customer_cols = ["c_customer_sk", "c_current_addr_sk"]
    customer_address_cols = ["ca_address_sk", "ca_state"]

    item_df = table_reader.read("item", relevant_cols=item_cols)
    store_sales_df = table_reader.read("store_sales", relevant_cols=store_sales_cols)
    store_df = table_reader.read("store", relevant_cols=store_cols)
    date_dim_df = table_reader.read("date_dim", relevant_cols=date_cols)
    customer_df = table_reader.read("customer", relevant_cols=customer_cols)
    customer_address_df = table_reader.read(
        "customer_address", relevant_cols=customer_address_cols
    )

    return (
        item_df,
        store_sales_df,
        store_df,
        date_dim_df,
        customer_df,
        customer_address_df,
    )
コード例 #5
0
def read_tables():
    table_reader = build_reader(
        data_format=cli_args["file_format"],
        basepath=cli_args["data_dir"],
        split_row_groups=cli_args["split_row_groups"],
    )

    product_review_cols = [
        "pr_review_rating",
        "pr_item_sk",
    ]
    web_sales_cols = [
        "ws_sold_date_sk",
        "ws_net_paid",
        "ws_item_sk",
    ]
    date_cols = ["d_date_sk", "d_date"]

    pr_df = table_reader.read("product_reviews",
                              relevant_cols=product_review_cols)
    # we only read int columns here so it should scale up to sf-10k as just 26M rows
    pr_df = pr_df.repartition(npartitions=1)

    ws_df = table_reader.read("web_sales", relevant_cols=web_sales_cols)
    date_df = table_reader.read("date_dim", relevant_cols=date_cols)

    return pr_df, ws_df, date_df
コード例 #6
0
def read_tables():
    table_reader = build_reader(
        data_format=cli_args["file_format"],
        basepath=cli_args["data_dir"],
        split_row_groups=cli_args["split_row_groups"],
    )

    store_sales_cols = [
        "ss_customer_sk",
        "ss_ticket_number",
        "ss_item_sk",
        "ss_net_paid",
    ]
    store_returns_cols = [
        "sr_item_sk",
        "sr_customer_sk",
        "sr_ticket_number",
        "sr_return_amt",
    ]

    store_sales_df = table_reader.read("store_sales",
                                       relevant_cols=store_sales_cols)
    store_returns_df = table_reader.read("store_returns",
                                         relevant_cols=store_returns_cols)
    return store_sales_df, store_returns_df
コード例 #7
0
ファイル: tpcx_bb_query_09.py プロジェクト: eyal0/tpcx-bb
def read_tables(config):
    table_reader = build_reader(
        data_format=config["file_format"],
        basepath=config["data_dir"],
        split_row_groups=config["split_row_groups"],
    )

    ss_columns = [
        "ss_quantity",
        "ss_sold_date_sk",
        "ss_addr_sk",
        "ss_store_sk",
        "ss_cdemo_sk",
        "ss_sales_price",
        "ss_net_profit",
    ]

    store_sales = table_reader.read("store_sales", relevant_cols=ss_columns)

    ca_columns = ["ca_address_sk", "ca_country", "ca_state"]
    customer_address = table_reader.read("customer_address",
                                         relevant_cols=ca_columns)

    cd_columns = ["cd_demo_sk", "cd_marital_status", "cd_education_status"]
    customer_demographics = table_reader.read("customer_demographics",
                                              relevant_cols=cd_columns)

    dd_columns = ["d_year", "d_date_sk"]
    date_dim = table_reader.read("date_dim", relevant_cols=dd_columns)

    s_columns = ["s_store_sk"]
    store = table_reader.read("store", relevant_cols=s_columns)

    return store_sales, customer_address, customer_demographics, date_dim, store
コード例 #8
0
def read_tables():
    table_reader = build_reader(
        data_format=cli_args["file_format"],
        basepath=cli_args["data_dir"],
        split_row_groups=cli_args["split_row_groups"],
    )

    ss_cols = [
        "ss_customer_sk", "ss_sold_date_sk", "ss_ticket_number", "ss_net_paid"
    ]
    ws_cols = [
        "ws_bill_customer_sk",
        "ws_sold_date_sk",
        "ws_order_number",
        "ws_net_paid",
    ]
    datedim_cols = ["d_date_sk", "d_date"]

    ss_ddf = table_reader.read("store_sales",
                               relevant_cols=ss_cols,
                               index=False)
    ws_ddf = table_reader.read("web_sales", relevant_cols=ws_cols, index=False)
    datedim_ddf = table_reader.read("date_dim",
                                    relevant_cols=datedim_cols,
                                    index=False)

    return (ss_ddf, ws_ddf, datedim_ddf)
コード例 #9
0
ファイル: tpcx_bb_query_07.py プロジェクト: eyal0/tpcx-bb
def read_tables(config):
    table_reader = build_reader(
        data_format=config["file_format"],
        basepath=config["data_dir"],
        split_row_groups=config["split_row_groups"],
    )

    item_cols = ["i_item_sk", "i_current_price", "i_category"]
    store_sales_cols = ["ss_item_sk", "ss_customer_sk", "ss_sold_date_sk"]
    store_cols = ["s_store_sk"]
    date_cols = ["d_date_sk", "d_year", "d_moy"]
    customer_cols = ["c_customer_sk", "c_current_addr_sk"]
    customer_address_cols = ["ca_address_sk", "ca_state"]

    item_df = table_reader.read("item", relevant_cols=item_cols)
    store_sales_df = table_reader.read("store_sales",
                                       relevant_cols=store_sales_cols)
    store_df = table_reader.read("store", relevant_cols=store_cols)
    date_dim_df = table_reader.read("date_dim", relevant_cols=date_cols)
    customer_df = table_reader.read("customer", relevant_cols=customer_cols)
    customer_address_df = table_reader.read(
        "customer_address", relevant_cols=customer_address_cols)

    return (
        item_df,
        store_sales_df,
        store_df,
        date_dim_df,
        customer_df,
        customer_address_df,
    )
コード例 #10
0
ファイル: tpcx_bb_query_27.py プロジェクト: terryjx/tpcx-bb
def read_tables():
    ### splitting by row groups for better parallelism
    table_reader = build_reader(basepath=cli_args["data_dir"],
                                split_row_groups=True)
    product_reviews_cols = ["pr_item_sk", "pr_review_content", "pr_review_sk"]
    product_reviews_df = table_reader.read("product_reviews",
                                           relevant_cols=product_reviews_cols)
    return product_reviews_df
コード例 #11
0
def read_tables(config):
    table_reader = build_reader(
        data_format=config["file_format"],
        basepath=config["data_dir"],
        split_row_groups=config["split_row_groups"],
    )
    wcs_cols = ["wcs_user_sk", "wcs_item_sk", "wcs_click_date_sk", "wcs_click_time_sk"]
    wcs_df = table_reader.read("web_clickstreams", relevant_cols=wcs_cols)
    return wcs_df
コード例 #12
0
ファイル: tpcx_bb_query_24.py プロジェクト: terryjx/tpcx-bb
def read_tables():
    table_reader = build_reader(basepath=cli_args["data_dir"])
    ### read tables
    ws_df = table_reader.read("web_sales", relevant_cols=ws_cols)
    item_df = table_reader.read("item", relevant_cols=item_cols)
    imp_df = table_reader.read("item_marketprices", relevant_cols=imp_cols)
    ss_df = table_reader.read("store_sales", relevant_cols=ss_cols)

    return ws_df, item_df, imp_df, ss_df
コード例 #13
0
def read_tables():
    table_reader = build_reader(basepath=cli_args["data_dir"])

    store_sales_df = table_reader.read("store_sales",
                                       relevant_cols=store_sales_cols)
    date_dim_df = table_reader.read("date_dim", relevant_cols=date_cols)
    item_df = table_reader.read("item", relevant_cols=item_cols)

    return store_sales_df, date_dim_df, item_df
コード例 #14
0
ファイル: tpcx_bb_query_26.py プロジェクト: terryjx/tpcx-bb
def read_tables():
    table_reader = build_reader(basepath=cli_args["data_dir"])

    ss_cols = ["ss_customer_sk", "ss_item_sk"]
    items_cols = ["i_item_sk", "i_category", "i_class_id"]

    ss_ddf = table_reader.read("store_sales", relevant_cols=ss_cols, index=False)
    items_ddf = table_reader.read("item", relevant_cols=items_cols, index=False)

    return (ss_ddf, items_ddf)
コード例 #15
0
ファイル: tpcx_bb_query_01.py プロジェクト: eyal0/tpcx-bb
def read_tables(config):
    table_reader = build_reader(
        data_format=config["file_format"],
        basepath=config["data_dir"],
        split_row_groups=config["split_row_groups"],
    )

    item_df = table_reader.read("item", relevant_cols=item_cols)
    ss_df = table_reader.read("store_sales", relevant_cols=ss_cols)
    return item_df, ss_df
コード例 #16
0
def read_tables(config):
    table_reader = build_reader(
        data_format=config["file_format"],
        basepath=config["data_dir"],
        split_row_groups=config["split_row_groups"],
    )

    item_cols = ["i_category_id", "i_item_sk"]
    item_df = table_reader.read("item", relevant_cols=item_cols)
    return item_df
コード例 #17
0
ファイル: tpcx_bb_query_16.py プロジェクト: terryjx/tpcx-bb
def read_tables():
    table_reader = build_reader(basepath=cli_args["data_dir"])

    web_sales_df = table_reader.read("web_sales", relevant_cols=websale_cols)
    web_returns_df = table_reader.read("web_returns",
                                       relevant_cols=web_returns_cols)
    date_dim_df = table_reader.read("date_dim", relevant_cols=date_cols)
    item_df = table_reader.read("item", relevant_cols=item_cols)
    warehouse_df = table_reader.read("warehouse", relevant_cols=warehouse_cols)
    return web_sales_df, web_returns_df, date_dim_df, item_df, warehouse_df
コード例 #18
0
def read_tables():
    table_reader = build_reader(basepath=cli_args["data_dir"])

    item_df = table_reader.read("item", relevant_cols=item_cols)
    store_sales_df = table_reader.read("store_sales",
                                       relevant_cols=store_sales_cols)
    web_clickstreams_df = table_reader.read(
        "web_clickstreams", relevant_cols=web_clickstreams_cols)

    return item_df, store_sales_df, web_clickstreams_df
コード例 #19
0
def read_tables(config):
    table_reader = build_reader(
        data_format=config["file_format"], basepath=config["data_dir"],
    )
    item_cols = ["i_item_sk", "i_category_id"]
    item_df = table_reader.read("item", relevant_cols=item_cols)

    ws_cols = ["ws_order_number", "ws_item_sk"]
    ws_df = table_reader.read("web_sales", relevant_cols=ws_cols)

    return item_df, ws_df
コード例 #20
0
def read_tables():
    ### splitting by row groups for better parallelism
    table_reader = build_reader(basepath=cli_args["data_dir"],
                                split_row_groups=True)

    columns = [
        "pr_review_content",
        "pr_review_rating",
        "pr_review_sk",
    ]
    ret = table_reader.read("product_reviews", relevant_cols=columns)
    return ret
コード例 #21
0
def read_tables():
    table_reader = build_reader(basepath=cli_args["data_dir"])

    date_dim_cols = ["d_date_sk", "d_date"]
    web_page_cols = ["wp_web_page_sk", "wp_type"]
    web_sales_cols = ["ws_net_paid", "ws_order_number", "ws_sold_date_sk"]

    date_dim_df = table_reader.read("date_dim", relevant_cols=date_dim_cols)
    web_page_df = table_reader.read("web_page", relevant_cols=web_page_cols)
    web_sales_df = table_reader.read("web_sales", relevant_cols=web_sales_cols)

    return (date_dim_df, web_page_df, web_sales_df)
コード例 #22
0
ファイル: tpcx_bb_query_05.py プロジェクト: terryjx/tpcx-bb
def read_tables():
    table_reader = build_reader(basepath=cli_args["data_dir"])

    item_ddf = table_reader.read("item", relevant_cols=items_columns, index=False)
    customer_ddf = table_reader.read(
        "customer", relevant_cols=customer_columns, index=False
    )
    customer_dem_ddf = table_reader.read(
        "customer_demographics", relevant_cols=customer_dem_columns, index=False
    )

    return (item_ddf, customer_ddf, customer_dem_ddf)
コード例 #23
0
def read_tables():
    table_reader = build_reader(
        data_format=cli_args["file_format"],
        basepath=cli_args["data_dir"],
        split_row_groups=cli_args["split_row_groups"],
    )
    ### read tables
    ws_df = table_reader.read("web_sales", relevant_cols=ws_cols)
    item_df = table_reader.read("item", relevant_cols=item_cols)
    imp_df = table_reader.read("item_marketprices", relevant_cols=imp_cols)
    ss_df = table_reader.read("store_sales", relevant_cols=ss_cols)

    return ws_df, item_df, imp_df, ss_df
コード例 #24
0
ファイル: tpcx_bb_query_16.py プロジェクト: eyal0/tpcx-bb
def read_tables(config):
    table_reader = build_reader(
        data_format=config["file_format"],
        basepath=config["data_dir"],
        split_row_groups=config["split_row_groups"],
    )

    web_sales_df = table_reader.read("web_sales", relevant_cols=websale_cols)
    web_returns_df = table_reader.read("web_returns", relevant_cols=web_returns_cols)
    date_dim_df = table_reader.read("date_dim", relevant_cols=date_cols)
    item_df = table_reader.read("item", relevant_cols=item_cols)
    warehouse_df = table_reader.read("warehouse", relevant_cols=warehouse_cols)
    return web_sales_df, web_returns_df, date_dim_df, item_df, warehouse_df
コード例 #25
0
def read_tables():
    table_reader = build_reader(
        data_format=cli_args["file_format"],
        basepath=cli_args["data_dir"],
        split_row_groups=cli_args["split_row_groups"],
    )

    store_sales_df = table_reader.read("store_sales",
                                       relevant_cols=store_sales_cols)
    date_dim_df = table_reader.read("date_dim", relevant_cols=date_cols)
    item_df = table_reader.read("item", relevant_cols=item_cols)

    return store_sales_df, date_dim_df, item_df
コード例 #26
0
ファイル: tpcx_bb_query_23.py プロジェクト: terryjx/tpcx-bb
def read_tables():
    table_reader = build_reader(basepath=cli_args["data_dir"])

    date_cols = ["d_date_sk", "d_year", "d_moy"]
    date_df = table_reader.read("date_dim", relevant_cols=date_cols)

    inv_cols = [
        "inv_warehouse_sk",
        "inv_item_sk",
        "inv_date_sk",
        "inv_quantity_on_hand",
    ]
    inv_df = table_reader.read("inventory", relevant_cols=inv_cols)

    return date_df, inv_df
コード例 #27
0
def read_tables(config):

    ### splitting by row groups for better parallelism
    table_reader = build_reader(
        data_format=config["file_format"],
        basepath=config["data_dir"],
        split_row_groups=True,
    )
    product_reviews_cols = ["pr_item_sk", "pr_review_content", "pr_review_sk"]

    product_reviews_df = table_reader.read(
        "product_reviews",
        relevant_cols=product_reviews_cols,
    )
    return product_reviews_df
コード例 #28
0
ファイル: tpcx_bb_query_08.py プロジェクト: eyal0/tpcx-bb
def read_tables(config):
    table_reader = build_reader(
        data_format=config["file_format"],
        basepath=config["data_dir"],
        split_row_groups=config["split_row_groups"],
    )

    date_dim_cols = ["d_date_sk", "d_date"]
    web_page_cols = ["wp_web_page_sk", "wp_type"]
    web_sales_cols = ["ws_net_paid", "ws_order_number", "ws_sold_date_sk"]

    date_dim_df = table_reader.read("date_dim", relevant_cols=date_dim_cols)
    web_page_df = table_reader.read("web_page", relevant_cols=web_page_cols)
    web_sales_df = table_reader.read("web_sales", relevant_cols=web_sales_cols)

    return (date_dim_df, web_page_df, web_sales_df)
コード例 #29
0
ファイル: tpcx_bb_query_04.py プロジェクト: terryjx/tpcx-bb
def read_tables():
    table_reader = build_reader(basepath=cli_args["data_dir"])

    wp_cols = ["wp_type", "wp_web_page_sk"]
    wp_df = table_reader.read("web_page", relevant_cols=wp_cols)

    wcs_cols = [
        "wcs_user_sk",
        "wcs_click_date_sk",
        "wcs_click_time_sk",
        "wcs_web_page_sk",
        "wcs_sales_sk",
    ]
    web_clicksteams_df = table_reader.read("web_clickstreams",
                                           relevant_cols=wcs_cols)

    return wp_df, web_clicksteams_df
コード例 #30
0
def read_tables(config):
    table_reader = build_reader(
        data_format=config["file_format"],
        basepath=config["data_dir"],
        split_row_groups=config["split_row_groups"],
    )

    ss_cols = ["ss_customer_sk", "ss_item_sk"]
    items_cols = ["i_item_sk", "i_category", "i_class_id"]

    ss_ddf = table_reader.read("store_sales",
                               relevant_cols=ss_cols,
                               index=False)
    items_ddf = table_reader.read("item",
                                  relevant_cols=items_cols,
                                  index=False)

    return (ss_ddf, items_ddf)