コード例 #1
0
ファイル: helper.py プロジェクト: mpdevs/mplib
def execute_hive_predict_data(category_id,
                              is_event=False,
                              show_sql=False,
                              udf="udf_item_pricing13.py"):
    from mplib.IO import Hive
    sql = """
        USE elengjing_price;
        ADD FILE /home/script/normal_servers/serverudf/elengjing/{udf};
        ADD FILE /home/script/normal_servers/serverudf/elengjing/data/pricing_model/sklearn_randomforest_{category_id}_{model_type}.pickle;

        DROP TABLE IF EXISTS tmp_{category_id};
        CREATE TABLE tmp_{category_id} (
            itemid STRING,
            data STRING
        )CLUSTERED BY (itemid) INTO 1017 BUCKETS
        STORED AS ORC;

        INSERT INTO tmp_{category_id}
        SELECT
            i.itemid,
            CONCAT_WS(
                ',',
                NVL(CAST(i.avg_price AS STRING), '0'),
                NVL(CAST((UNIX_TIMESTAMP() - TO_UNIX_TIMESTAMP(i.listeddate)) / 86400.0 AS STRING), '0'),
                NVL(CAST(shop.level AS STRING), '0'),
                NVL(CAST(shop.favor AS STRING), '0'),
                NVL(CAST(s.all_spu AS STRING), '0'),
                NVL(CAST(s.all_sales_rank AS STRING), '0'),
                NVL(CAST(sc.category_spu AS STRING), '0'),
                NVL(CAST(sc.category_sales_rank AS STRING), '0'),
                NVL(CAST(s.shop_avg_price AS STRING), '0'),
                NVL(CAST(s.shop_avg_price_rank AS STRING), '0'),
                NVL(CAST(sc.category_avg_price AS STRING), '0'),
                NVL(CAST(sc.category_avg_price_rank AS STRING), '0'),
                v.data
            ) AS data
        FROM (
            SELECT
                shopid, itemid,
                MAX(listeddate) AS listeddate,
                CASE WHEN SUM(salesqty) = 0 THEN 0 ELSE SUM(salesamt) / SUM(salesqty) END AS avg_price
            FROM transforms.women_clothing_item_new_dict
            WHERE categoryid = {category_id}
            GROUP BY
                shopid, itemid
        ) AS i
        JOIN (
            SELECT
                shopid,
                COUNT(DISTINCT itemid) AS all_spu,
                CASE WHEN SUM(salesqty) = 0 THEN 0 ELSE SUM(salesamt) / SUM(salesqty) END AS shop_avg_price,
                ROW_NUMBER() OVER(ORDER BY SUM(salesamt) DESC) AS all_sales_rank,
                ROW_NUMBER() OVER(ORDER BY CASE WHEN SUM(salesqty) = 0 THEN 0 ELSE SUM(salesamt) / SUM(salesqty) END DESC) AS shop_avg_price_rank
            FROM transforms.women_clothing_item_new_dict
            GROUP BY
                shopid
        ) AS s
        ON i.shopid = s.shopid
        JOIN (
            SELECT
                shopid,
                COUNT(DISTINCT itemid) AS category_spu,
                ROW_NUMBER() OVER(ORDER BY SUM(salesamt) DESC) AS category_sales_rank,
                ROW_NUMBER() OVER(ORDER BY CASE WHEN SUM(salesqty) = 0 THEN 0 ELSE SUM(salesamt) / SUM(salesqty) END DESC) AS category_avg_price_rank,
                CASE WHEN SUM(salesqty) = 0 THEN 0 ELSE SUM(salesamt) / SUM(salesqty) END AS category_avg_price
            FROM transforms.women_clothing_item_new_dict
            WHERE categoryid = {category_id}
            GROUP BY
                shopid
            ) AS sc
        ON i.shopid = sc.shopid
        JOIN elengjing_base.shop AS shop
        ON i.shopid = shop.shopid
        JOIN elengjing_price.attr_itemid_value_matrix_{category_id} AS v
        ON i.itemid = v.itemid;

        DROP TABLE IF EXISTS predict_{category_id}_{model_type};
        CREATE TABLE predict_{category_id}_{model_type} (
            itemid STRING,
            price DECIMAL(20,2)
        ) STORED AS ORC;

        INSERT INTO predict_{category_id}_{model_type}
        SELECT TRANSFORM(itemid, data)
        USING 'python {udf} {category_id} {model_type}' AS (itemid, price)
        FROM tmp_{category_id};

        DROP TABLE IF EXISTS tmp_{category_id};
        """.format(category_id=category_id,
                   model_type="event" if is_event else "daily",
                   udf=udf)
    if show_sql:
        print(sql)
    else:
        Hive("idc").execute(sql)
コード例 #2
0
ファイル: rawdata_preprocess.py プロジェクト: mpdevs/mplib
def create_attrvalue(category_id):
    sql = """
    -------- 1 产生中间表
    DROP TABLE IF EXISTS mpintranet.attrvalue;
    CREATE TABLE IF NOT EXISTS mpintranet.attrvalue(attrvalue STRING)
    CLUSTERED BY (attrvalue) INTO 113 BUCKETS
    STORED AS ORC;

    INSERT INTO TABLE mpintranet.attrvalue
    SELECT CONCAT(attrname, '_', attrvalue) AS attrvalue
    FROM elengjing.women_clothing_item_attr
    WHERE categoryid = {0}
    GROUP BY CONCAT(attrname, '_', attrvalue);

    DROP TABLE IF EXISTS mpintranet.itemid;
    CREATE TABLE IF NOT EXISTS mpintranet.itemid(itemid  BIGINT)
    CLUSTERED BY (itemid) INTO 113 BUCKETS
    STORED AS ORC;

    INSERT INTO TABLE mpintranet.itemid
    SELECT itemid
    FROM elengjing.women_clothing_item_attr
    WHERE categoryid = {0}
    GROUP BY itemid;

    DROP TABLE IF EXISTS mpintranet.item_attr;
    CREATE TABLE IF NOT EXISTS mpintranet.item_attr(
        itemid BIGINT,
        attrvalue STRING
    )
    CLUSTERED BY (itemid) INTO 581 BUCKETS
    STORED AS ORC;

    INSERT INTO TABLE mpintranet.item_attr
    SELECT
        itemid,
        CONCAT(attrname, '_', attrvalue) AS attrvalue
    FROM elengjing.women_clothing_item_attr
    WHERE categoryid = {0}
    GROUP BY itemid, CONCAT(attrname, '_', attrvalue);

    ------- 产生全量属性表
    DROP TABLE IF EXISTS mpintranet.itemid_attrvalue;
    CREATE TABLE IF NOT EXISTS mpintranet.itemid_attrvalue(
        itemid  BIGINT,
        attrvalue  STRING
    )
    CLUSTERED BY (itemid) INTO 581 BUCKETS
    STORED AS ORC;

    INSERT INTO TABLE mpintranet.itemid_attrvalue
    SELECT
        b.itemid,
        a.attrvalue
    FROM mpintranet.attrvalue AS a
    CROSS JOIN mpintranet.itemid AS b;

    ---------JOIN 全量表, 产出结果
    DROP TABLE IF EXISTS mpintranet.itemid_attrvalue_tag;
    CREATE TABLE IF NOT EXISTS mpintranet.itemid_attrvalue_tag(
        itemid BIGINT,
        attrvalue STRING,
        tag STRING
    )
    CLUSTERED BY (itemid) INTO 783 BUCKETS
    STORED AS ORC;

    SET mapred.reduce.tasks=500;
    INSERT INTO TABLE mpintranet.itemid_attrvalue_tag
    SELECT
        a.itemid,
        a.attrvalue,
        IF(b.itemid IS NULL, '0', '1') AS tag
    FROM mpintranet.itemid_attrvalue a
    LEFT JOIN mpintranet.item_attr b
    ON a.itemid = b.itemid
    AND a.attrvalue = b.attrvalue;
    ---------- JOIN 结果表
    DROP TABLE IF EXISTS mpintranet.attrvalue_result;
    CREATE TABLE IF NOT EXISTS mpintranet.attrvalue_result(
        itemid BIGINT,
        attrvalue STRING,
        tag STRING
    )
    CLUSTERED BY (itemid) INTO 113 BUCKETS
    STORED AS ORC;

    INSERT INTO TABLE mpintranet.attrvalue_result
    SELECT
        t.itemid,
        GROUP_CONCAT(t.attrvalue, ",") AS attrvalue,
        GROUP_CONCAT(t.tag, ",") AS tag
    FROM
    (
        SELECT
            itemid,
            attrvalue,
            tag,
            SPLIT(attrvalue, ',')[0] AS attrname
        FROM mpintranet.itemid_attrvalue_tag
        DISTRIBUTE BY itemid
        SORT BY attrname DESC
    ) AS t
    GROUP BY t.itemid;
    ------------end-------------
    DROP TABLE IF EXISTS mpintranet.attrvalue_export;
    CREATE TABLE IF NOT EXISTS mpintranet.attrvalue_export(
        itemid BIGINT,
        tag STRING
    )
    CLUSTERED BY (itemid) INTO 113 BUCKETS
    STORED AS ORC;

    INSERT INTO TABLE mpintranet.attrvalue_export
    SELECT
        itemid,
        tag
    FROM mpintranet.attrvalue_result;
    """
    # print(sql.format(category_id))
    Hive(env=ENV).execute(sql.format(category_id))
コード例 #3
0
def get_gold_from_hive():
    sql = "SELECT * FROM t_elengjing.competitive_item_train_stage_3"
    pickle_dump("train_data", Hive(env="idc").query(sql, to_dict=False))
コード例 #4
0
def insert_by_category(**kwargs):
    sql = """
        USE elengjing_tj;
        DROP TABLE IF EXISTS t_shop_{shop_id}_case_{case}_customer;
        CREATE TABLE t_shop_{shop_id}_case_{case}_customer (
            shopid BIGINT,
            categoryid BIGINT,
            itemid BIGINT,
            platformid BIGINT,
            avg_price FLOAT
        )
        CLUSTERED BY (itemid) INTO 113 BUCKETS
        STORED AS ORC;


        INSERT INTO TABLE t_shop_{shop_id}_case_{case}_customer
        SELECT
                shopid,
                categoryid,
                itemid,
                platformid,
                CASE
                    WHEN NVL(SUM(salesqty), 0) = 0
                    THEN 0
                    ELSE 1.0 * NVL(SUM(salesamt), 0) / SUM(salesqty)
                END AS avg_price
            FROM elengjing.women_clothing_item
            WHERE shopid = {shop_id}
            AND categoryid = {category_id}
            GROUP BY
                shopid,
                categoryid,
                itemid,
                platformid;


        DROP TABLE IF EXISTS t_shop_{shop_id}_case_{case}_target;
        CREATE TABLE t_shop_{shop_id}_case_{case}_target (
            shopid BIGINT,
            categoryid BIGINT,
            itemid BIGINT,
            platformid BIGINT,
            avg_price FLOAT
        )
        CLUSTERED BY (itemid) INTO 113 BUCKETS
        STORED AS ORC;

        INSERT INTO TABLE t_shop_{shop_id}_case_{case}_target
        SELECT
            b.shopid,
            b.categoryid,
            b.itemid,
            b.platformid,
            b.avg_price
        FROM (
            SELECT
                a.shopid,
                a.categoryid,
                a.itemid,
                a.platformid,
                a.avg_price,
                1.0 * SUM(a.qty) OVER (ORDER BY a.qty DESC, a.qty ROWS UNBOUNDED PRECEDING) / t.qty AS c_ratio
            FROM (
                SELECT
                    shopid,
                    categoryid,
                    itemid,
                    platformid,
                    CASE
                        WHEN NVL(SUM(salesqty), 0) = 0
                        THEN 0
                        ELSE 1.0 * NVL(SUM(salesamt), 0) / SUM(salesqty)
                    END AS avg_price,
                    SUM(salesqty) AS qty
                FROM elengjing.women_clothing_item
                WHERE shopid != {shop_id}
                AND categoryid = {category_id}
                GROUP BY
                    shopid,
                    categoryid,
                    itemid,
                    platformid
                ORDER BY qty DESC
            ) AS a
            CROSS JOIN (
                SELECT
                    SUM(salesqty) AS qty
                FROM elengjing.women_clothing_item
                WHERE shopid != {shop_id}
                AND categoryid = {category_id}
            ) AS t
        ) AS b
        WHERE b.c_ratio <= 0.8;


        INSERT INTO elengjing_tj.t_shop_{shop_id}_case_{case}
        SELECT
            c.categoryid AS category_id,
            c.shopid AS customer_shop_id,
            c.itemid AS customer_item_id,
            c.platformid AS customer_platform_id,
            t.shopid AS target_shop_id,
            t.itemid AS target_item_id,
            t.platformid AS target_platform_id
        FROM t_shop_{shop_id}_case_{case}_customer AS c
        CROSS JOIN t_shop_{shop_id}_case_{case}_target AS t
        WHERE t.avg_price BETWEEN 0.9 * c.avg_price AND 1.1 * c.avg_price;
        """.format(**kwargs)
    Hive("idc").execute(sql)
コード例 #5
0
ファイル: rawdata_preprocess.py プロジェクト: mpdevs/mplib
def create_attrname(category_id):
    sql = """
    DROP TABLE IF EXISTS mpintranet.attrname;
    CREATE TABLE IF NOT EXISTS mpintranet.attrname(attrname STRING)
    CLUSTERED BY (attrname) INTO 113 BUCKETS
    STORED AS ORC;

    INSERT INTO TABLE mpintranet.attrname
    SELECT attrname
    FROM elengjing.women_clothing_item_attr
    WHERE categoryid = {0}
    GROUP BY attrname;

    DROP TABLE IF EXISTS mpintranet.itemid;
    CREATE TABLE IF NOT EXISTS mpintranet.itemid(itemid BIGINT)
    CLUSTERED BY (itemid) INTO 113 BUCKETS
    STORED AS ORC;

    INSERT INTO TABLE mpintranet.itemid
    SELECT itemid
    FROM elengjing.women_clothing_item_attr
    WHERE categoryid = {0}
    GROUP BY itemid;

    DROP TABLE IF EXISTS mpintranet.item_attr;
    CREATE TABLE IF NOT EXISTS mpintranet.item_attr(
        itemid BIGINT,
        attrvalue STRING
    )
    CLUSTERED BY (itemid) INTO 581 BUCKETS
    STORED AS ORC;

    INSERT INTO TABLE mpintranet.item_attr
    SELECT
        itemid,
        CONCAT(attrname, '_', attrvalue) AS attrvalue
    FROM elengjing.women_clothing_item_attr
    WHERE categoryid = {0}
    GROUP BY itemid, CONCAT(attrname, '_', attrvalue);

    DROP TABLE IF EXISTS mpintranet.itemid_attrname;
    CREATE TABLE IF NOT EXISTS mpintranet.itemid_attrname(
        itemid  BIGINT,
        attrname  STRING
    )
    CLUSTERED BY (itemid) INTO 581 BUCKETS
    STORED AS ORC;

    INSERT INTO TABLE mpintranet.itemid_attrname
    SELECT
        b.itemid,
        a.attrname
    FROM mpintranet.attrname a
    CROSS JOIN mpintranet.itemid b;

    DROP TABLE IF EXISTS mpintranet.itemid_attrname_attrvalue;
    CREATE TABLE IF NOT EXISTS mpintranet.itemid_attrname_attrvalue(
        itemid  BIGINT,
        attrname  STRING,
        attrvalue  STRING
    )
    CLUSTERED BY (itemid) INTO 783 BUCKETS
    STORED AS ORC;

    INSERT INTO TABLE mpintranet.itemid_attrname_attrvalue
    SELECT
        a.itemid,
        a.attrname,
        NVL(b.attrvalue, '') AS attrvalue
    FROM mpintranet.itemid_attrname a
    LEFT JOIN mpintranet.item_attr b
    ON a.itemid = b.itemid
    AND a.attrname = b.attrname;

    DROP TABLE IF EXISTS mpintranet.attrname_result;
    CREATE TABLE IF NOT EXISTS mpintranet.attrname_result(
        itemid  BIGINT,
        attrname  STRING,
        attrvalue  STRING
    )
    CLUSTERED BY (itemid) INTO 113 BUCKETS
    STORED AS ORC;

    INSERT INTO TABLE mpintranet.attrname_result
    SELECT
        t.itemid,
        GROUP_CONCAT(t.attrname, ",") AS attrname,
        GROUP_CONCAT(t.attrvalue, ",") AS attrvalue
    FROM
    (
        SELECT
            itemid,
            attrname,
            attrvalue
        FROM mpintranet.itemid_attrname_attrvalue
        DISTRIBUTE BY itemid
        SORT BY attrname DESC
    ) AS t
    GROUP BY t.itemid;

    DROP TABLE IF EXISTS mpintranet.attrname_export;
    CREATE TABLE IF NOT EXISTS mpintranet.attrname_export(
        itemid  BIGINT,
        attrvalue  STRING
    )
    CLUSTERED BY (itemid) INTO 113 BUCKETS
    STORED AS ORC;

    INSERT INTO TABLE mpintranet.attrname_export
    SELECT
        itemid,
        attrvalue
    FROM mpintranet.attrname_result;
    """
    Hive(env=ENV).execute(sql.format(category_id))
コード例 #6
0
def insert_by_category(**kwargs):
    sql = """
        USE elengjing_tj;
        DROP TABLE IF EXISTS t_shop_{shop_id}_customer;
        CREATE TABLE t_shop_{shop_id}_customer (
            shopid BIGINT,
            categoryid BIGINT,
            itemid BIGINT,
            platformid BIGINT,
            avg_price FLOAT
        )
        CLUSTERED BY (itemid) INTO 791 BUCKETS
        STORED AS ORC;

        INSERT INTO TABLE t_shop_{shop_id}_customer
        SELECT
                shopid,
                categoryid,
                itemid,
                platformid,
                CASE
                    WHEN NVL(SUM(salesqty), 0) = 0
                    THEN 0
                    ELSE 1.0 * NVL(SUM(salesamt), 0) / SUM(salesqty)
                END AS avg_price
            FROM elengjing.women_clothing_item
            WHERE shopid = {shop_id}
            AND categoryid = {category_id}
            GROUP BY
                shopid,
                categoryid,
                itemid,
                platformid;

        DROP TABLE IF EXISTS t_shop_{shop_id}_target;
        CREATE TABLE t_shop_{shop_id}_target (
            shopid BIGINT,
            categoryid BIGINT,
            itemid BIGINT,
            platformid BIGINT,
            avg_price FLOAT
        )
        CLUSTERED BY (itemid) INTO 791 BUCKETS
        STORED AS ORC;

        INSERT INTO t_shop_{shop_id}_target
        SELECT
            shopid,
            categoryid,
            itemid,
            platformid,
            CASE
                WHEN NVL(SUM(salesqty), 0) = 0
                THEN 0
                ELSE 1.0 * NVL(SUM(salesamt), 0) / SUM(salesqty)
            END AS avg_price
        FROM elengjing.women_clothing_item
        WHERE shopid != {shop_id}
        AND categoryid = {category_id}
        GROUP BY
            shopid,
            categoryid,
            itemid,
            platformid;

        INSERT INTO elengjing_tj.t_shop_{shop_id}
        SELECT
            c.categoryid AS category_id,
            c.shopid AS customer_shop_id,
            c.itemid AS customer_item_id,
            c.platformid AS customer_platform_id,
            t.shopid AS target_shop_id,
            t.itemid AS target_item_id,
            t.platformid AS target_platform_id
        FROM t_shop_{shop_id}_customer AS c
        CROSS JOIN t_shop_{shop_id}_target AS t
        WHERE t.avg_price >= 0.9 * c.avg_price AND t.avg_price <= 1.1 * c.avg_price;
        """.format(**kwargs)
    if kwargs.get("debug"):
        print(sql)
    else:
        Hive("idc").execute(sql)
コード例 #7
0
ファイル: patch.py プロジェクト: mpdevs/mplib
def get_table_count(table_name):
    sql = "SELECT COUNT(*) AS cnt FROM das.{0}".format(table_name)
    return Hive("local").query(sql)[0].get("cnt")
コード例 #8
0
def get_data():
    sql = "SELECT COUNT(*) AS cnt FROM das.ci_diaper2014to2016_0406_allplatform_distinct AS d WHERE d.year = '2016'"
    data = Hive(env="local").query(sql)
    pickle_dump("raw", data)
コード例 #9
0
def show_tables():
    print(Hive().query("show tables"))
コード例 #10
0
# coding: utf-8
# __author__: u"John"
from __future__ import unicode_literals, absolute_import, print_function, division
from mplib.IO import Hive, pickle_dump, pickle_load
from mplib.common import smart_encode
import editdistance


def get_data():
    sql = "SELECT COUNT(*) AS cnt FROM das.ci_diaper2014to2016_0406_allplatform_distinct AS d WHERE d.year = '2016'"
    data = Hive(env="local").query(sql)
    pickle_dump("raw", data)


def show_tables():
    print(Hive().query("show tables"))


def transform():
    pickle_dump("id_content",
                {line[0]: line[1]
                 for line in pickle_load("raw")})


if __name__ == "__main__":
    Hive.show_env_dict()
    print(Hive().query("show tables"))