def execute_hive_predict_data(category_id, is_event=False, show_sql=False, udf="udf_item_pricing13.py"): from mplib.IO import Hive sql = """ USE elengjing_price; ADD FILE /home/script/normal_servers/serverudf/elengjing/{udf}; ADD FILE /home/script/normal_servers/serverudf/elengjing/data/pricing_model/sklearn_randomforest_{category_id}_{model_type}.pickle; DROP TABLE IF EXISTS tmp_{category_id}; CREATE TABLE tmp_{category_id} ( itemid STRING, data STRING )CLUSTERED BY (itemid) INTO 1017 BUCKETS STORED AS ORC; INSERT INTO tmp_{category_id} SELECT i.itemid, CONCAT_WS( ',', NVL(CAST(i.avg_price AS STRING), '0'), NVL(CAST((UNIX_TIMESTAMP() - TO_UNIX_TIMESTAMP(i.listeddate)) / 86400.0 AS STRING), '0'), NVL(CAST(shop.level AS STRING), '0'), NVL(CAST(shop.favor AS STRING), '0'), NVL(CAST(s.all_spu AS STRING), '0'), NVL(CAST(s.all_sales_rank AS STRING), '0'), NVL(CAST(sc.category_spu AS STRING), '0'), NVL(CAST(sc.category_sales_rank AS STRING), '0'), NVL(CAST(s.shop_avg_price AS STRING), '0'), NVL(CAST(s.shop_avg_price_rank AS STRING), '0'), NVL(CAST(sc.category_avg_price AS STRING), '0'), NVL(CAST(sc.category_avg_price_rank AS STRING), '0'), v.data ) AS data FROM ( SELECT shopid, itemid, MAX(listeddate) AS listeddate, CASE WHEN SUM(salesqty) = 0 THEN 0 ELSE SUM(salesamt) / SUM(salesqty) END AS avg_price FROM transforms.women_clothing_item_new_dict WHERE categoryid = {category_id} GROUP BY shopid, itemid ) AS i JOIN ( SELECT shopid, COUNT(DISTINCT itemid) AS all_spu, CASE WHEN SUM(salesqty) = 0 THEN 0 ELSE SUM(salesamt) / SUM(salesqty) END AS shop_avg_price, ROW_NUMBER() OVER(ORDER BY SUM(salesamt) DESC) AS all_sales_rank, ROW_NUMBER() OVER(ORDER BY CASE WHEN SUM(salesqty) = 0 THEN 0 ELSE SUM(salesamt) / SUM(salesqty) END DESC) AS shop_avg_price_rank FROM transforms.women_clothing_item_new_dict GROUP BY shopid ) AS s ON i.shopid = s.shopid JOIN ( SELECT shopid, COUNT(DISTINCT itemid) AS category_spu, ROW_NUMBER() OVER(ORDER BY SUM(salesamt) DESC) AS category_sales_rank, ROW_NUMBER() OVER(ORDER BY CASE WHEN SUM(salesqty) = 0 THEN 0 ELSE SUM(salesamt) / SUM(salesqty) END DESC) AS category_avg_price_rank, CASE WHEN SUM(salesqty) = 0 THEN 0 ELSE SUM(salesamt) / SUM(salesqty) END AS category_avg_price FROM transforms.women_clothing_item_new_dict WHERE categoryid = {category_id} GROUP BY shopid ) AS sc ON i.shopid = sc.shopid JOIN elengjing_base.shop AS shop ON i.shopid = shop.shopid JOIN elengjing_price.attr_itemid_value_matrix_{category_id} AS v ON i.itemid = v.itemid; DROP TABLE IF EXISTS predict_{category_id}_{model_type}; CREATE TABLE predict_{category_id}_{model_type} ( itemid STRING, price DECIMAL(20,2) ) STORED AS ORC; INSERT INTO predict_{category_id}_{model_type} SELECT TRANSFORM(itemid, data) USING 'python {udf} {category_id} {model_type}' AS (itemid, price) FROM tmp_{category_id}; DROP TABLE IF EXISTS tmp_{category_id}; """.format(category_id=category_id, model_type="event" if is_event else "daily", udf=udf) if show_sql: print(sql) else: Hive("idc").execute(sql)
def create_attrvalue(category_id): sql = """ -------- 1 产生中间表 DROP TABLE IF EXISTS mpintranet.attrvalue; CREATE TABLE IF NOT EXISTS mpintranet.attrvalue(attrvalue STRING) CLUSTERED BY (attrvalue) INTO 113 BUCKETS STORED AS ORC; INSERT INTO TABLE mpintranet.attrvalue SELECT CONCAT(attrname, '_', attrvalue) AS attrvalue FROM elengjing.women_clothing_item_attr WHERE categoryid = {0} GROUP BY CONCAT(attrname, '_', attrvalue); DROP TABLE IF EXISTS mpintranet.itemid; CREATE TABLE IF NOT EXISTS mpintranet.itemid(itemid BIGINT) CLUSTERED BY (itemid) INTO 113 BUCKETS STORED AS ORC; INSERT INTO TABLE mpintranet.itemid SELECT itemid FROM elengjing.women_clothing_item_attr WHERE categoryid = {0} GROUP BY itemid; DROP TABLE IF EXISTS mpintranet.item_attr; CREATE TABLE IF NOT EXISTS mpintranet.item_attr( itemid BIGINT, attrvalue STRING ) CLUSTERED BY (itemid) INTO 581 BUCKETS STORED AS ORC; INSERT INTO TABLE mpintranet.item_attr SELECT itemid, CONCAT(attrname, '_', attrvalue) AS attrvalue FROM elengjing.women_clothing_item_attr WHERE categoryid = {0} GROUP BY itemid, CONCAT(attrname, '_', attrvalue); ------- 产生全量属性表 DROP TABLE IF EXISTS mpintranet.itemid_attrvalue; CREATE TABLE IF NOT EXISTS mpintranet.itemid_attrvalue( itemid BIGINT, attrvalue STRING ) CLUSTERED BY (itemid) INTO 581 BUCKETS STORED AS ORC; INSERT INTO TABLE mpintranet.itemid_attrvalue SELECT b.itemid, a.attrvalue FROM mpintranet.attrvalue AS a CROSS JOIN mpintranet.itemid AS b; ---------JOIN 全量表, 产出结果 DROP TABLE IF EXISTS mpintranet.itemid_attrvalue_tag; CREATE TABLE IF NOT EXISTS mpintranet.itemid_attrvalue_tag( itemid BIGINT, attrvalue STRING, tag STRING ) CLUSTERED BY (itemid) INTO 783 BUCKETS STORED AS ORC; SET mapred.reduce.tasks=500; INSERT INTO TABLE mpintranet.itemid_attrvalue_tag SELECT a.itemid, a.attrvalue, IF(b.itemid IS NULL, '0', '1') AS tag FROM mpintranet.itemid_attrvalue a LEFT JOIN mpintranet.item_attr b ON a.itemid = b.itemid AND a.attrvalue = b.attrvalue; ---------- JOIN 结果表 DROP TABLE IF EXISTS mpintranet.attrvalue_result; CREATE TABLE IF NOT EXISTS mpintranet.attrvalue_result( itemid BIGINT, attrvalue STRING, tag STRING ) CLUSTERED BY (itemid) INTO 113 BUCKETS STORED AS ORC; INSERT INTO TABLE mpintranet.attrvalue_result SELECT t.itemid, GROUP_CONCAT(t.attrvalue, ",") AS attrvalue, GROUP_CONCAT(t.tag, ",") AS tag FROM ( SELECT itemid, attrvalue, tag, SPLIT(attrvalue, ',')[0] AS attrname FROM mpintranet.itemid_attrvalue_tag DISTRIBUTE BY itemid SORT BY attrname DESC ) AS t GROUP BY t.itemid; ------------end------------- DROP TABLE IF EXISTS mpintranet.attrvalue_export; CREATE TABLE IF NOT EXISTS mpintranet.attrvalue_export( itemid BIGINT, tag STRING ) CLUSTERED BY (itemid) INTO 113 BUCKETS STORED AS ORC; INSERT INTO TABLE mpintranet.attrvalue_export SELECT itemid, tag FROM mpintranet.attrvalue_result; """ # print(sql.format(category_id)) Hive(env=ENV).execute(sql.format(category_id))
def get_gold_from_hive(): sql = "SELECT * FROM t_elengjing.competitive_item_train_stage_3" pickle_dump("train_data", Hive(env="idc").query(sql, to_dict=False))
def insert_by_category(**kwargs): sql = """ USE elengjing_tj; DROP TABLE IF EXISTS t_shop_{shop_id}_case_{case}_customer; CREATE TABLE t_shop_{shop_id}_case_{case}_customer ( shopid BIGINT, categoryid BIGINT, itemid BIGINT, platformid BIGINT, avg_price FLOAT ) CLUSTERED BY (itemid) INTO 113 BUCKETS STORED AS ORC; INSERT INTO TABLE t_shop_{shop_id}_case_{case}_customer SELECT shopid, categoryid, itemid, platformid, CASE WHEN NVL(SUM(salesqty), 0) = 0 THEN 0 ELSE 1.0 * NVL(SUM(salesamt), 0) / SUM(salesqty) END AS avg_price FROM elengjing.women_clothing_item WHERE shopid = {shop_id} AND categoryid = {category_id} GROUP BY shopid, categoryid, itemid, platformid; DROP TABLE IF EXISTS t_shop_{shop_id}_case_{case}_target; CREATE TABLE t_shop_{shop_id}_case_{case}_target ( shopid BIGINT, categoryid BIGINT, itemid BIGINT, platformid BIGINT, avg_price FLOAT ) CLUSTERED BY (itemid) INTO 113 BUCKETS STORED AS ORC; INSERT INTO TABLE t_shop_{shop_id}_case_{case}_target SELECT b.shopid, b.categoryid, b.itemid, b.platformid, b.avg_price FROM ( SELECT a.shopid, a.categoryid, a.itemid, a.platformid, a.avg_price, 1.0 * SUM(a.qty) OVER (ORDER BY a.qty DESC, a.qty ROWS UNBOUNDED PRECEDING) / t.qty AS c_ratio FROM ( SELECT shopid, categoryid, itemid, platformid, CASE WHEN NVL(SUM(salesqty), 0) = 0 THEN 0 ELSE 1.0 * NVL(SUM(salesamt), 0) / SUM(salesqty) END AS avg_price, SUM(salesqty) AS qty FROM elengjing.women_clothing_item WHERE shopid != {shop_id} AND categoryid = {category_id} GROUP BY shopid, categoryid, itemid, platformid ORDER BY qty DESC ) AS a CROSS JOIN ( SELECT SUM(salesqty) AS qty FROM elengjing.women_clothing_item WHERE shopid != {shop_id} AND categoryid = {category_id} ) AS t ) AS b WHERE b.c_ratio <= 0.8; INSERT INTO elengjing_tj.t_shop_{shop_id}_case_{case} SELECT c.categoryid AS category_id, c.shopid AS customer_shop_id, c.itemid AS customer_item_id, c.platformid AS customer_platform_id, t.shopid AS target_shop_id, t.itemid AS target_item_id, t.platformid AS target_platform_id FROM t_shop_{shop_id}_case_{case}_customer AS c CROSS JOIN t_shop_{shop_id}_case_{case}_target AS t WHERE t.avg_price BETWEEN 0.9 * c.avg_price AND 1.1 * c.avg_price; """.format(**kwargs) Hive("idc").execute(sql)
def create_attrname(category_id): sql = """ DROP TABLE IF EXISTS mpintranet.attrname; CREATE TABLE IF NOT EXISTS mpintranet.attrname(attrname STRING) CLUSTERED BY (attrname) INTO 113 BUCKETS STORED AS ORC; INSERT INTO TABLE mpintranet.attrname SELECT attrname FROM elengjing.women_clothing_item_attr WHERE categoryid = {0} GROUP BY attrname; DROP TABLE IF EXISTS mpintranet.itemid; CREATE TABLE IF NOT EXISTS mpintranet.itemid(itemid BIGINT) CLUSTERED BY (itemid) INTO 113 BUCKETS STORED AS ORC; INSERT INTO TABLE mpintranet.itemid SELECT itemid FROM elengjing.women_clothing_item_attr WHERE categoryid = {0} GROUP BY itemid; DROP TABLE IF EXISTS mpintranet.item_attr; CREATE TABLE IF NOT EXISTS mpintranet.item_attr( itemid BIGINT, attrvalue STRING ) CLUSTERED BY (itemid) INTO 581 BUCKETS STORED AS ORC; INSERT INTO TABLE mpintranet.item_attr SELECT itemid, CONCAT(attrname, '_', attrvalue) AS attrvalue FROM elengjing.women_clothing_item_attr WHERE categoryid = {0} GROUP BY itemid, CONCAT(attrname, '_', attrvalue); DROP TABLE IF EXISTS mpintranet.itemid_attrname; CREATE TABLE IF NOT EXISTS mpintranet.itemid_attrname( itemid BIGINT, attrname STRING ) CLUSTERED BY (itemid) INTO 581 BUCKETS STORED AS ORC; INSERT INTO TABLE mpintranet.itemid_attrname SELECT b.itemid, a.attrname FROM mpintranet.attrname a CROSS JOIN mpintranet.itemid b; DROP TABLE IF EXISTS mpintranet.itemid_attrname_attrvalue; CREATE TABLE IF NOT EXISTS mpintranet.itemid_attrname_attrvalue( itemid BIGINT, attrname STRING, attrvalue STRING ) CLUSTERED BY (itemid) INTO 783 BUCKETS STORED AS ORC; INSERT INTO TABLE mpintranet.itemid_attrname_attrvalue SELECT a.itemid, a.attrname, NVL(b.attrvalue, '') AS attrvalue FROM mpintranet.itemid_attrname a LEFT JOIN mpintranet.item_attr b ON a.itemid = b.itemid AND a.attrname = b.attrname; DROP TABLE IF EXISTS mpintranet.attrname_result; CREATE TABLE IF NOT EXISTS mpintranet.attrname_result( itemid BIGINT, attrname STRING, attrvalue STRING ) CLUSTERED BY (itemid) INTO 113 BUCKETS STORED AS ORC; INSERT INTO TABLE mpintranet.attrname_result SELECT t.itemid, GROUP_CONCAT(t.attrname, ",") AS attrname, GROUP_CONCAT(t.attrvalue, ",") AS attrvalue FROM ( SELECT itemid, attrname, attrvalue FROM mpintranet.itemid_attrname_attrvalue DISTRIBUTE BY itemid SORT BY attrname DESC ) AS t GROUP BY t.itemid; DROP TABLE IF EXISTS mpintranet.attrname_export; CREATE TABLE IF NOT EXISTS mpintranet.attrname_export( itemid BIGINT, attrvalue STRING ) CLUSTERED BY (itemid) INTO 113 BUCKETS STORED AS ORC; INSERT INTO TABLE mpintranet.attrname_export SELECT itemid, attrvalue FROM mpintranet.attrname_result; """ Hive(env=ENV).execute(sql.format(category_id))
def insert_by_category(**kwargs): sql = """ USE elengjing_tj; DROP TABLE IF EXISTS t_shop_{shop_id}_customer; CREATE TABLE t_shop_{shop_id}_customer ( shopid BIGINT, categoryid BIGINT, itemid BIGINT, platformid BIGINT, avg_price FLOAT ) CLUSTERED BY (itemid) INTO 791 BUCKETS STORED AS ORC; INSERT INTO TABLE t_shop_{shop_id}_customer SELECT shopid, categoryid, itemid, platformid, CASE WHEN NVL(SUM(salesqty), 0) = 0 THEN 0 ELSE 1.0 * NVL(SUM(salesamt), 0) / SUM(salesqty) END AS avg_price FROM elengjing.women_clothing_item WHERE shopid = {shop_id} AND categoryid = {category_id} GROUP BY shopid, categoryid, itemid, platformid; DROP TABLE IF EXISTS t_shop_{shop_id}_target; CREATE TABLE t_shop_{shop_id}_target ( shopid BIGINT, categoryid BIGINT, itemid BIGINT, platformid BIGINT, avg_price FLOAT ) CLUSTERED BY (itemid) INTO 791 BUCKETS STORED AS ORC; INSERT INTO t_shop_{shop_id}_target SELECT shopid, categoryid, itemid, platformid, CASE WHEN NVL(SUM(salesqty), 0) = 0 THEN 0 ELSE 1.0 * NVL(SUM(salesamt), 0) / SUM(salesqty) END AS avg_price FROM elengjing.women_clothing_item WHERE shopid != {shop_id} AND categoryid = {category_id} GROUP BY shopid, categoryid, itemid, platformid; INSERT INTO elengjing_tj.t_shop_{shop_id} SELECT c.categoryid AS category_id, c.shopid AS customer_shop_id, c.itemid AS customer_item_id, c.platformid AS customer_platform_id, t.shopid AS target_shop_id, t.itemid AS target_item_id, t.platformid AS target_platform_id FROM t_shop_{shop_id}_customer AS c CROSS JOIN t_shop_{shop_id}_target AS t WHERE t.avg_price >= 0.9 * c.avg_price AND t.avg_price <= 1.1 * c.avg_price; """.format(**kwargs) if kwargs.get("debug"): print(sql) else: Hive("idc").execute(sql)
def get_table_count(table_name): sql = "SELECT COUNT(*) AS cnt FROM das.{0}".format(table_name) return Hive("local").query(sql)[0].get("cnt")
def get_data(): sql = "SELECT COUNT(*) AS cnt FROM das.ci_diaper2014to2016_0406_allplatform_distinct AS d WHERE d.year = '2016'" data = Hive(env="local").query(sql) pickle_dump("raw", data)
def show_tables(): print(Hive().query("show tables"))
# coding: utf-8 # __author__: u"John" from __future__ import unicode_literals, absolute_import, print_function, division from mplib.IO import Hive, pickle_dump, pickle_load from mplib.common import smart_encode import editdistance def get_data(): sql = "SELECT COUNT(*) AS cnt FROM das.ci_diaper2014to2016_0406_allplatform_distinct AS d WHERE d.year = '2016'" data = Hive(env="local").query(sql) pickle_dump("raw", data) def show_tables(): print(Hive().query("show tables")) def transform(): pickle_dump("id_content", {line[0]: line[1] for line in pickle_load("raw")}) if __name__ == "__main__": Hive.show_env_dict() print(Hive().query("show tables"))