def rebuild_table(table_name): count = get_table_count(table_name) if not count: print("no rows in das.{0}".format(table_name)) return definition = ",\n".join([ "{0} {1}".format(d[0], d[1]) for d in get_table_definition(table_name) ]) clusters = int(ceil(count / 1e7)) * 113 time_stamp = datetime.now().strftime("%Y%m%d%H%M%S") select_columns = ",".join([d[0] for d in get_table_definition(table_name)]) print(datetime.now()) sql = """ DROP TABLE IF EXISTS {0}_{1}; CREATE TABLE {0}_{1} ( {2} )CLUSTERED BY (id) INTO {3} BUCKETS STORED AS ORC; """.format(table_name, time_stamp, definition, clusters) Hive("local").execute(sql) print(datetime.now()) sql = """ INSERT INTO TABLE das.{0}_{1} SELECT {2} FROM das.{0}; """.format(table_name, time_stamp, select_columns) Hive("local").execute(sql) print(datetime.now()) sql = """ DROP TABLE das.{0}; ALTER TABLE das.{0}_{1} RENAME TO das.{0}; """.format(table_name, time_stamp) Hive("local").execute(sql) print(datetime.now())
def transform(**kwargs): sql = """ USE elengjing_tj; DROP TABLE IF EXISTS shop_{shop_id}; ALTER TABLE t_shop_{shop_id} RENAME TO shop_{shop_id}; DROP TABLE IF EXISTS shop_{shop_id}_item; CREATE TABLE shop_{shop_id}_item AS SELECT category_id, customer_item_id AS item_id, count(*) as cnt FROM elengjing_tj.shop_{shop_id} GROUP BY category_id, customer_item_id UNION ALL SELECT category_id, target_item_id AS item_id, count(*) as cnt FROM elengjing_tj.shop_{shop_id} GROUP BY category_id, target_item_id; DROP TABLE IF EXISTS t_shop_{shop_id}_target; DROP TABLE IF EXISTS t_shop_{shop_id}_customer; DROP TABLE IF EXISTS t_shop_{shop_id}; DROP TABLE IF EXISTS t_shop_{shop_id}_scope; """.format(**kwargs) if kwargs.get("debug"): print(sql) else: Hive("idc").execute(sql)
def transform(**kwargs): sql = """ USE elengjing_tj; DROP TABLE IF EXISTS shop_{shop_id}_case_{case}; ALTER TABLE t_shop_{shop_id}_case_{case} RENAME TO shop_{shop_id}_case_{case}; DROP TABLE IF EXISTS shop_{shop_id}_case_{case}_item; CREATE TABLE shop_{shop_id}_case_{case}_item AS ( SELECT category_id, customer_item_id AS item_id, count(*) as cnt FROM elengjing_tj.shop_{shop_id}_case_{case} GROUP BY category_id, item_id UNION ALL SELECT category_id, target_item_id AS item_id, count(*) as cnt FROM elengjing_tj.shop_{shop_id}_case_{case} GROUP BY category_id, item_id ); DROP TABLE IF EXISTS t_shop_{shop_id}_case_{case}_target; DROP TABLE IF EXISTS t_shop_{shop_id}_case_{case}_customer; DROP TABLE IF EXISTS t_shop_{shop_id}_case_{case}; DROP TABLE IF EXISTS t_shop_{shop_id}_case_{case}_scope; """.format(**kwargs) Hive("idc").execute(sql)
def get_shop_category(**kwargs): sql = """ SELECT DISTINCT categoryid FROM elengjing.women_clothing_item WHERE shopid = {shop_id} """.format(**kwargs) return list(map(lambda x: x[0], Hive("idc").query(sql, to_dict=False)))
def drop(is_event, show_sql=False): sql = """ USE elengjing_price; DROP TABLE IF EXISTS predict_{0}; """.format("event" if is_event else "daily") if show_sql: print(sql) else: Hive("idc").execute(sql)
def create_table_item_tagged(): sql = """ USE elengjing; DROP TABLE IF EXISTS women_clothing_item_attr_t; CREATE TABLE women_clothing_item_attr_t AS SELECT itemid, CONCAT_WS(',', COLLECT_SET(CONCAT(attrname, ':' ,attrvalue))) AS data FROM women_clothing_item_attr GROUP BY itemid; """ Hive(env=ENV).execute(sql)
def insert_attrvalue_to_pg(): ret = Hive(env=ENV).query("SELECT attrvalue FROM mpintranet.attrvalue_result LIMIT 1") if len(ret) > 0: attrvalues = ret[0]["attrvalue"] rows = PostgreSQL().query("SELECT * FROM text_value WHERE name = 'attrname_attrvalue_columns'") if len(rows) > 0: sql = "UPDATE text_value SET value = '{0}' WHERE name = 'attrname_attrvalue_columns'".format(attrvalues) else: sql = "INSERT INTO text_value (name, value) VALUES('attrname_attrvalue_columns', '{0}')".format(attrvalues) PostgreSQL().execute(sql)
def get_minerals_from_hive(): sql = """ SELECT customer_attr, target_attr, customer_item_id, target_item_id FROM t_elengjing.competitive_filtering_stage_1 LIMIT 10000; """ pickle_dump("raw_data", Hive(env="idc").query(sql=sql, to_dict=False))
def create(is_event=True, show_sql=False): sql = """ USE elengjing_price; CREATE TABLE predict_{0} ( itemid BIGINT, categoryid BIGINT, price DECIMAL(20, 2) ) STORED AS ORC; """.format("event" if is_event else "daily") if show_sql: print(sql) else: Hive("idc").execute(sql)
def create_table(**kwargs): sql = """ USE elengjing_tj; DROP TABLE IF EXISTS t_shop_{shop_id}_case_{case}; CREATE TABLE t_shop_{shop_id}_case_{case} ( category_id BIGINT, customer_shop_id BIGINT, customer_item_id BIGINT, customer_platform_id BIGINT, target_shop_id BIGINT, target_item_id BIGINT, target_platform_id BIGINT ) CLUSTERED BY (customer_item_id) INTO 113 BUCKETS STORED AS ORC; """.format(**kwargs) Hive("idc").execute(sql)
def get_hive_train_data(category_id, is_event=False): from mplib.IO import Hive sql = """ SELECT data AS data FROM elengjing_price.train_{category_id}_{model_type} ORDER BY RAND(1000000) LIMIT 100000 """.format(category_id=category_id, model_type="event" if is_event else "daily") lines = [line.get("data") for line in Hive("idc").query(sql)] train_x = [] train_y = [] for line in lines: line = [float(num) for num in line.split(",")] train_x.append(line[1:]) train_y.append(line[0]) return numpy.nan_to_num(numpy.array(train_x)), vrm( numpy.nan_to_num(numpy.array(train_y)))
def get_features(**kwargs): sql = """ USE t_elengjing; ADD FILE /home/udflib/serverudf/elengjing/{udf}; DROP TABLE IF EXISTS competitive_item_train_stage_1; CREATE TABLE competitive_item_train_stage_1( customer_item_id STRING, target_item_id STRING, similarity STRING, score STRING, category_id STRING ) STORED AS ORC; INSERT INTO competitive_item_train_stage_1 SELECT TRANSFORM(ct.data, tt.data, ct.itemid, tt.itemid, l.score, l.categoryid) USING 'python {udf} {category_id}' AS (customer_item_id, target_item_id, similarity, score, category_id) FROM competitive_manual_label AS l JOIN elengjing.women_clothing_item_attr_t AS ct ON ct.itemid = l.sourceitemid JOIN elengjing.women_clothing_item_attr_t AS tt ON tt.itemid = l.targetitemid; DROP TABLE IF EXISTS competitive_item_train_stage_2; CREATE TABLE competitive_item_train_stage_2 AS SELECT a.customer_item_id, a.target_item_id, a.category_id, cv.tag AS customer_item_dummy, tv.tag AS target_item_dummy, cn.attrvalue AS customer_item_attr, tn.attrvalue AS target_item_attr, CASE WHEN a.score < 0.5 THEN 0 ELSE 1 END AS score FROM competitive_item_train_stage_1 AS a JOIN mpintranet.attrname_export AS cn ON a.customer_item_id = cn.itemid JOIN mpintranet.attrname_export AS tn ON a.target_item_id = tn.itemid JOIN mpintranet.attrvalue_export AS cv ON a.customer_item_id = cv.itemid JOIN mpintranet.attrvalue_export AS tv ON a.target_item_id = tv.itemid WHERE a.category_id = {category_id}; """.format(**kwargs) Hive(env="idc").execute(sql)
def create_table(**kwargs): sql = """ USE elengjing_tj; DROP TABLE IF EXISTS t_shop_{shop_id}; CREATE TABLE t_shop_{shop_id} ( category_id BIGINT, customer_shop_id BIGINT, customer_item_id BIGINT, customer_platform_id BIGINT, target_shop_id BIGINT, target_item_id BIGINT, target_platform_id BIGINT ) CLUSTERED BY (customer_item_id) INTO 791 BUCKETS STORED AS ORC; """.format(**kwargs) if kwargs.get("debug"): print(sql) else: Hive("idc").execute(sql)
def insert(category_id, is_event, show_sql=False): raw_sql = """ USE elengjing_price; INSERT INTO predict_{0} SELECT itemid, {1} AS categoryid, CASE WHEN price IS NULL OR price <= 0 THEN -1 ELSE price END AS price FROM predict_{1}_{0}; """ if isinstance(category_id, list): sql = ";\n".join([raw_sql.format("event" if is_event else "daily", c) for c in category_id]) else: sql = raw_sql.format("event" if is_event else "daily", category_id) if show_sql: print(sql) else: Hive("idc").execute(sql)
def gen_attribute_dictionary(out=False): sql = """ DROP TABLE IF EXISTS elengjing_price.attr_value_text; CREATE TABLE IF NOT EXISTS elengjing_price.attr_value_text( categoryid STRING, attr STRING ) ROW FORMAT DELIMITED FIELDS TERMINATED BY'\t'; LOAD DATA LOCAL INPATH '/home/data/attr_value.csv' OVERWRITE INTO TABLE elengjing_price.attr_value_text; DROP TABLE IF EXISTS elengjing_price.attr_value; CREATE TABLE elengjing_price.attr_value STORED AS ORC AS SELECT * FROM elengjing_price.attr_value_text; """ if out: print(sql) else: Hive("idc").execute(sql)
def execute_hive_predict_data(category_id, is_event=False, show_sql=False, udf="udf_item_pricing13.py"): from mplib.IO import Hive sql = """ USE elengjing_price; ADD FILE /home/script/normal_servers/serverudf/elengjing/{udf}; ADD FILE /home/script/normal_servers/serverudf/elengjing/data/pricing_model/sklearn_randomforest_{category_id}_{model_type}.pickle; DROP TABLE IF EXISTS tmp_{category_id}; CREATE TABLE tmp_{category_id} ( itemid STRING, data STRING )CLUSTERED BY (itemid) INTO 1017 BUCKETS STORED AS ORC; INSERT INTO tmp_{category_id} SELECT i.itemid, CONCAT_WS( ',', NVL(CAST(i.avg_price AS STRING), '0'), NVL(CAST((UNIX_TIMESTAMP() - TO_UNIX_TIMESTAMP(i.listeddate)) / 86400.0 AS STRING), '0'), NVL(CAST(shop.level AS STRING), '0'), NVL(CAST(shop.favor AS STRING), '0'), NVL(CAST(s.all_spu AS STRING), '0'), NVL(CAST(s.all_sales_rank AS STRING), '0'), NVL(CAST(sc.category_spu AS STRING), '0'), NVL(CAST(sc.category_sales_rank AS STRING), '0'), NVL(CAST(s.shop_avg_price AS STRING), '0'), NVL(CAST(s.shop_avg_price_rank AS STRING), '0'), NVL(CAST(sc.category_avg_price AS STRING), '0'), NVL(CAST(sc.category_avg_price_rank AS STRING), '0'), v.data ) AS data FROM ( SELECT shopid, itemid, MAX(listeddate) AS listeddate, CASE WHEN SUM(salesqty) = 0 THEN 0 ELSE SUM(salesamt) / SUM(salesqty) END AS avg_price FROM transforms.women_clothing_item_new_dict WHERE categoryid = {category_id} GROUP BY shopid, itemid ) AS i JOIN ( SELECT shopid, COUNT(DISTINCT itemid) AS all_spu, CASE WHEN SUM(salesqty) = 0 THEN 0 ELSE SUM(salesamt) / SUM(salesqty) END AS shop_avg_price, ROW_NUMBER() OVER(ORDER BY SUM(salesamt) DESC) AS all_sales_rank, ROW_NUMBER() OVER(ORDER BY CASE WHEN SUM(salesqty) = 0 THEN 0 ELSE SUM(salesamt) / SUM(salesqty) END DESC) AS shop_avg_price_rank FROM transforms.women_clothing_item_new_dict GROUP BY shopid ) AS s ON i.shopid = s.shopid JOIN ( SELECT shopid, COUNT(DISTINCT itemid) AS category_spu, ROW_NUMBER() OVER(ORDER BY SUM(salesamt) DESC) AS category_sales_rank, ROW_NUMBER() OVER(ORDER BY CASE WHEN SUM(salesqty) = 0 THEN 0 ELSE SUM(salesamt) / SUM(salesqty) END DESC) AS category_avg_price_rank, CASE WHEN SUM(salesqty) = 0 THEN 0 ELSE SUM(salesamt) / SUM(salesqty) END AS category_avg_price FROM transforms.women_clothing_item_new_dict WHERE categoryid = {category_id} GROUP BY shopid ) AS sc ON i.shopid = sc.shopid JOIN elengjing_base.shop AS shop ON i.shopid = shop.shopid JOIN elengjing_price.attr_itemid_value_matrix_{category_id} AS v ON i.itemid = v.itemid; DROP TABLE IF EXISTS predict_{category_id}_{model_type}; CREATE TABLE predict_{category_id}_{model_type} ( itemid STRING, price DECIMAL(20,2) ) STORED AS ORC; INSERT INTO predict_{category_id}_{model_type} SELECT TRANSFORM(itemid, data) USING 'python {udf} {category_id} {model_type}' AS (itemid, price) FROM tmp_{category_id}; DROP TABLE IF EXISTS tmp_{category_id}; """.format(category_id=category_id, model_type="event" if is_event else "daily", udf=udf) if show_sql: print(sql) else: Hive("idc").execute(sql)
def get_data(): sql = "SELECT COUNT(*) AS cnt FROM das.ci_diaper2014to2016_0406_allplatform_distinct AS d WHERE d.year = '2016'" data = Hive(env="local").query(sql) pickle_dump("raw", data)
def get_gold_from_hive(): sql = "SELECT * FROM t_elengjing.competitive_item_train_stage_3" pickle_dump("train_data", Hive(env="idc").query(sql, to_dict=False))
def get_table_count(table_name): sql = "SELECT COUNT(*) AS cnt FROM das.{0}".format(table_name) return Hive("local").query(sql)[0].get("cnt")
from mplib.IO import Hive from mplib import * import traceback import sys if __name__ == "__main__": reload(sys) sys.setdefaultencoding("utf8") try: a = SKAssess() a.category_id = sys.argv[1] a.interval = smart_decode(sys.argv[2]) sql = "SELECT itemid AS itemid, data AS data FROM elengjing_price.tmp_{0} LIMIT 10".format( a.category_id) data = Hive("idc").query(sql) data = [ "\t".join([str(line.get("itemid")), str(line.get("data"))]) for line in data ] data = list( map( lambda x: smart_decode(x).replace("\n", "").replace("\r", ""). split("\t"), data)) items, data = split_id_feature(data) a.x_predict, a.y_predict = split_x_y(data) print(a.y_predict.shape) print(a.x_predict.shape) a.path = "/Users/panjunjun/PycharmProjects/Lab/IO/sklearn_randomforest_{0}_daily.pickle".format( a.category_id) a.load_model()
def show_tables(): print(Hive().query("show tables"))
def gen_attribute_matrix(category_id, keep_tmp=False, out=False): sql = """ DROP TABLE IF EXISTS elengjing_price.attr_unique_itemid_{category_id}; CREATE TABLE IF NOT EXISTS elengjing_price.attr_unique_itemid_{category_id}( itemid BIGINT ) CLUSTERED BY (itemid) INTO 997 BUCKETS STORED AS ORC; INSERT INTO TABLE elengjing_price.attr_unique_itemid_{category_id} SELECT itemid FROM transforms.women_clothing_item_attr WHERE categoryid = {category_id} AND attrname != '品牌' GROUP BY itemid; DROP TABLE IF EXISTS elengjing_price.item_attr_{category_id}; CREATE TABLE IF NOT EXISTS elengjing_price.item_attr_{category_id}( itemid BIGINT, attr STRING ) CLUSTERED BY (itemid) INTO 997 BUCKETS STORED AS ORC; INSERT INTO TABLE elengjing_price.item_attr_{category_id} SELECT itemid, CONCAT(attrname, ":", attrvalue) FROM transforms.women_clothing_item_attr WHERE categoryid = {category_id} AND attrname != '品牌'; DROP TABLE IF EXISTS elengjing_price.item_attr_1_{category_id}; CREATE TABLE IF NOT EXISTS elengjing_price.item_attr_1_{category_id}( itemid BIGINT, attr STRING ) CLUSTERED BY (itemid) INTO 997 BUCKETS STORED AS ORC; INSERT INTO TABLE elengjing_price.item_attr_1_{category_id} SELECT itemid, attr FROM elengjing_price.item_attr_{category_id} GROUP BY itemid, attr; DROP TABLE IF EXISTS elengjing_price.attr_itemid_value_{category_id}; CREATE TABLE IF NOT EXISTS elengjing_price.attr_itemid_value_{category_id}( itemid BIGINT, attr STRING ) CLUSTERED BY (itemid) INTO 997 BUCKETS STORED AS ORC; INSERT INTO TABLE elengjing_price.attr_itemid_value_{category_id} SELECT t1.itemid, t2.attr FROM elengjing_price.attr_unique_itemid_{category_id} AS t1 CROSS JOIN elengjing_price.attr_value AS t2 WHERE t2.categoryid = {category_id}; DROP TABLE IF EXISTS elengjing_price.attr_itemid_value_data_{category_id}; CREATE TABLE IF NOT EXISTS elengjing_price.attr_itemid_value_data_{category_id}( itemid BIGINT, attr STRING, data INT ) CLUSTERED BY (itemid) INTO 997 BUCKETS STORED AS ORC; SET mapred.reduce.tasks = 1000; INSERT INTO TABLE elengjing_price.attr_itemid_value_data_{category_id} SELECT t1.*, IF(NVL(t2.itemid, 0) = 0, 0, 1) FROM elengjing_price.attr_itemid_value_{category_id} AS t1 LEFT JOIN elengjing_price.item_attr_1_{category_id} AS t2 ON t1.itemid = t2.itemid AND t1.attr = t2.attr; DROP TABLE IF EXISTS elengjing_price.attr_itemid_value_matrix_{category_id}; CREATE TABLE IF NOT EXISTS elengjing_price.attr_itemid_value_matrix_{category_id}( itemid BIGINT, attr STRING, data STRING ) CLUSTERED BY (itemid) INTO 997 BUCKETS STORED AS ORC; INSERT INTO TABLE elengjing_price.attr_itemid_value_matrix_{category_id} SELECT itemid, CONCAT_WS(',', COLLECT_LIST(attr)) AS attr, CONCAT_WS(',', COLLECT_LIST(CAST(data AS STRING))) AS data FROM ( SELECT itemid, attr, data, row_number() OVER (PARTITION BY itemid ORDER BY attr DESC) AS rank FROM elengjing_price.attr_itemid_value_data_{category_id} ) AS a GROUP BY itemid; """.format(category_id=category_id) if out: print(sql) else: Hive("idc").execute(sql) if not keep_tmp: sql = """ USE elengjing_price; DROP TABLE attr_itemid_value_{category_id}; DROP TABLE attr_itemid_value_data_{category_id}; DROP TABLE attr_unique_itemid_{category_id}; DROP TABLE item_attr_1_{category_id}; DROP TABLE item_attr_{category_id}; """.format(category_id=category_id) if out: print(sql) else: Hive("idc").execute(sql)
def insert_by_category(**kwargs): sql = """ USE elengjing_tj; DROP TABLE IF EXISTS t_shop_{shop_id}_case_{case}_customer; CREATE TABLE t_shop_{shop_id}_case_{case}_customer ( shopid BIGINT, categoryid BIGINT, itemid BIGINT, platformid BIGINT, avg_price FLOAT ) CLUSTERED BY (itemid) INTO 113 BUCKETS STORED AS ORC; INSERT INTO TABLE t_shop_{shop_id}_case_{case}_customer SELECT shopid, categoryid, itemid, platformid, CASE WHEN NVL(SUM(salesqty), 0) = 0 THEN 0 ELSE 1.0 * NVL(SUM(salesamt), 0) / SUM(salesqty) END AS avg_price FROM elengjing.women_clothing_item WHERE shopid = {shop_id} AND categoryid = {category_id} GROUP BY shopid, categoryid, itemid, platformid; DROP TABLE IF EXISTS t_shop_{shop_id}_case_{case}_target; CREATE TABLE t_shop_{shop_id}_case_{case}_target ( shopid BIGINT, categoryid BIGINT, itemid BIGINT, platformid BIGINT, avg_price FLOAT ) CLUSTERED BY (itemid) INTO 113 BUCKETS STORED AS ORC; INSERT INTO t_shop_{shop_id}_case_{case}_target SELECT shopid, categoryid, itemid, platformid, CASE WHEN NVL(SUM(salesqty), 0) = 0 THEN 0 ELSE 1.0 * NVL(SUM(salesamt), 0) / SUM(salesqty) END AS avg_price FROM elengjing.women_clothing_item WHERE shopid != {shop_id} AND categoryid = {category_id} GROUP BY shopid, categoryid, itemid, platformid; INSERT INTO elengjing_tj.t_shop_{shop_id}_case_{case} SELECT c.categoryid AS category_id, c.shopid AS customer_shop_id, c.itemid AS customer_item_id, c.platformid AS customer_platform_id, t.shopid AS target_shop_id, t.itemid AS target_item_id, t.platformid AS target_platform_id FROM t_shop_{shop_id}_case_{case}_customer AS c CROSS JOIN t_shop_{shop_id}_case_{case}_target AS t WHERE t.avg_price BETWEEN 0.9 * c.avg_price AND 1.1 *c.avg_price; """.format(**kwargs) Hive("idc").execute(sql)
def insert_by_category(**kwargs): sql = """ USE elengjing_tj; DROP TABLE IF EXISTS t_shop_{shop_id}_case_{case}_customer; CREATE TABLE t_shop_{shop_id}_case_{case}_customer ( shopid BIGINT, categoryid BIGINT, itemid BIGINT, platformid BIGINT, avg_price FLOAT ) CLUSTERED BY (itemid) INTO 113 BUCKETS STORED AS ORC; INSERT INTO TABLE t_shop_{shop_id}_case_{case}_customer SELECT shopid, categoryid, itemid, platformid, CASE WHEN NVL(SUM(salesqty), 0) = 0 THEN 0 ELSE 1.0 * NVL(SUM(salesamt), 0) / SUM(salesqty) END AS avg_price FROM elengjing.women_clothing_item WHERE shopid = {shop_id} AND categoryid = {category_id} GROUP BY shopid, categoryid, itemid, platformid; DROP TABLE IF EXISTS t_shop_{shop_id}_case_{case}_target; CREATE TABLE t_shop_{shop_id}_case_{case}_target ( shopid BIGINT, categoryid BIGINT, itemid BIGINT, platformid BIGINT, avg_price FLOAT ) CLUSTERED BY (itemid) INTO 113 BUCKETS STORED AS ORC; INSERT INTO TABLE t_shop_{shop_id}_case_{case}_target SELECT b.shopid, b.categoryid, b.itemid, b.platformid, b.avg_price FROM ( SELECT a.shopid, a.categoryid, a.itemid, a.platformid, a.avg_price, 1.0 * SUM(a.qty) OVER (ORDER BY a.qty DESC, a.qty ROWS UNBOUNDED PRECEDING) / t.qty AS c_ratio FROM ( SELECT shopid, categoryid, itemid, platformid, CASE WHEN NVL(SUM(salesqty), 0) = 0 THEN 0 ELSE 1.0 * NVL(SUM(salesamt), 0) / SUM(salesqty) END AS avg_price, SUM(salesqty) AS qty FROM elengjing.women_clothing_item WHERE shopid != {shop_id} AND categoryid = {category_id} GROUP BY shopid, categoryid, itemid, platformid ORDER BY qty DESC ) AS a CROSS JOIN ( SELECT SUM(salesqty) AS qty FROM elengjing.women_clothing_item WHERE shopid != {shop_id} AND categoryid = {category_id} ) AS t ) AS b WHERE b.c_ratio <= 0.8; INSERT INTO elengjing_tj.t_shop_{shop_id}_case_{case} SELECT c.categoryid AS category_id, c.shopid AS customer_shop_id, c.itemid AS customer_item_id, c.platformid AS customer_platform_id, t.shopid AS target_shop_id, t.itemid AS target_item_id, t.platformid AS target_platform_id FROM t_shop_{shop_id}_case_{case}_customer AS c CROSS JOIN t_shop_{shop_id}_case_{case}_target AS t WHERE t.avg_price BETWEEN 0.9 * c.avg_price AND 1.1 * c.avg_price; """.format(**kwargs) Hive("idc").execute(sql)
def gen_train_features( category_id, is_event=False, date="WHERE daterange BETWEEN '2016-05-01' AND '2016-05-24'", out=False): event = "event" if is_event else "daily" sql = """ USE elengjing_price; DROP TABLE IF EXISTS train_{category_id}_{event}; CREATE TABLE train_{category_id}_{event} ( itemid STRING, data STRING )CLUSTERED BY (itemid) INTO 113 BUCKETS; INSERT INTO train_{category_id}_{event} SELECT i.itemid, CONCAT_WS( ',', NVL(CAST(i.avg_price AS STRING), '0'), NVL(CAST((UNIX_TIMESTAMP() - TO_UNIX_TIMESTAMP(i.listeddate)) / 86400.0 AS STRING), '0'), NVL(CAST(shop.level AS STRING), '0'), NVL(CAST(shop.favor AS STRING), '0'), NVL(CAST(s.all_spu AS STRING), '0'), NVL(CAST(s.all_sales_rank AS STRING), '0'), NVL(CAST(sc.category_spu AS STRING), '0'), NVL(CAST(sc.category_sales_rank AS STRING), '0'), NVL(CAST(s.shop_avg_price AS STRING), '0'), NVL(CAST(s.shop_avg_price_rank AS STRING), '0'), NVL(CAST(sc.category_avg_price AS STRING), '0'), NVL(CAST(sc.category_avg_price_rank AS STRING), '0'), v.data ) AS data FROM (SELECT shopid, itemid, MAX(listeddate) AS listeddate, CASE WHEN SUM(salesqty) = 0 THEN 0 ELSE SUM(salesamt) / SUM(salesqty) END AS avg_price FROM transforms.women_clothing_item_new_dict {date_filter} AND categoryid = {category_id} GROUP BY shopid, itemid) AS i JOIN (SELECT shopid, COUNT(DISTINCT itemid) AS all_spu, CASE WHEN SUM(salesqty) = 0 THEN 0 ELSE SUM(salesamt) / SUM(salesqty) END AS shop_avg_price, ROW_NUMBER() OVER(ORDER BY SUM(salesamt) DESC) AS all_sales_rank, ROW_NUMBER() OVER(ORDER BY CASE WHEN SUM(salesqty) = 0 THEN 0 ELSE SUM(salesamt) / SUM(salesqty) END DESC) AS shop_avg_price_rank FROM transforms.women_clothing_item_new_dict {date_filter} GROUP BY shopid) AS s ON i.shopid = s.shopid JOIN (SELECT shopid, COUNT(DISTINCT itemid) AS category_spu, ROW_NUMBER() OVER(ORDER BY SUM(salesamt) DESC) AS category_sales_rank, ROW_NUMBER() OVER(ORDER BY CASE WHEN SUM(salesqty) = 0 THEN 0 ELSE SUM(salesamt) / SUM(salesqty) END DESC) AS category_avg_price_rank, CASE WHEN SUM(salesqty) = 0 THEN 0 ELSE SUM(salesamt) / SUM(salesqty) END AS category_avg_price FROM transforms.women_clothing_item_new_dict {date_filter} AND categoryid = {category_id} GROUP BY shopid) AS sc ON i.shopid = sc.shopid JOIN elengjing_base.shop AS shop ON i.shopid = shop.shopid JOIN elengjing_price.attr_itemid_value_matrix_{category_id} AS v ON i.itemid = v.itemid WHERE shop.level IS NOT NULL AND shop.favor IS NOT NULL AND i.avg_price BETWEEN 10 AND 10000; """.format(category_id=category_id, date_filter=date, event=event) if out: print(sql) else: Hive("idc").execute(sql)
def create_attrname(category_id): sql = """ DROP TABLE IF EXISTS mpintranet.attrname; CREATE TABLE IF NOT EXISTS mpintranet.attrname(attrname STRING) CLUSTERED BY (attrname) INTO 113 BUCKETS STORED AS ORC; INSERT INTO TABLE mpintranet.attrname SELECT attrname FROM elengjing.women_clothing_item_attr WHERE categoryid = {0} GROUP BY attrname; DROP TABLE IF EXISTS mpintranet.itemid; CREATE TABLE IF NOT EXISTS mpintranet.itemid(itemid BIGINT) CLUSTERED BY (itemid) INTO 113 BUCKETS STORED AS ORC; INSERT INTO TABLE mpintranet.itemid SELECT itemid FROM elengjing.women_clothing_item_attr WHERE categoryid = {0} GROUP BY itemid; DROP TABLE IF EXISTS mpintranet.item_attr; CREATE TABLE IF NOT EXISTS mpintranet.item_attr( itemid BIGINT, attrvalue STRING ) CLUSTERED BY (itemid) INTO 581 BUCKETS STORED AS ORC; INSERT INTO TABLE mpintranet.item_attr SELECT itemid, CONCAT(attrname, '_', attrvalue) AS attrvalue FROM elengjing.women_clothing_item_attr WHERE categoryid = {0} GROUP BY itemid, CONCAT(attrname, '_', attrvalue); DROP TABLE IF EXISTS mpintranet.itemid_attrname; CREATE TABLE IF NOT EXISTS mpintranet.itemid_attrname( itemid BIGINT, attrname STRING ) CLUSTERED BY (itemid) INTO 581 BUCKETS STORED AS ORC; INSERT INTO TABLE mpintranet.itemid_attrname SELECT b.itemid, a.attrname FROM mpintranet.attrname a CROSS JOIN mpintranet.itemid b; DROP TABLE IF EXISTS mpintranet.itemid_attrname_attrvalue; CREATE TABLE IF NOT EXISTS mpintranet.itemid_attrname_attrvalue( itemid BIGINT, attrname STRING, attrvalue STRING ) CLUSTERED BY (itemid) INTO 783 BUCKETS STORED AS ORC; INSERT INTO TABLE mpintranet.itemid_attrname_attrvalue SELECT a.itemid, a.attrname, NVL(b.attrvalue, '') AS attrvalue FROM mpintranet.itemid_attrname a LEFT JOIN mpintranet.item_attr b ON a.itemid = b.itemid AND a.attrname = b.attrname; DROP TABLE IF EXISTS mpintranet.attrname_result; CREATE TABLE IF NOT EXISTS mpintranet.attrname_result( itemid BIGINT, attrname STRING, attrvalue STRING ) CLUSTERED BY (itemid) INTO 113 BUCKETS STORED AS ORC; INSERT INTO TABLE mpintranet.attrname_result SELECT t.itemid, GROUP_CONCAT(t.attrname, ",") AS attrname, GROUP_CONCAT(t.attrvalue, ",") AS attrvalue FROM ( SELECT itemid, attrname, attrvalue FROM mpintranet.itemid_attrname_attrvalue DISTRIBUTE BY itemid SORT BY attrname DESC ) AS t GROUP BY t.itemid; DROP TABLE IF EXISTS mpintranet.attrname_export; CREATE TABLE IF NOT EXISTS mpintranet.attrname_export( itemid BIGINT, attrvalue STRING ) CLUSTERED BY (itemid) INTO 113 BUCKETS STORED AS ORC; INSERT INTO TABLE mpintranet.attrname_export SELECT itemid, attrvalue FROM mpintranet.attrname_result; """ Hive(env=ENV).execute(sql.format(category_id))
def create_attrvalue(category_id): sql = """ -------- 1 产生中间表 DROP TABLE IF EXISTS mpintranet.attrvalue; CREATE TABLE IF NOT EXISTS mpintranet.attrvalue(attrvalue STRING) CLUSTERED BY (attrvalue) INTO 113 BUCKETS STORED AS ORC; INSERT INTO TABLE mpintranet.attrvalue SELECT CONCAT(attrname, '_', attrvalue) AS attrvalue FROM elengjing.women_clothing_item_attr WHERE categoryid = {0} GROUP BY CONCAT(attrname, '_', attrvalue); DROP TABLE IF EXISTS mpintranet.itemid; CREATE TABLE IF NOT EXISTS mpintranet.itemid(itemid BIGINT) CLUSTERED BY (itemid) INTO 113 BUCKETS STORED AS ORC; INSERT INTO TABLE mpintranet.itemid SELECT itemid FROM elengjing.women_clothing_item_attr WHERE categoryid = {0} GROUP BY itemid; DROP TABLE IF EXISTS mpintranet.item_attr; CREATE TABLE IF NOT EXISTS mpintranet.item_attr( itemid BIGINT, attrvalue STRING ) CLUSTERED BY (itemid) INTO 581 BUCKETS STORED AS ORC; INSERT INTO TABLE mpintranet.item_attr SELECT itemid, CONCAT(attrname, '_', attrvalue) AS attrvalue FROM elengjing.women_clothing_item_attr WHERE categoryid = {0} GROUP BY itemid, CONCAT(attrname, '_', attrvalue); ------- 产生全量属性表 DROP TABLE IF EXISTS mpintranet.itemid_attrvalue; CREATE TABLE IF NOT EXISTS mpintranet.itemid_attrvalue( itemid BIGINT, attrvalue STRING ) CLUSTERED BY (itemid) INTO 581 BUCKETS STORED AS ORC; INSERT INTO TABLE mpintranet.itemid_attrvalue SELECT b.itemid, a.attrvalue FROM mpintranet.attrvalue AS a CROSS JOIN mpintranet.itemid AS b; ---------JOIN 全量表, 产出结果 DROP TABLE IF EXISTS mpintranet.itemid_attrvalue_tag; CREATE TABLE IF NOT EXISTS mpintranet.itemid_attrvalue_tag( itemid BIGINT, attrvalue STRING, tag STRING ) CLUSTERED BY (itemid) INTO 783 BUCKETS STORED AS ORC; SET mapred.reduce.tasks=500; INSERT INTO TABLE mpintranet.itemid_attrvalue_tag SELECT a.itemid, a.attrvalue, IF(b.itemid IS NULL, '0', '1') AS tag FROM mpintranet.itemid_attrvalue a LEFT JOIN mpintranet.item_attr b ON a.itemid = b.itemid AND a.attrvalue = b.attrvalue; ---------- JOIN 结果表 DROP TABLE IF EXISTS mpintranet.attrvalue_result; CREATE TABLE IF NOT EXISTS mpintranet.attrvalue_result( itemid BIGINT, attrvalue STRING, tag STRING ) CLUSTERED BY (itemid) INTO 113 BUCKETS STORED AS ORC; INSERT INTO TABLE mpintranet.attrvalue_result SELECT t.itemid, GROUP_CONCAT(t.attrvalue, ",") AS attrvalue, GROUP_CONCAT(t.tag, ",") AS tag FROM ( SELECT itemid, attrvalue, tag, SPLIT(attrvalue, ',')[0] AS attrname FROM mpintranet.itemid_attrvalue_tag DISTRIBUTE BY itemid SORT BY attrname DESC ) AS t GROUP BY t.itemid; ------------end------------- DROP TABLE IF EXISTS mpintranet.attrvalue_export; CREATE TABLE IF NOT EXISTS mpintranet.attrvalue_export( itemid BIGINT, tag STRING ) CLUSTERED BY (itemid) INTO 113 BUCKETS STORED AS ORC; INSERT INTO TABLE mpintranet.attrvalue_export SELECT itemid, tag FROM mpintranet.attrvalue_result; """ # print(sql.format(category_id)) Hive(env=ENV).execute(sql.format(category_id))
# coding: utf-8 # __author__: u"John" from __future__ import unicode_literals, absolute_import, print_function, division from mplib.IO import Hive, pickle_dump, pickle_load from mplib.common import smart_encode import editdistance def get_data(): sql = "SELECT COUNT(*) AS cnt FROM das.ci_diaper2014to2016_0406_allplatform_distinct AS d WHERE d.year = '2016'" data = Hive(env="local").query(sql) pickle_dump("raw", data) def show_tables(): print(Hive().query("show tables")) def transform(): pickle_dump("id_content", {line[0]: line[1] for line in pickle_load("raw")}) if __name__ == "__main__": Hive.show_env_dict() print(Hive().query("show tables"))