def get_mon_order_except(): try: co_cust = get_valid_co_cust(spark).select("cust_id", "cust_seg", "com_id") # -----------------------获取co_co_01 co_co_01 = get_co_co_01(spark,scope=[1,1],filter="month") \ .select("qty_sum", "amt_sum", "cust_id") #每个零售户的订货总量 总订货额 qty_amt_sum = co_co_01.groupBy("cust_id")\ .agg(f.sum("qty_sum").alias("order_sum"), f.sum("amt_sum").alias("amt_sum")) \ .join(co_cust, "cust_id") # -----零售店单月订购总量异常 try: order_sum = qty_amt_sum.select("com_id", "cust_seg", "cust_id", "order_sum") cols = { "value": "order_sum", "abnormal": "sum_abno_month", "mean_plus_3std": "grade_sum_plus3", "mean_minus_3std": "grade_sum_minu3", "mean": "grade_sum" } values = list(cols.values()) values.remove(cols["value"]) print(f"{str(dt.now())} 零售店单月订购总量异常") result = is_except(order_sum, cols, ["com_id", "cust_seg"]) result.foreachPartition(lambda x: write_hbase1(x, values, hbase)) except Exception as e: tb.print_exc() #-----零售店单月订货条均价异常 try: avg_price=qty_amt_sum.withColumn("avg_price", col("amt_sum")/col("order_sum"))\ .select("com_id","cust_seg","cust_id","avg_price") cols = { "value": "avg_price", "abnormal": "price_abno_month", "mean_plus_3std": "grade_price_plus3", "mean_minus_3std": "grade_price_minu3", "mean": "grade_price" } values = list(cols.values()) values.remove(cols["value"]) print(f"{str(dt.now())} 零售店单月订货条均价异常") result = is_except(avg_price, cols, ["com_id", "cust_seg"]) result.foreachPartition(lambda x: write_hbase1(x, values, hbase)) except Exception as e: tb.print_exc() except Exception as e: tb.print_exc()
def get_last_order_except(): try: co_co_01 = get_co_co_01(spark,[0,30])\ .select("qty_sum", "amt_sum", "cust_id", "born_date") co_cust = get_valid_co_cust(spark).select("cust_id", "cust_seg", "com_id") #零售户上一次订货量 取零售户日期最大的订单 qty_amt_sum = co_co_01.groupBy("cust_id").agg(f.max("born_date").alias("born_date")) \ .join(co_co_01, ["cust_id", "born_date"]) \ .join(co_cust, "cust_id") #-----零售店单次订购总量异常 #1.如果上一次为退货,即订单量为负 try: order_sum = qty_amt_sum.select("com_id", "cust_seg", "cust_id", "qty_sum") cols = { "value": "qty_sum", "abnormal": "sum_abno_time", "mean_plus_3std": "last_sum_plus3", "mean_minus_3std": "last_sum_minu3", "mean": "last_sum_mean" } values = list(cols.values()) print(f"{str(dt.now())} 零售店上次订购总量异常") result = is_except(order_sum, cols, ["com_id", "cust_seg"]) result.foreachPartition(lambda x: write_hbase1(x, values, hbase)) except Exception: tb.print_exc() # -----零售店单次订货条均价异常 try: avg_price = qty_amt_sum.withColumn("avg_price", col("amt_sum")/col("qty_sum")) \ .select("com_id", "cust_seg", "cust_id", "avg_price") cols = { "value": "avg_price", "abnormal": "avg_abno_time", "mean_plus_3std": "last_avg_plus3", "mean_minus_3std": "last_avg_minu3", "mean": "last_avg_mean" } values = list(cols.values()) print(f"{str(dt.now())} 零售店上次订货条均价异常") result = is_except(avg_price, cols, ["com_id", "cust_seg"]) result.foreachPartition(lambda x: write_hbase1(x, values, hbase)) except Exception: tb.print_exc() except Exception: tb.print_exc()
def generate_brand_data(): try: hbase = {"table": brand_table, "families": ["0"], "row": "brand_id"} print(f"{dt.now()} 生成品牌名称 {brand_table}") #写入品牌表 plm_item = get_plm_item(spark).select("item_id", "item_name", "item_kind", "is_mrb") # item_kind !=4 4为罚没 is_mrb=1 1为在使用 去掉空格 去掉* 中文括号用英文括号替换 #处理item_name #??? 可以用所有数据 然后drop掉没有括号的 plm_item_brand = plm_item\ .where(col("item_kind") != "4") \ .where(col("is_mrb") == "1") \ .withColumn("brand_name",item_name_udf(col("item_name")))\ .dropDuplicates(["brand_name"]) \ .where(col("brand_name") != "") from pyspark.sql import Window win = Window.orderBy("item_id") brand=plm_item_brand.select("item_id", "item_name", "brand_name") \ .withColumn("brand_id", f.row_number().over(win)) brand.foreachPartition( lambda x: write_hbase1(x, ["brand_name"], hbase)) except: tb.print_exc()
def get_warning(): try: print("删除过期预警数据") levels=["YJFL004","YJFL012","YJFL003","YJFL001"] for level in levels: print(f"预警级别:{level}") delete_all(hbase["table"],row_prefix=level) print(f"{str(dt.now())} 预警") result = get_warning_result(white_list, city='岳阳市', com_id='011114306', day='20190601', cluster_dir=cluster_path+"/") #highprice_30days_order 里面的数值要是float类型 result["highprice_30days_order"] = result["highprice_30days_order"].apply( lambda x: json.dumps(x, ensure_ascii=False)) df = spark.createDataFrame(result)\ .withColumn("classify_id",f.concat_ws("_",col("classify_level1_code"),col("cust_id"))) cols=df.columns cols.remove("classify_id") df.foreachPartition(lambda x:write_hbase1(x,cols,hbase)) except Exception: tb.print_exc()
def get_city(): print(f"{str(dt.now())} city county") try: get_area(spark).select("city","county","sale_center_id")\ .groupBy("city","sale_center_id").agg(f.collect_list("county").alias("county"))\ .foreachPartition(lambda x:write_hbase1(x,["city","county","sale_center_id"],hbase)) except Exception: tb.print_exc()
def get_area_info(): print(f"{str(dt.now())} gdp ...") try: area_code = get_area(spark).select("county", "sale_center_id") population = get_city_ppl(spark).select("区县", "城镇人口", "农村人口") cols = [ "county", "gdp", "gdp_add", "consumption", "consumption_add", "farmer", "farmer_add", "urban", "urban_add", "primary_industry", "primary_industry_add", "second_industry", "second_industry_add", "tertiary_industry", "tertiary_industry_add" ] city_info = get_city_info(spark).where(col("mtime") == f"{dt.now().year-1}-12-01") \ .select(cols) \ .join(population, col("county") == col("区县")) \ .join(area_code, "county") # 计算每个营销中心的总城镇人口和总农村人口 total_ppl = city_info.groupBy("sale_center_id") \ .agg(f.sum("城镇人口").alias("urban_total_ppl"), f.sum("农村人口").alias("farmer_total_ppl")) # 计算每个区县的城镇人口 占 整个营销中心总城镇人口的比值 # 计算每个区县的农村人口 占 整个营销中心总农村人口的比值 weight = city_info.join(total_ppl, "sale_center_id") \ .withColumn("urban_weight", col("城镇人口") / col("urban_total_ppl")) \ .withColumn("farmer_weight", col("农村人口") / col("farmer_total_ppl")) # urban urban_add farmer farmer_add 加权平均 result = weight.groupBy("sale_center_id") \ .agg( f.sum("gdp").alias("gdp"), f.avg("gdp_add").alias("gdp_add"), f.sum("consumption").alias("consumption"), f.avg("consumption_add").alias("consumption_add"), f.sum(col("urban") * col("urban_weight")).alias("urban"), f.sum(col("urban_add") * col("urban_weight")).alias("urban_add"), f.sum(col("farmer") * col("farmer_weight")).alias("farmer"), f.sum(col("farmer_add") * col("farmer_weight")).alias("farmer_add"), f.sum("primary_industry").alias("primary_industry"), f.avg("primary_industry_add").alias("primary_industry_add"), f.sum("second_industry").alias("second_industry"), f.avg("second_industry_add").alias("second_industry_add"), f.sum("tertiary_industry").alias("tertiary_industry"), f.avg("tertiary_industry_add").alias("tertiary_industry_add") ) cols.remove("county") result.foreachPartition(lambda x: write_hbase1(x, cols, hbase)) except Exception: tb.print_exc()
def county_reality_supply(): # 各区县各档位该品规上周投放量 try: print(f"{str(dt.now())} 各区县各档位该品规上周投放量") plm_item = get_plm_item(spark).select("item_id", "item_name") co_cust = get_co_cust(spark).select("cust_id", "sale_center_id", "cust_seg") area = get_area(spark) # com_id与city的映射关系 city = area.dropDuplicates(["com_id"]).select("com_id", "city") # sale_center_id与区(list)的映射关系 county = area.groupBy("sale_center_id") \ .agg(f.collect_list("county").alias("county")) \ .select("sale_center_id", "county") # 获取上周实际投放量 # cust_item_spw = spark.sql( # "select com_id,cust_id,item_id,qty_allocco,begin_date,end_date from DB2_DB2INST1_SGP_CUST_ITEM_SPW") \ # .withColumn("begin_date", f.to_date(col("begin_date"), "yyyyMMdd")) \ # .withColumn("end_date", f.to_date(col("end_date"), "yyyyMMdd")) \ # .withColumn("last_mon", f.date_sub(f.date_trunc("week", f.current_date()), 7)) \ # .withColumn("last_sun", f.date_add(col("last_mon"), 6)) \ # .where((col("begin_date") == col("last_mon")) & (col("end_date") == col("last_sun")))\ # .join(co_cust,"cust_id") cust_item_spw = spark.sql( "select com_id,cust_id,item_id,qty_allocco,begin_date,end_date from DB2_DB2INST1_SGP_CUST_ITEM_SPW") \ .withColumn("begin_date", f.to_date(col("begin_date"), "yyyyMMdd")) \ .withColumn("end_date", f.to_date(col("end_date"), "yyyyMMdd")) \ .withColumn("last_mon", f.date_sub(f.date_trunc("week", f.current_date()), 7 * 4)) \ .withColumn("last_sun", f.date_add(col("last_mon"), 6 + 7 * 3)) \ .where((col("begin_date") >= col("last_mon")) & (col("end_date") <= col("last_sun")))\ .join(co_cust,"cust_id") #需要计算的值的列名 colName = "county_gauge_week_volume" result = cust_item_spw.groupBy("com_id","sale_center_id","cust_seg", "item_id") \ .agg(f.sum("qty_allocco").alias(colName)) columns = [ "com_id", "city", "sale_center_id", "county", "gears", "gauge_id", "gauge_name", "city", "gears_data_marker", colName ] result.withColumn("row", f.concat_ws("_", col("sale_center_id"),col("cust_seg"), col("item_id"))) \ .withColumn("gears_data_marker", f.lit("4")) \ .join(plm_item, "item_id") \ .join(city, "com_id") \ .join(county,"sale_center_id")\ .withColumnRenamed("item_id","gauge_id")\ .withColumnRenamed("item_name","gauge_name")\ .withColumnRenamed("cust_seg","gears")\ .foreachPartition(lambda x: write_hbase1(x, columns, hbase)) except Exception: tb.print_exc()
def source_supply(): #计算每个区每个档位对每款烟的评分 try: print(f"{str(dt.now())} 每个区每个档位对每款烟的评分") co_cust=get_valid_co_cust(spark).select("com_id","cust_id","sale_center_id","cust_seg")\ .where(col("cust_seg")!="ZZ") co_co_line=get_co_co_line(spark,scope=[0,30])\ .where((col("qty_ord")>0) & (col("qty_rsn")>0))\ .select("cust_id","item_id","qty_ord") #每个零售的每款烟的订单量 qty_ord = co_co_line.groupBy("cust_id", "item_id").agg( f.sum("qty_ord").alias("amount")) #每个零售户对每款卷烟的评分 item_rating = get_cigar_rating(spark) # 各区县各档位零售户的数量 cust_num = co_cust.groupBy("com_id","sale_center_id", "cust_seg") \ .agg(f.count("cust_id").alias("county_gears_volume_num")) #结果 #区,档位,品规,评分,区各档位零售户数量 result=qty_ord.join(item_rating,["cust_id","item_id"])\ .withColumn("rating_amount",col("rating")*col("amount"))\ .join(co_cust,"cust_id")\ .groupBy("com_id","sale_center_id","cust_seg","item_id")\ .agg(f.sum(col("rating_amount")).alias("county_gears_score"))\ .join(cust_num,["com_id","sale_center_id","cust_seg"]) plm_item = get_plm_item(spark).select("item_id", "item_name") area = get_area(spark) # com_id与city的映射关系 city = area.dropDuplicates(["com_id"]).select("com_id", "city") # sale_center_id与区(list)的映射关系 county = area.groupBy("sale_center_id") \ .agg(f.collect_list("county").alias("county")) \ .select("sale_center_id", "county") columns = [ "com_id", "city", "sale_center_id", "county", "gears", "gauge_id", "gauge_name", "county_gears_score", "county_gears_volume_num", "gears_data_marker" ] result.join(plm_item,"item_id")\ .withColumn("row",f.concat_ws("_",col("sale_center_id"),col("cust_seg"),col("item_id")))\ .join(city,"com_id")\ .join(county,"sale_center_id")\ .withColumn("gears_data_marker",f.lit("4"))\ .withColumnRenamed("item_id","gauge_id")\ .withColumnRenamed("item_name","gauge_name")\ .withColumnRenamed("cust_seg","gears")\ .foreachPartition(lambda x:write_hbase1(x,columns,hbase)) except Exception: tb.print_exc()
def get_item_stats_info_daily(): #本月当前本市/区每款卷烟订货量 #本月当前本市/区每款卷烟订单额 #烟id,烟名称 plm_item = get_plm_item(spark).select("item_id", "item_name") area = get_area(spark) #com_id与city的映射关系 city = area.dropDuplicates(["com_id"]).select("com_id", "city") #sale_center_id与区(list)的映射关系 county = area.groupBy("sale_center_id")\ .agg(f.collect_list("county").alias("county"))\ .select("sale_center_id","county") #标识列的值 markers = ["1", "3"] #按照 市或区统计 groups = ["com_id", "sale_center_id"] joins = [city, county] # 除需要计算的值,其他的数据 cols_comm = [["city", "gauge_id", "gauge_name", "ciga_data_marker"], [ "county", "sale_center_id", "gauge_id", "gauge_name", "ciga_data_marker" ]] #需要计算的值的列名 cols = [["gauge_city_orders", "gauge_city_order_amount"], ["gauge_county_orders", "gauge_county_order_amount"]] co_co_line = get_co_co_line(spark, scope=[0, 0], filter="month")\ .select("item_id","qty_ord","amt","com_id","sale_center_id") co_co_line.cache() for i in range(len(groups)): group = groups[i] join = joins[i] c = cols[i] marker = markers[i] try: #2.本月每款烟在每个区的订单量,订单额 print(f"{str(dt.now())} 本月{group}各品规的订单量,订单额") #com_id item_id qty_ord amt qty_amt=co_co_line.groupBy([group,"item_id"])\ .agg(f.sum(col("qty_ord")).alias(c[0]),f.sum(col("amt")).alias(c[1])) columns = cols_comm[i] + c qty_amt.withColumn("row",f.concat_ws("_",col(group),col("item_id")))\ .join(plm_item,"item_id")\ .join(join,group)\ .withColumnRenamed("item_id","gauge_id")\ .withColumnRenamed("item_name","gauge_name")\ .withColumn("ciga_data_marker",f.lit(marker))\ .foreachPartition(lambda x:write_hbase1(x,columns,hbase)) except Exception: tb.print_exc() co_co_line.unpersist()
def get_ppl_areaCpt_index(): population = get_city_ppl(spark).select("区县", "总人口") area_code = get_area(spark).where(col("city").rlike("株洲市|邵阳市|岳阳市")) \ .select("city","county", "sale_center_id") area_plt = population.join(area_code, col("county") == col("区县")) # 每个区域中心的人口 sale_center_ppl = area_plt.groupBy("city", "sale_center_id") \ .agg(f.sum("总人口").alias("ppl")) # -----人口指数 print(f"{str(dt.now())} 人口指数") try: colName = "population_index" # 每个市人口最多的区域中心的人口 max_df = sale_center_ppl.groupBy("city") \ .agg(f.max("ppl").alias("max")) sale_center_ppl.join(max_df, "city") \ .withColumn(colName, col("ppl")/col("max") * 5) \ .foreachPartition(lambda x: write_hbase1(x, [colName], hbase)) except Exception: tb.print_exc() #-----区域竞争力指数 try: # 区域零售户数目 cust_num =spark.sql("select * from DB2_DB2INST1_CO_CUST where dt=(select max(dt) from DB2_DB2INST1_CO_CUST)").select("sale_center_id") \ .groupBy("sale_center_id").count() # 人口数目/零售户数目 print(f"{str(dt.now())} 区域竞争指数(人口数目/零售户数目)指数") ppl_cust = sale_center_ppl.join(cust_num, "sale_center_id") \ .withColumn("ppl_cust", col("ppl")/col("count")) max_df = ppl_cust.groupBy("city").agg(f.max("ppl_cust").alias("max")) colName = "area_competitive_index" ppl_cust.join(max_df, "city") \ .withColumn(colName, col("ppl_cust")/col("max") * 5) \ .select("sale_center_id", colName)\ .foreachPartition(lambda x:write_hbase1(x,[colName],hbase)) except Exception: tb.print_exc()
def get_acc(): #acc是扣款账号 不是co_cust里面的开户银行账号 print(f"{str(dt.now())} 扣款账号") try: co_cust = get_co_cust(spark).select("cust_id") get_co_debit_acc(spark).select("cust_id", "acc") \ .join(co_cust, "cust_id") \ .foreachPartition(lambda x: write_hbase1(x, ["acc"], hbase)) except Exception: tb.print_exc()
def get_grade_except(): try: co_cust = get_valid_co_cust(spark).select("cust_id", "com_id") #档位变更表 crm_cust_log=get_crm_cust_log(spark).where((col("change_type")=="CO_CUST.CUST_SEG") & (col("day_diff")<=90))\ .select("cust_id","change_frm","change_to","audit_date") #档位变更差 grade_diff_df=crm_cust_log.groupBy("cust_id")\ .agg(grade_diff_udf(f.max("change_frm"), f.min("change_frm"), f.max("change_to"), f.min("change_to")).alias("grade_diff"))\ .join(co_cust,"cust_id") grade_diff_df.registerTempTable("grade_diff_df") colName = "grade_abno" print(f"{str(dt.now())} 档位变更差异常的") result=spark.sql("select com_id,percentile_approx(grade_diff,0.25) as percent_25,percentile_approx(grade_diff,0.75) as percent_75 from grade_diff_df group by com_id")\ .join(grade_diff_df,"com_id")\ .withColumn(colName,box_plots_filter_udf(col("grade_diff"),col("percent_25"),col("percent_75")))\ .where(col(colName)>=0) result.foreachPartition(lambda x: write_hbase1(x, [colName], hbase)) print(f"{str(dt.now())} 档位变更差") colName = "grade_abs_three_month" grade_diff_df.join(result,"cust_id")\ .withColumnRenamed("grade_diff",colName)\ .foreachPartition(lambda x:write_hbase1(x,[colName],hbase)) print(f"{str(dt.now())} 档位变更次数") colName = "grade_change_times" crm_cust_log.join(result,"cust_id")\ .groupBy("cust_id").agg(f.count("cust_id").alias(colName))\ .foreachPartition(lambda x:write_hbase1(x,[colName],hbase)) print(f"{str(dt.now())} 90天前档位") colName = "grade_last_three_month" crm_cust_log.groupBy("cust_id") \ .agg(f.min("audit_date").alias("audit_date")) \ .join(crm_cust_log.join(result,"cust_id"), ["cust_id", "audit_date"]) \ .withColumnRenamed("change_frm", colName) \ .foreachPartition(lambda x: write_hbase1(x, [colName],hbase)) except Exception as e: tb.print_exc()
def get_around_class_except(): try: print(f"{str(dt.now())} 零售店订购烟品规异常") co_cust = get_valid_co_cust(spark).select("cust_id") co_co_line = get_co_co_line(spark,scope=[1,1],filter="month")\ .select("cust_id","item_id","qty_ord") plm_item = get_plm_item(spark).select("item_id", "item_name") # 1.每个零售户 每类烟 的数量 cust_item_sum = co_co_line.join(plm_item, "item_id") \ .groupBy("cust_id", "item_name") \ .agg(f.sum("qty_ord").alias("cust_item_sum")) #2.每个零售户订购量前三的烟 win = Window.partitionBy("cust_id").orderBy( col("cust_item_sum").desc()) rank3 = cust_item_sum.withColumn("rank", f.row_number().over(win)) \ .where(col("rank") <= 3) \ .groupBy("cust_id") \ .agg(f.collect_list("item_name").alias("items")) win = Window.partitionBy("cust_id1").orderBy( col("one_km_item_sum").desc()) #每个零售户cust_id1 周边的零售户cust_id0 around_cust = get_around_cust(spark, 1).select("cust_id1", "cust_id0") """ 零售户cust_id1周边包含cust_id0这些零售户 1.第一个join,计算每个零售户cust_id1一公里内有哪些零售户cust_id0 2.第二个join,一公里内的cust_id0这些零售户所定各类烟的数量 cust_id0 与 cust_item_sum的cust_id 3.根据cust_id1 item_name 计算一公里内各类烟的数量 """ #3.每个零售户一公里内所定烟的前三 one_km_rank3 = around_cust.join(cust_item_sum, col("cust_id0") == col("cust_id")) \ .select("cust_id1", "item_name", "cust_item_sum") \ .groupBy("cust_id1", "item_name") \ .agg(f.sum("cust_item_sum").alias("one_km_item_sum")) \ .withColumn("rank", f.row_number().over(win)) \ .where(col("rank") <= 3) \ .groupBy("cust_id1") \ .agg(f.collect_list("item_name").alias("one_km_items")) colName = ["regulation_abno", "ciga_top3_last_month", "ciga_top3_km"] #四.求交集 长度为0,异常 rank3.join(one_km_rank3, col("cust_id") == col("cust_id1")) \ .where(f.size(f.array_intersect(col("items"), col("one_km_items"))) == 0) \ .withColumn(colName[0],f.lit(1))\ .withColumnRenamed("items",colName[1])\ .withColumnRenamed("one_km_items",colName[2])\ .join(co_cust,"cust_id")\ .foreachPartition(lambda x:write_hbase1(x,colName,hbase)) except Exception as e: tb.print_exc()
def get_vfr_index(): try: print(f"{str(dt.now())} 零售户周边人流指数") # 有人流数据的零售户 vfr = get_around_vfr(spark) vfr.cache() # 零售户 co_cust = get_co_cust(spark).select("cust_id") # 有经纬度的零售户 cust_lng_lat = get_cust_lng_lat(spark) \ .select("city", "cust_id", "lng", "lat") \ .join(co_cust, "cust_id") cust_lng_lat.cache() # 周边没有人流的零售户 not_df = cust_lng_lat.select("cust_id") \ .exceptAll(vfr.select("cust_id")) \ .join(cust_lng_lat, "cust_id") exist_df = vfr.join(cust_lng_lat, ["city", "cust_id"]) if not_df.count()>0: #knn填充 fill_df = fillWithKNN(exist_df.toPandas(), not_df.toPandas(), "avg_vfr") all_df = spark.createDataFrame(fill_df) \ .unionByName(exist_df) else: all_df=exist_df #阈值 threshold = all_df.groupBy("city") \ .agg((f.mean("avg_vfr") + 3 * f.stddev_pop("avg_vfr")).alias("threshold")) truncate_df = all_df.join(threshold, "city") \ .withColumn("avg_vfr", f.when(col("avg_vfr") > col("threshold"), col("threshold")) .otherwise(col("avg_vfr")) ) log_df = truncate_df.withColumn("log", f.log(col("avg_vfr") + 1)) log_max = log_df.groupBy("city").agg(f.max("log").alias("log_max")) colName = "people_count" log_df.join(log_max, "city") \ .withColumn(colName, col("log") / col("log_max") * 5) \ .foreachPartition(lambda x: write_hbase1(x, [colName], hbase)) vfr.unpersist() cust_lng_lat.unpersist() except Exception: tb.print_exc()
def card_passId(): try: co_cust=get_co_cust(spark).select("cust_id","identity_card_id") co_debit_acc=get_co_debit_acc(spark).select("cust_id","pass_id") colName = "license_not_match" result=co_cust.join(co_debit_acc, "cust_id") \ .withColumn(colName, isMatch(col("identity_card_id"), col("pass_id"))) result.foreachPartition(lambda x: write_hbase1(x, ["license_not_match"],hbase)) except Exception: tb.print_exc()
def get_lng_lat(): print(f"{str(dt.now())} 经纬度") try: co_cust = get_valid_co_cust(spark).select("cust_id") crm_cust = get_crm_cust(spark).select("cust_id", "longitude", "latitude") co_cust.join(crm_cust,"cust_id")\ .foreachPartition(lambda x:write_hbase1(x,["longitude","latitude"],hbase)) except Exception: tb.print_exc() #-----网上爬取的经纬度 print(f"{str(dt.now())} 经纬度") try: cust_lng_lat = get_cust_lng_lat(spark).select("cust_id", "lng", "lat") co_cust.join(cust_lng_lat,"cust_id")\ .withColumnRenamed("lng","longitude")\ .withColumnRenamed("lat","latitude")\ .foreachPartition(lambda x:write_hbase1(x,["longitude","latitude"],hbase)) except Exception as e: tb.print_exc()
def get_city_county(): print(f"{str(dt.now())} 市 区 abcode") try: co_cust = get_valid_co_cust(spark).select("cust_id", "com_id", "sale_center_id") area_code = get_area(spark) co_cust.join(area_code,["com_id","sale_center_id"])\ .withColumnRenamed("城市","city")\ .withColumnRenamed("区","county")\ .foreachPartition(lambda x: write_hbase1(x, ["sale_center_id","city","county"],hbase)) except Exception as e: tb.print_exc()
def get_cust_info(): #零售户信息表co_cust co_cust_cols=["cust_id","cust_name","cust_seg","status","pay_type","license_code","manager","identity_card_id", "order_tel","inv_type","order_way","periods","busi_addr","work_port","base_type","sale_scope", "scope","com_chara","is_tor_tax","is_sale_large","is_rail_cust","area_type","is_sefl_cust","is_func_cust", "sale_center_id","sale_dept_id","slsman_id","slsman_mobile"] # 需要修改的列名 与hbase的列名对应 co_cust_renamed = {"is_tor_tax": "tor_tax", "is_sale_large": "sale_large", "is_rail_cust": "rail_cust", "is_sefl_cust": "sefl_cust", "is_func_cust": "func_cust", "cust_seg": "grade" } #零售户信息表crm_cust crm_cust_cols=["cust_id","longitude","latitude","is_multiple_shop","org_model","is_night_shop", "busi_time_type","consumer_group","consumer_attr","busi_type","compliance_grade", "rim_type","info_terminal"] crm_cust_renamed = {"is_multiple_shop": "multiple_shop", "is_night_shop": "night_shop"} #送货地址 ldm_cust = spark.sql( "select cust_id,dist_addr from DB2_DB2INST1_LDM_CUST where dt=(select max(dt) from DB2_DB2INST1_LDM_CUST)") #是否属于配送点 ldm_cust_dist = spark.sql( "select cust_id,is_in_point from DB2_DB2INST1_LDM_CUST_DIST where dt=(select max(dt) from DB2_DB2INST1_LDM_CUST_DIST)") print(f"{str(dt.now())} co_cust/crm_cust零售户信息") try: co_cust = spark.sql("select * from DB2_DB2INST1_CO_CUST where dt=(select max(dt) from DB2_DB2INST1_CO_CUST)") \ .where(col("com_id").isin(cities)).select(co_cust_cols) #修改列名 for key in co_cust_renamed.keys(): co_cust = co_cust.withColumnRenamed(key, co_cust_renamed[key]) crm_cust=get_crm_cust(spark).select(crm_cust_cols) #修改列名 for key in crm_cust_renamed.keys(): crm_cust = crm_cust.withColumnRenamed(key, crm_cust_renamed[key]) cust_info=co_cust.join(crm_cust, "cust_id","left")\ .join(ldm_cust,"cust_id","left")\ .join(ldm_cust_dist,"cust_id","left") cols=cust_info.columns cust_info.foreachPartition(lambda x: write_hbase1(x, cols, hbase)) except Exception: tb.print_exc()
def get_city_county(): #-----市 区 print(f"{str(dt.now())} city county abcode") try: co_cust=get_co_cust(spark).select("cust_id","com_id","sale_center_id") area_code=get_area(spark).groupBy("com_id","city","sale_center_id")\ .agg(f.collect_list("county").alias("county")) co_cust.join(area_code,["com_id","sale_center_id"])\ .withColumnRenamed("城市","city")\ .withColumnRenamed("区","county")\ .foreachPartition(lambda x: write_hbase1(x, ["city","county"],hbase)) except Exception: tb.print_exc()
def city_reality_supply(): # 全市上周该品规计划投放量 # 全市上周该品规实际投放量 # 全市上周该品规投放剩余量 try: print(f"{str(dt.now())} 全市上周各品规 计划/实际/剩余 量") plm_item = get_plm_item(spark).select("item_id", "item_name") area = get_area(spark) # com_id与city的映射关系 city = area.dropDuplicates(["com_id"]).select("com_id", "city") # 获取上周 # cust_item_spw = spark.sql( # "select com_id,item_id,qty_plan,qty_remain,qty_allocco,begin_date,end_date from DB2_DB2INST1_SGP_CUST_ITEM_SPW") \ # .where(col("com_id").isin(cities)) \ # .withColumn("begin_date", f.to_date(col("begin_date"), "yyyyMMdd")) \ # .withColumn("end_date", f.to_date(col("end_date"), "yyyyMMdd")) \ # .withColumn("last_mon", f.date_sub(f.date_trunc("week", f.current_date()), 7)) \ # .withColumn("last_sun", f.date_add(col("last_mon"), 6)) \ # .where((col("begin_date") == col("last_mon")) & (col("end_date") == col("last_sun"))) cust_item_spw = spark.sql( "select com_id,item_id,qty_plan,qty_remain,qty_allocco,begin_date,end_date from DB2_DB2INST1_SGP_CUST_ITEM_SPW") \ .where(col("com_id").isin(cities)) \ .where((col("begin_date") == "20190708") & (col("end_date") <= "20190714")) #需要计算的值的列名 cols = [ "gauge_week_planned_volume", "gauge_week_reality_volume", "gauge_week_residue_volume" ] result = cust_item_spw.groupBy("com_id", "item_id") \ .agg(f.sum("qty_plan").alias(cols[0]), f.sum("qty_allocco").alias(cols[1]), f.sum("qty_remain").alias(cols[2])) columns = ["gauge_id", "gauge_name", "city", "gears_data_marker" ] + cols result.withColumn("row", f.concat_ws("_", col("com_id"), col("item_id"))) \ .withColumn("gears_data_marker", f.lit("0")) \ .join(plm_item, "item_id") \ .join(city, "com_id") \ .withColumnRenamed("item_id","gauge_id")\ .withColumnRenamed("item_name","gauge_name")\ .foreachPartition(lambda x: write_hbase1(x, columns, hbase)) except Exception: tb.print_exc()
def get_gdp_index(): try: print(f"{str(dt.now())} gdp index") area_code = get_area(spark).select("sale_center_id", "county") gdp_sum = get_city_info(spark).select("city", "county", "gdp", "mtime") \ .where(col("mtime") == f"{dt.now().year-1}-12-01") \ .join(area_code, "county") \ .groupBy("city", "sale_center_id") \ .agg(f.sum("gdp")) max_df = gdp_sum.groupBy("city").agg(f.max("sum(gdp)").alias("max")) colName = "GDP" gdp_sum.join(max_df, "city") \ .withColumn(colName, col("sum(gdp)")/col("max") * 5) \ .foreachPartition(lambda x: write_hbase1(x, [colName], hbase)) except Exception: tb.print_exc()
def get_high_cons_level(): try: co_cust = get_valid_co_cust(spark).select("cust_id") print(f"{str(dt.now())} 零售店高价烟比例/消费水平") # 消费水平 consume_level_df = get_consume_level(spark).select( "cust_id", "consume_level") # -----------------------获取co_co_line co_co_line=get_co_co_line(spark,scope=[1,1],filter="month") \ .select("cust_id","qty_ord","price","com_id") #零售户所定烟的总数目 item_num = co_co_line.groupBy("com_id", "cust_id").agg( f.sum("qty_ord").alias("item_num")) #每个零售户高价烟的数量 high_price_num = co_co_line.where(col("price") >= 500) \ .groupBy("cust_id") \ .agg(f.sum("qty_ord").alias("high_price_num")) #每个零售户高价烟比例 high_price_ratio = item_num.join(high_price_num, "cust_id") \ .withColumn("high_price_ratio", col("high_price_num")/col("item_num")) \ .select("com_id", "cust_id", "high_price_ratio") high_consume_level=high_price_ratio.join(consume_level_df,"cust_id")\ .withColumn("retail_high_cons",col("high_price_ratio")/col("consume_level")) cols = { "value": "retail_high_cons", "abnormal": "high_cons", "mean_plus_3std": "high_std_plus3", "mean_minus_3std": "high_std_minu3", "mean": "city_high_cons" } values = list(cols.values()) is_except(high_consume_level,cols,["com_id"])\ .join(co_cust,"cust_id")\ .foreachPartition(lambda x:write_hbase1(x,values,hbase)) except Exception: tb.print_exc()
def get_cust_index(): types = {"catering_cons_count": "(.*餐厅.*)|(.*咖啡厅.*)|(.*茶艺馆.*)|(.*甜品.*)", "convenient_trans_count": "(.*地铁站.*)|(.*公交.*)|(.*机场.*)|(.*港口码头.*)", "shopping_cons_count": "(.*购物中心.*)|(.*普通商场.*)|(.*免税品店.*)|(.*市场.*)|(.*特色商业街.*)|(.*步行街.*)|(.*农贸市场.*)", "entertainment_count": "(.*运动场馆.*)|(.*娱乐场所.*)|(.*休闲场所.*)"} try: # 餐厅、交通、商城、娱乐场馆等经纬度 coordinate = get_coordinate(spark).select("cityname","lng", "lat", "types") coordinate.cache() # 零售户经纬度 co_cust = get_co_cust(spark).select("cust_id") cust_lng_lat = get_cust_lng_lat(spark).select("cust_id", "city", "lng", "lat") \ .join(co_cust, "cust_id") # 每个零售户 一公里的经度范围和纬度范围 cust_lng_lat0 = cust_lng_lat.withColumn("scope", f.lit(1)) \ .withColumn("lng_l", lng_l(col("lng"), col("lat"), col("scope"))) \ .withColumn("lng_r", lng_r(col("lng"), col("lat"), col("scope"))) \ .withColumn("lat_d", lat_d(col("lat"), col("scope"))) \ .withColumn("lat_u", lat_u(col("lat"), col("scope"))) cust_lng_lat0.cache() for y in range(len(types)): try: colName = list(types.keys())[y] regex = types[colName] print(f"{str(dt.now())} {colName}指数") result = poi_index(spark, cust_lng_lat0,coordinate,regex, is_fill=True) \ .withColumnRenamed("poi_index", colName) result.foreachPartition(lambda x: write_hbase1(x, [colName], hbase)) except Exception: tb.print_exc() coordinate.unpersist() cust_lng_lat0.unpersist() except Exception: tb.print_exc()
def recommend_supply_value(): #建议投放量 try: print(f"{str(dt.now())} 建议投放量") spark.conf.set("spark.sql.execution.arrow.enabled", "true") co_co_line = get_co_co_line(spark, scope=[1, 365])\ .where(col("com_id")=="011114302")\ .select("co_num", "line_num", "cust_id", "item_id","sale_center_id", "qty_need", "qty_ord", "qty_rsn", "price", "amt","born_date")\ .withColumn("born_date",col("born_date").cast("string")) pd_df = co_co_line.toPandas() """ item_id,2019-06-03 00:00:00 1130309,51 1130316,26 """ sub_df = predict(pd_df, 1) df = spark.createDataFrame(sub_df) value = df.columns[1] result = df.withColumnRenamed(value, "gauge_advise_volume") \ .withColumn("date", f.lit(value)) plm_item = get_plm_item(spark).select("item_id", "item_name") columns = [ "com_id", "city", "gauge_id", "gauge_name", "gauge_advise_volume", "date", "gears_data_marker" ] result.join(plm_item,"item_id")\ .withColumn("com_id",f.lit("011114302"))\ .withColumn("city",f.lit("株洲市"))\ .withColumn("row",f.concat_ws("_",col("com_id"),col("item_id")))\ .withColumn("gears_data_marker",f.lit("0"))\ .withColumnRenamed("item_id","gauge_id")\ .withColumnRenamed("item_name","gauge_name")\ .foreachPartition(lambda x:write_hbase1(x,columns,hbase)) except Exception: tb.print_exc()
def get_item_rating(): #规格卷烟区域偏好分布 hbase = {"table": als_table + "_TEMP", "families": ["0"], "row": "row"} try: #卷烟id 卷烟名称 plm_item = get_plm_item(spark).select("item_id", "item_name") area = get_area(spark) # com_id与city的映射关系 city = area.dropDuplicates(["com_id"]).select("com_id", "city") # sale_center_id与区(list)的映射关系 county = area.groupBy("sale_center_id") \ .agg(f.collect_list("county").alias("county")) \ .select("sale_center_id", "county") #com_id,sale_center_id,city,county,cust_id,cust_name,longitude,latitude co_cust = get_co_cust(spark).select("cust_id", "cust_name", "com_id", "sale_center_id") \ .join(get_cust_lng_lat(spark), "cust_id") \ .withColumnRenamed("lng", "longitude") \ .withColumnRenamed("lat", "latitude")\ .join(county,"sale_center_id")\ .join(city,"com_id") print(f"{str(dt.now())} 每个零售户对每品规烟的评分") columns = [ "city", "sale_center_id", "county", "cust_id", "cust_name", "longitude", "latitude", "gauge_id", "gauge_name", "grade_data_marker", "gauge_grade" ] get_cigar_rating(spark).join(co_cust,"cust_id")\ .join(plm_item,"item_id")\ .withColumn("row",f.concat_ws("_",col("item_id"),col("cust_id")))\ .withColumn("grade_data_marker",f.lit("1"))\ .withColumnRenamed("item_id","gauge_id")\ .withColumnRenamed("item_name","gauge_name")\ .withColumnRenamed("rating","gauge_grade")\ .foreachPartition(lambda x:write_hbase1(x,columns,hbase)) except Exception: tb.print_exc()
def supply_analysis_county(): # 各区县该卷烟品规订足率 plm_item = get_plm_item(spark).select("item_id", "item_name") area = get_area(spark) # sale_center_id与区(list)的映射关系 county = area.groupBy("sale_center_id") \ .agg(f.collect_list("county").alias("county")) \ .select("sale_center_id", "county") co_co_line = get_co_co_line(spark, scope=[1, 1], filter="week") \ .where(col("qty_rsn") > 0) \ .select("sale_center_id", "item_id", "qty_ord", "qty_rsn","cust_id") #只按照档位过滤掉ZZ档位的,状态不过滤 co_cust = spark.sql( "select cust_id from DB2_DB2INST1_CO_CUST where dt=(select max(dt) from DB2_DB2INST1_CO_CUST) and cust_seg!='ZZ'") \ .where(col("com_id").isin(cities)) try: print(f"{str(dt.now())} 各区县该卷烟品规订足率") colName = "county_gauge_reserve" reserve_ratio = co_co_line.join(co_cust,"cust_id")\ .groupBy("sale_center_id", "item_id") \ .agg(f.sum("qty_ord").alias("qty_ord"), f.sum("qty_rsn").alias("qty_rsn")) \ .withColumn(colName, col("qty_ord") / col("qty_rsn")) columns = [ "sale_center_id", "county", "gauge_id", "gauge_name", "gears_data_marker", colName ] reserve_ratio.join(county, "sale_center_id") \ .join(plm_item, "item_id") \ .withColumn("row", f.concat_ws("_", col("sale_center_id"), col("item_id"))) \ .withColumn("gears_data_marker", f.lit("1")) \ .withColumnRenamed("item_id", "gauge_id") \ .withColumnRenamed("item_name", "gauge_name") \ .foreachPartition(lambda x: write_hbase1(x, columns, hbase)) except Exception: tb.print_exc()
def get_order_similar_cigar(): #客户订购相似卷烟 try: print(f"{str(dt.now())} 客户订购相似卷烟") # origin为要查找的烟,nearest为最相似的卷烟 similar_cigar = get_client_similar_cigar(spark) plm_item = get_plm_item(spark).select("item_id", "item_name") columns = [ "gauge_id", "gauge_name", "gauge_client_like_ciga", "ciga_data_marker" ] json_udf = f.udf(lambda x, y, z: json.dumps( { "gaugeId": x, "gaugeName": y, "gaugeSimilarity": z }, ensure_ascii=False)) df_map = similar_cigar.join(plm_item, col("origin") == col("item_id")) \ .withColumnRenamed("item_id", "gauge_id") \ .withColumnRenamed("item_name", "gauge_name") \ .join(plm_item, col("other") == col("item_id")) \ .withColumn("item_map", json_udf(col("item_id"), col("item_name"), col("distance"))) #按照相似度倒序排序 map_list_sort_udf = f.udf(lambda l, k: sorted( l, key=lambda x: json.loads(x)[k], reverse=True)) df_map.groupBy("gauge_id", "gauge_name") \ .agg(f.collect_list(col("item_map")).alias("item_list")) \ .withColumn("gauge_client_like_ciga", map_list_sort_udf(col("item_list"), f.lit("gaugeSimilarity"))) \ .withColumn("row", col("gauge_id")) \ .withColumn("ciga_data_marker", f.lit("4")) \ .select("row", "gauge_id", "gauge_name", "ciga_data_marker", "gauge_client_like_ciga") \ .foreachPartition(lambda x: write_hbase1(x, columns, hbase)) except Exception: tb.print_exc()
def get_area_ppl(): print(f"{str(dt.now())} 人口数量") try: population = get_city_ppl(spark).select("区县", "总人口", "常住人口", "城镇人口", "农村人口") area_code = get_area(spark).where(col("city").rlike("株洲市|邵阳市|岳阳市")) \ .select("city","county", "sale_center_id") area_plt = population.join(area_code, col("county") == col("区县")) cols = [ "total_number", "permanent_number", "urban_number", "country_number" ] # 每个区域中心的人口 sale_center_ppl = area_plt.groupBy("city", "sale_center_id") \ .agg(f.sum("总人口").alias(cols[0]), f.sum("常住人口").alias(cols[1]), f.sum("城镇人口").alias(cols[2]), f.sum("农村人口").alias(cols[3])) sale_center_ppl.foreachPartition( lambda x: write_hbase1(x, cols, hbase)) except Exception: tb.print_exc()
def get_avg_cons_level(): try: co_cust = get_valid_co_cust(spark).select("cust_id") print(f"{str(dt.now())} 零售店订货条均价/消费水平") consume_level_df = get_consume_level(spark).select( "cust_id", "consume_level") # -----------------------获取co_co_01 co_co_01=get_co_co_01(spark,scope=[1,1],filter="month") \ .select("com_id", "qty_sum", "amt_sum", "cust_id") #每个零售户的订货总量 总订货额 qty_amt_sum = co_co_01.groupBy("com_id","cust_id")\ .agg(f.sum("qty_sum").alias("order_sum"), f.sum("amt_sum").alias("amt_sum")) #每个零售户的订货条均价 avg_price=qty_amt_sum.withColumn("avg_price", col("amt_sum")/col("order_sum"))\ .select("com_id","cust_id","avg_price") avg_consume_level=avg_price.join(consume_level_df,"cust_id")\ .withColumn("retail_price_cons",col("avg_price")/col("consume_level")) cols = { "value": "retail_price_cons", "abnormal": "price_cons", "mean_plus_3std": "price_std_plus3", "mean_minus_3std": "price_std_minu3", "mean": "city_price_cons" } values = list(cols.values()) is_except(avg_consume_level,cols,["com_id"])\ .join(co_cust,"cust_id")\ .foreachPartition(lambda x:write_hbase1(x,values,hbase)) except Exception: tb.print_exc()
def get_trans_busiArea_index(): try: coordinate = get_coordinate(spark).select("lng", "lat", "types", "adcode", "cityname") area_code=get_area(spark).withColumnRenamed("区域编码","adcode")\ .select("adcode","county","sale_center_id") types = { "trans_index": "(.*地铁站.*)|(.*公交.*)|(.*机场.*)|(.*港口码头.*)", "area_index": "(.*餐厅.*)|(.*咖啡厅.*)|(.*茶艺馆.*)|(.*甜品店.*)|(.*购物中心.*)|(.*运动场馆.*)|(.*娱乐场所.*)|(.*休闲场所.*)", } area_coor = coordinate.join(area_code, "adcode") for colName in types.keys(): print(f"{str(dt.now())} {colName}") regex = types[colName] try: #area_coor 先获取符合条件的服务 count_df=area_coor.where(col("types").rlike(regex)) \ .groupBy("cityname", "sale_center_id") \ .count() #每个市的最大值 max_df = count_df.groupBy("cityname").agg( f.max("count").alias("max")) count_df.join(max_df, "cityname")\ .withColumn(colName,f.round(col("count")/col("max")*5,4)) \ .foreachPartition(lambda x: write_hbase1(x, [colName],hbase)) except Exception: tb.print_exc() except Exception: tb.print_exc()