#result = df.rdd.map(lambda row: ((row['item_third_cate_cd']), row)).groupByKey().flatMap(calculate_baseline_cid3) fi_df = spark.sql(''' SELECT * FROM %s ''' % (spa_utils.rename('app.app_pa_festival_information', params))) fi_pd = fi_df.toPandas() result = df.rdd \ .map(lambda row: ((row['item_third_cate_cd']), row)).groupByKey() \ .flatMap(lambda row: calculate_baseline_cid3(row, 'self', fi_pd)) # 结果保存为Spark Dataframe result_df = spark.createDataFrame(result.map(format_result_cid3), schema=SCHEMA_OUTPUT_CID3) result_df = result_df.na.drop() result_df = result_df\ .withColumn('dt', F.lit(update_end))\ .select('date', 'item_third_cate_cd', 'final_baseline','dt') spark.sql("set hive.exec.dynamic.partition=true") spark.sql("set hive.exec.dynamic.partition.mode=nonstrict") logger.info('Saving results...') logger.info('inserting app.app_pa_baseline_cid3...') spa_utils.save_result(result_df, 'app.app_pa_baseline_cid3', partitioning_columns=['dt'], write_mode=params['write_mode'], spark=spark, params=params) logger.info('insert table done')
''' % (spa_utils.rename('app.app_pa_sales_duration', params))) df_sales_dtsku_start_end_date = df_sales_dtsku_start_end_date \ .withColumn('start_date', F.when(F.col('start_date') >= update_start, F.col('start_date')).otherwise(update_start)) df_sales_dtsku_start_end_date.cache() # 填充各sku跨度的日期特征 df_sku_duration_time = df_sales_dtsku_start_end_date\ .join(df_time, df_time['dt'] >= df_sales_dtsku_start_end_date['start_date'], 'left')\ .drop('start_date')\ .filter(F.col('dt').isNotNull()) df_sku_duration_time.cache() # 合并 df_complete = df_sku_duration_time\ .join(df_sales_dtsku, ['dt', 'item_sku_id'], 'left')\ .join(df_stock, ['dt', 'item_sku_id'], 'left')\ .fillna(0) df_complete = df_complete.select(['item_sku_id', 'newyear', 'springfestival', 'tombsweepingfestival', 'labourday', 'dragonboatfestival', 'midautumnfestival', 'nationalday', 'h1111mark', 'h618mark', 'h1212mark', 'week_of_year', 'day_of_year', 'day_of_week', 'free_gift_flag', 'ghost_offer_flag', 'dq_and_jq_pay_flag', 'jq_pay_flag', 'dq_pay_flag', 'full_minus_offer_flag', 'suit_offer_flag', 'sku_offer_flag', 'non_promo_flag', 'sale_qtty', 'after_prefr_amount', 'before_prefr_amount', 'synthetic_before_prefr_amount', 'participation_rate_full_minus_and_suit_offer', 'participation_rate_dq_and_jq_pay', 'sku_offer_discount_rate', 'full_minus_offer_discount_rate', 'suit_offer_discount_rate', 'ghost_offer_discount_rate', 'dq_and_jq_pay_discount_rate', 'jq_pay_discount_rate', 'dq_pay_discount_rate', 'free_gift_discount_rate', 'out_of_stock_flag', 'dt']) spark.sql("set hive.exec.dynamic.partition=true") spark.sql("set hive.exec.dynamic.partition.mode=nonstrict") logger.info('inserting app.app_pa_features_dtsku...') spa_utils.save_result(df_complete, 'app.app_pa_features_dtsku', partitioning_columns=['dt'], write_mode=params['write_mode'], spark=spark, params = params)
# 读取流量表 traffic_df = spa_utils.read_table('app_cmo_ol_client_sku_3_to_bjmart_di', start=update_start, end=update_end, spark=spark, params=params, sep='\t', header=True, schema=traffic_schema) # app.app_pa_traffic_dtsku # sku流量模型 # 粒度(dt, sku) df_sku_traffic = traffic_df\ .groupBy(['item_sku_id', 'dt'])\ .agg( F.sum('pv').alias('pv'), F.sum('uv').alias('uv') ) df_sku_traffic = df_sku_traffic.select(['item_sku_id', 'pv', 'uv', 'dt']) spark.sql("set hive.exec.dynamic.partition=true") spark.sql("set hive.exec.dynamic.partition.mode=nonstrict") logger.info('inserting app.app_pa_traffic_dtsku...') spa_utils.save_result(df_sku_traffic, 'app.app_pa_traffic_dtsku', partitioning_columns=['dt'], write_mode=params['write_mode'], spark=spark, params=params)