def save_result(df, table_name, partitioning_columns=[], repartitioning_columns=[], write_mode='insert', spark=None, params=None): if params is None: params = dict() table_name = spa_utils.rename(table_name, params) if isinstance(partitioning_columns, str): partitioning_columns = [partitioning_columns] save_mode = 'overwrite' if ('overwrite' in params.keys()) and ( params['overwrite'] == 1) else 'error' if write_mode == 'save': if len(partitioning_columns) > 0: df.repartition( *repartitioning_columns).write.mode(save_mode).partitionBy( partitioning_columns).format('orc').saveAsTable(table_name) else: df.write.mode(save_mode).format('orc').saveAsTable(table_name) elif write_mode == 'insert': if len(partitioning_columns) > 0: rows = df.select(partitioning_columns).distinct().collect() querys = [] for r in rows: p_str = ','.join( ["%s='%s'" % (k, r[k]) for k in partitioning_columns]) querys.append("alter table %s drop if exists partition(%s)" % (table_name, p_str)) for q in querys: spark.sql(q) df.repartition(*repartitioning_columns).write.insertInto( table_name, overwrite=False) else: df.write.insertInto(table_name, overwrite=False) else: raise ValueError('mode "%s" not supported ' % write_mode)
# 读取start date和end date update_start = params['update_start'] update_end = params['update_end'] # 读取各模型以及合并 # 销量 df_sales_dtsku = spark.sql(''' SELECT * FROM %s WHERE dt >= '%s' AND dt <= '%s' ''' % (spa_utils.rename('app.app_pa_sales_dtsku', params), update_start, update_end)) df_sales_dtsku.cache() # 库存 df_stock = spark.sql(''' SELECT sku_id, dt, out_of_stock_flag FROM %s WHERE dt >= '%s' AND dt <= '%s' ''' % (spa_utils.rename('app.app_pa_stock_dtsku', params), update_start, update_end)) df_stock = df_stock\ .withColumnRenamed('sku_id', 'item_sku_id') df_stock.cache()
if len(sys.argv) >= 4: params['write_mode'] = sys.argv[3] # 销量基线update_end以前的所有历史数据 update_end = params['update_end'] update_start = params['update_origin'] # 读取各模型以及合并 df_sales_dtcid3 = spark.sql(''' SELECT * FROM %s WHERE dt = '%s' ''' % (spa_utils.rename('app.app_pa_sales_dtcid3', params), update_end)) df_sales_dtcid3 = df_sales_dtcid3.drop('dt') df_sales_dtcid3.cache() df_time = spark.sql(''' SELECT * FROM %s WHERE dt >= '%s' AND dt <= '%s' ''' % (spa_utils.rename('app.app_pa_festival_features', params), update_start, update_end)) df_time = df_time.withColumnRenamed('dt', 'date') df_time.cache()
SCHEMA_OUTPUT_CID3 = StructType([ StructField("date", sql_type.StringType()), StructField("item_third_cate_cd", sql_type.StringType()), StructField("final_baseline", sql_type.DoubleType()) ]) # 读取订单模型B1 df = spark.sql(''' SELECT * FROM %s WHERE dt = '%s' ''' % (spa_utils.rename('app.app_pa_features_dtcid3', params), update_end)) df = df.drop('dt') df.cache() df = df.withColumn('log_synthetic_before_prefr_amount', F.log(F.col('synthetic_before_prefr_amount') + 0.0001)) df = add_datediff(df, 'date', update_start) df = add_fourier_terms(df, 53, 'week_of_year', 3) df = add_fourier_terms(df, 7, 'day_of_week', 3) #df = convert_boolean_to_int(df, ['newyear', 'springfestival', 'tombsweepingfestival', 'labourday', 'dragonboatfestival', 'midautumnfestival', 'nationalday', 'h1111mark', 'h618mark', 'h1212mark']) # Dataframe转换为rdd,并按item_third_cate_cd进行分组,分别计算每个item_third_cate_cd的销量基线 #result = df.rdd.map(lambda row: ((row['item_third_cate_cd']), row)).groupByKey().flatMap(calculate_baseline_cid3) fi_df = spark.sql('''
SCHEMA_OUTPUT_SKU = StructType([ StructField("dt", sql_type.StringType()), StructField("item_sku_id", sql_type.LongType()), StructField("final_baseline", sql_type.DoubleType()), StructField("uplift", sql_type.DoubleType()), StructField("uplift_rate", sql_type.DoubleType()) ]) # 读取订单模型B1 df_features = spark.sql(''' SELECT * FROM %s WHERE dt <= '%s' ''' % (spa_utils.rename('app.app_pa_features_dtsku', params), update_end)) df_features.cache() # 读取最新的库存状态 df_hierarchical_sku = spark.sql(''' SELECT item_sku_id, start_date, sku_status_cd FROM %s WHERE dt = '%s' ''' % (spa_utils.rename('app.app_pa_hierarchical_sku', params), update_end)) df_hierarchical_sku = df_hierarchical_sku.filter( F.col('sku_status_cd') == '3000').drop('sku_status_cd') df_hierarchical_sku.cache() # 剔除每个sku,最近一次状态为下柜时的,下柜时间的数据