def save_result(df,
                table_name,
                partitioning_columns=[],
                repartitioning_columns=[],
                write_mode='insert',
                spark=None,
                params=None):
    if params is None:
        params = dict()
    table_name = spa_utils.rename(table_name, params)
    if isinstance(partitioning_columns, str):
        partitioning_columns = [partitioning_columns]
    save_mode = 'overwrite' if ('overwrite' in params.keys()) and (
        params['overwrite'] == 1) else 'error'
    if write_mode == 'save':
        if len(partitioning_columns) > 0:
            df.repartition(
                *repartitioning_columns).write.mode(save_mode).partitionBy(
                    partitioning_columns).format('orc').saveAsTable(table_name)
        else:
            df.write.mode(save_mode).format('orc').saveAsTable(table_name)
    elif write_mode == 'insert':
        if len(partitioning_columns) > 0:
            rows = df.select(partitioning_columns).distinct().collect()
            querys = []
            for r in rows:
                p_str = ','.join(
                    ["%s='%s'" % (k, r[k]) for k in partitioning_columns])
                querys.append("alter table %s drop if exists partition(%s)" %
                              (table_name, p_str))
            for q in querys:
                spark.sql(q)
            df.repartition(*repartitioning_columns).write.insertInto(
                table_name, overwrite=False)
        else:
            df.write.insertInto(table_name, overwrite=False)
    else:
        raise ValueError('mode "%s" not supported ' % write_mode)
Beispiel #2
0
# 读取start date和end date
update_start = params['update_start']
update_end = params['update_end']

# 读取各模型以及合并
# 销量
df_sales_dtsku = spark.sql('''
SELECT
    *
FROM
    %s
WHERE
    dt >= '%s'
AND dt <= '%s'
''' % (spa_utils.rename('app.app_pa_sales_dtsku', params), update_start, update_end))
df_sales_dtsku.cache()

# 库存
df_stock = spark.sql('''
SELECT
    sku_id, dt, out_of_stock_flag
FROM
    %s
WHERE
    dt >= '%s'
AND dt <= '%s'
''' % (spa_utils.rename('app.app_pa_stock_dtsku', params), update_start, update_end))
df_stock = df_stock\
    .withColumnRenamed('sku_id', 'item_sku_id')
df_stock.cache()
if len(sys.argv) >= 4:
    params['write_mode'] = sys.argv[3]

# 销量基线update_end以前的所有历史数据
update_end = params['update_end']
update_start = params['update_origin']

# 读取各模型以及合并
df_sales_dtcid3 = spark.sql('''
SELECT
    *
FROM
    %s
WHERE
    dt = '%s'
''' % (spa_utils.rename('app.app_pa_sales_dtcid3', params), update_end))
df_sales_dtcid3 = df_sales_dtcid3.drop('dt')
df_sales_dtcid3.cache()

df_time = spark.sql('''
SELECT
    *
FROM
    %s
WHERE
    dt >= '%s'
AND dt <= '%s'
''' % (spa_utils.rename('app.app_pa_festival_features',
                        params), update_start, update_end))
df_time = df_time.withColumnRenamed('dt', 'date')
df_time.cache()
Beispiel #4
0

SCHEMA_OUTPUT_CID3 = StructType([
    StructField("date", sql_type.StringType()),
    StructField("item_third_cate_cd", sql_type.StringType()),
    StructField("final_baseline", sql_type.DoubleType())
])

# 读取订单模型B1
df = spark.sql('''
SELECT
    *
FROM %s
WHERE
    dt = '%s'
''' % (spa_utils.rename('app.app_pa_features_dtcid3', params), update_end))
df = df.drop('dt')
df.cache()

df = df.withColumn('log_synthetic_before_prefr_amount',
                   F.log(F.col('synthetic_before_prefr_amount') + 0.0001))
df = add_datediff(df, 'date', update_start)
df = add_fourier_terms(df, 53, 'week_of_year', 3)
df = add_fourier_terms(df, 7, 'day_of_week', 3)

#df = convert_boolean_to_int(df, ['newyear', 'springfestival', 'tombsweepingfestival', 'labourday', 'dragonboatfestival', 'midautumnfestival', 'nationalday', 'h1111mark', 'h618mark', 'h1212mark'])

# Dataframe转换为rdd,并按item_third_cate_cd进行分组,分别计算每个item_third_cate_cd的销量基线

#result = df.rdd.map(lambda row: ((row['item_third_cate_cd']), row)).groupByKey().flatMap(calculate_baseline_cid3)
fi_df = spark.sql('''
SCHEMA_OUTPUT_SKU = StructType([
    StructField("dt", sql_type.StringType()),
    StructField("item_sku_id", sql_type.LongType()),
    StructField("final_baseline", sql_type.DoubleType()),
    StructField("uplift", sql_type.DoubleType()),
    StructField("uplift_rate", sql_type.DoubleType())
])

# 读取订单模型B1
df_features = spark.sql('''
SELECT
    *
FROM %s
WHERE
    dt <= '%s'
''' % (spa_utils.rename('app.app_pa_features_dtsku', params), update_end))
df_features.cache()

# 读取最新的库存状态
df_hierarchical_sku = spark.sql('''
SELECT
    item_sku_id, start_date, sku_status_cd
FROM %s
WHERE
    dt = '%s'
''' % (spa_utils.rename('app.app_pa_hierarchical_sku', params), update_end))
df_hierarchical_sku = df_hierarchical_sku.filter(
    F.col('sku_status_cd') == '3000').drop('sku_status_cd')
df_hierarchical_sku.cache()

# 剔除每个sku,最近一次状态为下柜时的,下柜时间的数据