def write_aggr2ps(start_dt, name, delta, id_name="epk_id"):
    end_dt = start_dt + timedelta(days=delta - 1)
    for _ in tqdm(range((date(2021, 12, 1) - start_dt).days // delta)):
        aggr_days = create_aggr(start_dt, end_dt)
        pv_aggr_days = create_pivot_table(aggr_days)
        pv_aggr_days = pv_aggr_days.withColumn("report_dt", F.lit(end_dt))
        pv_aggr_days = pv_aggr_days.where(F.col(id_name).isNotNull())
        features = add_knowledge_info(spark, pv_aggr_days, start_dt)
        save2ps(features, name, mode="append", partition="report_dt")
        start_dt = start_dt + timedelta(days=delta)
        end_dt = end_dt + timedelta(days=delta)
Esempio n. 2
0
from ld_utils.utils import dict_from_file
from ld_utils.spark_utils import create_spark_session, save2ps
import pyspark.sql.functions as func
from datetime import date

spark = create_spark_session('mcc', n_executors=8, n_cores=8)

for day in range(7, 14):
    current = date(2020, 12, day)
    transactions = spark.table("rozn_custom_rb_smartvista.card_transaction")
    transactions = transactions.select("epk_id", "merchant", "day_part")
    transactions = transactions.where(func.col("day_part") == current)
    transactions = transactions.where(func.col("epk_id").isNotNull())
    if day == 7:
        save2ps(transactions, "transactions4mcc_test", partition='day_part', mode="overwrite")
    else:
        save2ps(transactions, "transactions4mcc_test", partition='day_part', mode="append")

Esempio n. 3
0
from ld_utils.td_utils import create_connection, create_table
from ld_utils.utils import dict_from_file
from ld_utils.spark_utils import create_spark_session, custom_load, make_sql, save2ps

config = dict_from_file("../conf/logins.json")

spark = create_spark_session('sberprime_transfer', n_executors=8, n_cores=8)
sql = make_sql("sbx_retail_mp_lm ", "dm_partner_sales")
df = custom_load(spark, sql, config)
save2ps(df, 'dm_partner_sales', partition="evt_dt")
               | (F.col("report_dt_part") ==  pivot_dt - timedelta(days=1)))
    # aggr = aggr.drop(["report_dt", "ctl_loading"])
    kn = spark.table("sklod_dwh_sbx_retail_mp_ext.dm_client_knowledge_epk ")\
    .where((F.col("report_dt_part") ==  first_dt - timedelta(days=1))
               | (F.col("report_dt_part") ==  pivot_dt - timedelta(days=1)))
    if cities:
        kn = kn.where(F.col('client_city').isin(cities))
    # kn = kn.drop("report_dt", "ctl_loading")
    for column in ["report_dt", "ctl_loading"]:
        aggr = aggr.drop(column)
        kn = kn.drop(column)
    dataset = aggr.join(kn, ["epk_id", "report_dt_part"], how='inner').join(sales, ["epk_id", "report_dt_part"], how='left')
    dataset = dataset.fillna({"target": 0})

    percents = dataset.groupBy("target").count().toPandas()
    ones = 100000 / percents["count"].tolist()[0]
    zeros = 1900000 / percents["count"].tolist()[1]
    print(dataset.groupBy("report_dt_part").count().show())
    print(ones, zeros)
    train = dataset.where(F.col("report_dt_part") == first_dt - timedelta(days=1))\
        .sampleBy("target", fractions={1: min(ones * 2, 1), 0: min(zeros * 2, 1)})
    print(train.groupBy("target").count().show())
    delete_table_from_ps(f'{partner}_train', spark=spark)
    save2ps(train, f'{partner}_train')
    oot = dataset.where(F.col("report_dt_part") == pivot_dt - timedelta(days=1))\
        .sampleBy("target", fractions={1: min(ones * 4, 1), 0: min(zeros * 1.5, 1)})
    print(train.groupBy("target").count().show())
    delete_table_from_ps(f'{partner}_oot', spark=spark)
    save2ps(train, f'{partner}_oot')
    print(f"<END>")
Esempio n. 5
0
from ld_utils.td_utils import create_connection, create_table
from ld_utils.utils import dict_from_file
from ld_utils.spark_utils import create_spark_session, custom_load, make_sql, save2ps

config = dict_from_file("../conf/logins.json")

spark = create_spark_session(
    'local kitchen',
    n_executors=16,
)
# sql = make_sql("sbx_retail_mp_lm ", "matched_local_kitchen_1202", columns=["epk_id"])
sql = '''select t1.*, t2.mcc_subgroup_name, t2.mcc_group_id from sbx_retail_mp_ca_vd.vsiv_autocj_next_mcc_scores_fnl_corr  t1
left join  sbx_retail_mp_dm.ref_mcc_subgroup t2
on  t1.mcc_subgroup_id = t2.mcc_subgroup_id'''
df = custom_load(spark, sql, config)
save2ps(df, 'knowledge_mcc_test')
# path2conf = "conf"

spark = create_spark_session('dataset_creation', n_executors=16, n_cores=8, executor_memory=32, driver_memory=64, )
partners = dict_from_file(f"{path2conf}/partners.json")
aggr = spark.table("sbx_t_team_mp_cmpn_ds.day_aggr")
aggr = aggr.where(F.col("report_dt") < last_dt)
aggr = aggr.where (F.col("client_city").isin(get_list_of_cities(partner, path2conf=path2conf)))
sales = spark.table("sbx_t_team_mp_cmpn_ds.dm_partner_sales")
sales = sales.where(F.col("partner_name") == partners[partner])
sales = sales.where((F.col("evt_dt") < last_dt) & (F.col("evt_dt") >= first_dt))
sales = sales.withColumn("target", F.lit(1))
sales = sales.withColumn("report_dt", F.date_sub('evt_dt', 3))
# sales = sales.withColumnRenamed("evt_dt", "report_dt")
sales = sales.select("epk_id", "report_dt", "target")

# TODO crete good algorithm to do this shit
dataset = aggr.join(sales, ['epk_id', "report_dt"], how="left")
print(dataset.select(F.sum("target").alias('amount_of')).show())
dataset = dataset.fillna({"target": 0})
dataset = dataset.where(F.col("report_dt").isNotNull())
train = dataset.where(F.col("report_dt") < pivot_dt).sampleBy("target", fractions={1: 0.05, 0: 0.95})
# sdf2cluster(train, f'{partner}_train')
spark.sql(f"drop table if exists sbx_t_team_mp_cmpn_ds.{partner}_train")
delete_folder_from_ps(f'{partner}_train')
save2ps(train, f'{partner}_train', partition="report_dt")
oot = dataset.where(F.col("report_dt") >= pivot_dt).sampleBy("target", fractions={1: 0.03, 0: 0.97})
# sdf2cluster(oot, f'{partner}_oot')
spark.sql(f"drop table if exists sbx_t_team_mp_cmpn_ds.{partner}_oot")
delete_folder_from_ps( f'{partner}_oot')
save2ps(oot, f'{partner}_oot', partition="report_dt")
spark.stop()
Esempio n. 7
0
# sales = spark.table(target_source).where(F.col("partner_name") == partner_name)
# sales = sales.where((F.col("evt_dt") < last_dt) & (F.col("evt_dt") >= first_dt))
# sales = sales.withColumn("report_dt_part", F.expr("date_sub(evt_dt, dayOfMonth(evt_dt))"))
# sales = sales.select(*ощ).withColumn("target", F.lit(1))
# join_columns

sales = spark.table(target_source)
sales = sales.withColumn("target", F.lit(1))

# TODO dates as parameter

## ADD FEATURES

aggr = spark.table("sklod_dwh_sbx_retail_mp_ext.ft_client_aggr_mnth_epk")\
    .where(F.col("report_dt_part") ==  first_dt - timedelta(days=1))
# aggr = aggr.drop(["report_dt", "ctl_loading"])
kn = spark.table("sklod_dwh_sbx_retail_mp_ext.dm_client_knowledge_epk ")\
    .where(F.col("report_dt_part") ==  first_dt - timedelta(days=1))
if cities:
    kn = kn.where(F.col('client_city').isin(cities))
# kn = kn.drop("report_dt", "ctl_loading")
for column in ["report_dt", "ctl_loading", "report_dt_part"]:
    aggr = aggr.drop(column)
    kn = kn.drop(column)
dataset = aggr.join(kn, ["epk_id"], how='inner').join(sales, ["epk_id"],
                                                      how='left')
dataset = dataset.fillna({"target": 0})

save2ps(dataset, "lkrscore")

spark.stop()