def write_aggr2ps(start_dt, name, delta, id_name="epk_id"): end_dt = start_dt + timedelta(days=delta - 1) for _ in tqdm(range((date(2021, 12, 1) - start_dt).days // delta)): aggr_days = create_aggr(start_dt, end_dt) pv_aggr_days = create_pivot_table(aggr_days) pv_aggr_days = pv_aggr_days.withColumn("report_dt", F.lit(end_dt)) pv_aggr_days = pv_aggr_days.where(F.col(id_name).isNotNull()) features = add_knowledge_info(spark, pv_aggr_days, start_dt) save2ps(features, name, mode="append", partition="report_dt") start_dt = start_dt + timedelta(days=delta) end_dt = end_dt + timedelta(days=delta)
from ld_utils.utils import dict_from_file from ld_utils.spark_utils import create_spark_session, save2ps import pyspark.sql.functions as func from datetime import date spark = create_spark_session('mcc', n_executors=8, n_cores=8) for day in range(7, 14): current = date(2020, 12, day) transactions = spark.table("rozn_custom_rb_smartvista.card_transaction") transactions = transactions.select("epk_id", "merchant", "day_part") transactions = transactions.where(func.col("day_part") == current) transactions = transactions.where(func.col("epk_id").isNotNull()) if day == 7: save2ps(transactions, "transactions4mcc_test", partition='day_part', mode="overwrite") else: save2ps(transactions, "transactions4mcc_test", partition='day_part', mode="append")
from ld_utils.td_utils import create_connection, create_table from ld_utils.utils import dict_from_file from ld_utils.spark_utils import create_spark_session, custom_load, make_sql, save2ps config = dict_from_file("../conf/logins.json") spark = create_spark_session('sberprime_transfer', n_executors=8, n_cores=8) sql = make_sql("sbx_retail_mp_lm ", "dm_partner_sales") df = custom_load(spark, sql, config) save2ps(df, 'dm_partner_sales', partition="evt_dt")
| (F.col("report_dt_part") == pivot_dt - timedelta(days=1))) # aggr = aggr.drop(["report_dt", "ctl_loading"]) kn = spark.table("sklod_dwh_sbx_retail_mp_ext.dm_client_knowledge_epk ")\ .where((F.col("report_dt_part") == first_dt - timedelta(days=1)) | (F.col("report_dt_part") == pivot_dt - timedelta(days=1))) if cities: kn = kn.where(F.col('client_city').isin(cities)) # kn = kn.drop("report_dt", "ctl_loading") for column in ["report_dt", "ctl_loading"]: aggr = aggr.drop(column) kn = kn.drop(column) dataset = aggr.join(kn, ["epk_id", "report_dt_part"], how='inner').join(sales, ["epk_id", "report_dt_part"], how='left') dataset = dataset.fillna({"target": 0}) percents = dataset.groupBy("target").count().toPandas() ones = 100000 / percents["count"].tolist()[0] zeros = 1900000 / percents["count"].tolist()[1] print(dataset.groupBy("report_dt_part").count().show()) print(ones, zeros) train = dataset.where(F.col("report_dt_part") == first_dt - timedelta(days=1))\ .sampleBy("target", fractions={1: min(ones * 2, 1), 0: min(zeros * 2, 1)}) print(train.groupBy("target").count().show()) delete_table_from_ps(f'{partner}_train', spark=spark) save2ps(train, f'{partner}_train') oot = dataset.where(F.col("report_dt_part") == pivot_dt - timedelta(days=1))\ .sampleBy("target", fractions={1: min(ones * 4, 1), 0: min(zeros * 1.5, 1)}) print(train.groupBy("target").count().show()) delete_table_from_ps(f'{partner}_oot', spark=spark) save2ps(train, f'{partner}_oot') print(f"<END>")
from ld_utils.td_utils import create_connection, create_table from ld_utils.utils import dict_from_file from ld_utils.spark_utils import create_spark_session, custom_load, make_sql, save2ps config = dict_from_file("../conf/logins.json") spark = create_spark_session( 'local kitchen', n_executors=16, ) # sql = make_sql("sbx_retail_mp_lm ", "matched_local_kitchen_1202", columns=["epk_id"]) sql = '''select t1.*, t2.mcc_subgroup_name, t2.mcc_group_id from sbx_retail_mp_ca_vd.vsiv_autocj_next_mcc_scores_fnl_corr t1 left join sbx_retail_mp_dm.ref_mcc_subgroup t2 on t1.mcc_subgroup_id = t2.mcc_subgroup_id''' df = custom_load(spark, sql, config) save2ps(df, 'knowledge_mcc_test')
# path2conf = "conf" spark = create_spark_session('dataset_creation', n_executors=16, n_cores=8, executor_memory=32, driver_memory=64, ) partners = dict_from_file(f"{path2conf}/partners.json") aggr = spark.table("sbx_t_team_mp_cmpn_ds.day_aggr") aggr = aggr.where(F.col("report_dt") < last_dt) aggr = aggr.where (F.col("client_city").isin(get_list_of_cities(partner, path2conf=path2conf))) sales = spark.table("sbx_t_team_mp_cmpn_ds.dm_partner_sales") sales = sales.where(F.col("partner_name") == partners[partner]) sales = sales.where((F.col("evt_dt") < last_dt) & (F.col("evt_dt") >= first_dt)) sales = sales.withColumn("target", F.lit(1)) sales = sales.withColumn("report_dt", F.date_sub('evt_dt', 3)) # sales = sales.withColumnRenamed("evt_dt", "report_dt") sales = sales.select("epk_id", "report_dt", "target") # TODO crete good algorithm to do this shit dataset = aggr.join(sales, ['epk_id', "report_dt"], how="left") print(dataset.select(F.sum("target").alias('amount_of')).show()) dataset = dataset.fillna({"target": 0}) dataset = dataset.where(F.col("report_dt").isNotNull()) train = dataset.where(F.col("report_dt") < pivot_dt).sampleBy("target", fractions={1: 0.05, 0: 0.95}) # sdf2cluster(train, f'{partner}_train') spark.sql(f"drop table if exists sbx_t_team_mp_cmpn_ds.{partner}_train") delete_folder_from_ps(f'{partner}_train') save2ps(train, f'{partner}_train', partition="report_dt") oot = dataset.where(F.col("report_dt") >= pivot_dt).sampleBy("target", fractions={1: 0.03, 0: 0.97}) # sdf2cluster(oot, f'{partner}_oot') spark.sql(f"drop table if exists sbx_t_team_mp_cmpn_ds.{partner}_oot") delete_folder_from_ps( f'{partner}_oot') save2ps(oot, f'{partner}_oot', partition="report_dt") spark.stop()
# sales = spark.table(target_source).where(F.col("partner_name") == partner_name) # sales = sales.where((F.col("evt_dt") < last_dt) & (F.col("evt_dt") >= first_dt)) # sales = sales.withColumn("report_dt_part", F.expr("date_sub(evt_dt, dayOfMonth(evt_dt))")) # sales = sales.select(*ощ).withColumn("target", F.lit(1)) # join_columns sales = spark.table(target_source) sales = sales.withColumn("target", F.lit(1)) # TODO dates as parameter ## ADD FEATURES aggr = spark.table("sklod_dwh_sbx_retail_mp_ext.ft_client_aggr_mnth_epk")\ .where(F.col("report_dt_part") == first_dt - timedelta(days=1)) # aggr = aggr.drop(["report_dt", "ctl_loading"]) kn = spark.table("sklod_dwh_sbx_retail_mp_ext.dm_client_knowledge_epk ")\ .where(F.col("report_dt_part") == first_dt - timedelta(days=1)) if cities: kn = kn.where(F.col('client_city').isin(cities)) # kn = kn.drop("report_dt", "ctl_loading") for column in ["report_dt", "ctl_loading", "report_dt_part"]: aggr = aggr.drop(column) kn = kn.drop(column) dataset = aggr.join(kn, ["epk_id"], how='inner').join(sales, ["epk_id"], how='left') dataset = dataset.fillna({"target": 0}) save2ps(dataset, "lkrscore") spark.stop()