from ld_utils.utils import dict_from_file from ld_utils.spark_utils import create_spark_session, save2ps import pyspark.sql.functions as func from datetime import date spark = create_spark_session('mcc', n_executors=8, n_cores=8) for day in range(7, 14): current = date(2020, 12, day) transactions = spark.table("rozn_custom_rb_smartvista.card_transaction") transactions = transactions.select("epk_id", "merchant", "day_part") transactions = transactions.where(func.col("day_part") == current) transactions = transactions.where(func.col("epk_id").isNotNull()) if day == 7: save2ps(transactions, "transactions4mcc_test", partition='day_part', mode="overwrite") else: save2ps(transactions, "transactions4mcc_test", partition='day_part', mode="append")
from ld_utils.td_utils import create_connection, create_table from ld_utils.utils import dict_from_file from ld_utils.spark_utils import create_spark_session, custom_load, make_sql, save2ps config = dict_from_file("../conf/logins.json") spark = create_spark_session('sberprime_transfer', n_executors=8, n_cores=8) sql = make_sql("sbx_retail_mp_lm ", "dm_partner_sales") df = custom_load(spark, sql, config) save2ps(df, 'dm_partner_sales', partition="evt_dt")
from ld_utils.spark_utils import create_spark_session, save2ps from tqdm import tqdm from pyspark.sql.types import * import pyspark.sql.functions as F from itertools import chain from datetime import date, timedelta from ld_utils.utils import get_last_day_of_current_month import os spark = create_spark_session(name="day_aggr", n_executors=8, n_cores=8) chosen_types = [ 737, # Транзакция завершения покупки на POS-терминале 787, # Вторая часть 2-этапной кредитной транзакции на POS- терминале 890, # Перевод средств со счета на счет в пределах карты 502, # Покупка за бонусные баллы 501, # Покупка с частичным использованием бонусных баллов 511, # Покупка по топливной карте 782, # Дебетовая часть транзакции перевода с карты на карту 680, # Покупка через ePOS –терминал 678, # Завершение покупки через ePOS-терминал 776, # Транзакция Покупка со сдачей на POS-терминале 700, # Выдача наличных через банкомат 699, # Снятие наличных с POS-терминала без карты 774, # покупка в POS ] data = {'Здоровье и красота': 'health', 'Одежда и аксессуары': 'clothes', 'Образование': 'education',
path2conf = "../conf" # path2conf = "conf" partner = 'delivery_club' partner_name = dict_from_file(f"{path2conf}/partners.json")[partner] first_dt = date(2020, 12, 1) pivot_dt = date(2021, 1, 1) last_dt = date(2021, 2, 1) # TODO list with all cities? cities = get_list_of_cities(partner, path2conf=path2conf) join_columns = ['epk_id', "report_dt_part"] target_source = "sbx_t_team_mp_cmpn_ds.dm_partner_sales" ## CONNECT TO DB spark = create_spark_session('sberprime task', n_executors=16, n_cores=8) # TODO create features about ecosystem ## ## TARGET CREATION partners = [el for el in dict_from_file(f"{path2conf}/partners.json").keys() if el not in ['sberprime', 'level_kitchen']] for partner in partners: print(f"<BEGIN> {partner}") try: cities = get_list_of_cities(partner, path2conf=path2conf) except: cities = False partner_name = dict_from_file(f"{path2conf}/partners.json")[partner] sales = spark.table(target_source).where(F.col("partner_name") == partner_name) sales = sales.groupBy("epk_id").agg({"evt_dt": "min"}).withColumnRenamed("min(evt_dt)", "evt_dt")
from ld_utils.td_utils import create_connection, create_table from ld_utils.utils import dict_from_file from ld_utils.spark_utils import create_spark_session, custom_load, make_sql, save2ps config = dict_from_file("../conf/logins.json") spark = create_spark_session( 'local kitchen', n_executors=16, ) # sql = make_sql("sbx_retail_mp_lm ", "matched_local_kitchen_1202", columns=["epk_id"]) sql = '''select t1.*, t2.mcc_subgroup_name, t2.mcc_group_id from sbx_retail_mp_ca_vd.vsiv_autocj_next_mcc_scores_fnl_corr t1 left join sbx_retail_mp_dm.ref_mcc_subgroup t2 on t1.mcc_subgroup_id = t2.mcc_subgroup_id''' df = custom_load(spark, sql, config) save2ps(df, 'knowledge_mcc_test')
from ld_utils.spark_utils import create_spark_session, save2ps, ps2cluster, sdf2cluster, delete_folder_from_ps from ld_utils.utils import dict_from_file, get_list_of_cities import pyspark.sql.functions as F from datetime import date, timedelta partner = "delivery_club" last_dt = date(2021, 1, 1) pivot_dt = date(2020, 12, 1) first_dt = date(2020, 10, 1) # TODO create function that construct path2conf through enviroment path2conf = "../conf" # uncomment if run as py # path2conf = "conf" spark = create_spark_session('dataset_creation', n_executors=16, n_cores=8, executor_memory=32, driver_memory=64, ) partners = dict_from_file(f"{path2conf}/partners.json") aggr = spark.table("sbx_t_team_mp_cmpn_ds.day_aggr") aggr = aggr.where(F.col("report_dt") < last_dt) aggr = aggr.where (F.col("client_city").isin(get_list_of_cities(partner, path2conf=path2conf))) sales = spark.table("sbx_t_team_mp_cmpn_ds.dm_partner_sales") sales = sales.where(F.col("partner_name") == partners[partner]) sales = sales.where((F.col("evt_dt") < last_dt) & (F.col("evt_dt") >= first_dt)) sales = sales.withColumn("target", F.lit(1)) sales = sales.withColumn("report_dt", F.date_sub('evt_dt', 3)) # sales = sales.withColumnRenamed("evt_dt", "report_dt") sales = sales.select("epk_id", "report_dt", "target") # TODO crete good algorithm to do this shit dataset = aggr.join(sales, ['epk_id', "report_dt"], how="left") print(dataset.select(F.sum("target").alias('amount_of')).show()) dataset = dataset.fillna({"target": 0}) dataset = dataset.where(F.col("report_dt").isNotNull())