コード例 #1
0
from ld_utils.utils import dict_from_file
from ld_utils.spark_utils import create_spark_session, save2ps
import pyspark.sql.functions as func
from datetime import date

spark = create_spark_session('mcc', n_executors=8, n_cores=8)

for day in range(7, 14):
    current = date(2020, 12, day)
    transactions = spark.table("rozn_custom_rb_smartvista.card_transaction")
    transactions = transactions.select("epk_id", "merchant", "day_part")
    transactions = transactions.where(func.col("day_part") == current)
    transactions = transactions.where(func.col("epk_id").isNotNull())
    if day == 7:
        save2ps(transactions, "transactions4mcc_test", partition='day_part', mode="overwrite")
    else:
        save2ps(transactions, "transactions4mcc_test", partition='day_part', mode="append")

コード例 #2
0
from ld_utils.td_utils import create_connection, create_table
from ld_utils.utils import dict_from_file
from ld_utils.spark_utils import create_spark_session, custom_load, make_sql, save2ps

config = dict_from_file("../conf/logins.json")

spark = create_spark_session('sberprime_transfer', n_executors=8, n_cores=8)
sql = make_sql("sbx_retail_mp_lm ", "dm_partner_sales")
df = custom_load(spark, sql, config)
save2ps(df, 'dm_partner_sales', partition="evt_dt")
コード例 #3
0
from ld_utils.spark_utils import create_spark_session, save2ps
from tqdm import tqdm
from pyspark.sql.types import *
import pyspark.sql.functions as F
from itertools import chain
from datetime import date, timedelta
from ld_utils.utils import get_last_day_of_current_month
import os

spark = create_spark_session(name="day_aggr", n_executors=8, n_cores=8)

chosen_types = [
    737,  # Транзакция   завершения покупки на POS-терминале
    787,  # Вторая   часть 2-этапной кредитной транзакции на POS- терминале
    890,  # Перевод   средств со счета на счет в пределах карты
    502,  # Покупка   за бонусные баллы
    501,  # Покупка   с частичным использованием бонусных баллов
    511,  # Покупка   по топливной карте
    782,  # Дебетовая   часть транзакции перевода с карты на карту
    680,  # Покупка   через ePOS –терминал
    678,  # Завершение   покупки через ePOS-терминал
    776,  # Транзакция   Покупка со сдачей на POS-терминале
    700,  # Выдача   наличных через банкомат
    699,  # Снятие   наличных с POS-терминала без карты
    774,  # покупка в POS

]

data = {'Здоровье и красота': 'health',
        'Одежда и аксессуары': 'clothes',
        'Образование': 'education',
コード例 #4
0
path2conf = "../conf"
# path2conf = "conf"
partner = 'delivery_club'
partner_name = dict_from_file(f"{path2conf}/partners.json")[partner]
first_dt = date(2020, 12, 1)
pivot_dt = date(2021, 1, 1)
last_dt = date(2021, 2, 1)
# TODO list with all cities?
cities = get_list_of_cities(partner, path2conf=path2conf)
join_columns = ['epk_id', "report_dt_part"]
target_source = "sbx_t_team_mp_cmpn_ds.dm_partner_sales"

## CONNECT TO DB

spark = create_spark_session('sberprime task', n_executors=16, n_cores=8)


# TODO create features about ecosystem
##
## TARGET CREATION
partners = [el for el in dict_from_file(f"{path2conf}/partners.json").keys() if el not in ['sberprime', 'level_kitchen']]
for partner in partners:
    print(f"<BEGIN> {partner}")
    try:
        cities = get_list_of_cities(partner, path2conf=path2conf)
    except:
        cities = False
    partner_name = dict_from_file(f"{path2conf}/partners.json")[partner]
    sales = spark.table(target_source).where(F.col("partner_name") == partner_name)
    sales = sales.groupBy("epk_id").agg({"evt_dt": "min"}).withColumnRenamed("min(evt_dt)", "evt_dt")
コード例 #5
0
from ld_utils.td_utils import create_connection, create_table
from ld_utils.utils import dict_from_file
from ld_utils.spark_utils import create_spark_session, custom_load, make_sql, save2ps

config = dict_from_file("../conf/logins.json")

spark = create_spark_session(
    'local kitchen',
    n_executors=16,
)
# sql = make_sql("sbx_retail_mp_lm ", "matched_local_kitchen_1202", columns=["epk_id"])
sql = '''select t1.*, t2.mcc_subgroup_name, t2.mcc_group_id from sbx_retail_mp_ca_vd.vsiv_autocj_next_mcc_scores_fnl_corr  t1
left join  sbx_retail_mp_dm.ref_mcc_subgroup t2
on  t1.mcc_subgroup_id = t2.mcc_subgroup_id'''
df = custom_load(spark, sql, config)
save2ps(df, 'knowledge_mcc_test')
コード例 #6
0
from ld_utils.spark_utils import create_spark_session, save2ps, ps2cluster, sdf2cluster, delete_folder_from_ps
from ld_utils.utils import dict_from_file, get_list_of_cities
import pyspark.sql.functions as F
from datetime import date, timedelta

partner = "delivery_club"
last_dt = date(2021, 1, 1)
pivot_dt = date(2020, 12, 1)
first_dt =  date(2020, 10, 1)
# TODO create function that construct path2conf through enviroment
path2conf = "../conf" # uncomment if run as py
# path2conf = "conf"

spark = create_spark_session('dataset_creation', n_executors=16, n_cores=8, executor_memory=32, driver_memory=64, )
partners = dict_from_file(f"{path2conf}/partners.json")
aggr = spark.table("sbx_t_team_mp_cmpn_ds.day_aggr")
aggr = aggr.where(F.col("report_dt") < last_dt)
aggr = aggr.where (F.col("client_city").isin(get_list_of_cities(partner, path2conf=path2conf)))
sales = spark.table("sbx_t_team_mp_cmpn_ds.dm_partner_sales")
sales = sales.where(F.col("partner_name") == partners[partner])
sales = sales.where((F.col("evt_dt") < last_dt) & (F.col("evt_dt") >= first_dt))
sales = sales.withColumn("target", F.lit(1))
sales = sales.withColumn("report_dt", F.date_sub('evt_dt', 3))
# sales = sales.withColumnRenamed("evt_dt", "report_dt")
sales = sales.select("epk_id", "report_dt", "target")

# TODO crete good algorithm to do this shit
dataset = aggr.join(sales, ['epk_id', "report_dt"], how="left")
print(dataset.select(F.sum("target").alias('amount_of')).show())
dataset = dataset.fillna({"target": 0})
dataset = dataset.where(F.col("report_dt").isNotNull())