7. join delta_df, k_test, kl_divergence
"""

import sys
sys.path.append("/home/boldyrek/mysoft/te/te_reporting/")
from col_stats import *
import config as cfg
from helper_functions import *
from scipy import stats
from helper_functions import start_spark_session, get_imputed_df, suffix_and_join_dfs, write_to_excel, load_df


spark = start_spark_session()

# step 1 loading dfs
imputed_train = load_df( cfg.SPLIT_TRAIN_PATH )
imputed_predict = load_df( cfg.SPLIT_PRED_PATH )

#imputed_train = test_df
#imputed_predict = test_df

# step 2 getting descriptive statistics
imputed_train_descriptive_stats = get_df_with_descriptive_stats_for_columns ( spark,  imputed_train )
imputed_predict_descriptive_stats = get_df_with_descriptive_stats_for_columns ( spark, imputed_predict )

#Step 3 join dfs
joined_descriptive_stats= suffix_and_join_dfs(
    imputed_train_descriptive_stats, imputed_predict_descriptive_stats, 'column_name' )

delta_df = get_delta_descriptive_stats_df(joined_descriptive_stats, '2' )
Esempio n. 2
0
    return imputation_cols_df


"""
****** MAIN ******
1. Create spark session 
2. Read the file into a dataframe
4. Calculate statistical summary for every column in a dataframe
5. Get imputation approach from the te_constants.py 
6. Join dfs from 4 and 5
7. Save it as an excel tab 
"""

#file_name = "../data/example.csv"
# Step 1 create sparj session
spark = start_spark_session()
# Step 2 Read file into df
gen_pre_df = load_df(cfg.GEN_PREPROCESS_PATH)
# Step 4 Calculate statistical summary for every column in a dataframe
columns_summary_stats_df = get_summary_stats_for_every_column(gen_pre_df)
# Step 5 Get imputation approach from the te_constants.py
columns = gen_pre_df.columns
imputation_cols_df = get_column_imputation_approach_df(columns)
# Step 6 Join dfs from 4 and 5
excel_ready_df = columns_summary_stats_df.join(
    imputation_cols_df,
    spark_funcs.col('column') == spark_funcs.col('column_name2'), "left_outer")
# Step 7 Save it as an excel tab
excel_ready_df = excel_ready_df.drop("column_name2")
write_to_excel(excel_ready_df, "zone_5_col_imputation_step_3")
    """
    put event event columns first 
    """
    cols = joined_ctu_event_stats_df.columns
    new_order_cols = cols[-2:] + cols[:-2]
    return new_order_cols


"""
***** MAIN *******
"""

#file_name = "../data/example.csv"

spark = start_spark_session()
df = load_df(cfg.PREPROCESS_PATH)
#df = load_df(file_name)
#df = get_imputed_df(cfg.IMPUTATION_TRAIN_PATH)
event_stats_ctus_dfs = []
max_CTU_num = get_max_CTU_num(df)
for CTU_num in range(max_CTU_num):
    print(CTU_num)
    single_CTU_df = df.filter(f"CTU == { CTU_num }")  # get a df with one CTU
    single_CTU_clean_df = get_df_with_dropped_garbage_cols(single_CTU_df)
    ctu_event_stats_df = get_event_stats_df(single_CTU_clean_df, CTU_num)
    event_stats_ctus_dfs.append(ctu_event_stats_df)

joined_ctu_event_stats_df = get_joined_df(event_stats_ctus_dfs)
#print_df (joined_ctu_event_stats_df)
joined_ctu_event_stats_df = split_column_by_underscore(
    joined_ctu_event_stats_df)
Esempio n. 4
0
    imputed_predict
2. calculate number of
    party_id in imputed_train
    party_ids in predict train
3. Calculate common party ids

"""

import sys
sys.path.append("/home/boldyrek/mysoft/te/te_reporting/")
from helper_functions import get_imputed_df, start_spark_session, load_df
from col_stats import *
import config as cfg
from helper_functions import *

split_train = load_df(cfg.SPLIT_TRAIN_PATH)
split_predict = load_df(cfg.SPLIT_PRED_PATH)

num_of_train_ids = split_train.select('party_id').distinct().count()

num_of_test_ids = split_predict.select('party_id').distinct().count()

num_of_common_between_train_and_test = split_train.select('party_id').distinct().join(
    split_predict.select('party_id').distinct(),\
    ['party_id'], how='inner').select('party_id').count()

output_df = spark.createDataFrame(
    [[num_of_train_ids, num_of_test_ids, num_of_common_between_train_and_test]
     ], [
         "num_of_train_ids", "num_of_test_ids",
         "num_of_common_between_train_and_test"
    joined_df_min_max = joined_df_min.withColumn("delta_max",
                                                 col("max_pre") - col("max"))
    joined_df_min_max_mean = joined_df_min_max.withColumn(
        "delta_mean",
        col("mean_pre") - col("mean"))

    return joined_df_min_max_mean


"""
**** MAIN *****
"""

spark = start_spark_session()

preprocessing_df = load_df(cfg.PREPROCESS_PATH)
preprocessing_columns_with_stats = get_descriptive_statistics_for_columns(
    preprocessing_df)
preprocessing_cols_stats_df = spark.createDataFrame(
    preprocessing_columns_with_stats, ['column', 'max', 'min', 'mean'])

imputed_df = get_imputed_df(cfg.IMPUTATION_TRAIN_PATH,
                            cfg.IMPUTATION_PREDICT_PATH)
imputed_columns_with_stats = get_descriptive_statistics_for_columns(imputed_df)
imputed_cols_stats_df = spark.createDataFrame(imputed_columns_with_stats,
                                              ['column', 'max', 'min', 'mean'])


preprocessing_cols_stats_df_re = preprocessing_cols_stats_df.\
select(*(col(x).alias(x + '_pre') for x in preprocessing_cols_stats_df.columns))
joined_df = preprocessing_cols_stats_df_re.join(