7. join delta_df, k_test, kl_divergence """ import sys sys.path.append("/home/boldyrek/mysoft/te/te_reporting/") from col_stats import * import config as cfg from helper_functions import * from scipy import stats from helper_functions import start_spark_session, get_imputed_df, suffix_and_join_dfs, write_to_excel, load_df spark = start_spark_session() # step 1 loading dfs imputed_train = load_df( cfg.SPLIT_TRAIN_PATH ) imputed_predict = load_df( cfg.SPLIT_PRED_PATH ) #imputed_train = test_df #imputed_predict = test_df # step 2 getting descriptive statistics imputed_train_descriptive_stats = get_df_with_descriptive_stats_for_columns ( spark, imputed_train ) imputed_predict_descriptive_stats = get_df_with_descriptive_stats_for_columns ( spark, imputed_predict ) #Step 3 join dfs joined_descriptive_stats= suffix_and_join_dfs( imputed_train_descriptive_stats, imputed_predict_descriptive_stats, 'column_name' ) delta_df = get_delta_descriptive_stats_df(joined_descriptive_stats, '2' )
return imputation_cols_df """ ****** MAIN ****** 1. Create spark session 2. Read the file into a dataframe 4. Calculate statistical summary for every column in a dataframe 5. Get imputation approach from the te_constants.py 6. Join dfs from 4 and 5 7. Save it as an excel tab """ #file_name = "../data/example.csv" # Step 1 create sparj session spark = start_spark_session() # Step 2 Read file into df gen_pre_df = load_df(cfg.GEN_PREPROCESS_PATH) # Step 4 Calculate statistical summary for every column in a dataframe columns_summary_stats_df = get_summary_stats_for_every_column(gen_pre_df) # Step 5 Get imputation approach from the te_constants.py columns = gen_pre_df.columns imputation_cols_df = get_column_imputation_approach_df(columns) # Step 6 Join dfs from 4 and 5 excel_ready_df = columns_summary_stats_df.join( imputation_cols_df, spark_funcs.col('column') == spark_funcs.col('column_name2'), "left_outer") # Step 7 Save it as an excel tab excel_ready_df = excel_ready_df.drop("column_name2") write_to_excel(excel_ready_df, "zone_5_col_imputation_step_3")
""" put event event columns first """ cols = joined_ctu_event_stats_df.columns new_order_cols = cols[-2:] + cols[:-2] return new_order_cols """ ***** MAIN ******* """ #file_name = "../data/example.csv" spark = start_spark_session() df = load_df(cfg.PREPROCESS_PATH) #df = load_df(file_name) #df = get_imputed_df(cfg.IMPUTATION_TRAIN_PATH) event_stats_ctus_dfs = [] max_CTU_num = get_max_CTU_num(df) for CTU_num in range(max_CTU_num): print(CTU_num) single_CTU_df = df.filter(f"CTU == { CTU_num }") # get a df with one CTU single_CTU_clean_df = get_df_with_dropped_garbage_cols(single_CTU_df) ctu_event_stats_df = get_event_stats_df(single_CTU_clean_df, CTU_num) event_stats_ctus_dfs.append(ctu_event_stats_df) joined_ctu_event_stats_df = get_joined_df(event_stats_ctus_dfs) #print_df (joined_ctu_event_stats_df) joined_ctu_event_stats_df = split_column_by_underscore( joined_ctu_event_stats_df)
imputed_predict 2. calculate number of party_id in imputed_train party_ids in predict train 3. Calculate common party ids """ import sys sys.path.append("/home/boldyrek/mysoft/te/te_reporting/") from helper_functions import get_imputed_df, start_spark_session, load_df from col_stats import * import config as cfg from helper_functions import * split_train = load_df(cfg.SPLIT_TRAIN_PATH) split_predict = load_df(cfg.SPLIT_PRED_PATH) num_of_train_ids = split_train.select('party_id').distinct().count() num_of_test_ids = split_predict.select('party_id').distinct().count() num_of_common_between_train_and_test = split_train.select('party_id').distinct().join( split_predict.select('party_id').distinct(),\ ['party_id'], how='inner').select('party_id').count() output_df = spark.createDataFrame( [[num_of_train_ids, num_of_test_ids, num_of_common_between_train_and_test] ], [ "num_of_train_ids", "num_of_test_ids", "num_of_common_between_train_and_test"
joined_df_min_max = joined_df_min.withColumn("delta_max", col("max_pre") - col("max")) joined_df_min_max_mean = joined_df_min_max.withColumn( "delta_mean", col("mean_pre") - col("mean")) return joined_df_min_max_mean """ **** MAIN ***** """ spark = start_spark_session() preprocessing_df = load_df(cfg.PREPROCESS_PATH) preprocessing_columns_with_stats = get_descriptive_statistics_for_columns( preprocessing_df) preprocessing_cols_stats_df = spark.createDataFrame( preprocessing_columns_with_stats, ['column', 'max', 'min', 'mean']) imputed_df = get_imputed_df(cfg.IMPUTATION_TRAIN_PATH, cfg.IMPUTATION_PREDICT_PATH) imputed_columns_with_stats = get_descriptive_statistics_for_columns(imputed_df) imputed_cols_stats_df = spark.createDataFrame(imputed_columns_with_stats, ['column', 'max', 'min', 'mean']) preprocessing_cols_stats_df_re = preprocessing_cols_stats_df.\ select(*(col(x).alias(x + '_pre') for x in preprocessing_cols_stats_df.columns)) joined_df = preprocessing_cols_stats_df_re.join(