def main(): spark = start_spark_session() imputed_df = get_imputed_df(cfg.IMPUTATION_TRAIN_PATH, cfg.IMPUTATION_PREDICT_PATH) num_party_ids = imputed_df.select("party_id").distinct().count() num_rows = imputed_df.count() num_party_ids_with_positive_outcome = get_num_party_ids_with_positive_outcome( imputed_df) result = spark.createDataFrame( [['Number of customers', num_party_ids], ['Number of rows', num_rows], [ 'Number of customers with a positive outcome', num_party_ids_with_positive_outcome ]], ['', 'Value']) return result
def get_test_df(): """ Creating test data frame """ spark = start_spark_session() return spark.createDataFrame([ (1, 1, 0), (1, 2, 0), (1, 3, 0), (1, 4, 1), (1, 5, 0), (2, 1, 0), (2, 2, 0), (2, 3, 1), (3, 1, 1), (3, 2, 1), (3, 3, 1), (4, 1, 1), (4, 2, 1), (4, 3, 1), (4, 4, 0), (5, 1, 1), (5, 2, 1), (5, 3, 1), (5, 4, 0), (5, 5, 0), (6, 1, 1), (6, 2, 1), (6, 3, 1), (6, 4, 0), (6, 5, 0), (7, 1, 0), (7, 2, 0), (6, 3, 0), (6, 4, 0), (6, 5, 0), ], ['party_id', 'ctu', 'imputed_ctu'])
def main(): """ 1. For every party ID caclulate number of CTU's imputed 2. Calculate number of distinct CTU's per party id 3. Devide number of CTU's imputed by distinct number of CTUs 4. Create buckets 0.00% -24%, 25% - 49%, 50 - 74%, 75% - 98%, 99% -100 in every bucket all ther partyid those that fit inside thier backet 5. Caclulcate proportion of partyids that fit inside the bucket out of total partyids Calculate proportion of accounts that have more than: 99% missing 0.20 75% missing 0.40 50% missing 0.60 25% missing 0.70 """ spark = start_spark_session() imputed_df = get_imputed_df(cfg.IMPUTATION_TRAIN_PATH, cfg.IMPUTATION_PREDICT_PATH) num_imputed_ctus_per_partyid = get_num_imputed_ctus_per_partyid(imputed_df) num_distinct_ctus_per_partyid = get_num_distinct_ctus_per_partyid( imputed_df) joined_num_distinct_imputed_ctus_df = join(num_distinct_ctus_per_partyid,\ num_imputed_ctus_per_partyid) percentage_of_missing_ctus_per_partyid = \ get_percentage_of_missing_ctus_per_party_id( joined_num_distinct_imputed_ctus_df ) party_id_and_its_bucket = create_buckets( percentage_of_missing_ctus_per_partyid) num_partyids_with_missing_ctus_per_backet = get_num_partyids_per_backet( party_id_and_its_bucket) total_num_ids = imputed_df.groupby("party_id").count().count() result_df = calculate_proportion_of_missing_ctus_per_percentile ( spark, num_partyids_with_missing_ctus_per_backet, \ total_num_ids ) write_to_excel(result_df, "zone_5_ctu_imp_ste_6_miss_ctus")
4. diffrence delts between columns : get_delta_columns_df(joined_df): 5. Caclulate KS divergence and kl divergence 6. K test 7. join delta_df, k_test, kl_divergence """ import sys sys.path.append("/home/boldyrek/mysoft/te/te_reporting/") from col_stats import * import config as cfg from helper_functions import * from scipy import stats from helper_functions import start_spark_session, get_imputed_df, suffix_and_join_dfs, write_to_excel, load_df spark = start_spark_session() # step 1 loading dfs imputed_train = load_df( cfg.SPLIT_TRAIN_PATH ) imputed_predict = load_df( cfg.SPLIT_PRED_PATH ) #imputed_train = test_df #imputed_predict = test_df # step 2 getting descriptive statistics imputed_train_descriptive_stats = get_df_with_descriptive_stats_for_columns ( spark, imputed_train ) imputed_predict_descriptive_stats = get_df_with_descriptive_stats_for_columns ( spark, imputed_predict ) #Step 3 join dfs joined_descriptive_stats= suffix_and_join_dfs( imputed_train_descriptive_stats, imputed_predict_descriptive_stats, 'column_name' )