Python start_spark_session Examples

Programming Language: Python

Namespace/Package Name: helper_functions

Method/Function: start_spark_session

Examples at hotexamples.com: 4

Python start_spark_session - 4 examples found. These are the top rated real world Python examples of helper_functions.start_spark_session extracted from open source projects. You can rate examples to help us improve the quality of examples.

Example #1

Show file

File: zone_5_model_data_funnel_step_8.py Project: boldyrek/te_reporting

def main():

    spark = start_spark_session()
    imputed_df = get_imputed_df(cfg.IMPUTATION_TRAIN_PATH,
                                cfg.IMPUTATION_PREDICT_PATH)

    num_party_ids = imputed_df.select("party_id").distinct().count()

    num_rows = imputed_df.count()

    num_party_ids_with_positive_outcome = get_num_party_ids_with_positive_outcome(
        imputed_df)

    result = spark.createDataFrame(
        [['Number of customers', num_party_ids], ['Number of rows', num_rows],
         [
             'Number of customers with a positive outcome',
             num_party_ids_with_positive_outcome
         ]], ['', 'Value'])
    return result

Example #2

Show file

File: zone_5_ctu_imputation_step_6_missing_values.py Project: boldyrek/te_reporting

def get_test_df():
    """
    Creating test data frame 
    """
    spark = start_spark_session()

    return spark.createDataFrame([
        (1, 1, 0),
        (1, 2, 0),
        (1, 3, 0),
        (1, 4, 1),
        (1, 5, 0),
        (2, 1, 0),
        (2, 2, 0),
        (2, 3, 1),
        (3, 1, 1),
        (3, 2, 1),
        (3, 3, 1),
        (4, 1, 1),
        (4, 2, 1),
        (4, 3, 1),
        (4, 4, 0),
        (5, 1, 1),
        (5, 2, 1),
        (5, 3, 1),
        (5, 4, 0),
        (5, 5, 0),
        (6, 1, 1),
        (6, 2, 1),
        (6, 3, 1),
        (6, 4, 0),
        (6, 5, 0),
        (7, 1, 0),
        (7, 2, 0),
        (6, 3, 0),
        (6, 4, 0),
        (6, 5, 0),
    ], ['party_id', 'ctu', 'imputed_ctu'])

Example #3

Show file

File: zone_5_ctu_imputation_step_6_missing_values.py Project: boldyrek/te_reporting

def main():
    """
    1. For every party ID caclulate number of CTU's imputed
    2. Calculate number of distinct CTU's per party id
    3. Devide number of CTU's imputed by distinct number of CTUs
    4. Create buckets 0.00% -24%, 25% - 49%, 50 - 74%, 75% - 98%, 99% -100 
    in every bucket all ther partyid those that fit inside thier backet
    5. Caclulcate proportion of partyids that fit inside the bucket out of total partyids
        Calculate proportion of accounts that have more than:	
        99% missing	0.20
        75% missing	0.40
        50% missing	0.60
        25% missing	0.70

    """
    spark = start_spark_session()
    imputed_df = get_imputed_df(cfg.IMPUTATION_TRAIN_PATH,
                                cfg.IMPUTATION_PREDICT_PATH)

    num_imputed_ctus_per_partyid = get_num_imputed_ctus_per_partyid(imputed_df)
    num_distinct_ctus_per_partyid = get_num_distinct_ctus_per_partyid(
        imputed_df)
    joined_num_distinct_imputed_ctus_df = join(num_distinct_ctus_per_partyid,\
                                               num_imputed_ctus_per_partyid)
    percentage_of_missing_ctus_per_partyid = \
                get_percentage_of_missing_ctus_per_party_id(
                        joined_num_distinct_imputed_ctus_df )

    party_id_and_its_bucket = create_buckets(
        percentage_of_missing_ctus_per_partyid)
    num_partyids_with_missing_ctus_per_backet = get_num_partyids_per_backet(
        party_id_and_its_bucket)
    total_num_ids = imputed_df.groupby("party_id").count().count()
    result_df = calculate_proportion_of_missing_ctus_per_percentile ( spark, num_partyids_with_missing_ctus_per_backet, \
                                          total_num_ids )
    write_to_excel(result_df, "zone_5_ctu_imp_ste_6_miss_ctus")

Example #4

Show file

File: zone_5_splitting_feature_statistics_step_9.py Project: boldyrek/te_reporting

4. diffrence delts between columns : get_delta_columns_df(joined_df):
5. Caclulate KS divergence and kl divergence
6. K test
7. join delta_df, k_test, kl_divergence
"""

import sys
sys.path.append("/home/boldyrek/mysoft/te/te_reporting/")
from col_stats import *
import config as cfg
from helper_functions import *
from scipy import stats
from helper_functions import start_spark_session, get_imputed_df, suffix_and_join_dfs, write_to_excel, load_df


spark = start_spark_session()

# step 1 loading dfs
imputed_train = load_df( cfg.SPLIT_TRAIN_PATH )
imputed_predict = load_df( cfg.SPLIT_PRED_PATH )

#imputed_train = test_df
#imputed_predict = test_df

# step 2 getting descriptive statistics
imputed_train_descriptive_stats = get_df_with_descriptive_stats_for_columns ( spark,  imputed_train )
imputed_predict_descriptive_stats = get_df_with_descriptive_stats_for_columns ( spark, imputed_predict )

#Step 3 join dfs
joined_descriptive_stats= suffix_and_join_dfs(
    imputed_train_descriptive_stats, imputed_predict_descriptive_stats, 'column_name' )