def main():
    """
    1. For every party ID caclulate number of CTU's imputed
    2. Calculate number of distinct CTU's per party id
    3. Devide number of CTU's imputed by distinct number of CTUs
    4. Create buckets 0.00% -24%, 25% - 49%, 50 - 74%, 75% - 98%, 99% -100 
    in every bucket all ther partyid those that fit inside thier backet
    5. Caclulcate proportion of partyids that fit inside the bucket out of total partyids
        Calculate proportion of accounts that have more than:	
        99% missing	0.20
        75% missing	0.40
        50% missing	0.60
        25% missing	0.70

    """
    spark = start_spark_session()
    imputed_df = get_imputed_df(cfg.IMPUTATION_TRAIN_PATH,
                                cfg.IMPUTATION_PREDICT_PATH)

    num_imputed_ctus_per_partyid = get_num_imputed_ctus_per_partyid(imputed_df)
    num_distinct_ctus_per_partyid = get_num_distinct_ctus_per_partyid(
        imputed_df)
    joined_num_distinct_imputed_ctus_df = join(num_distinct_ctus_per_partyid,\
                                               num_imputed_ctus_per_partyid)
    percentage_of_missing_ctus_per_partyid = \
                get_percentage_of_missing_ctus_per_party_id(
                        joined_num_distinct_imputed_ctus_df )

    party_id_and_its_bucket = create_buckets(
        percentage_of_missing_ctus_per_partyid)
    num_partyids_with_missing_ctus_per_backet = get_num_partyids_per_backet(
        party_id_and_its_bucket)
    total_num_ids = imputed_df.groupby("party_id").count().count()
    result_df = calculate_proportion_of_missing_ctus_per_percentile ( spark, num_partyids_with_missing_ctus_per_backet, \
                                          total_num_ids )
    write_to_excel(result_df, "zone_5_ctu_imp_ste_6_miss_ctus")
def run_twitter_search(keyword, output_file):
    print(keyword.upper())
    counter = 0
    try:
        tso = TwitterSearchOrder() # create a TwitterSearchOrder object
        tso.set_keywords([str(keyword)]) # let's define all words we would like to have a look for
        # tso.set_language('en') # we want to see English tweets only
        tso.set_include_entities(True) # and don't give us all those entity information
        tso.set_geocode(45.551279, -92.586955, 530, imperial_metric=True)

        for tweet in ts.search_tweets_iterable(tso):
            counter = counter + 1
            search_term = keyword
            username = "******" if tweet['user']['screen_name'] is None else tweet['user']['screen_name']
            text = "NONE" if tweet['text'] is None else tweet['text']
            place = "NONE" if tweet['place'] is None else tweet['place']
            if (tweet['coordinates'] is not None):
                lat = tweet['coordinates']['coordinates'][1]
                lng = tweet['coordinates']['coordinates'][0]
            elif (tweet['place'] is not None):
                place_coordinates = tweet['place']['bounding_box']['coordinates']
                sum_lat = 0
                sum_lng = 0
                for pair in place_coordinates[0]:
                    sum_lat += pair[1]
                    sum_lng += pair[0]
                lat = sum_lat / len(place_coordinates[0])
                lng = sum_lng / len(place_coordinates[0])
                place = tweet['place']['full_name']
            else:
                lat = "NONE"
                lng = "NONE"
            location = "NONE" if tweet['user']['location'] is None else tweet['user']['location']
            created_at = "NONE" if tweet['created_at'] is None else tweet['created_at']
            description = "NONE" if tweet['user']['description'] is None else tweet['user']['description']
            verified = "NONE" if tweet['user']['verified'] is None else str(tweet['user']['verified'])
            sentiment_score = vaderSentiment(text.encode('utf-8'))
            compound_sentiment = sentiment_score['compound']
            description_sentiment = vaderSentiment(description.encode('utf-8'))['compound']
            try:
                df.loc[len(df)] = [search_term, username, text, lat, lng, location, created_at, place, description, verified, sentiment_score, compound_sentiment, description_sentiment]
                if((len(df) % 200) == 0):
                    write_to_excel(output_file, 'Sheet1', df)
                    print("_%s %s tweets/%s total" % (counter, keyword.upper(), len(df)))
            except:
                write_to_excel(output_file, str(keyword), df)
            if(counter == 10000):
                return
        write_to_excel(output_file, str(keyword), df)
        print("_______%s tweets saved" % (len(df)))

    except TwitterSearchException as e: # take care of all those ugly errors if there are some
        print(e)
    num_party_ids_with_positive_outcome =\
            imputed_df.where(where_clause).select('party_id').distinct().count()

    return num_party_ids_with_positive_outcome


def main():

    spark = start_spark_session()
    imputed_df = get_imputed_df(cfg.IMPUTATION_TRAIN_PATH,
                                cfg.IMPUTATION_PREDICT_PATH)

    num_party_ids = imputed_df.select("party_id").distinct().count()

    num_rows = imputed_df.count()

    num_party_ids_with_positive_outcome = get_num_party_ids_with_positive_outcome(
        imputed_df)

    result = spark.createDataFrame(
        [['Number of customers', num_party_ids], ['Number of rows', num_rows],
         [
             'Number of customers with a positive outcome',
             num_party_ids_with_positive_outcome
         ]], ['', 'Value'])
    return result


result = main()
write_to_excel(result, "zone_5_model_data_funnel_step_8")
delta_df = get_delta_descriptive_stats_df(joined_descriptive_stats, '2' )

# Step 5 ks stats
def get_df_with_ks_stats( imputed_train, imputed_predict ):
    columns = imputed_train.schema.names
    col_ks = []
    for col in columns:
        imputed_train_col = imputed_train.select(col).toPandas()[col].tolist()
        imputed_predict_col = imputed_predict.select(col).toPandas()[col].tolist()
        try:
            ks = stats.ks_2samp(imputed_train_col, imputed_predict_col)
            p_value = str(round(ks[0], 2))
            
            kd = str(round(ks[1], 2))

        except Exception as e:
            #print('col ',col ,e)
            p_value = ''
            kd = ''      
        col_ks.append((col,p_value, kd))   
    ks_stats_df = spark.createDataFrame(col_ks, ['column_name_ks', 'p_value', 'kd'])
    return ks_stats_df

ks_stats_df = get_df_with_ks_stats (imputed_train, imputed_predict )
#ks_stats_df.show()
# Step 7 Join 
delta_df = delta_df.join(ks_stats_df, col('column_name') == col('column_name_ks')).\
    select('column_name','delta_min','delta_max','delta_mean','delta_stddev','delta_median',
           'p_value','kd')
write_to_excel( delta_df, 'zone_5_split_fea_stats_ste_9')
Ejemplo n.º 5
0
    return imputation_cols_df


"""
****** MAIN ******
1. Create spark session 
2. Read the file into a dataframe
4. Calculate statistical summary for every column in a dataframe
5. Get imputation approach from the te_constants.py 
6. Join dfs from 4 and 5
7. Save it as an excel tab 
"""

#file_name = "../data/example.csv"
# Step 1 create sparj session
spark = start_spark_session()
# Step 2 Read file into df
gen_pre_df = load_df(cfg.GEN_PREPROCESS_PATH)
# Step 4 Calculate statistical summary for every column in a dataframe
columns_summary_stats_df = get_summary_stats_for_every_column(gen_pre_df)
# Step 5 Get imputation approach from the te_constants.py
columns = gen_pre_df.columns
imputation_cols_df = get_column_imputation_approach_df(columns)
# Step 6 Join dfs from 4 and 5
excel_ready_df = columns_summary_stats_df.join(
    imputation_cols_df,
    spark_funcs.col('column') == spark_funcs.col('column_name2'), "left_outer")
# Step 7 Save it as an excel tab
excel_ready_df = excel_ready_df.drop("column_name2")
write_to_excel(excel_ready_df, "zone_5_col_imputation_step_3")
Ejemplo n.º 6
0
                    newLat = coordinates[0]
                    newLng = coordinates[1]
                    counter += 1
                    df_output.loc[len(df_output)] = [
                        tweet.keyword, tweet.username, tweet.text, newLat,
                        newLng, tweet.location, tweet.created_at, tweet.place,
                        tweet.description, tweet.verified,
                        tweet.sentiment_score, tweet.compound_sentiment,
                        tweet.description_sentiment
                    ]
                    print("stored: %s, to go: %s, dns: %s" %
                          (counter, num_left, dns_counter))

                    if ((counter % 200) == 0):
                        try:
                            write_to_excel(output_file, 'Sheet1', df_output)
                            print("%s SAVED" % (counter))
                        except:
                            print("ERROR saving new coordinates")
                            # return
                else:
                    print(
                        "coordinates not found for '%s' - dropping tweet by %s"
                        % (tweet.location, tweet.username))
        else:
            print("no location information - dropping tweet by %s" %
                  (tweet.username))
    else:
        counter += 1
        print("coordinates already found")
        df_output.loc[len(df_output)] = [
    proportion_of_positives = joined_df.withColumn("proportion_with_positive_target",
                                 round(col('count(party_id)')/
                                       col('count(party_id)_2'),2))\
                                .select('CTU',"proportion_with_positive_target")

    return proportion_of_positives


def add_zero_proportions_to_empty_ctus(proportion_of_positives, ctus_all,
                                       spark):

    for ctu in sorted(ctus_all):
        ctu_query = "ctu == {0}".format(ctu[0])
        ctu_value = proportion_of_positives.where(ctu_query).select(
            'CTU').collect()
        if len(ctu_value) == 0:
            new_df = spark.createDataFrame(
                [[ctu[0], 0]], ['CTU', 'proportion_with_positive_target'])
            proportion_of_positives = proportion_of_positives.union(new_df)
    return proportion_of_positives


imputed_df = get_imputed_df(cfg.IMPUTATION_TRAIN_PATH,
                            cfg.IMPUTATION_PREDICT_PATH)
proportion_of_positives = main(imputed_df, spark)
proportion_of_positives = proportion_of_positives.withColumn('CTU',
                                                             proportion_of_positives['CTU']\
                                                             .cast(IntegerType()))
proportion_of_positives = proportion_of_positives.orderBy(asc('CTU'))
write_to_excel(proportion_of_positives, "zone_5_ta_def_7_prop_per_ctu.py")
Ejemplo n.º 8
0
Descriptive statistics on the number of positive labels for across all customers

Minimum    Maximum    Mean    Standard deviation    Median        
1    2    1    1.5    1        
"""
import sys
sys.path.append("/home/boldyrek/mysoft/te/te_reporting/")
from helper_functions import start_spark_session, get_imputed_df
from col_stats import *
from pyspark.sql.functions import col
from helper_functions import get_imputed_df, start_spark_session, load_df, write_to_excel 
import config as cfg

spark = start_spark_session()
imputed_df = get_imputed_df( cfg.IMPUTATION_TRAIN_PATH, cfg.IMPUTATION_PREDICT_PATH )

"""
take the targets where target is equal 1, group by party id, count how many 1 targets
"""
imputed_df_count = imputed_df.where("te_2month = 1").groupBy('party_id').agg({'te_2month' : 'count'})
imputed_df_count_te_2month = imputed_df_count.select("count(te_2month)")
minimum = calc_column_min(imputed_df_count_te_2month)
maximum = calc_column_max(imputed_df_count_te_2month)
mean = calc_column_avg(imputed_df_count_te_2month)
stdev = calc_column_stddev(imputed_df_count_te_2month)
median = calc_column_median(imputed_df_count_te_2month)
positive_label_stats_across_customers_df = spark.createDataFrame([[minimum, maximum, mean, stdev, median]],\
                      ['minimum', 'maximum','mean','stdev','median'])
write_to_excel( positive_label_stats_across_customers_df , "zone_5_target_definitio_step_7")

    return new_order_cols


"""
***** MAIN *******
"""

#file_name = "../data/example.csv"

spark = start_spark_session()
df = load_df(cfg.PREPROCESS_PATH)
#df = load_df(file_name)
#df = get_imputed_df(cfg.IMPUTATION_TRAIN_PATH)
event_stats_ctus_dfs = []
max_CTU_num = get_max_CTU_num(df)
for CTU_num in range(max_CTU_num):
    print(CTU_num)
    single_CTU_df = df.filter(f"CTU == { CTU_num }")  # get a df with one CTU
    single_CTU_clean_df = get_df_with_dropped_garbage_cols(single_CTU_df)
    ctu_event_stats_df = get_event_stats_df(single_CTU_clean_df, CTU_num)
    event_stats_ctus_dfs.append(ctu_event_stats_df)

joined_ctu_event_stats_df = get_joined_df(event_stats_ctus_dfs)
#print_df (joined_ctu_event_stats_df)
joined_ctu_event_stats_df = split_column_by_underscore(
    joined_ctu_event_stats_df)
new_order_cols = get_event_cols_first(joined_ctu_event_stats_df)
joined_ctu_event_stats_df = joined_ctu_event_stats_df.select(
    new_order_cols).orderBy('event')
write_to_excel(joined_ctu_event_stats_df, "zone_5_ctu_report_step_5")
    """
    columns_to_drop = [
        'level_0', 'index', 'Unnamed: 0', '_c0', 'party_id', 'event_date',
        'CTU', 'event_id'
    ]
    df_to_drop = df.select('*')
    df_to_drop = df_to_drop.drop(*columns_to_drop)

    return df_to_drop


"""
*** MAIN ***
"""

spark = start_spark_session()
prepro_df = load_df(cfg.PREPROCESS_PATH)
num_rows = prepro_df.count()
event_rate_df = prepro_df.select([
    (F.count(F.when(prepro_df[c] != 0, c)) / num_rows).alias(c)
    for c in prepro_df.columns
])
event_rate_df_clean = drop_garbage_cols(event_rate_df)
event_rate_df_clean_pd = event_rate_df_clean.toPandas().transpose(
).reset_index().rename(columns={
    0: 'Column event rate ',
    'index': 'Column names'
})
event_rate_df_clean_spark = spark.createDataFrame(event_rate_df_clean_pd)
write_to_excel(event_rate_df_clean_spark, "zone_5_stage_I_aggrega_step_2")
"""
**** MAIN *****
"""

spark = start_spark_session()

preprocessing_df = load_df(cfg.PREPROCESS_PATH)
preprocessing_columns_with_stats = get_descriptive_statistics_for_columns(
    preprocessing_df)
preprocessing_cols_stats_df = spark.createDataFrame(
    preprocessing_columns_with_stats, ['column', 'max', 'min', 'mean'])

imputed_df = get_imputed_df(cfg.IMPUTATION_TRAIN_PATH,
                            cfg.IMPUTATION_PREDICT_PATH)
imputed_columns_with_stats = get_descriptive_statistics_for_columns(imputed_df)
imputed_cols_stats_df = spark.createDataFrame(imputed_columns_with_stats,
                                              ['column', 'max', 'min', 'mean'])


preprocessing_cols_stats_df_re = preprocessing_cols_stats_df.\
select(*(col(x).alias(x + '_pre') for x in preprocessing_cols_stats_df.columns))
joined_df = preprocessing_cols_stats_df_re.join(
    imputed_cols_stats_df,
    preprocessing_cols_stats_df_re.column_pre == imputed_cols_stats_df.column)

delta_columns_df = get_delta_columns_df(joined_df)
elta_columns_df = delta_columns_df.select('column', 'delta_min', 'delta_max',
                                          'delta_mean')
write_to_excel(delta_columns_df, "zone_5_ctu_imputation_step_6")
                print("DO_NOT_SEARCH here")
                dns_counter += 1
            else:
                googleLocation = geocoder.google(tweet.location, key=GOOGLE_API_KEY)
                coordinates = googleLocation.latlng
                if (len(coordinates) == 2):
                    # warning: unless any of the input data has decimal values in it, this will auto-round the coordinates
                    newLat = coordinates[0]
                    newLng = coordinates[1]
                    counter += 1
                    df_output.loc[len(df_output)] = [tweet.keyword, tweet.username, tweet.text, newLat, newLng, tweet.location, tweet.created_at, tweet.place, tweet.description, tweet.verified, tweet.sentiment_score, tweet.compound_sentiment, tweet.description_sentiment]
                    print("stored: %s, to go: %s, dns: %s" % (counter, num_left, dns_counter))

                    if((counter % 200) == 0):
                        try:
                            write_to_excel(output_file, 'Sheet1', df_output)
                            print("%s SAVED" % (counter))
                        except:
                            print("ERROR saving new coordinates")
                            # return
                else:
                    print("coordinates not found for '%s' - dropping tweet by %s" % (tweet.location, tweet.username))   
        else:
            print("no location information - dropping tweet by %s" % (tweet.username))
    else:
        counter += 1
        print("coordinates already found")
        df_output.loc[len(df_output)] = [tweet.keyword, tweet.username, tweet.text, tweet.lat, tweet.lng, tweet.location, tweet.created_at, tweet.place, tweet.description, tweet.verified, tweet.sentiment_score, tweet.compound_sentiment, tweet.description_sentiment]
        print("stored: %s, to go: %s, dns: %s" % (counter, num_left, dns_counter))
        
        if((counter % 200) == 0):