def main(): """ 1. For every party ID caclulate number of CTU's imputed 2. Calculate number of distinct CTU's per party id 3. Devide number of CTU's imputed by distinct number of CTUs 4. Create buckets 0.00% -24%, 25% - 49%, 50 - 74%, 75% - 98%, 99% -100 in every bucket all ther partyid those that fit inside thier backet 5. Caclulcate proportion of partyids that fit inside the bucket out of total partyids Calculate proportion of accounts that have more than: 99% missing 0.20 75% missing 0.40 50% missing 0.60 25% missing 0.70 """ spark = start_spark_session() imputed_df = get_imputed_df(cfg.IMPUTATION_TRAIN_PATH, cfg.IMPUTATION_PREDICT_PATH) num_imputed_ctus_per_partyid = get_num_imputed_ctus_per_partyid(imputed_df) num_distinct_ctus_per_partyid = get_num_distinct_ctus_per_partyid( imputed_df) joined_num_distinct_imputed_ctus_df = join(num_distinct_ctus_per_partyid,\ num_imputed_ctus_per_partyid) percentage_of_missing_ctus_per_partyid = \ get_percentage_of_missing_ctus_per_party_id( joined_num_distinct_imputed_ctus_df ) party_id_and_its_bucket = create_buckets( percentage_of_missing_ctus_per_partyid) num_partyids_with_missing_ctus_per_backet = get_num_partyids_per_backet( party_id_and_its_bucket) total_num_ids = imputed_df.groupby("party_id").count().count() result_df = calculate_proportion_of_missing_ctus_per_percentile ( spark, num_partyids_with_missing_ctus_per_backet, \ total_num_ids ) write_to_excel(result_df, "zone_5_ctu_imp_ste_6_miss_ctus")
def run_twitter_search(keyword, output_file): print(keyword.upper()) counter = 0 try: tso = TwitterSearchOrder() # create a TwitterSearchOrder object tso.set_keywords([str(keyword)]) # let's define all words we would like to have a look for # tso.set_language('en') # we want to see English tweets only tso.set_include_entities(True) # and don't give us all those entity information tso.set_geocode(45.551279, -92.586955, 530, imperial_metric=True) for tweet in ts.search_tweets_iterable(tso): counter = counter + 1 search_term = keyword username = "******" if tweet['user']['screen_name'] is None else tweet['user']['screen_name'] text = "NONE" if tweet['text'] is None else tweet['text'] place = "NONE" if tweet['place'] is None else tweet['place'] if (tweet['coordinates'] is not None): lat = tweet['coordinates']['coordinates'][1] lng = tweet['coordinates']['coordinates'][0] elif (tweet['place'] is not None): place_coordinates = tweet['place']['bounding_box']['coordinates'] sum_lat = 0 sum_lng = 0 for pair in place_coordinates[0]: sum_lat += pair[1] sum_lng += pair[0] lat = sum_lat / len(place_coordinates[0]) lng = sum_lng / len(place_coordinates[0]) place = tweet['place']['full_name'] else: lat = "NONE" lng = "NONE" location = "NONE" if tweet['user']['location'] is None else tweet['user']['location'] created_at = "NONE" if tweet['created_at'] is None else tweet['created_at'] description = "NONE" if tweet['user']['description'] is None else tweet['user']['description'] verified = "NONE" if tweet['user']['verified'] is None else str(tweet['user']['verified']) sentiment_score = vaderSentiment(text.encode('utf-8')) compound_sentiment = sentiment_score['compound'] description_sentiment = vaderSentiment(description.encode('utf-8'))['compound'] try: df.loc[len(df)] = [search_term, username, text, lat, lng, location, created_at, place, description, verified, sentiment_score, compound_sentiment, description_sentiment] if((len(df) % 200) == 0): write_to_excel(output_file, 'Sheet1', df) print("_%s %s tweets/%s total" % (counter, keyword.upper(), len(df))) except: write_to_excel(output_file, str(keyword), df) if(counter == 10000): return write_to_excel(output_file, str(keyword), df) print("_______%s tweets saved" % (len(df))) except TwitterSearchException as e: # take care of all those ugly errors if there are some print(e)
num_party_ids_with_positive_outcome =\ imputed_df.where(where_clause).select('party_id').distinct().count() return num_party_ids_with_positive_outcome def main(): spark = start_spark_session() imputed_df = get_imputed_df(cfg.IMPUTATION_TRAIN_PATH, cfg.IMPUTATION_PREDICT_PATH) num_party_ids = imputed_df.select("party_id").distinct().count() num_rows = imputed_df.count() num_party_ids_with_positive_outcome = get_num_party_ids_with_positive_outcome( imputed_df) result = spark.createDataFrame( [['Number of customers', num_party_ids], ['Number of rows', num_rows], [ 'Number of customers with a positive outcome', num_party_ids_with_positive_outcome ]], ['', 'Value']) return result result = main() write_to_excel(result, "zone_5_model_data_funnel_step_8")
delta_df = get_delta_descriptive_stats_df(joined_descriptive_stats, '2' ) # Step 5 ks stats def get_df_with_ks_stats( imputed_train, imputed_predict ): columns = imputed_train.schema.names col_ks = [] for col in columns: imputed_train_col = imputed_train.select(col).toPandas()[col].tolist() imputed_predict_col = imputed_predict.select(col).toPandas()[col].tolist() try: ks = stats.ks_2samp(imputed_train_col, imputed_predict_col) p_value = str(round(ks[0], 2)) kd = str(round(ks[1], 2)) except Exception as e: #print('col ',col ,e) p_value = '' kd = '' col_ks.append((col,p_value, kd)) ks_stats_df = spark.createDataFrame(col_ks, ['column_name_ks', 'p_value', 'kd']) return ks_stats_df ks_stats_df = get_df_with_ks_stats (imputed_train, imputed_predict ) #ks_stats_df.show() # Step 7 Join delta_df = delta_df.join(ks_stats_df, col('column_name') == col('column_name_ks')).\ select('column_name','delta_min','delta_max','delta_mean','delta_stddev','delta_median', 'p_value','kd') write_to_excel( delta_df, 'zone_5_split_fea_stats_ste_9')
return imputation_cols_df """ ****** MAIN ****** 1. Create spark session 2. Read the file into a dataframe 4. Calculate statistical summary for every column in a dataframe 5. Get imputation approach from the te_constants.py 6. Join dfs from 4 and 5 7. Save it as an excel tab """ #file_name = "../data/example.csv" # Step 1 create sparj session spark = start_spark_session() # Step 2 Read file into df gen_pre_df = load_df(cfg.GEN_PREPROCESS_PATH) # Step 4 Calculate statistical summary for every column in a dataframe columns_summary_stats_df = get_summary_stats_for_every_column(gen_pre_df) # Step 5 Get imputation approach from the te_constants.py columns = gen_pre_df.columns imputation_cols_df = get_column_imputation_approach_df(columns) # Step 6 Join dfs from 4 and 5 excel_ready_df = columns_summary_stats_df.join( imputation_cols_df, spark_funcs.col('column') == spark_funcs.col('column_name2'), "left_outer") # Step 7 Save it as an excel tab excel_ready_df = excel_ready_df.drop("column_name2") write_to_excel(excel_ready_df, "zone_5_col_imputation_step_3")
newLat = coordinates[0] newLng = coordinates[1] counter += 1 df_output.loc[len(df_output)] = [ tweet.keyword, tweet.username, tweet.text, newLat, newLng, tweet.location, tweet.created_at, tweet.place, tweet.description, tweet.verified, tweet.sentiment_score, tweet.compound_sentiment, tweet.description_sentiment ] print("stored: %s, to go: %s, dns: %s" % (counter, num_left, dns_counter)) if ((counter % 200) == 0): try: write_to_excel(output_file, 'Sheet1', df_output) print("%s SAVED" % (counter)) except: print("ERROR saving new coordinates") # return else: print( "coordinates not found for '%s' - dropping tweet by %s" % (tweet.location, tweet.username)) else: print("no location information - dropping tweet by %s" % (tweet.username)) else: counter += 1 print("coordinates already found") df_output.loc[len(df_output)] = [
proportion_of_positives = joined_df.withColumn("proportion_with_positive_target", round(col('count(party_id)')/ col('count(party_id)_2'),2))\ .select('CTU',"proportion_with_positive_target") return proportion_of_positives def add_zero_proportions_to_empty_ctus(proportion_of_positives, ctus_all, spark): for ctu in sorted(ctus_all): ctu_query = "ctu == {0}".format(ctu[0]) ctu_value = proportion_of_positives.where(ctu_query).select( 'CTU').collect() if len(ctu_value) == 0: new_df = spark.createDataFrame( [[ctu[0], 0]], ['CTU', 'proportion_with_positive_target']) proportion_of_positives = proportion_of_positives.union(new_df) return proportion_of_positives imputed_df = get_imputed_df(cfg.IMPUTATION_TRAIN_PATH, cfg.IMPUTATION_PREDICT_PATH) proportion_of_positives = main(imputed_df, spark) proportion_of_positives = proportion_of_positives.withColumn('CTU', proportion_of_positives['CTU']\ .cast(IntegerType())) proportion_of_positives = proportion_of_positives.orderBy(asc('CTU')) write_to_excel(proportion_of_positives, "zone_5_ta_def_7_prop_per_ctu.py")
Descriptive statistics on the number of positive labels for across all customers Minimum Maximum Mean Standard deviation Median 1 2 1 1.5 1 """ import sys sys.path.append("/home/boldyrek/mysoft/te/te_reporting/") from helper_functions import start_spark_session, get_imputed_df from col_stats import * from pyspark.sql.functions import col from helper_functions import get_imputed_df, start_spark_session, load_df, write_to_excel import config as cfg spark = start_spark_session() imputed_df = get_imputed_df( cfg.IMPUTATION_TRAIN_PATH, cfg.IMPUTATION_PREDICT_PATH ) """ take the targets where target is equal 1, group by party id, count how many 1 targets """ imputed_df_count = imputed_df.where("te_2month = 1").groupBy('party_id').agg({'te_2month' : 'count'}) imputed_df_count_te_2month = imputed_df_count.select("count(te_2month)") minimum = calc_column_min(imputed_df_count_te_2month) maximum = calc_column_max(imputed_df_count_te_2month) mean = calc_column_avg(imputed_df_count_te_2month) stdev = calc_column_stddev(imputed_df_count_te_2month) median = calc_column_median(imputed_df_count_te_2month) positive_label_stats_across_customers_df = spark.createDataFrame([[minimum, maximum, mean, stdev, median]],\ ['minimum', 'maximum','mean','stdev','median']) write_to_excel( positive_label_stats_across_customers_df , "zone_5_target_definitio_step_7")
return new_order_cols """ ***** MAIN ******* """ #file_name = "../data/example.csv" spark = start_spark_session() df = load_df(cfg.PREPROCESS_PATH) #df = load_df(file_name) #df = get_imputed_df(cfg.IMPUTATION_TRAIN_PATH) event_stats_ctus_dfs = [] max_CTU_num = get_max_CTU_num(df) for CTU_num in range(max_CTU_num): print(CTU_num) single_CTU_df = df.filter(f"CTU == { CTU_num }") # get a df with one CTU single_CTU_clean_df = get_df_with_dropped_garbage_cols(single_CTU_df) ctu_event_stats_df = get_event_stats_df(single_CTU_clean_df, CTU_num) event_stats_ctus_dfs.append(ctu_event_stats_df) joined_ctu_event_stats_df = get_joined_df(event_stats_ctus_dfs) #print_df (joined_ctu_event_stats_df) joined_ctu_event_stats_df = split_column_by_underscore( joined_ctu_event_stats_df) new_order_cols = get_event_cols_first(joined_ctu_event_stats_df) joined_ctu_event_stats_df = joined_ctu_event_stats_df.select( new_order_cols).orderBy('event') write_to_excel(joined_ctu_event_stats_df, "zone_5_ctu_report_step_5")
""" columns_to_drop = [ 'level_0', 'index', 'Unnamed: 0', '_c0', 'party_id', 'event_date', 'CTU', 'event_id' ] df_to_drop = df.select('*') df_to_drop = df_to_drop.drop(*columns_to_drop) return df_to_drop """ *** MAIN *** """ spark = start_spark_session() prepro_df = load_df(cfg.PREPROCESS_PATH) num_rows = prepro_df.count() event_rate_df = prepro_df.select([ (F.count(F.when(prepro_df[c] != 0, c)) / num_rows).alias(c) for c in prepro_df.columns ]) event_rate_df_clean = drop_garbage_cols(event_rate_df) event_rate_df_clean_pd = event_rate_df_clean.toPandas().transpose( ).reset_index().rename(columns={ 0: 'Column event rate ', 'index': 'Column names' }) event_rate_df_clean_spark = spark.createDataFrame(event_rate_df_clean_pd) write_to_excel(event_rate_df_clean_spark, "zone_5_stage_I_aggrega_step_2")
""" **** MAIN ***** """ spark = start_spark_session() preprocessing_df = load_df(cfg.PREPROCESS_PATH) preprocessing_columns_with_stats = get_descriptive_statistics_for_columns( preprocessing_df) preprocessing_cols_stats_df = spark.createDataFrame( preprocessing_columns_with_stats, ['column', 'max', 'min', 'mean']) imputed_df = get_imputed_df(cfg.IMPUTATION_TRAIN_PATH, cfg.IMPUTATION_PREDICT_PATH) imputed_columns_with_stats = get_descriptive_statistics_for_columns(imputed_df) imputed_cols_stats_df = spark.createDataFrame(imputed_columns_with_stats, ['column', 'max', 'min', 'mean']) preprocessing_cols_stats_df_re = preprocessing_cols_stats_df.\ select(*(col(x).alias(x + '_pre') for x in preprocessing_cols_stats_df.columns)) joined_df = preprocessing_cols_stats_df_re.join( imputed_cols_stats_df, preprocessing_cols_stats_df_re.column_pre == imputed_cols_stats_df.column) delta_columns_df = get_delta_columns_df(joined_df) elta_columns_df = delta_columns_df.select('column', 'delta_min', 'delta_max', 'delta_mean') write_to_excel(delta_columns_df, "zone_5_ctu_imputation_step_6")
print("DO_NOT_SEARCH here") dns_counter += 1 else: googleLocation = geocoder.google(tweet.location, key=GOOGLE_API_KEY) coordinates = googleLocation.latlng if (len(coordinates) == 2): # warning: unless any of the input data has decimal values in it, this will auto-round the coordinates newLat = coordinates[0] newLng = coordinates[1] counter += 1 df_output.loc[len(df_output)] = [tweet.keyword, tweet.username, tweet.text, newLat, newLng, tweet.location, tweet.created_at, tweet.place, tweet.description, tweet.verified, tweet.sentiment_score, tweet.compound_sentiment, tweet.description_sentiment] print("stored: %s, to go: %s, dns: %s" % (counter, num_left, dns_counter)) if((counter % 200) == 0): try: write_to_excel(output_file, 'Sheet1', df_output) print("%s SAVED" % (counter)) except: print("ERROR saving new coordinates") # return else: print("coordinates not found for '%s' - dropping tweet by %s" % (tweet.location, tweet.username)) else: print("no location information - dropping tweet by %s" % (tweet.username)) else: counter += 1 print("coordinates already found") df_output.loc[len(df_output)] = [tweet.keyword, tweet.username, tweet.text, tweet.lat, tweet.lng, tweet.location, tweet.created_at, tweet.place, tweet.description, tweet.verified, tweet.sentiment_score, tweet.compound_sentiment, tweet.description_sentiment] print("stored: %s, to go: %s, dns: %s" % (counter, num_left, dns_counter)) if((counter % 200) == 0):