def process_unique_time_distribution(valid_rdd, process_properties, data_dict, stats_collector, acc_dict): if process_properties.get('unique_time_distribution', None): column_number_list = process_properties['unique_time_distribution']['columns'] for column_number in column_number_list: column_name = data_dict[column_number]['name'] stats_key = 'unique_time_distribution_' + column_name frequency = process_properties['unique_time_distribution'].get('frequency', 1) frequency_unit = process_properties['unique_time_distribution'].get('unit', 'hours') timestamp_format = process_properties['timestamp_format'] gap_rdd = (valid_rdd .map(lambda x: (x[column_number], round_datetime(construct_datetime(x, timestamp_format, data_dict), frequency, frequency_unit)))) gap_df = gap_rdd.toDF().selectExpr('_2 as DATETIME', '_1 as COLUMN') stats_collector[stats_key] = (gap_df .groupBy('DATETIME').agg(countDistinct('COLUMN').alias('UNIQUE_COLUMN')) .rdd .map(lambda x: [x['DATETIME'], x['UNIQUE_COLUMN']]) .collect()) print stats_collector[stats_key]
def get_num_distinct_ctus_per_partyid(imputed_df): """ Calculating number of distinct CTUs per customer """ distinct_ctus_per_partyid = imputed_df.groupby('party_id').agg(countDistinct("CTU"))\ .select("party_id", col("count(CTU)").alias("distinct_ctus")) return distinct_ctus_per_partyid
def agg_fn(df: DataFrame, group_by_cols): gbc = [col(x) for x in group_by_cols] assoc_df = (df.groupBy(*gbc).agg( countDistinct(col("pmid")).alias("f"), count(col("pmid")).alias("N"))) return assoc_df
def cnt_dates_cnt_ifa(df, OUTPUT_DATA_PATH): cnt_dates_cnt_ifa = df.groupBy("c_dates").agg( F.countDistinct("ifa").alias('count_ifa')) # write to s3 : small file cnt_dates_cnt_ifa_path = OUTPUT_DATA_PATH + 'cnt_dates_cnt_ifa.csv' cnt_dates_cnt_ifa.repartition(1).write.csv(cnt_dates_cnt_ifa_path, header=True, mode='overwrite')
def get_average_num_ps1s(df): return df.where( col('top_level_url').isin(all_top) ).groupBy('top_level_url').agg(countDistinct('ps1').alias("count")).select( mean('count').alias('mean'), expr('percentile_approx(count, 0.5)').alias('median'), stddev('count').alias('std') ).show()
def manufacturer_distribution(): df = sqlContext.read.load('harddrive.csv', delimiter=';', format='com.databricks.spark.csv', header='true', inferSchema='true') dg = df.groupBy("MFG").agg(F.countDistinct("model_new"))
def process_data(data, begin, date): return (data.filter(col('submission_date_s3') > begin).filter( col('submission_date_s3') <= date).groupBy('country').agg( (sample_factor * countDistinct('client_id')).alias(active_users_col)).select( '*', lit(begin).alias(start_date_col), lit(date).alias('submission_date_s3')))
def create_categorical_feature_map(df, categorical_features): temp_data = df.agg(*(countDistinct(col(c)).alias(c) for c in df.columns)).collect()[0] data_map = {} for feature_indx in range(0, len(df.columns)): if temp_data.__fields__[feature_indx] in categorical_features: data_map[feature_indx] = int(temp_data[feature_indx]) return data_map
def stats(col: str) -> F.Column: return F.struct( F.min(col).alias('min'), F.max(col).alias('max'), F.avg(col).alias('avg'), F.count(col).alias('count'), F.countDistinct(col).alias('countDistinct'), )
def calculate_spending_patterns(orderInfoDF): # generate entity summary usrAggregates = orderInfoDF.groupby(entity) \ .agg(F.countDistinct(uID).alias("nCustomers"), F.sum(spend).alias("spend"), F.count(orderID).alias("orders")) return usrAggregates
def get_user_demo_metrics(addons_expanded): """ :param addons_expanded: addons_expanded dataframe :return: aggregated dataframe by addon_id with user demographic information including - distribution of users by operating system - distribution of users by country """ def source_map(df, extra_filter=""): m = F.create_map( list( chain(*((F.lit(name), F.col(name)) for name in df.columns if name != "addon_id")))).alias(extra_filter) return m client_counts = (addons_expanded.groupBy("addon_id").agg( F.countDistinct("client_id").alias("total_clients")).cache()) os_dist = (addons_expanded.withColumn("os", bucket_os("os")).groupBy( "addon_id", "os").agg(F.countDistinct("client_id").alias("os_client_count")).join( client_counts, on="addon_id", how="left").withColumn( "os_pct", F.col("os_client_count") / F.col("total_clients")).groupby("addon_id").pivot("os").agg( F.first("os_pct").alias("os_pct"))) os_dist = os_dist.na.fill(0).select("addon_id", source_map(os_dist, "os_pct")) ct_dist = (addons_expanded.withColumn( "country", bucket_country("country")).groupBy("addon_id", "country").agg( F.countDistinct("client_id").alias("country_client_count")).join( client_counts, on="addon_id", how="left").withColumn( "country_pct", F.col("country_client_count") / F.col("total_clients")).groupBy("addon_id").pivot( "country").agg( F.first("country_pct").alias("country_pct"))) ct_dist = ct_dist.na.fill(0).select("addon_id", source_map(ct_dist, "country_pct")) combined_dist = os_dist.join(ct_dist, on="addon_id", how="outer") return combined_dist
def calcIdf(corpus): # get document count docCount = corpusDf.agg(f.countDistinct('id')) docCount = docCount.collect()[0][0] # explode token vector corpusDfExplode = (corpusDf.select('id','tokens',(f.explode('tokens').alias('indvToken')))) # count number of IDs that include each word to get document frequency docFreqs = corpusDfExplode.groupBy('indvToken').agg(f.countDistinct('id').alias('df')) docFreqs = docFreqs.sort(f.desc('df')) idfDf = docFreqs.withColumn('idf', docFreqs.df / docCount) idfDf = docFreqs.withColumn('idf', f.log((docCount + 1) / (docFreqs.df + 1))) idfRdd = idfDf.select('indvToken','idf').rdd return idfRdd
def test_all(self, measure_columns=None, dimension_columns=None, max_num_levels=40): # CommonUtils.create_update_and_save_progress_message(self._dataframe_context,self._scriptWeightDict,self._scriptStages,self._analysisName,"chisquareStats","info",display=False,weightKey="script") targetDimension = dimension_columns[0] all_dimensions = self._dimension_columns all_dimensions = [x for x in all_dimensions if x != targetDimension] # if self._analysisDict != {}: # nColsToUse = self._analysisDict[self._analysisName]["noOfColumnsToUse"] # else: # nColsToUse = None # if nColsToUse != None: # all_dimensions = all_dimensions[:nColsToUse] all_measures = self._measure_columns df_chisquare_result = DFChiSquareResult() # print "df_chisquare_result"*50 # print df_chisquare_result for d in all_dimensions: try: chisquare_result = self.test_dimension(targetDimension, d) df_chisquare_result.add_chisquare_result( targetDimension, d, chisquare_result) except Exception as e: print(repr(e), d) continue for m in all_measures: try: if self._pandas_flag: if len(self._data_frame[m].unique() ) > self._analysisDict['Dimension vs. Dimension'][ 'binSetting']['binCardinality']: chisquare_result = self.test_measures( targetDimension, m) df_chisquare_result.add_chisquare_result( targetDimension, m, chisquare_result) else: if self._data_frame.select(F.countDistinct(m)).collect( )[0][0] > self._analysisDict['Dimension vs. Dimension'][ 'binSetting']['binCardinality']: chisquare_result = self.test_measures( targetDimension, m) df_chisquare_result.add_chisquare_result( targetDimension, m, chisquare_result) except Exception as e: print(str(e), m) continue CommonUtils.create_update_and_save_progress_message( self._dataframe_context, self._scriptWeightDict, self._scriptStages, self._analysisName, "completion", "info", display=False, weightKey="script") return df_chisquare_result
def calc_retention(heavy_col_name, heavy_co_arr, heavy_co_date, anchor_start, anchor_end): """ Given a dataframe, heavy column name, heavy cutoff, and base week end date, returns dataframe with retention for heavy user type for each period. Params: heavy_col_name (string): name of the column for heavy user heavy_co_arr (array numbers): an array of cutoff values to define a heavy user for a date heavy_co_date (array dates): an array of dates for the cutoff values anchor_start (date): date for the start of the anchor week anchor_end (date): date for the end of the anchor week Returns dataframe with retention for heavy user type for each period """ # Get heavy users for i in range(0, len(heavy_co_date)): if i == 0: hu_where = "(submission_date_s3 <= " + heavy_co_date[ i] + " and " + heavy_col_name + " >= " + str( heavy_co_arr[i]) + ")" else: hu_where = hu_where + " or (submission_date_s3 = " + heavy_co_date[ i] + " and " + heavy_col_name + " >= " + str( heavy_co_arr[i]) + ")" print hu_where ms_ret_hu = ms_ret.where(hu_where) # Get base week of heavy users base_hu = ms_ret_hu \ .where("submission_date_s3 >= \'" + anchor_start + "\' and submission_date_s3 <= \'" + anchor_end + "\'") \ .select('client_id').distinct() # Count unique clients in base week of heavy users base_num_clients = base_hu.select('client_id').distinct().count() # Get only base week heavy users for the entire time ret_hu = ms_ret_hu.join(base_hu, on='client_id') # Add period column ret_hu = ret_hu.withColumn("anchor", F.lit(base_start)) \ .withColumn("period", get_period("anchor", "submission_date_s3")) # Group by period and count the number of distinct clients hu_weekly_counts = ret_hu.groupby("period").agg( F.countDistinct("client_id").alias("n_week_clients")) # Add column with calculated retention and 95% CI hu_retention = ( hu_weekly_counts.withColumn( "total_clients", F.lit(base_num_clients)).withColumn( "retention", F.col("n_week_clients") / F.col("total_clients")) # Add a 95% confidence interval based on the normal approximation for a binomial distribution, # p ± z * sqrt(p*(1-p)/n). # The 95% CI spans the range `retention ± ci_95_semi_interval`. .withColumn( "ci_95_semi_interval", F.lit(1.96) * F.sqrt( F.col("retention") * (F.lit(1) - F.col("retention")) / F.col("total_clients")))) return hu_retention
def getColumnUniqueCounts(clickDF): """ Parameters ---------- clickDF : class Dataframe """ distvals = clickDF.agg(*(countDistinct(col(c)).alias(c) for c in clickDF.columns)) print(type(distvals)) return distvals
def _deduplication_todo(columns, df): """ Returns what (columns, as in spark columns) to compute to get the results requested by the parameters. :param columns: :type columns: list :param df: :type df: DataFrame :return: Pyspark columns representing what to compute. """ if columns is None: # 1 count distinct, on all columns todo = [countDistinct(*[col(c) for c in df.columns])] else: # multiple count distinct, one column each todo = [countDistinct(col(c)) for c in columns] return todo
def __init__(self): """ Call the parent constructor. """ super(FeatureUniqueUaTotal, self).__init__() self.group_by_aggs = { 'distinct_ua': F.countDistinct(F.col('client_ua')).cast('float') }
def count_unique_request(df_session): """ Determine unique URL visits per session. To clarify, count a hit to a unique URL only once per session. """ df_unique_url = df_session.groupby("ip_session_count").agg( countDistinct("request_url").alias("count_unique_requests")) df_unique_url.show() return df_unique_url
def calculate_concentration_ratios(editor_interactions): concentration_ratio = editor_interactions.groupby('page_id').agg( f.countDistinct('event_user_id').alias('num_editors'), f.sum('revisions_count').alias('num_revisions')) \ .withColumn('concentration_ratio', col('num_editors') / col('num_revisions')) editor_interactions = editor_interactions.join(concentration_ratio, on='page_id') return editor_interactions
def cardinalities(df, cols): from functools import reduce counts = df.agg( F.struct(*[F.countDistinct(F.col(c)).alias(c) for c in cols] + [F.count(F.col(cols[0])).alias('total')]).alias( "results")).select("results").collect()[0][0].asDict() counts.update({'total': df.count()}) return counts
def check_df(df): print( "Number of sites %d" % df.agg(countDistinct(col("visit_id"))).collect()[0] ) print( "Number of records with stripped cookies %d" % df.agg(count_not_null('original_cookies')).collect()[0] ) if 'content_hash' in df.columns: print( "Total number of scripts loaded %d" % df.agg(count_not_null("content_hash")).collect()[0] ) print( "Number of distinct scripts loaded %d" % df.agg(countDistinct(col("content_hash"))).collect()[0] )
def get_top_addon_names(addons_expanded): w = Window().partitionBy("addon_id").orderBy(F.col("n").desc()) cnts = addons_expanded.groupby("addon_id", "name").agg( F.countDistinct("client_id").alias("n")) addon_names = (cnts.withColumn( "rn", F.row_number().over(w)).where(F.col("rn") == 1).select( "addon_id", "name")) return addon_names
def get_class_distribution(df, col_name='subbreddit_display_name'): count = df.groupBy(col_name).agg(F.countDistinct('_id')) s = count.select(F.sum('count(DISTINCT _id)')).collect()[0][0] udf_func = F.udf(lambda col: round(col / s, 4) * 100, FloatType()) count = count.withColumn('set_part_%', udf_func('count(DISTINCT _id)')) return count.show()
def foreach_batch_function(df, epoch_id): df.groupBy("country").count().orderBy("count", ascending=False) \ .select(col("country").alias("maxCountry"), col("count")).limit(1).show() df.groupBy("country").count().orderBy("count", ascending=True) \ .select(col("country").alias("minCountry"), col("count")).limit(1).show() df.agg(countDistinct( col("first_name")).alias("numberOfUniqueUsers")).show()
def get_addon(data, date, period=7, country_list=None): """ Calculate the proportion of WAU that have a "self installed" addon for a specific date Parameters: data: sample of the main server ping data frame date: string, with the format of 'yyyyMMdd' period: The number of days before looked at in the analysis country_list: a list of country names in string Returns: a dataframe showing all the information for each date in the period - three columns: 'submission_date_s3', 'country', 'pct_Addon' """ data_all = keep_countries_and_all(data, country_list) begin = date_plus_x_days(date, -period) addon_filter = (~F.col('addon.is_system')) & (~F.col('addon.foreign_install')) &\ (~F.col('addon.addon_id').isin(NON_MOZ_TP)) &\ (~F.col('addon.addon_id').like('%@mozilla%')) &\ (~F.col('addon.addon_id').like('%@shield.mozilla%')) &\ (~F.col('addon.addon_id').like('%' + UNIFIED_SEARCH_STR + '%')) WAU = data_all\ .filter("submission_date_s3 <= '{0}' and submission_date_s3 > '{1}'".format(date, begin))\ .groupBy('country')\ .agg(F.countDistinct('client_id').alias('WAU')) addon_count = data_all\ .filter("submission_date_s3 <= '{0}' and submission_date_s3 > '{1}'".format(date, begin))\ .select('submission_date_s3', 'country', 'client_id', F.explode('active_addons').alias('addon'))\ .filter(addon_filter)\ .groupBy('country')\ .agg(F.countDistinct('client_id').alias('add_on_count')) join_df = WAU.join(addon_count, 'country', how='left')\ .withColumn("pct_addon", (100.0 * F.col("add_on_count") / F.col("WAU")))\ .select(F.lit(date).alias('submission_date_s3'), '*') return join_df.select('submission_date_s3', 'country', 'pct_addon')
def main(spark, interaction_path): ''' ---------- spark : SparkSession object interaction_path : string, path to the interaction parquet file to load user_map_path : string, path to the user map parquet file to load ''' # Load the dataframe interaction = spark.read.parquet(interaction_path) #user_map = spark.read.parquet(user_map_path) # discard rating == 0 interaction = interaction.filter(interaction['is_read'] == 1) #is_read_filtered = interaction.select("user_id").distinct().count() #print("number of distinct user ids after discarding is_read == 0: ", is_read_filtered) # Get dataframe with users having more than 30 interactions df = interaction.groupby('user_id').agg(countDistinct('book_id').alias('num_interaction')) df_more30 = df.where(df.num_interaction > 30) user_id_ls = df_more30.select('user_id').distinct() #print("number of distinct user ids after discarding interactions < 30: ", user_id_ls.count()) # save filtered interaction data for possible later use print("save filtered data") df_more30.write.format("parquet").mode("overwrite").save("hdfs:/user/hj1399/filtered.parquet") # split 60%, 20%, 20% of downsample based on distinct user id seed=42 train_user, val_user, test_user = df_more30\ .select("user_id")\ .distinct()\ .randomSplit(weights=[0.6, 0.2, 0.2], seed=seed) #join splitted train_user, val_user, test_user with downsample to create training dataset # these are df now (!= rdd) train = train_user.join(df_more30, 'user_id', 'inner') val = val_user.join(df_more30, 'user_id', 'inner') test = test_user.join(df_more30, 'user_id', 'inner') # take 50 % of interactions per user from val and test data and add those to train print("start taking half inteactions from val and test data and add them to train") val_sampled, val_left = split_data(val, fraction=0.5, seed=42) test_sampled, test_left = split_data(test, fraction=0.5, seed=42) train_df = train.union(val_sampled).union(test_sampled) print("number of distinct user ids of train data: ", train_df.select("user_id").distinct().count()) print("number of distinct user ids of val data: ", val_left.select("user_id").distinct().count()) print("number of distinct user ids of test data: ", test_left.select("user_id").distinct().count()) train_df.write.format("parquet").mode("overwrite").save("hdfs:/user/hj1399/train_total.parquet") print("train df saved") val_left.write.format("parquet").mode("overwrite").save("hdfs:/user/hj1399/val_total.parquet") print("val df saved") test_left.write.format("parquet").mode("overwrite").save("hdfs:/user/hj1399/test_total.parquet") print("test df saved")
def install_flow_events(events): def source_map(df, alias, extra_filter=""): m = F.create_map( list( chain(*((F.lit(name.split("_")[0]), F.col(name)) for name in df.columns if name != "addon_id" and extra_filter in name )))).alias(alias) return m install_flow_events = ( events.select([ "client_id", "submission_date_s3", "event_map_values.method", "event_method", "event_string_value", "event_map_values.source", "event_map_values.download_time", "event_map_values.addon_id", ]).filter("event_category = 'addonsManager'").filter( "event_object = 'extension'").filter(""" (event_method = 'install' and event_map_values.step = 'download_completed') or (event_method = 'uninstall') """).withColumn( "addon_id", F.when( F.col("addon_id").isNull(), F.col("event_string_value")).otherwise(F.col("addon_id")), ) # uninstalls populate addon_id in a different place .drop("event_string_value").groupby( "addon_id", "event_method", "source").agg( F.avg("download_time").alias("avg_download_time"), F.countDistinct("client_id").alias("n_distinct_users"), )) installs = (install_flow_events.filter("event_method = 'install'").groupBy( "addon_id").pivot("source").agg( (F.sum("n_distinct_users") * F.lit(100)), F.avg("avg_download_time"))) uninstalls = (install_flow_events.filter( "event_method = 'uninstall'").groupBy("addon_id").pivot("source").agg( (F.sum("n_distinct_users") * F.lit(100)))) agg = (installs.na.fill(0).select( "addon_id", source_map(installs, "installs", "n_distinct_users"), source_map(installs, "download_times", "avg_download_time"), ).join( uninstalls.na.fill(0).select("addon_id", source_map(uninstalls, "uninstalls")), on="addon_id", how="full", )) return agg
def registry_etl( spark: SparkSession, config: confuse.core.Configuration, icd_to_desc_map: pd.DataFrame, ) -> pd.DataFrame: registry_claims_bucket = config["buckets"]["registry_claims"].get(str) log.info(f"Registry claim bucket: {registry_claims_bucket}") log.info(f"Reading in registry claims data") registry_rdd = spark.read.parquet( registry_claims_bucket.replace("s3:", "s3a:")).withColumnRenamed( "patient_id", "registry_id") # Select claims around patient reference date and filter out claims before 2017 to remove ICD9 codes registry_rdd = ( registry_rdd.where( # Filters to claims falling before reference date F.col("claim_date") < F.date_sub(F.col("reference_date"), 0)).where( F.col("claim_date") > F.date_sub(F.col("reference_date"), 1 * 365)) # TODO [Low] Move time length into YAML .where(F.col("reference_date") > F.lit("2017-01-01")) # TODO [Low] Move cut-off date into YAML ) registry_id_count = registry_rdd.select("registry_id").distinct().count() log.info(f"Registry ID Count: {registry_id_count}") registry_count_min = config["etl"]["registry_count_min"].get(int) log.info(f"ETL of registry data and bringing to pandas") registry_df = (registry_rdd.select( "registry_id", F.explode(F.col("dx_list")).alias("code")).where( F.col("code").isNotNull()).groupBy("code").agg( F.collect_set(F.col("registry_id")).alias("registry_ids"), F.countDistinct(F.col("registry_id")).alias("registry_count"), ).where(F.col("registry_count") > registry_count_min).withColumn( "registry_rate", F.col("registry_count") / F.lit(registry_id_count))).toPandas() # TODO [Medium] Move below this into Spark log.info(f"Registry ETL pandas operations") registry_df.drop( registry_df.index[~registry_df.code.isin(icd_to_desc_map.code)], inplace=True) registry_df.sort_values("code").reset_index(drop=True, inplace=True) n_total_test = int( round( registry_df["registry_count"].iloc[0] / registry_df["registry_rate"].iloc[0], 0, )) log.info(f"N Total Test: {n_total_test}") registry_df["n_total_test"] = n_total_test return registry_df
def edges_agg(context, df: DataFrame): cols_gb = context.solid_config.get("cols_gb") col_date = context.solid_config.get("col_date") min_edges = context.solid_config.get("min_edges") node_att = context.solid_config.get("node_att") weight_distinct_col = context.solid_config.get("weight_distinct_col") context.log.debug( f"edges_agg: cols_gb={cols_gb}, col_date={col_date}, min_edges={min_edges}" ) edge_weight_col = "edge_weight" filter_query = f"{edge_weight_col}>={min_edges}" node_att_x = [] node_att_y = [] if node_att: node_att_x = [f"{c}1" for c in node_att] node_att_y = [f"{c}2" for c in node_att] collect_set_list = list( set(df.columns) - set(cols_gb) - set(node_att_x + node_att_y)) agg_list = [ F.countDistinct(F.col(weight_distinct_col)).alias(edge_weight_col), ] + [ F.collect_set(F.col(c).cast("string")).cast("string").alias(f"{c}_att") for c in collect_set_list ] if col_date: agg_list += [ F.min(F.col(col_date).cast("string")).alias(f"{col_date}_min"), F.max(F.col(col_date).cast("string")).alias(f"{col_date}_max"), ] df_edges = df.groupBy(cols_gb).agg(*agg_list).filter(filter_query) node_att_dict = {k: v for k, v in zip(cols_gb, [node_att_x, node_att_y])} df_nodes = df.select(*cols_gb + node_att_x + node_att_y) df_nodes = create_df_node(df_nodes, node_att_dict) df_edges_nodes = create_df_node(df_edges, {k: None for k in cols_gb}) df_nodes = df_nodes.join( df_edges_nodes, on="NodeID", how="right") # keep only nodes that are in df_edges df_edges = df_edges.cache() df_nodes = df_nodes.cache() context.log.info( f"df_edges: count={df_edges.count}, columns={df_edges.columns}") context.log.info( f"df_nodes: count={df_nodes.count}, columns={df_nodes.columns}") yield Output(df_edges, "df_edges") yield Output(df_nodes, "df_nodes")
def drop_constant_unique_cols(self): if not self._pandas_flag: cnt = self.data_frame.agg( *(F.countDistinct(c).alias(c) for c in self.data_frame.columns)).first() self.data_frame = self.data_frame.drop(*[ c for c in cnt.asDict() if cnt[c] == 1 or cnt[c] == self.data_frame.count() ]) else: # self.data_frame = self.data_frame.loc[:, (self.data_frame .nunique() != 1)] # new_df = self.data_frame.loc[:, (self.data_frame.nunique() != self.data_frame .shape[0])] # if self.target not in new_df.columns: # new_df[self.target] = self.data_frame[self.target] # self.data_frame = new_df self.data_frame = self.data_frame.apply( lambda col: pd.to_datetime(col, errors='ignore') if col.dtypes == object else col, axis=0) self.data_frame = self.data_frame.loc[:, ( self.data_frame.nunique() != 1)] float_df = self.data_frame.loc[:, ( self.data_frame.dtypes == 'float64')] date_df = self.data_frame[self.datetime_cols] int_df = self.data_frame.loc[:, (self.data_frame.dtypes == 'int64')] other_df = self.data_frame.loc[:, ( (self.data_frame.dtypes == 'object') & (self.data_frame.nunique() != self.data_frame.shape[0]))] new_df = pd.concat([other_df, float_df, int_df, date_df], axis=1) #new_df = new_df.loc[:, (new_df.nunique() != new_df.shape[0])] #new_df = self.data_frame .loc[:, (self.data_frame.nunique() != self.data_frame .shape[0])] #new_df=self.data_frame if self.target not in new_df.columns: new_df[self.target] = self.data_frame[self.target] self.data_frame = new_df else: self.data_frame = new_df self.data_change_dict['dropped_columns_list'] = [] def dropped_col_update(list_element): if list_element["column_name"] not in self.data_frame.columns: list_element["dropped_column_flag"] = True self.data_change_dict['dropped_columns_list'].append( list_element["column_name"]) self.numeric_cols, self.dimension_cols, self.col_with_nulls = drop_cols_from_list( list_element["column_name"], self.numeric_cols, self.dimension_cols, self.col_with_nulls) else: pass return list_element self.data_change_dict['Column_settings'] = [ dropped_col_update(list_element) for list_element in self.data_change_dict['Column_settings'] ]
def os_on_date(data, date, period=7, country_list=None): """ Gets the distribution of OS usage calculated on the WAU on 1 day. Parameters: data: Usually the main summary data frame. date: day to get the os distribution for the past week. period: The number of days to calculate the distibution. By default it finds os distribution over a week. country_list: The countries to do the analysis. If None then it does it for the whole world. Returns: submission_date_s3, country, os, pct_on_os """ data_all = keep_countries_and_all(data, country_list) begin = date_plus_x_days(date, -period) data_all = data_all.select( 'client_id', 'submission_date_s3', 'country', nice_os(col('os'), col('os_version')).alias('nice_os')) # Calculate the WAU wau = data_all\ .filter((col('submission_date_s3') <= date) & (col('submission_date_s3') > begin))\ .groupBy('country')\ .agg(countDistinct('client_id').alias('WAU'))\ os_wau = data_all\ .filter((col('submission_date_s3') <= date) & (col('submission_date_s3') > begin))\ .groupBy('country', 'nice_os')\ .agg(countDistinct('client_id').alias('WAU_on_OS'))\ .select(lit(begin).alias('start_date'), lit(date).alias('submission_date_s3'), 'country', 'WAU_on_OS', 'nice_os') res = os_wau.join(wau, 'country', how='left')\ .select('start_date', 'submission_date_s3', 'country', 'WAU_on_OS', 'nice_os', 'WAU') return res.select('submission_date_s3', 'country', col('nice_os').alias('os'), (100.0 * col('WAU_on_OS') / col('WAU')).alias('pct_on_os'))
def main(sc): cdetEnclosureData = sc.textFile("file:///scratch/CDETS_Enclosures.txt") cdetEnclosureDataClean = cdetEnclosureData.map(clean_row) dataDict = cdetEnclosureDataClean.map(make_row) cdetEnclosureDataCleanDF = sqlContext.createDataFrame(dataDict) cdetEnclosureDataCleanDF.groupBy("CdetID").count().show() cdetEnclosureDataCleanDF.agg(countDistinct("CdetID")).show() #Call the dumpcr command cdetEnclosureData.foreach(get_enclosure_content)
def test_aggregator(self): df = self.df g = df.groupBy() self.assertEqual([99, 100], sorted(g.agg({'key': 'max', 'value': 'count'}).collect()[0])) self.assertEqual([Row(**{"AVG(key#0)": 49.5})], g.mean().collect()) from pyspark.sql import functions self.assertEqual((0, u'99'), tuple(g.agg(functions.first(df.key), functions.last(df.value)).first())) self.assertTrue(95 < g.agg(functions.approxCountDistinct(df.key)).first()[0]) self.assertEqual(100, g.agg(functions.countDistinct(df.value)).first()[0])
def process_unique_columns(raw_df, process_properties, data_dict, stats_collector, acc_dict): if process_properties.get('unique_columns', None): for index in process_properties['unique_columns']: unique_count = (raw_df .groupBy() .agg(countDistinct(data_dict[index]['name']).alias('DISTINCT_COUNT')) .collect()) stats_collector['unique_' + data_dict[index]['name']] = unique_count[0]['DISTINCT_COUNT'] print stats_collector['unique_' + data_dict[index]['name']]
def runAggregateFunctions(spark, df1, df2): # collect_list, collect_set doubledDf1 = df1.union(df1) doubledDf1.select(functions.collect_list(doubledDf1["name"])).show(truncate=False) doubledDf1.select(functions.collect_set(doubledDf1["name"])).show(truncate=False) # count, countDistinct doubledDf1.select(functions.count(doubledDf1["name"]), functions.countDistinct(doubledDf1["name"])).show( truncate=False) # sum df2.printSchema() df2.select(sum(df2["price"])).show(truncate=False) # grouping, grouping_id df2.cube(df2["store"], df2["product"]).agg(sum(df2["amount"]), grouping(df2["store"])).show(truncate=False) df2.cube(df2["store"], df2["product"]).agg(sum(df2["amount"]), grouping_id(df2["store"], df2["product"])).show( truncate=False)
def process_groupby_unique_columns(raw_df, process_properties, data_dict, stats_collector, acc_dict): if process_properties.get('groupby_unique_columns', None): for item in process_properties['groupby_unique_columns']: groupby_column_name = data_dict[item[0]]['name'] count_unique_column_name = data_dict[item[1]]['name'] unique_count = (raw_df .groupBy(groupby_column_name) .agg(countDistinct(count_unique_column_name).alias('DISTINCT_COUNT')) .collect()) key_name = 'groupby_' + groupby_column_name + '_unique_' + count_unique_column_name stats_collector[key_name] = dict() for row in unique_count: if row[groupby_column_name] not in stats_collector[key_name]: stats_collector[key_name][row[groupby_column_name]] = row['DISTINCT_COUNT'] print stats_collector[key_name]
def describe_1d(df, column, nrows, lookup_config=None): column_type = df.select(column).dtypes[0][1] # TODO: think about implementing analysis for complex # data types: if ("array" in column_type) or ("stuct" in column_type) or ("map" in column_type): raise NotImplementedError("Column {c} is of type {t} and cannot be analyzed".format(c=column, t=column_type)) distinct_count = df.select(column).agg(countDistinct(col(column)).alias("distinct_count")).toPandas() non_nan_count = df.select(column).na.drop().select(count(col(column)).alias("count")).toPandas() results_data = pd.concat([distinct_count, non_nan_count],axis=1) results_data["p_unique"] = results_data["distinct_count"] / float(results_data["count"]) results_data["is_unique"] = results_data["distinct_count"] == nrows results_data["n_missing"] = nrows - results_data["count"] results_data["p_missing"] = results_data["n_missing"] / float(nrows) results_data["p_infinite"] = 0 results_data["n_infinite"] = 0 result = results_data.ix[0].copy() result["memorysize"] = 0 result.name = column if result["distinct_count"] <= 1: result = result.append(describe_constant_1d(df, column)) elif column_type in {"tinyint", "smallint", "int", "bigint"}: result = result.append(describe_integer_1d(df, column, result, nrows)) elif column_type in {"float", "double", "decimal"}: result = result.append(describe_float_1d(df, column, result, nrows)) elif column_type in {"date", "timestamp"}: result = result.append(describe_date_1d(df, column)) elif result["is_unique"] == True: result = result.append(describe_unique_1d(df, column)) else: result = result.append(describe_categorical_1d(df, column)) # Fix to also count MISSING value in the distict_count field: if result["n_missing"] > 0: result["distinct_count"] = result["distinct_count"] + 1 # TODO: check whether it is worth it to # implement the "real" mode: if (result["count"] > result["distinct_count"] > 1): try: result["mode"] = result["top"] except KeyError: result["mode"] = 0 else: try: result["mode"] = result["value_counts"].index[0] except KeyError: result["mode"] = 0 # If and IndexError happens, # it is because all column are NULLs: except IndexError: result["mode"] = "MISSING" if lookup_config: lookup_object = lookup_config['object'] col_name_in_db = lookup_config['col_name_in_db'] if 'col_name_in_db' in lookup_config else None try: matched, unmatched = lookup_object.lookup(df.select(column), col_name_in_db) result['lookedup_values'] = str(matched.count()) + "/" + str(df.select(column).count()) except: result['lookedup_values'] = 'FAILED' else: result['lookedup_values'] = '' return result
def main(input_path, output_path): sc = SparkContext(appName='Data_Analysis') ''' Define spark accumulators for counting total records, valid records and empty string checking ''' empty_records = {} # accumulator check the emptiness of columns for column in CHECK_EMPTY_COLUMNS: empty_records[column] = sc.accumulator(0) total_records = sc.accumulator(0) valid_records = sc.accumulator(0) # collection all desired statistics stats_collector = {} # load raw dataset raw_rdd = sc.textFile(input_path).map(lambda x: x.split('|')) # vaidate whether data fulfill the definition of data dictionary validate_rdd = raw_rdd.filter(lambda x: data_validate(x, VALIDATION_LIST, total_records, valid_records)) ''' Keep only the following columns: 'IMSI', 'EVENT_TYPE', 'CGI', 'DATETIME', 'BBC' ''' # load fixed cell master file for cgi and bbc mapping cell_master_dict = {} with open(CELL_MASTER_FILE, 'r') as f: for line in f: line = line.strip() line = line.split('|') cell_master_dict[line[0]] = line[9] transform_rdd = validate_rdd.map(lambda x: data_tranform(x, cell_master_dict, empty_records)) ''' Filter out records for JABODETABEK ''' Jabodetabek_rdd = transform_rdd.filter(lambda x: x[-1] == 'JABODETABEK').cache() stats_collector['Jabodetabek_records'] = Jabodetabek_rdd.count() print 'Number of Jabodetabek records: %d' % stats_collector['Jabodetabek_records'] stats_collector['total_records'] = total_records.value print 'Number of total records: %d' % stats_collector['total_records'] stats_collector['valid_records'] = valid_records.value print 'Number of valid records: %d' % stats_collector['valid_records'] stats_collector['empty_records'] = {} if stats_collector['valid_records'] != 0: for column in CHECK_EMPTY_COLUMNS: stats_collector['empty_records'][column] = empty_records[column].value print 'Empty record in column %s is %d, with percentage %.2f' % (column, empty_records[column].value, empty_records[column].value / float( stats_collector['valid_records'])) if stats_collector['Jabodetabek_records'] > 0: ''' Generate the aggregate count distribution over time ''' # aggregate count distribution over time per event type event_datetime_pair = Jabodetabek_rdd.map(lambda x: ((x[1], round_datetime(x[3], 10, 'minutes')), 1)) agg_event_datetime_pair = event_datetime_pair.reduceByKey(lambda x, y: x + y).sortByKey(ascending=True) stats_collector['event_time_distribution'] = {} for item in agg_event_datetime_pair.collect(): if item[0][0] not in stats_collector['event_time_distribution']: stats_collector['event_time_distribution'][item[0][0]] = [[item[0][1], item[1]]] else: stats_collector['event_time_distribution'][item[0][0]].append([item[0][1], item[1]]) print 'Event time records distribution: %s' % str(stats_collector['event_time_distribution']) # overall aggregate count distribution over time event_time_distribution = stats_collector['event_time_distribution'] event_list = event_time_distribution.keys() time_series_dict = dict() for event_type in event_list: for item in event_time_distribution[event_type]: if item[0] not in time_series_dict: time_series_dict[item[0]] = item[1] else: time_series_dict[item[0]] += item[1] time_distribution = [[t, time_series_dict[t]] for t in time_series_dict] time_distribution = sorted(time_distribution, key=lambda x: x[0]) stats_collector['time_distribution'] = time_distribution print 'Time records distribution: %s' % str(stats_collector['time_distribution']) # Group by records according different event stats_collector['event_distribution'] = [[e_type, sum([event_time_distribution[e_type][i][1] for i in range(len(event_time_distribution[e_type]))])] for e_type in event_list] print 'Event distribution is: %s' % stats_collector['event_distribution'] ''' Convert RDD to dataframe ''' sql_context = SQLContext(sc) fields = [StructField(field_name, StringType(), True) for field_name in SCHEMA_NAMES] schema = StructType(fields) df = sql_context.createDataFrame(Jabodetabek_rdd, schema).cache() ''' Unique IMIS per hour ''' udf_round_datetime = udf(round_datetime, StringType()) df = df.select(df['*'], udf_round_datetime(df['DATETIME']).alias('Hour')) stats_collector['imsi_per_hour'] = (df.groupBy('Hour').agg(countDistinct('IMSI').alias('UNIQUE_IMSI')) .rdd.map(lambda x: [x['Hour'], x['UNIQUE_IMSI']]).collect()) ''' Number of records per imsi distribution ''' count_by_imsi = df.groupBy('IMSI').count().selectExpr('count as num_records') # ['IMSI', 'num_records'] stats_collector['total_imsi'] = count_by_imsi.count() print 'Total number of imsi is %d' % stats_collector['total_imsi'] stats = count_by_imsi.selectExpr('avg(num_records) as mean', 'stddev(num_records) as std').collect() mean_value = stats[0]['mean'] std_value = stats[0]['std'] upper_limit = mean_value + 3 * std_value # mean + 3 * standard deviation count_by_imsi_filtered = count_by_imsi.filter(count_by_imsi['num_records'] <= upper_limit) agg_by_count = count_by_imsi_filtered.groupBy('num_records').count() stats_collector['imsi_distribution'] = agg_by_count.rdd.map(lambda x: [x['num_records'], x['count']]).collect() print 'IMSI distribution is: %s' % stats_collector['imsi_distribution'] output_rdd = sc.parallelize([json.dumps(stats_collector)]) output_rdd.saveAsTextFile(output_path) else: output_rdd = sc.parallelize([json.dumps(stats_collector)]) output_rdd.saveAsTextFile(output_path) raise ValueError('No record for Jabodetabek Found.')
# COMMAND ---------- filterNonNullDF = nonNullDF.filter((col("firstName") == "") | (col("lastName") == "")).sort(asc("email")) display(filterNonNullDF) # COMMAND ---------- # MAGIC %md # MAGIC **Example aggregations using ``agg()`` and ``countDistinct()``.** # COMMAND ---------- from pyspark.sql.functions import countDistinct countDistinctDF = nonNullDF.select("firstName", "lastName").groupBy("firstName", "lastName").agg(countDistinct("firstName")) display(countDistinctDF) # COMMAND ---------- # MAGIC %md # MAGIC **Compare the DataFrame and SQL Query Physical Plans** # MAGIC **(Hint: They should be the same.)** # COMMAND ---------- countDistinctDF.explain() # COMMAND ----------
def main_process(raw_rdd, sc, output_path, data_dict, process_properties): sql_context = SQLContext(sc) stats_collector = dict() stats_collector['data_dict'] = data_dict stats_collector['process_properties'] = process_properties # initialize accumulator acc_dict = initial_accumulator_dict(sc, data_dict) # data validation process valid_rdd = raw_rdd.filter(lambda x: data_validation(x, data_dict, acc_dict)) if process_properties.get('time_distribution', None): if 'timestamp_format' not in process_properties: raise Exception( 'Time distribution cannot be derived without timestamp information specified in properties.') frequency = process_properties['time_distribution'].get('frequency', 10) frequency_unit = process_properties['time_distribution'].get('unit', 'minutes') event_datetime_pair = valid_rdd.map(lambda x: datetime_distribution_pair(x, process_properties, data_dict, frequency, frequency_unit)) agg_event_datetime_result = (event_datetime_pair.reduceByKey(lambda x, y: x + y) .sortByKey(ascending=True) .collect()) process_agg_event_datetime_result(agg_event_datetime_result, process_properties, stats_collector, data_dict) store_accumulator_dict(acc_dict, stats_collector) if process_properties.get('time_gap_distribution', None): if 'timestamp_format' not in process_properties: raise Exception( 'Time distribution cannot be derived without timestamp information specified in properties.') column_number = process_properties['time_gap_distribution']['index'] return_range = process_properties['time_gap_distribution'].get('return_std_range', 3) stats_method = process_properties['time_gap_distribution'].get('stats_method', 'median') resolution = process_properties['time_gap_distribution'].get('resolution', '60') threshold = process_properties['time_gap_distribution'].get('threshold', 3600*24) column_name = data_dict[column_number]['name'] stats_key = 'gap_distribution_' + column_name stats_collector[stats_key] = {} agg_result = (valid_rdd.map(lambda x: (x[column_number], construct_datetime(x, process_properties['timestamp_format'], data_dict))) .groupByKey() .map(lambda x: (x[0], generate_gap_stats(x[1], stats_method, resolution, threshold))) .filter(lambda x: (x[1] != 0) and (x[1] != None)) .map(lambda x: (x[1], 1)) .reduceByKey(lambda x, y: x + y) .collect()) mean_value = hist_mean(agg_result) std_value = hist_std(agg_result) median_value = hist_median(agg_result) stats_collector[stats_key]['mean'] = mean_value stats_collector[stats_key]['std'] = std_value stats_collector[stats_key]['median'] = median_value upper_limit = mean_value + return_range * std_value # mean + return_range * standard deviation stats_collector[stats_key]['distribution'] = [[item[0], item[1]] for item in agg_result if item[0] <= upper_limit] store_accumulator_dict(acc_dict, stats_collector) # Test if process_properties.get('hourly_time_gap_distribution', None): if 'timestamp_format' not in process_properties: raise Exception( 'Time distribution cannot be derived without timestamp information specified in properties.') def hour_gap_pair_list(spark_iterable, resolution, threshold): assert isinstance(resolution, int) and resolution > 0, 'Please provide proper resolution in terms of seconds.' if len(spark_iterable) > 1: dt_list = [datetime.datetime.strptime(dt, '%Y-%m-%d %H:%M:%S') for dt in spark_iterable] dt_list = sorted(dt_list) gap_list = [] for i in range(1, len(dt_list)): gap = (dt_list[i] - dt_list[i - 1]).seconds if gap < threshold: hour = round_datetime(datetime.datetime.strftime(dt_list[i - 1], '%Y-%m-%d %H:%M:%S'), sample_f=1, unit='hours') gap_list.append(((hour, gap // resolution), 1)) if len(gap_list) >= 1: return gap_list else: return [((0, 0), 1)] else: return [((0, 0), 1)] column_number = process_properties['hourly_time_gap_distribution']['index'] return_range = process_properties['hourly_time_gap_distribution'].get('return_std_range', 3) resolution = process_properties['hourly_time_gap_distribution'].get('resolution', 60) threshold = process_properties['hourly_time_gap_distribution'].get('threshold', 3600) column_name = data_dict[column_number]['name'] stats_key = 'hourly_gap_distribution_' + column_name stats_collector[stats_key] = dict() agg_result = (valid_rdd.map(lambda x: (x[column_number], construct_datetime(x, process_properties['timestamp_format'], data_dict))) .groupByKey() .flatMap(lambda x: hour_gap_pair_list(x[1], resolution, threshold)) .filter(lambda x: (x[0][1] != None) and (x[0][1] != 0)) .reduceByKey(lambda x, y: x + y) .collect()) hourly_agg_result = dict() for item in agg_result: if item[0][0] not in hourly_agg_result: hourly_agg_result[item[0][0]] = {item[0][1]: item[1]} else: if item[0][1] not in hourly_agg_result[item[0][0]]: hourly_agg_result[item[0][0]][item[0][1]] = item[1] else: hourly_agg_result[item[0][0]][item[0][1]] += item[1] for hour in hourly_agg_result: stats_collector[stats_key][hour] = dict() agg_result = [[item[0], item[1]] for item in hourly_agg_result[hour].items()] mean_value = hist_mean(agg_result) std_value = hist_std(agg_result) median_value = hist_median(agg_result) stats_collector[stats_key][hour]['mean'] = mean_value stats_collector[stats_key][hour]['std'] = std_value stats_collector[stats_key][hour]['median'] = median_value upper_limit = mean_value + return_range * std_value # mean + return_range * standard deviation stats_collector[stats_key][hour]['distribution'] = [[item[0], item[1]] for item in agg_result if item[0] <= upper_limit] if process_properties.get('unique_time_distribution', None): column_number_list = process_properties['unique_time_distribution']['columns'] for column_number in column_number_list: column_name = data_dict[column_number]['name'] stats_key = 'unique_time_distribution_' + column_name frequency = process_properties['unique_time_distribution'].get('frequency', 1) frequency_unit = process_properties['unique_time_distribution'].get('unit', 'hours') gap_rdd = (valid_rdd.map(lambda x: (x[column_number], round_datetime( construct_datetime(x, process_properties['timestamp_format'], data_dict) , frequency, frequency_unit)))) gap_df = gap_rdd.toDF().selectExpr('_2 as DATETIME', '_1 as COLUMN') stats_collector[stats_key] = (gap_df.groupBy('DATETIME').agg(countDistinct('COLUMN').alias('UNIQUE_COLUMN')) .rdd.map(lambda x: [x['DATETIME'], x['UNIQUE_COLUMN']]).collect()) store_accumulator_dict(acc_dict, stats_collector) if process_properties.get('unique_columns', None): if 'raw_df' not in locals(): raw_df = generate_raw_dataframe(valid_rdd, sql_context, data_dict).cache() for index in process_properties['unique_columns']: unique_count = (raw_df.groupBy() .agg(countDistinct(data_dict[index]['name']) .alias('DISTINCT_COUNT')) .collect()) stats_collector['unique_' + data_dict[index]['name']] = unique_count[0]['DISTINCT_COUNT'] store_accumulator_dict(acc_dict, stats_collector) if process_properties.get('groupby_unique_columns', None): if 'raw_df' not in locals(): raw_df = generate_raw_dataframe(valid_rdd, sql_context, data_dict).cache() for item in process_properties['groupby_unique_columns']: groupby_column = item[0] groupby_column_name = data_dict[item[0]]['name'] count_unique_column = item[1] count_unique_column_name = data_dict[item[1]]['name'] unique_count = (raw_df .groupBy(groupby_column_name) .agg(countDistinct(count_unique_column_name).alias('DISTINCT_COUNT')) .collect()) key_name = 'groupby_' + groupby_column_name + '_unique_' + count_unique_column_name stats_collector[key_name] = dict() for row in unique_count: if row[groupby_column_name] not in stats_collector[key_name]: stats_collector[key_name][row[groupby_column_name]] = row['DISTINCT_COUNT'] store_accumulator_dict(acc_dict, stats_collector) if process_properties.get('groupby_columns', None): if 'raw_df' not in locals(): raw_df = generate_raw_dataframe(valid_rdd, sql_context, data_dict).cache() groupby_column_index = process_properties['groupby_columns'] groupby_column_names = [data_dict[index]['name'] for index in groupby_column_index] groupby_stats = raw_df.groupBy(groupby_column_names).count().collect() for column in groupby_column_names: stats_collector['groupby_' + column] = dict() for row in groupby_stats: for column in groupby_column_names: stats_collector['groupby_' + column][row[column]] = \ stats_collector['groupby_' + column].get(row[column], 0) + row['count'] store_accumulator_dict(acc_dict, stats_collector) if process_properties.get('transaction_distribution', None): if 'raw_df' not in locals(): raw_df = generate_raw_dataframe(valid_rdd, sql_context, data_dict).cache() column_number = process_properties['transaction_distribution']['index'] column_name = data_dict[column_number]['name'] return_range = process_properties['transaction_distribution']['return_std_range'] stats_key = 'records_distribution_' + column_name stats_collector[stats_key] = {} agg_by_count = (raw_df .groupBy(column_name).count() .selectExpr('count as num_records') .groupBy('num_records') .count()) agg_result = agg_by_count.rdd.map(lambda x: [x['num_records'], x['count']]).collect() mean_value = hist_mean(agg_result) std_value = hist_std(agg_result) median_value = hist_median(agg_result) stats_collector[stats_key]['mean'] = mean_value stats_collector[stats_key]['std'] = std_value stats_collector[stats_key]['median'] = median_value upper_limit = mean_value + return_range * std_value # mean + return_range * standard deviation stats_collector[stats_key]['distribution'] = [[item[0], item[1]] for item in agg_result if item[0] <= upper_limit] store_accumulator_dict(acc_dict, stats_collector) output_rdd = sc.parallelize([json.dumps(stats_collector)]) output_rdd.saveAsTextFile(output_path)
def main_process(raw_rdd, sc, output_path, data_dict, process_properties): # ------------------------------------------------------------------------------------------------------------------ # ------------------------Initialization---------------------------------------------------------------------------- # initialize sql context sql_context = SQLContext(sc) stats_collector = dict() stats_collector['data_dict'] = data_dict stats_collector['process_properties'] = process_properties # initialize accumulator acc_dict = initial_accumulator_dict(sc, data_dict) # data validation process valid_rdd = raw_rdd.filter(lambda x: data_validation(x, data_dict, acc_dict)) # ------------------------------------------------------------------------------------------------------------------ # ------------------------time distribution of number of records---------------------------------------------------- if process_properties.get('time_distribution', None): if 'timestamp_format' not in process_properties: raise Exception( 'Time distribution cannot be derived without timestamp information specified in properties.') frequency = process_properties['time_distribution'].get('frequency', 10) frequency_unit = process_properties['time_distribution'].get('unit', 'minutes') event_datetime_pair = valid_rdd.map(lambda x: datetime_distribution_pair(x, process_properties, data_dict, frequency, frequency_unit)) agg_event_datetime_result = (event_datetime_pair.reduceByKey(lambda x, y: x + y) .sortByKey(ascending=True) .collect()) process_agg_event_datetime_result(agg_event_datetime_result, process_properties, stats_collector, data_dict) store_accumulator_dict(acc_dict, stats_collector) # ------------------------------------------------------------------------------------------------------------------ # ------------------------distribution of median (mean) of time interval per imsi ---------------------------------- if process_properties.get('time_gap_distribution', None): if 'timestamp_format' not in process_properties: raise Exception( 'Time distribution cannot be derived without timestamp information specified in properties.') column_number = process_properties['time_gap_distribution']['index'] return_range = process_properties['time_gap_distribution'].get('return_std_range', 3) stats_method = process_properties['time_gap_distribution'].get('stats_method', 'median') resolution = process_properties['time_gap_distribution'].get('resolution', '60') threshold = process_properties['time_gap_distribution'].get('threshold', 3600*24) column_name = data_dict[column_number]['name'] stats_key = 'gap_distribution_' + column_name stats_collector[stats_key] = {} agg_result = (valid_rdd.map(lambda x: (x[column_number], construct_datetime(x, process_properties['timestamp_format'], data_dict))) .groupByKey() .map(lambda x: (x[0], generate_gap_stats(x[1], stats_method, resolution, threshold))) .filter(lambda x: (x[1] != 0) and (x[1] != None)) .map(lambda x: (x[1], 1)) .reduceByKey(lambda x, y: x + y) .collect()) mean_value = hist_mean(agg_result) std_value = hist_std(agg_result) median_value = hist_median(agg_result) stats_collector[stats_key]['mean'] = mean_value stats_collector[stats_key]['std'] = std_value stats_collector[stats_key]['median'] = median_value upper_limit = mean_value + return_range * std_value # mean + return_range * standard deviation stats_collector[stats_key]['distribution'] = [[item[0], item[1]] for item in agg_result if item[0] <= upper_limit] store_accumulator_dict(acc_dict, stats_collector) # ------------------------------------------------------------------------------------------------------------------ # ------------------------hourly time interval distribution -------------------------------------------------------- if process_properties.get('hourly_time_gap_distribution', None): if 'timestamp_format' not in process_properties: raise Exception( 'Time distribution cannot be derived without timestamp information specified in properties.') column_number = process_properties['hourly_time_gap_distribution']['index'] return_range = process_properties['hourly_time_gap_distribution'].get('return_std_range', 3) resolution = process_properties['hourly_time_gap_distribution'].get('resolution', 60) threshold = process_properties['hourly_time_gap_distribution'].get('threshold', 3600) column_name = data_dict[column_number]['name'] stats_key = 'hourly_gap_distribution_' + column_name stats_collector[stats_key] = dict() agg_result = (valid_rdd.map(lambda x: (x[column_number], construct_datetime(x, process_properties['timestamp_format'], data_dict))) .groupByKey() .flatMap(lambda x: hour_gap_pair_list(x[1], resolution, threshold)) .filter(lambda x: (x[0][1] != None) and (x[0][1] != 0)) .reduceByKey(lambda x, y: x + y) .collect()) hourly_agg_result = dict() for item in agg_result: if item[0][0] not in hourly_agg_result: hourly_agg_result[item[0][0]] = {item[0][1]: item[1]} else: if item[0][1] not in hourly_agg_result[item[0][0]]: hourly_agg_result[item[0][0]][item[0][1]] = item[1] else: hourly_agg_result[item[0][0]][item[0][1]] += item[1] for hour in hourly_agg_result: stats_collector[stats_key][hour] = dict() agg_result = [[item[0], item[1]] for item in hourly_agg_result[hour].items()] mean_value = hist_mean(agg_result) std_value = hist_std(agg_result) median_value = hist_median(agg_result) stats_collector[stats_key][hour]['mean'] = mean_value stats_collector[stats_key][hour]['std'] = std_value stats_collector[stats_key][hour]['median'] = median_value upper_limit = mean_value + return_range * std_value # mean + return_range * standard deviation stats_collector[stats_key][hour]['distribution'] = [[item[0], item[1]] for item in agg_result if item[0] <= upper_limit] # ------------------------------------------------------------------------------------------------------------------ # ------------------------time distribution of unique records ------------------------------------------------------ if process_properties.get('unique_time_distribution', None): column_number_list = process_properties['unique_time_distribution']['columns'] for column_number in column_number_list: column_name = data_dict[column_number]['name'] stats_key = 'unique_time_distribution_' + column_name frequency = process_properties['unique_time_distribution'].get('frequency', 1) frequency_unit = process_properties['unique_time_distribution'].get('unit', 'hours') gap_rdd = (valid_rdd.map(lambda x: (x[column_number], round_datetime( construct_datetime(x, process_properties['timestamp_format'], data_dict) , frequency, frequency_unit)))) gap_df = gap_rdd.toDF().selectExpr('_2 as DATETIME', '_1 as COLUMN') stats_collector[stats_key] = (gap_df.groupBy('DATETIME').agg(countDistinct('COLUMN').alias('UNIQUE_COLUMN')) .rdd.map(lambda x: [x['DATETIME'], x['UNIQUE_COLUMN']]).collect()) store_accumulator_dict(acc_dict, stats_collector) # ------------------------------------------------------------------------------------------------------------------ # ------------------------unique values of columns------------------------------------------------------------------ if process_properties.get('unique_columns', None): if 'raw_df' not in locals(): raw_df = generate_raw_dataframe(valid_rdd, sql_context, data_dict).cache() for index in process_properties['unique_columns']: unique_count = (raw_df.groupBy() .agg(countDistinct(data_dict[index]['name']) .alias('DISTINCT_COUNT')) .collect()) stats_collector['unique_' + data_dict[index]['name']] = unique_count[0]['DISTINCT_COUNT'] store_accumulator_dict(acc_dict, stats_collector) if process_properties.get('groupby_unique_columns', None): if 'raw_df' not in locals(): raw_df = generate_raw_dataframe(valid_rdd, sql_context, data_dict).cache() for item in process_properties['groupby_unique_columns']: groupby_column = item[0] groupby_column_name = data_dict[item[0]]['name'] count_unique_column = item[1] count_unique_column_name = data_dict[item[1]]['name'] unique_count = (raw_df .groupBy(groupby_column_name) .agg(countDistinct(count_unique_column_name).alias('DISTINCT_COUNT')) .collect()) key_name = 'groupby_' + groupby_column_name + '_unique_' + count_unique_column_name stats_collector[key_name] = dict() for row in unique_count: if row[groupby_column_name] not in stats_collector[key_name]: stats_collector[key_name][row[groupby_column_name]] = row['DISTINCT_COUNT'] store_accumulator_dict(acc_dict, stats_collector) # ------------------------------------------------------------------------------------------------------------------ # ------------------------total records groupby certain columns----------------------------------------------------- if process_properties.get('groupby_columns', None): if 'raw_df' not in locals(): raw_df = generate_raw_dataframe(valid_rdd, sql_context, data_dict).cache() groupby_column_index = process_properties['groupby_columns'] groupby_column_names = [data_dict[index]['name'] for index in groupby_column_index] groupby_stats = raw_df.groupBy(groupby_column_names).count().collect() for column in groupby_column_names: stats_collector['groupby_' + column] = dict() for row in groupby_stats: for column in groupby_column_names: stats_collector['groupby_' + column][row[column]] = \ stats_collector['groupby_' + column].get(row[column], 0) + row['count'] store_accumulator_dict(acc_dict, stats_collector) # ------------------------------------------------------------------------------------------------------------------ # ------------------------distribution of records update per imsi--------------------------------------------------- if process_properties.get('transaction_distribution', None): if 'raw_df' not in locals(): raw_df = generate_raw_dataframe(valid_rdd, sql_context, data_dict).cache() column_number = process_properties['transaction_distribution']['index'] column_name = data_dict[column_number]['name'] return_range = process_properties['transaction_distribution']['return_std_range'] stats_key = 'records_distribution_' + column_name stats_collector[stats_key] = {} agg_by_count = (raw_df .groupBy(column_name).count() .selectExpr('count as num_records') .groupBy('num_records') .count()) agg_result = agg_by_count.rdd.map(lambda x: [x['num_records'], x['count']]).collect() mean_value = hist_mean(agg_result) std_value = hist_std(agg_result) median_value = hist_median(agg_result) stats_collector[stats_key]['mean'] = mean_value stats_collector[stats_key]['std'] = std_value stats_collector[stats_key]['median'] = median_value upper_limit = mean_value + return_range * std_value # mean + return_range * standard deviation stats_collector[stats_key]['distribution'] = [[item[0], item[1]] for item in agg_result if item[0] <= upper_limit] store_accumulator_dict(acc_dict, stats_collector) # ------------------------------------------------------------------------------------------------------------------ # ------------------------dump all results to json file------------------------------------------------------------- output_rdd = sc.parallelize([json.dumps(stats_collector)]) output_rdd.saveAsTextFile(output_path)
.load("/data/retail-data/all/*.csv")\ .coalesce(5) df.cache() df.createOrReplaceTempView("dfTable") # COMMAND ---------- from pyspark.sql.functions import count df.select(count("StockCode")).show() # 541909 # COMMAND ---------- from pyspark.sql.functions import countDistinct df.select(countDistinct("StockCode")).show() # 4070 # COMMAND ---------- from pyspark.sql.functions import approx_count_distinct df.select(approx_count_distinct("StockCode", 0.1)).show() # 3364 # COMMAND ---------- from pyspark.sql.functions import first, last df.select(first("StockCode"), last("StockCode")).show() # COMMAND ----------