def get_avg_intensity(data, date, period=7, country_list=None): """ Calculate Average Intensity of the last 7 days for a particular date Parameters: data: sample of the main server ping data frame date: string, with the format of 'yyyyMMdd' period: The number of days before to run the analysis on. country_list: a list of country names in string Returns: a dataframe with three columns: 'submission_date_s3', 'country', 'avg_intensity' """ data_all = keep_countries_and_all(data, country_list) begin = date_plus_x_days(date, -period) data_agg = data_all\ .filter("submission_date_s3 <= '{0}' and submission_date_s3 > '{1}'".format(date, begin))\ .filter("subsession_length <= 86400")\ .filter("subsession_length > 0")\ .filter('active_ticks <= 17280')\ .groupBy('country', 'client_id', 'submission_date_s3')\ .agg(F.sum('subsession_length').alias('total_daily_time'), F.sum('active_ticks').alias('total_daily_ticks'))\ .select('country', 'client_id', 'submission_date_s3', F.when(F.col('total_daily_time') > 86400, 86400) .otherwise(F.col('total_daily_time')) .alias('total_daily_time'), F.when(F.col('total_daily_ticks') > 17280, 17280) .otherwise(F.col('total_daily_ticks')) .alias('total_daily_ticks'))\ .select('*', (col('total_daily_ticks') * 5 / col('total_daily_time')) .alias('daily_intensity'))\ .select('country', 'client_id', 'submission_date_s3', F.when(F.col('daily_intensity') > 1, 1) .otherwise(F.col('daily_intensity')) .alias('daily_intensity')) country_avg_intensity = data_agg\ .groupBy('country', 'client_id')\ .agg(F.avg('daily_intensity').alias('avg_7d_intensity'))\ .groupBy('country')\ .agg(F.avg('avg_7d_intensity').alias('avg_intensity'))\ .select(lit(date).alias('submission_date_s3'), '*') df = country_avg_intensity.orderBy('submission_date_s3', 'country') return df
def top_10_addons_on_date(data, date, topN, period=7, country_list=None): """ Gets the number of users in the past week who have used the top N addons, broken down by country. Parameters: data - The main ping server. date - The day you which you want to get the top N addons. topN - the number of addons to get. period - number of days to use to calculate metric country_list - a list of country names in string Returns: Dataframe containing the number of users using each of the addons. submission_date_s3, country, addon_id, name, percent_of_active_users """ addon_filter = (~col('addon.is_system')) & (~col('addon.foreign_install')) & \ (~col('addon.addon_id').isin(NON_MOZ_TP)) & (~col('addon.addon_id').like('%@mozilla%')) &\ (~col('addon.addon_id').like('%@shield.mozilla%')) &\ (~col('addon.addon_id').like('%' + UNIFIED_SEARCH_STR + '%')) data_all = keep_countries_and_all(data, country_list) begin = date_plus_x_days(date, -period) wau = data_all.filter((col('submission_date_s3') > begin) & (col('submission_date_s3') <= date))\ .groupBy('country')\ .agg(lit(date).alias('submission_date_s3'), F.countDistinct('client_id').alias('wau')) counts = data_all.select('submission_date_s3', 'country', 'client_id', F.explode('active_addons').alias('addon'))\ .filter((col('submission_date_s3') > begin) & (col('submission_date_s3') <= date))\ .filter(addon_filter)\ .select('country', 'client_id', 'addon.addon_id', 'addon.name')\ .distinct()\ .groupBy('country', 'addon_id')\ .agg(F.count('*').alias('number_of_users'), F.last('name').alias('name'))\ .select('*', lit(date).alias('submission_date_s3'), lit(begin).alias('start_date'), F.row_number().over(Window.partitionBy('country') .orderBy(desc('number_of_users')) .rowsBetween(Window.unboundedPreceding, Window.currentRow)) .alias('rank'))\ .filter(col('rank') <= topN) return counts.join(F.broadcast(wau), on=['country'], how='left')\ .select(lit(date).alias('submission_date_s3'), 'country', 'addon_id', col('name').alias('addon_name'), (100.0 * col('number_of_users') / col('wau')).alias('pct_with_addon'))
def getPAU(data, date, period, sample_factor=1, country_list=None): """ Calculates the PAU for a given period for each time in epoch_times. This function is fast for finding the PAU for a small number of dates. Paramaters: data - This should be a sample of the main server ping data frame. date - The day to calulate PAU for. This is given in epoch time. period - The number of days that we count distinct number of users. For example MAU has a period = 28 and YAU has a period = 365. sample_factor - the factor to multiply counts by, pre-calculated based on sample country_list - A list of countries that we want to calculate the PAU for. Output: A data frame, this data frame has 3 columns submission_date_s3, country, PAU(WAU/MAU/YAU). """ def process_data(data, begin, date): return (data.filter(col('submission_date_s3') > begin).filter( col('submission_date_s3') <= date).groupBy('country').agg( (sample_factor * countDistinct('client_id')).alias(active_users_col)).select( '*', lit(begin).alias(start_date_col), lit(date).alias('submission_date_s3'))) data_all = data.drop('country').select('*', lit('All').alias('country')) if country_list is not None: data_country = data.filter(col('country').isin(country_list)) # define column names based on period active_users_col = PERIOD_DESC.get(period, "other") start_date_col = 'start_date_' + PERIOD_DESC.get(period, "other") begin = date_plus_x_days(date, -period) current_count = process_data(data_all, begin, date) if country_list is not None: df_country = process_data(data_country, begin, date) current_count = current_count.union(df_country) return current_count.select('submission_date_s3', 'country', active_users_col)
def get_daily_avg_session(data, date, period=7, country_list=None): """ Calculate Average Daily usage of the last 7 days for a particular date Parameters: data: sample of the main server ping data frame date: string, with the format of 'yyyyMMdd' country_list: a list of country names in string Returns: a dataframe with four columns: 'submission_date_s3', 'country', 'avg_daily_usage(hours)' """ data_all = keep_countries_and_all(data, country_list) begin = date_plus_x_days(date, -period) data_agg = data_all\ .filter("submission_date_s3 <= '{}' and submission_date_s3 > '{}'" .format(date, begin))\ .filter("subsession_length <= 86400") .filter("subsession_length > 0")\ .groupBy('country', 'client_id', 'submission_date_s3')\ .agg(F.sum('subsession_length').alias('total_daily_time'))\ .select('country', 'client_id', 'submission_date_s3', F.when(F.col('total_daily_time') > 86400, 86400) .otherwise(F.col('total_daily_time')) .alias('total_daily_time')) country_avg_session = data_agg\ .groupBy('country', 'client_id')\ .agg(F.avg('total_daily_time').alias('client_7d_avg'))\ .groupBy('country')\ .agg(F.avg('client_7d_avg').alias('avg_daily_subsession_length'))\ .select(lit(date).alias('submission_date_s3'), '*') df = country_avg_session.orderBy('submission_date_s3', 'country') df = df.withColumn('avg_daily_usage(hours)', df.avg_daily_subsession_length / 3600) return df.select('submission_date_s3', 'country', 'avg_daily_usage(hours)')
def new_users(data, date, period=7, country_list=None): """Gets the percentage of WAU that are new users. Parameters: data - This should be the entire main server ping data frame. date - data to start calculating for period - The number of days before looked at in the analysis country_list - A list of countries that we want to calculate the PAU for. Returns: A dataframe with columns submission_date_s3, country, pct_new_users """ cols = ['submission_date_s3', 'client_id', 'profile_creation_date', 'country'] wau = getWAU(data, date, country_list) df = data.drop('country').select('*', lit('All').alias('country')) if country_list is not None: df = ( df.select(cols).union(data.select(cols) .filter(col('country').isin(country_list)))) begin = date_plus_x_days(date, -period) new_profiles = (df.filter(df.submission_date_s3 <= date) .filter(df.submission_date_s3 > begin) .withColumn('pcd_str', from_unixtime(col('profile_creation_date') * 24 * 60 * 60, format='yyyyMMdd')) .filter(col('pcd_str') <= date) .filter(col('pcd_str') > begin)) new_user_counts = ( new_profiles .groupBy('country') .agg((countDistinct('client_id')).alias('new_users'))) return wau.join(new_user_counts, on=['country'], how='left')\ .select('submission_date_s3', 'country', (100.0 * col('new_users') / col('WAU')).alias('pct_new_user'))
def pct_new_version(data, date, period=7, country_list=None, url=RELEASE_VERSIONS_URL): """ Calculate the proportion of active users on the latest release version every day. Parameters: data: sample of the main server ping data frame date: The day to calculate the metric period: number of days to use to calculate metric country_list: a list of country names in string url: path to the json file containing all the firefox release information to date Returns: a dataframe with five columns - 'country', 'submission_date_s3', 'pct_latest_version' """ data_all = keep_countries_and_all(data, country_list) begin = date_plus_x_days(date, -period) latest_version = get_latest_version(date, url) data_filtered = data_all.filter(""" {0} >= '{1}' and {0} <= '{2}' """.format("submission_date_s3", begin, date))\ .withColumn('app_major_version', split('app_version', '\.').getItem(0))\ .select('submission_date_s3', 'client_id', 'app_major_version', 'country') WAU = data_filtered.groupBy('country')\ .agg(F.countDistinct('client_id').alias('WAU')) WAU_latest = data_filtered.filter(F.col('app_major_version') >= F.lit(latest_version))\ .groupBy('country')\ .agg(F.countDistinct('client_id').alias('WAU_is_latest')) join_df = WAU.join(WAU_latest, 'country', 'left')\ .withColumn("pct_latest_version", (100.0 * F.col("WAU_is_latest") / F.col("WAU")))\ .select(F.lit(date).alias('submission_date_s3'), 'country', F.coalesce('pct_latest_version', F.lit(0)).alias('pct_latest_version')) return join_df
def os_on_date(data, date, period=7, country_list=None): """ Gets the distribution of OS usage calculated on the WAU on 1 day. Parameters: data: Usually the main summary data frame. date: day to get the os distribution for the past week. period: The number of days to calculate the distibution. By default it finds os distribution over a week. country_list: The countries to do the analysis. If None then it does it for the whole world. Returns: submission_date_s3, country, os, pct_on_os """ data_all = keep_countries_and_all(data, country_list) begin = date_plus_x_days(date, -period) data_all = data_all.select( 'client_id', 'submission_date_s3', 'country', nice_os(col('os'), col('os_version')).alias('nice_os')) # Calculate the WAU wau = data_all\ .filter((col('submission_date_s3') <= date) & (col('submission_date_s3') > begin))\ .groupBy('country')\ .agg(countDistinct('client_id').alias('WAU'))\ os_wau = data_all\ .filter((col('submission_date_s3') <= date) & (col('submission_date_s3') > begin))\ .groupBy('country', 'nice_os')\ .agg(countDistinct('client_id').alias('WAU_on_OS'))\ .select(lit(begin).alias('start_date'), lit(date).alias('submission_date_s3'), 'country', 'WAU_on_OS', 'nice_os') res = os_wau.join(wau, 'country', how='left')\ .select('start_date', 'submission_date_s3', 'country', 'WAU_on_OS', 'nice_os', 'WAU') return res.select('submission_date_s3', 'country', col('nice_os').alias('os'), (100.0 * col('WAU_on_OS') / col('WAU')).alias('pct_on_os'))
def get_addon(data, date, period=7, country_list=None): """ Calculate the proportion of WAU that have a "self installed" addon for a specific date Parameters: data: sample of the main server ping data frame date: string, with the format of 'yyyyMMdd' period: The number of days before looked at in the analysis country_list: a list of country names in string Returns: a dataframe showing all the information for each date in the period - three columns: 'submission_date_s3', 'country', 'pct_Addon' """ data_all = keep_countries_and_all(data, country_list) begin = date_plus_x_days(date, -period) addon_filter = (~F.col('addon.is_system')) & (~F.col('addon.foreign_install')) &\ (~F.col('addon.addon_id').isin(NON_MOZ_TP)) &\ (~F.col('addon.addon_id').like('%@mozilla%')) &\ (~F.col('addon.addon_id').like('%@shield.mozilla%')) &\ (~F.col('addon.addon_id').like('%' + UNIFIED_SEARCH_STR + '%')) WAU = data_all\ .filter("submission_date_s3 <= '{0}' and submission_date_s3 > '{1}'".format(date, begin))\ .groupBy('country')\ .agg(F.countDistinct('client_id').alias('WAU')) addon_count = data_all\ .filter("submission_date_s3 <= '{0}' and submission_date_s3 > '{1}'".format(date, begin))\ .select('submission_date_s3', 'country', 'client_id', F.explode('active_addons').alias('addon'))\ .filter(addon_filter)\ .groupBy('country')\ .agg(F.countDistinct('client_id').alias('add_on_count')) join_df = WAU.join(addon_count, 'country', how='left')\ .withColumn("pct_addon", (100.0 * F.col("add_on_count") / F.col("WAU")))\ .select(F.lit(date).alias('submission_date_s3'), '*') return join_df.select('submission_date_s3', 'country', 'pct_addon')
def locale_on_date(data, date, topN, period=7, country_list=None): """ Gets the ratio of the top locales in each country over the last week. parameters: data: The main ping server date: The date to find the locale distribution topN: The number of locales to get for each country. Only does the top N. period: The number of days before looked at in the analyisis country_list: The list to find look at in the analysis output: dataframe with columns: ['country', 'submission_date_s3', 'locale', 'pct_on_locale'] """ data_all = keep_countries_and_all(data, country_list) begin = date_plus_x_days(date, -period) wau = data_all\ .filter((col('submission_date_s3') <= date) & (col('submission_date_s3') > begin))\ .groupBy('country')\ .agg(countDistinct('client_id').alias('WAU')) locale_wau = data_all\ .filter((col('submission_date_s3') <= date) & (col('submission_date_s3') > begin))\ .groupBy('country', 'locale')\ .agg(countDistinct('client_id').alias('WAU_on_locale'))\ .select(lit(begin).alias('start_date'), lit(date).alias('submission_date_s3'), 'country', 'WAU_on_locale', 'locale') res = locale_wau.join(wau, 'country', how='left')\ .select('start_date', 'submission_date_s3', 'country', 'WAU_on_locale', 'locale', 'WAU') rank_window = Window.partitionBy('country', 'submission_date_s3').orderBy( desc('WAU_on_locale')) return res.select('*', F.row_number().over(rank_window).alias('rank'))\ .filter(col('rank') <= topN)\ .select('submission_date_s3', 'country', 'locale', (100.0 * col('WAU_on_locale') / col('WAU')).alias('pct_on_locale'))