def test_average_unique_domains_count(spark): snippets = [ # averages to 4 { 'client_id': '1', 'unique_domains_count': 6 }, { 'client_id': '1', 'unique_domains_count': 2 }, # averages to 8 { 'client_id': '2', 'unique_domains_count': 12 }, { 'client_id': '2', 'unique_domains_count': 4 } ] input_df = snippets_to_df(spark, snippets) df = churn.compute_churn_week(input_df, week_start_ds) rows = df.collect() # (4 + 8) / 2 == 6 assert rows[0].unique_domains_count_per_profile == 6
def test_nulled_stub_attribution_medium(nulled_attribution_df): df = churn.compute_churn_week(nulled_attribution_df, week_start_ds) rows = (df.select('medium').distinct().collect()) actual = set([r.medium for r in rows]) expect = set(['unknown']) assert actual == expect
def test_total_uri_count_per_client(spark): snippets = [{'total_uri_count': 1}, {'total_uri_count': 2}] input_df = snippets_to_df(spark, snippets) df = churn.compute_churn_week(input_df, week_start_ds) rows = df.collect() assert rows[0].total_uri_count == 3
def test_total_uri_count_per_client(generate_data): snippets = [{SPBE + 'total_uri_count': 1}, {SPBE + 'total_uri_count': 2}] input_df = generate_data(snippets) df = churn.compute_churn_week(input_df, week_start_ds) rows = df.collect() assert rows[0].total_uri_count == 3
def test_cohort_by_channel_aggregates(multi_profile_df): df = churn.compute_churn_week(multi_profile_df, week_start_ds) rows = (df.groupBy(df.channel).agg( F.sum('n_profiles').alias('n_profiles'), F.sum('usage_hours').alias('usage_hours')).where( df.channel == 'release-cck-mozilla42').collect()) assert rows[0].n_profiles == 2 assert rows[0].usage_hours == 4
def test_multiple_cohort_weeks_exist(multi_profile_df): df = churn.compute_churn_week(multi_profile_df, week_start_ds) rows = df.select('current_week').collect() actual = set([row.current_week for row in rows]) expect = set([0, 1, 2]) assert actual == expect
def test_current_cohort_week_is_zero(single_profile_df): df = churn.compute_churn_week(single_profile_df, week_start_ds) rows = df.collect() actual = rows[0].current_week expect = 0 assert actual == expect
def test_simple_string_dimensions(spark): input_df = snippets_to_df(spark, [{ 'distribution_id': None, 'default_search_engine': None, 'locale': None }]) df = churn.compute_churn_week(input_df, week_start_ds) rows = df.collect() assert rows[0].distribution_id == 'unknown' assert rows[0].default_search_engine == 'unknown' assert rows[0].locale == 'unknown'
def test_simple_string_dimensions(generate_data): snippets = [{ 'distribution_id': None, 'default_search_engine': None, 'locale': None }] input_df = generate_data(snippets) df = churn.compute_churn_week(input_df, week_start_ds) rows = df.collect() assert rows[0].distribution_id == 'unknown' assert rows[0].default_search_engine == 'unknown' assert rows[0].locale == 'unknown'
def test_empty_total_uri_count(spark): input_df = snippets_to_df(spark, [{'total_uri_count': None}]) df = churn.compute_churn_week(input_df, week_start_ds) rows = df.collect() assert rows[0].total_uri_count == 0
def test_cohort_by_channel_count(multi_profile_df): df = churn.compute_churn_week(multi_profile_df, week_start_ds) rows = df.where(df.channel == 'release-cck-mozilla42').collect() assert len(rows) == 2
def test_profile_usage_length(single_profile_df): # there are two pings each with 1 hour of usage df = churn.compute_churn_week(single_profile_df, week_start_ds) rows = df.collect() assert rows[0].usage_hours == 2
def test_latest_submission_from_client_exists(single_profile_df): df = churn.compute_churn_week(single_profile_df, week_start_ds) assert df.count() == 1
def test_ignored_submissions_outside_of_period(late_submissions_df): df = churn.compute_churn_week(late_submissions_df, week_start_ds) assert df.count() == 0