def test_multiple_sources_transform(effective_version, generate_main_summary_data, generate_new_profile_data): main_summary = generate_main_summary_data([ { "client_id": "1" }, { "client_id": "3" }, ]) new_profile = generate_new_profile_data([ { "client_id": "1" }, { "client_id": "2" }, { "client_id": "2" }, ]) sources = job.extract(main_summary, new_profile, WEEK_START_DS, 1, 0, False) df = job.transform(sources, effective_version, WEEK_START_DS) # There are two different channels assert df.count() == 2 assert (df.select( F.sum("n_profiles").alias("n_profiles")).first().n_profiles) == 3
def test_multiple_cohort_weeks_exist(multi_profile_df, effective_version): df = job.transform(multi_profile_df, effective_version, WEEK_START_DS) rows = df.select("current_week").collect() actual = set([row.current_week for row in rows]) expect = set([0, 1, 2]) assert actual == expect
def test_current_cohort_week_is_zero(single_profile_df, effective_version): df = job.transform(single_profile_df, effective_version, WEEK_START_DS) rows = df.collect() actual = rows[0].current_week expect = 0 assert actual == expect
def test_cohort_by_channel_aggregates(multi_profile_df, effective_version): df = job.transform(multi_profile_df, effective_version, WEEK_START_DS) rows = (df.groupBy(df.channel).agg( F.sum('n_profiles').alias('n_profiles'), F.sum('usage_hours').alias('usage_hours')).where( df.channel == 'release-cck-mozilla42').collect()) assert rows[0].n_profiles == 2 assert rows[0].usage_hours == 4
def test_attribution_from_new_profile( effective_version, generate_main_summary_data, generate_new_profile_data ): main_summary = generate_main_summary_data( [ {"client_id": "1", "attribution": {"source": "mozilla.org"}}, {"client_id": "3", "attribution": None}, {"client_id": "4", "attribution": None}, { "client_id": "5", "attribution": {"source": "mozilla.org"}, "timestamp": SUBSESSION_START.shift(days=1).timestamp * 10 ** 9, }, { "client_id": "6", "attribution": {"source": "mozilla.org"}, "timestamp": SUBSESSION_START.shift(days=1).timestamp * 10 ** 9, }, {"client_id": "7", "attribution": {"source": "mozilla.org"}}, ] ) def update_attribution(attribution): # only updates the attribution section in the environment env = copy.deepcopy(data.new_profile_sample["environment"]) env["settings"]["attribution"] = attribution return env new_profile = generate_new_profile_data( [ # new profile without a main summary companion { "client_id": "2", "environment": update_attribution({"source": "mozilla.org"}), }, # recover null attribution { "client_id": "3", "environment": update_attribution({"source": "mozilla.org"}), }, # new-profile ping used to recover attribution, but outside of the # the current retention period { "client_id": "4", "environment": update_attribution({"source": "mozilla.org"}), "submission": SUBSESSION_START.shift(days=-7).format("YYYYMMDD"), }, # avoid accidentally overwriting an existing value with an empty structure {"client_id": "5", "environment": update_attribution({})}, # main-pings have higher latency than new-profile pings, so the main # ping attribution state will be set correctly {"client": "6", "environment": update_attribution(None)}, # new-profile timestamp is newer than main-ping, so attribution for the # client is unset { "client_id": "7", "environment": update_attribution(None), "timestamp": SUBSESSION_START.shift(days=1).timestamp * 10 ** 9, }, ] ) sources = job.extract(main_summary, new_profile, WEEK_START_DS, 2, 0, False) df = job.transform(sources, effective_version, WEEK_START_DS) assert df.where("source='mozilla.org'").agg(F.sum("n_profiles")).first()[0] == 6
def _test_transform(snippets, week_start=WEEK_START_DS): return job.transform( generate_main_summary_data(snippets), effective_version, week_start )
def test_cohort_by_channel_count(multi_profile_df, effective_version): df = job.transform(multi_profile_df, effective_version, WEEK_START_DS) rows = df.where(df.channel == "release-cck-mozilla42").collect() assert len(rows) == 2
def test_profile_usage_length(single_profile_df, effective_version): # there are two pings each with 1 hour of usage df = job.transform(single_profile_df, effective_version, WEEK_START_DS) rows = df.collect() assert rows[0].usage_hours == 2
def test_latest_submission_from_client_exists(single_profile_df, effective_version): df = job.transform(single_profile_df, effective_version, WEEK_START_DS) assert df.count() == 1
def test_attribution_from_new_profile(effective_version, generate_main_summary_data, generate_new_profile_data): main_summary = generate_main_summary_data([ { 'client_id': '1', 'attribution': { 'source': 'mozilla.org' } }, { 'client_id': '3', 'attribution': None }, { 'client_id': '4', 'attribution': None }, { 'client_id': '5', 'attribution': { 'source': 'mozilla.org' }, 'timestamp': SUBSESSION_START.shift(days=1).timestamp * 10**9, }, { 'client_id': '6', 'attribution': { 'source': 'mozilla.org' }, 'timestamp': SUBSESSION_START.shift(days=1).timestamp * 10**9, }, { 'client_id': '7', 'attribution': { 'source': 'mozilla.org' } }, ]) def update_attribution(attribution): # only updates the attribution section in the environment env = copy.deepcopy(data.new_profile_sample['environment']) env['settings']['attribution'] = attribution return env new_profile = generate_new_profile_data([ # new profile without a main summary companion { 'client_id': '2', 'environment': update_attribution({'source': 'mozilla.org'}) }, # recover null attribution { 'client_id': '3', 'environment': update_attribution({'source': 'mozilla.org'}) }, # new-profile ping used to recover attribution, but outside of the # the current retention period { 'client_id': '4', 'environment': update_attribution({'source': 'mozilla.org'}), 'submission': SUBSESSION_START.shift(days=-7).format("YYYYMMDD"), }, # avoid accidentally overwriting an existing value with an empty structure { 'client_id': '5', 'environment': update_attribution({}) }, # main-pings have higher latency than new-profile pings, so the main # ping attribution state will be set correctly { 'client': '6', 'environment': update_attribution(None) }, # new-profile timestamp is newer than main-ping, so attribution for the # client is unset { 'client_id': '7', 'environment': update_attribution(None), 'timestamp': SUBSESSION_START.shift(days=1).timestamp * 10**9, }, ]) sources = job.extract(main_summary, new_profile, WEEK_START_DS, 2, 0, False) df = job.transform(sources, effective_version, WEEK_START_DS) assert df.where("source='mozilla.org'").agg( F.sum("n_profiles")).first()[0] == 6