Ejemplo n.º 1
0
def test_multiple_sources_transform(effective_version,
                                    generate_main_summary_data,
                                    generate_new_profile_data):
    main_summary = generate_main_summary_data([
        {
            "client_id": "1"
        },
        {
            "client_id": "3"
        },
    ])
    new_profile = generate_new_profile_data([
        {
            "client_id": "1"
        },
        {
            "client_id": "2"
        },
        {
            "client_id": "2"
        },
    ])
    sources = job.extract(main_summary, new_profile, WEEK_START_DS, 1, 0,
                          False)
    df = job.transform(sources, effective_version, WEEK_START_DS)

    # There are two different channels
    assert df.count() == 2

    assert (df.select(
        F.sum("n_profiles").alias("n_profiles")).first().n_profiles) == 3
Ejemplo n.º 2
0
def test_multiple_cohort_weeks_exist(multi_profile_df, effective_version):
    df = job.transform(multi_profile_df, effective_version, WEEK_START_DS)
    rows = df.select("current_week").collect()

    actual = set([row.current_week for row in rows])
    expect = set([0, 1, 2])

    assert actual == expect
Ejemplo n.º 3
0
def test_current_cohort_week_is_zero(single_profile_df, effective_version):
    df = job.transform(single_profile_df, effective_version, WEEK_START_DS)
    rows = df.collect()

    actual = rows[0].current_week
    expect = 0

    assert actual == expect
Ejemplo n.º 4
0
def test_cohort_by_channel_aggregates(multi_profile_df, effective_version):
    df = job.transform(multi_profile_df, effective_version, WEEK_START_DS)
    rows = (df.groupBy(df.channel).agg(
        F.sum('n_profiles').alias('n_profiles'),
        F.sum('usage_hours').alias('usage_hours')).where(
            df.channel == 'release-cck-mozilla42').collect())
    assert rows[0].n_profiles == 2
    assert rows[0].usage_hours == 4
Ejemplo n.º 5
0
def test_attribution_from_new_profile(
    effective_version, generate_main_summary_data, generate_new_profile_data
):
    main_summary = generate_main_summary_data(
        [
            {"client_id": "1", "attribution": {"source": "mozilla.org"}},
            {"client_id": "3", "attribution": None},
            {"client_id": "4", "attribution": None},
            {
                "client_id": "5",
                "attribution": {"source": "mozilla.org"},
                "timestamp": SUBSESSION_START.shift(days=1).timestamp * 10 ** 9,
            },
            {
                "client_id": "6",
                "attribution": {"source": "mozilla.org"},
                "timestamp": SUBSESSION_START.shift(days=1).timestamp * 10 ** 9,
            },
            {"client_id": "7", "attribution": {"source": "mozilla.org"}},
        ]
    )

    def update_attribution(attribution):
        # only updates the attribution section in the environment
        env = copy.deepcopy(data.new_profile_sample["environment"])
        env["settings"]["attribution"] = attribution
        return env

    new_profile = generate_new_profile_data(
        [
            # new profile without a main summary companion
            {
                "client_id": "2",
                "environment": update_attribution({"source": "mozilla.org"}),
            },
            # recover null attribution
            {
                "client_id": "3",
                "environment": update_attribution({"source": "mozilla.org"}),
            },
            # new-profile ping used to recover attribution, but outside of the
            # the current retention period
            {
                "client_id": "4",
                "environment": update_attribution({"source": "mozilla.org"}),
                "submission": SUBSESSION_START.shift(days=-7).format("YYYYMMDD"),
            },
            # avoid accidentally overwriting an existing value with an empty structure
            {"client_id": "5", "environment": update_attribution({})},
            # main-pings have higher latency than new-profile pings, so the main
            # ping attribution state will be set correctly
            {"client": "6", "environment": update_attribution(None)},
            # new-profile timestamp is newer than main-ping, so attribution for the
            # client is unset
            {
                "client_id": "7",
                "environment": update_attribution(None),
                "timestamp": SUBSESSION_START.shift(days=1).timestamp * 10 ** 9,
            },
        ]
    )
    sources = job.extract(main_summary, new_profile, WEEK_START_DS, 2, 0, False)
    df = job.transform(sources, effective_version, WEEK_START_DS)

    assert df.where("source='mozilla.org'").agg(F.sum("n_profiles")).first()[0] == 6
Ejemplo n.º 6
0
 def _test_transform(snippets, week_start=WEEK_START_DS):
     return job.transform(
         generate_main_summary_data(snippets), effective_version, week_start
     )
Ejemplo n.º 7
0
def test_cohort_by_channel_count(multi_profile_df, effective_version):
    df = job.transform(multi_profile_df, effective_version, WEEK_START_DS)
    rows = df.where(df.channel == "release-cck-mozilla42").collect()

    assert len(rows) == 2
Ejemplo n.º 8
0
def test_profile_usage_length(single_profile_df, effective_version):
    # there are two pings each with 1 hour of usage
    df = job.transform(single_profile_df, effective_version, WEEK_START_DS)
    rows = df.collect()

    assert rows[0].usage_hours == 2
Ejemplo n.º 9
0
def test_latest_submission_from_client_exists(single_profile_df, effective_version):
    df = job.transform(single_profile_df, effective_version, WEEK_START_DS)
    assert df.count() == 1
Ejemplo n.º 10
0
def test_attribution_from_new_profile(effective_version,
                                      generate_main_summary_data,
                                      generate_new_profile_data):
    main_summary = generate_main_summary_data([
        {
            'client_id': '1',
            'attribution': {
                'source': 'mozilla.org'
            }
        },
        {
            'client_id': '3',
            'attribution': None
        },
        {
            'client_id': '4',
            'attribution': None
        },
        {
            'client_id': '5',
            'attribution': {
                'source': 'mozilla.org'
            },
            'timestamp': SUBSESSION_START.shift(days=1).timestamp * 10**9,
        },
        {
            'client_id': '6',
            'attribution': {
                'source': 'mozilla.org'
            },
            'timestamp': SUBSESSION_START.shift(days=1).timestamp * 10**9,
        },
        {
            'client_id': '7',
            'attribution': {
                'source': 'mozilla.org'
            }
        },
    ])

    def update_attribution(attribution):
        # only updates the attribution section in the environment
        env = copy.deepcopy(data.new_profile_sample['environment'])
        env['settings']['attribution'] = attribution
        return env

    new_profile = generate_new_profile_data([
        # new profile without a main summary companion
        {
            'client_id': '2',
            'environment': update_attribution({'source': 'mozilla.org'})
        },
        # recover null attribution
        {
            'client_id': '3',
            'environment': update_attribution({'source': 'mozilla.org'})
        },
        # new-profile ping used to recover attribution, but outside of the
        # the current retention period
        {
            'client_id': '4',
            'environment': update_attribution({'source': 'mozilla.org'}),
            'submission': SUBSESSION_START.shift(days=-7).format("YYYYMMDD"),
        },
        # avoid accidentally overwriting an existing value with an empty structure
        {
            'client_id': '5',
            'environment': update_attribution({})
        },
        # main-pings have higher latency than new-profile pings, so the main
        # ping attribution state will be set correctly
        {
            'client': '6',
            'environment': update_attribution(None)
        },
        # new-profile timestamp is newer than main-ping, so attribution for the
        # client is unset
        {
            'client_id': '7',
            'environment': update_attribution(None),
            'timestamp': SUBSESSION_START.shift(days=1).timestamp * 10**9,
        },
    ])
    sources = job.extract(main_summary, new_profile, WEEK_START_DS, 2, 0,
                          False)
    df = job.transform(sources, effective_version, WEEK_START_DS)

    assert df.where("source='mozilla.org'").agg(
        F.sum("n_profiles")).first()[0] == 6