コード例 #1
0
def test_no_analysis_exception_when_shared_parent_dataframe(spark):
    # Check that we don't fall victim to
    # https://issues.apache.org/jira/browse/SPARK-10925
    df = spark.createDataFrame(
        [  # Just need the schema, really
            ['someone', '20190102', 'fake', 1],
        ],
        [
            "client_id",
            "submission_date_s3",
            "branch",
            "some_value",
        ])

    enrollments = df.groupby('client_id', 'branch').agg(
        F.min('submission_date_s3').alias('enrollment_date'))

    exp = Experiment('a-stub', '20180101')

    time_limits = TimeLimits.for_single_analysis_window(
        exp.start_date,
        last_date_full_data='20190522',
        analysis_start_days=28,
        analysis_length_dates=7)

    enrollments = exp._add_analysis_windows_to_enrollments(
        enrollments, time_limits)

    exp._get_results_for_one_data_source(
        enrollments,
        df,
        [F.max(F.col('some_value'))],
    )
コード例 #2
0
def test_add_analysis_windows_to_enrollments(spark):
    exp = Experiment('a-stub', '20190101', num_dates_enrollment=8)
    enrollments = exp.get_enrollments(spark,
                                      _get_enrollment_view(slug="a-stub"))
    assert enrollments.count() == 3

    tl = TimeLimits.for_ts(
        first_enrollment_date=exp.start_date,
        last_date_full_data='20190114',
        time_series_period='daily',
        num_dates_enrollment=exp.num_dates_enrollment,
    )
    assert len(tl.analysis_windows) == 7

    new_enrollments = exp._add_analysis_windows_to_enrollments(enrollments, tl)

    nep = new_enrollments.toPandas()
    assert len(nep) == enrollments.count() * len(tl.analysis_windows)

    a = nep[nep['client_id'] == 'aaaa']
    assert len(a) == len(tl.analysis_windows)
    assert (a.mozanalysis_analysis_window_start.sort_values() == np.arange(
        len(tl.analysis_windows))).all()
    assert (a.mozanalysis_analysis_window_end.sort_values() == np.arange(
        len(tl.analysis_windows))).all()
コード例 #3
0
def test_get_results_for_one_data_source(spark):
    exp = Experiment('a-stub', '20190101')

    enrollments = spark.createDataFrame(
        [
            ['aaaa', 'control', '20190101'],
            ['bbbb', 'test', '20190101'],
            ['cccc', 'control', '20190108'],
            ['dddd', 'test', '20190109'],
            ['annie-nodata', 'control', '20190101'],
            ['bob-badtiming', 'test', '20190102'],
            ['carol-gooddata', 'test', '20190101'],
            ['derek-lateisok', 'control', '20190110'],
        ],
        [
            "client_id",
            "branch",
            "enrollment_date",
        ],
    )

    ex_d = {'a-stub': 'fake-branch-lifes-too-short'}
    data_source = spark.createDataFrame(
        [
            # bob-badtiming only has data before/after analysis window
            # but missed by `process_data_source`
            ['bob-badtiming', '20190102', ex_d, 1],
            ['bob-badtiming', '20190106', ex_d, 2],
            # carol-gooddata has data on two days (including a dupe day)
            ['carol-gooddata', '20190102', ex_d, 3],
            ['carol-gooddata', '20190102', ex_d, 2],
            ['carol-gooddata', '20190104', ex_d, 6],
            # derek-lateisok has data before and during the analysis window
            ['derek-lateisok', '20190110', ex_d, 1000],
            ['derek-lateisok', '20190111', ex_d, 1],
            # TODO: exercise the last condition on the join
        ],
        [
            "client_id",
            "submission_date_s3",
            "experiments",
            "some_value",
        ],
    )

    time_limits = TimeLimits.for_single_analysis_window(
        exp.start_date,
        '20190114',
        1,
        3,
    )

    enrollments = exp._add_analysis_windows_to_enrollments(
        enrollments, time_limits)

    res = exp._get_results_for_one_data_source(
        enrollments,
        data_source,
        [
            F.coalesce(F.sum(data_source.some_value),
                       F.lit(0)).alias('some_value'),
        ],
    )

    # Check that the dataframe has the correct number of rows
    assert res.count() == enrollments.count()

    # Check that dataless enrollments are handled correctly
    annie_nodata = res.filter(res.client_id == 'annie-nodata')
    assert annie_nodata.count() == 1
    assert annie_nodata.first()['some_value'] == 0

    # Check that early and late data were ignored
    bob_badtiming = res.filter(res.client_id == 'bob-badtiming')
    assert bob_badtiming.count() == 1
    assert bob_badtiming.first()['some_value'] == 0

    # Check that relevant data was included appropriately
    carol_gooddata = res.filter(res.client_id == 'carol-gooddata')
    assert carol_gooddata.count() == 1
    assert carol_gooddata.first()['some_value'] == 11

    derek_lateisok = res.filter(res.client_id == 'derek-lateisok')
    assert derek_lateisok.count() == 1
    assert derek_lateisok.first()['some_value'] == 1

    # Check that it still works for `data_source`s without an experiments map
    res2 = exp._get_results_for_one_data_source(
        enrollments,
        data_source.drop('experiments'),
        [
            F.coalesce(F.sum(data_source.some_value),
                       F.lit(0)).alias('some_value'),
        ],
    )

    assert res2.count() == enrollments.count()