Beispiel #1
0
    def to_summaries(self, feature_slug: str) -> List[statistics.Summary]:
        column_names = ProbeLister.columns_for_scalar(self.name)

        column_exprs = []
        for column_name in column_names:
            column_exprs.append(f"COALESCE({column_name}, 0)")

        ever_used = statistics.Summary(
            Metric(
                f"{feature_slug}_ever_used",
                mozanalysis.metrics.desktop.main,
                f"SUM({' + '.join(column_exprs)}) > 0",
            ),
            statistics.Binomial(),
        )

        sum_metric = Metric(
            f"{feature_slug}_sum",
            mozanalysis.metrics.desktop.main,
            f"SUM({' + '.join(column_exprs)})",
        )

        used_mean = statistics.Summary(sum_metric, statistics.BootstrapMean())
        used_deciles = statistics.Summary(sum_metric, statistics.Deciles())

        return [ever_used, used_mean, used_deciles]
Beispiel #2
0
class FeatureEventTelemetry:
    kind: ClassVar[str] = "event"
    event_category: str
    event_method: Optional[str] = None
    event_object: Optional[str] = None
    event_value: Optional[str] = None

    def to_summaries(self, feature_slug: str) -> List[statistics.Summary]:
        clauses = [f"event_category = '{self.event_category}'"]
        for k in ("method", "object", "value"):
            if v := getattr(self, f"event_{k}"):
                clauses.append(f"event_{k} = '{v}'")
        predicate = " AND ".join(clauses)

        ever_used = statistics.Summary(
            Metric(
                f"{feature_slug}_ever_used",
                mozanalysis.metrics.desktop.events,
                f"COALESCE(COUNTIF({predicate}), 0) > 0",
            ),
            statistics.Binomial(),
        )

        sum_metric = Metric(
            f"{feature_slug}_sum",
            mozanalysis.metrics.desktop.events,
            f"COALESCE(COUNTIF({predicate}), 0)",
        )

        used_mean = statistics.Summary(sum_metric, statistics.BootstrapMean())
        used_deciles = statistics.Summary(sum_metric, statistics.Deciles())

        return [ever_used, used_mean, used_deciles]
Beispiel #3
0
def test_process_metrics(spark):
    exp = Experiment('a-stub', '20190101', num_dates_enrollment=8)
    enrollments = exp.get_enrollments(spark,
                                      _get_enrollment_view(slug="a-stub"))

    ds_df_A = register_data_source_fixture(spark, name='ds_df_A')
    ds_df_B = register_data_source_fixture(spark, name='ds_df_B')

    ds_A = DataSource.from_dataframe('ds_df_A', ds_df_A)
    ds_B = DataSource.from_dataframe('ds_df_B', ds_df_B)

    m1 = Metric.from_col('m1', ds_df_A.numeric_col, ds_A)
    m2 = Metric.from_col('m2', ds_df_A.bool_col, ds_A)
    m3 = Metric.from_col('m3', ds_df_B.numeric_col, ds_B)

    metric_list = [m1, m2, m3]

    exp = Experiment('a-stub', '20190101')

    data_sources_and_metrics = exp._process_metrics(enrollments, metric_list)

    assert len(data_sources_and_metrics) == 2

    assert len(data_sources_and_metrics[ds_df_A]) == 2
    assert len(data_sources_and_metrics[ds_df_B]) == 1

    assert 'numeric_col' in repr(data_sources_and_metrics[ds_df_B][0])
    assert '`m3`' in repr(data_sources_and_metrics[ds_df_B][0])
    assert repr(data_sources_and_metrics[ds_df_B][0]) in {
        "Column<b'numeric_col AS `m3`'>",  # py3
        "Column<numeric_col AS `m3`>",  # py2
    }
Beispiel #4
0
def _get_metrics(spark):
    ds_df = _get_data_source_df(spark)
    ds = DataSource.from_dataframe('bla_ds', ds_df)

    return {
        'how_many_ones':
        Metric.from_col('how_many_ones', agg_sum(ds_df.constant_one), ds),
    }
Beispiel #5
0
def test_process_metrics_dupe_data_source(spark):
    exp = Experiment('a-stub', '20190101', num_dates_enrollment=8)
    enrollments = exp.get_enrollments(spark,
                                      _get_enrollment_view(slug="a-stub"))

    ds_df = register_data_source_fixture(spark, name='ds_df_A')

    ds_1 = DataSource.from_dataframe('ds_df_A', ds_df)
    ds_2 = DataSource.from_dataframe('ds_df_A', ds_df)

    m1 = Metric.from_col('m1', ds_df.numeric_col, ds_1)
    m2 = Metric.from_col('m2', ds_df.bool_col, ds_2)

    metric_list = [m1, m2]

    exp = Experiment('a-stub', '20190101')

    data_sources_and_metrics = exp._process_metrics(enrollments, metric_list)

    assert len(data_sources_and_metrics) == 1

    assert len(data_sources_and_metrics[ds_df]) == 2
Beispiel #6
0
def _make_metric_list(report):
    metric_list = list()
    for metric in report['metrics']:
        try:
            metric_list.append(getattr(desktop, metric))
        except AttributeError:
            print(f'`{metric}` is not a pre-defined Metric. Will skip')
    if 'user_defined_metrics' in report:
        for data_source, data_source_metrics \
                in report['user_defined_metrics'].items():
            if not getattr(desktop, data_source, None):
                from_expr = report['user_defined_data_source'][data_source]
                data_source = DataSource(name=data_source,
                                         from_expr=from_expr,
                                         experiments_column_type='native')
            else:
                data_source = getattr(desktop, data_source)
            for key, select_expr in data_source_metrics.items():
                new_metric = Metric(name=key,
                                    data_source=data_source,
                                    select_expr=select_expr)
                metric_list.append(new_metric)

    return metric_list
Beispiel #7
0
    from_expr="""(
                SELECT
                    p.*,
                    DATE(p.submission_timestamp) AS submission_date
                FROM `moz-fx-data-shared-prod.{dataset}.metrics` p
            )""",
    client_id_column="client_info.client_id",
    experiments_column_type="glean",
    default_dataset="org_mozilla_ios_firefox",
)

#: Metric: ...
baseline_ping_count = Metric(
    name="baseline_ping_count",
    data_source=baseline,
    select_expr="COUNT(document_id)",
    friendly_name="Baseline pings",
    description="Counts the number of `baseline` pings received from each client.",
)

#: Metric: ...
metric_ping_count = Metric(
    name="metric_ping_count",
    data_source=metrics,
    select_expr="COUNT(document_id)",
    friendly_name="Metrics pings",
    description="Counts the number of `metrics` pings received from each client.",
)

#: Metric: ...
first_run_date = Metric(
Beispiel #8
0
    def test_metrics(self, client, project_id, static_dataset,
                     temporary_dataset):
        experiment = Experiment(
            experimenter_slug="test-experiment",
            type="rollout",
            status="Live",
            start_date=dt.datetime(2020, 3, 30, tzinfo=pytz.utc),
            end_date=dt.datetime(2020, 6, 1, tzinfo=pytz.utc),
            proposed_enrollment=7,
            branches=[
                Branch(slug="branch1", ratio=0.5),
                Branch(slug="branch2", ratio=0.5)
            ],
            reference_branch="branch2",
            features=[],
            normandy_slug="test-experiment",
        )

        config = AnalysisSpec().resolve(experiment)

        test_clients_daily = DataSource(
            name="clients_daily",
            from_expr=f"`{project_id}.test_data.clients_daily`",
        )

        test_active_hours = Metric(
            name="active_hours",
            data_source=test_clients_daily,
            select_expr=agg_sum("active_hours_sum"),
        )

        config.metrics = {
            AnalysisPeriod.WEEK: [Summary(test_active_hours, BootstrapMean())]
        }

        self.analysis_mock_run(config, static_dataset, temporary_dataset,
                               project_id)

        query_job = client.client.query(f"""
            SELECT
              *
            FROM `{project_id}.{temporary_dataset}.test_experiment_week_1`
            ORDER BY enrollment_date DESC
        """)

        expected_metrics_results = [
            {
                "client_id": "bbbb",
                "branch": "branch2",
                "enrollment_date": datetime.date(2020, 4, 3),
                "num_enrollment_events": 1,
                "analysis_window_start": 0,
                "analysis_window_end": 6,
            },
            {
                "client_id": "aaaa",
                "branch": "branch1",
                "enrollment_date": datetime.date(2020, 4, 2),
                "num_enrollment_events": 1,
                "analysis_window_start": 0,
                "analysis_window_end": 6,
            },
        ]

        for i, row in enumerate(query_job.result()):
            for k, v in expected_metrics_results[i].items():
                assert row[k] == v

        assert (client.client.get_table(
            f"{project_id}.{temporary_dataset}.test_experiment_weekly")
                is not None)
        assert (client.client.get_table(
            f"{project_id}.{temporary_dataset}.statistics_test_experiment_week_1"
        ) is not None)

        stats = client.client.list_rows(
            f"{project_id}.{temporary_dataset}.statistics_test_experiment_week_1"
        ).to_dataframe()

        count_by_branch = stats.query("statistic == 'count'").set_index(
            "branch")
        assert count_by_branch.loc["branch1", "point"] == 1.0
        assert count_by_branch.loc["branch2", "point"] == 1.0

        assert (client.client.get_table(
            f"{project_id}.{temporary_dataset}.statistics_test_experiment_weekly"
        ) is not None)
Beispiel #9
0
    from_expr="""(
                SELECT
                    *,
                    DATE(submission_timestamp) AS submission_date
                FROM mozdata.activity_stream.events
            )""",
    experiments_column_type="native",
)

#: Metric: ...
active_hours = Metric(
    name="active_hours",
    data_source=clients_daily,
    select_expr=agg_sum("active_hours_sum"),
    friendly_name="Active hours",
    description=dedent("""\
        Measures the amount of time (in 5-second increments) during which
        Firefox received user input from a keyboard or mouse. The Firefox
        window does not need to be focused.
    """),
)

#: Metric: ...
uri_count = Metric(
    name="uri_count",
    data_source=clients_daily,
    select_expr=agg_sum(
        "scalar_parent_browser_engagement_total_uri_count_sum"),
    friendly_name="URIs visited",
    description=dedent("""\
        Counts the total number of URIs visited.
Beispiel #10
0
    from_expr="""(
                SELECT
                    p.*,
                    DATE(p.submission_timestamp) AS submission_date
                FROM `moz-fx-data-shared-prod.{dataset}.metrics` p
            )""",
    client_id_column="client_info.client_id",
    experiments_column_type="glean",
    default_dataset="org_mozilla_firefox",
)

#: Metric: ...
uri_count = Metric(
    name="uri_count",
    data_source=baseline,
    select_expr=agg_sum("metrics.counter.events_total_uri_count"),
    friendly_name="URIs visited",
    description="Counts the number of URIs each client visited",
)

#: Metric: ...
user_reports_site_issue_count = Metric(
    name="user_reports_site_issue_count",
    data_source=events,
    select_expr="COUNTIF(event.name = 'browser_menu_action' AND " +
    "mozfun.map.get_key('event.extra', 'item') = 'report_site_issue')",
    friendly_name="Site issues reported",
    description=
    "Counts the number of times clients reported an issue with a site.",
)
Beispiel #11
0
    experiments_column_type="native",
)

cfr = DataSource(
    name='cfr',
    from_expr="""(
                SELECT
                    *,
                    DATE(submission_timestamp) AS submission_date
                FROM `moz-fx-data-derived-datasets`.messaging_system.cfr
            )""",
    experiments_column_type="native",
)

active_hours = Metric(name='active_hours',
                      data_source=clients_daily,
                      select_expr=agg_sum('active_hours_sum'))

uri_count = Metric(name='uri_count',
                   data_source=clients_daily,
                   select_expr=agg_sum(
                       'scalar_parent_browser_engagement_total_uri_count_sum'))

search_count = Metric(name='search_count',
                      data_source=search_clients_daily,
                      select_expr=agg_sum('sap'))

tagged_search_count = Metric(name='tagged_search_count',
                             data_source=search_clients_daily,
                             select_expr=agg_sum('tagged_sap'))
Beispiel #12
0
def test_get_per_client_data_join(spark):
    exp = Experiment('a-stub', '20190101')

    enrollments = spark.createDataFrame(
        [
            ['aaaa', 'control', '20190101'],
            ['bbbb', 'test', '20190101'],
            ['cccc', 'control', '20190108'],
            ['dddd', 'test', '20190109'],
            ['annie-nodata', 'control', '20190101'],
            ['bob-badtiming', 'test', '20190102'],
            ['carol-gooddata', 'test', '20190101'],
            ['derek-lateisok', 'control', '20190110'],
        ],
        [
            "client_id",
            "branch",
            "enrollment_date",
        ],
    )

    ex_d = {'a-stub': 'fake-branch-lifes-too-short'}
    data_source_df = spark.createDataFrame(
        [
            # bob-badtiming only has data before/after analysis window
            # but missed by `process_data_source`
            ['bob-badtiming', '20190102', ex_d, 1],
            ['bob-badtiming', '20190106', ex_d, 2],
            # carol-gooddata has data on two days (including a dupe day)
            ['carol-gooddata', '20190102', ex_d, 3],
            ['carol-gooddata', '20190102', ex_d, 2],
            ['carol-gooddata', '20190104', ex_d, 6],
            # derek-lateisok has data before and during the analysis window
            ['derek-lateisok', '20190110', ex_d, 1000],
            ['derek-lateisok', '20190111', ex_d, 1],
            # TODO: exercise the last condition on the join
        ],
        [
            "client_id",
            "submission_date_s3",
            "experiments",
            "some_value",
        ],
    )

    ds = DataSource.from_dataframe('ds', data_source_df)
    metric = Metric.from_col('some_value', agg_sum(data_source_df.some_value),
                             ds)

    res = exp.get_per_client_data(enrollments, [metric],
                                  '20190114',
                                  1,
                                  3,
                                  keep_client_id=True)

    # Check that the dataframe has the correct number of rows
    assert res.count() == enrollments.count()

    # Check that dataless enrollments are handled correctly
    annie_nodata = res.filter(res.client_id == 'annie-nodata')
    assert annie_nodata.count() == 1
    assert annie_nodata.first()['some_value'] == 0

    # Check that early and late data were ignored
    # i.e. check the join, not just _process_data_source_df
    bob_badtiming = res.filter(res.client_id == 'bob-badtiming')
    assert bob_badtiming.count() == 1
    assert bob_badtiming.first()['some_value'] == 0
    # Check that _process_data_source_df didn't do the
    # heavy lifting above
    time_limits = TimeLimits.for_single_analysis_window(
        exp.start_date, '20190114', 1, 3, exp.num_dates_enrollment)
    pds = exp._process_data_source_df(data_source_df, time_limits)
    assert pds.filter(pds.client_id == 'bob-badtiming').select(
        F.sum(pds.some_value).alias('agg_val')).first()['agg_val'] == 3

    # Check that relevant data was included appropriately
    carol_gooddata = res.filter(res.client_id == 'carol-gooddata')
    assert carol_gooddata.count() == 1
    assert carol_gooddata.first()['some_value'] == 11

    derek_lateisok = res.filter(res.client_id == 'derek-lateisok')
    assert derek_lateisok.count() == 1
    assert derek_lateisok.first()['some_value'] == 1

    # Check that it still works for `data_source`s without an experiments map
    ds_df_noexp = data_source_df.drop('experiments')
    ds_noexp = DataSource.from_dataframe('ds_noexp', ds_df_noexp)
    metric_noexp = Metric.from_col('some_value',
                                   agg_sum(ds_df_noexp.some_value), ds_noexp)

    res2 = exp.get_per_client_data(enrollments, [metric_noexp],
                                   '20190114',
                                   1,
                                   3,
                                   keep_client_id=True)

    assert res2.count() == enrollments.count()
    def test_metrics(self, client):
        experiment = Experiment(
            slug="test-experiment",
            type="rollout",
            start_date=dt.datetime(2020, 3, 30, tzinfo=pytz.utc),
            end_date=dt.datetime(2020, 6, 1, tzinfo=pytz.utc),
            proposed_enrollment=7,
            variants=[
                Variant(is_control=False, slug="branch1", ratio=0.5),
                Variant(is_control=True, slug="branch2", ratio=0.5),
            ],
            normandy_slug="test-experiment",
        )

        orig = mozanalysis.experiment.Experiment.build_query

        def build_query_test_project(instance, *args, **kwargs):
            # to use the test project and dataset, we need to change the SQL query
            # generated by mozanalysis
            query = orig(instance, args[0], args[1], args[2], args[3])
            query = query.replace("moz-fx-data-shared-prod", self.project_id)
            query = query.replace("telemetry", self.static_dataset)
            return query

        config = AnalysisSpec().resolve(experiment)

        test_clients_daily = DataSource(
            name="clients_daily", from_expr=f"`{self.project_id}.test_data.clients_daily`",
        )

        test_active_hours = Metric(
            name="active_hours",
            data_source=test_clients_daily,
            select_expr=agg_sum("active_hours_sum"),
        )

        config.metrics = {
            AnalysisPeriod.WEEK: [
                Summary(test_active_hours, BootstrapMean(ref_branch_label="branch1"))
            ]
        }

        analysis = Analysis(self.project_id, self.test_dataset, config)

        with mock.patch.object(
            mozanalysis.experiment.Experiment, "build_query", new=build_query_test_project
        ):
            analysis.run(current_date=dt.datetime(2020, 4, 12), dry_run=False)

        query_job = client.query(
            f"""
            SELECT
              *
            FROM `{self.project_id}.{self.test_dataset}.test_experiment_week_1`
            ORDER BY enrollment_date DESC
        """
        )

        expected_metrics_results = [
            {
                "client_id": "bbbb",
                "branch": "branch2",
                "enrollment_date": datetime.date(2020, 4, 3),
                "num_enrollment_events": 1,
                "analysis_window_start": 0,
                "analysis_window_end": 6,
            },
            {
                "client_id": "aaaa",
                "branch": "branch1",
                "enrollment_date": datetime.date(2020, 4, 2),
                "num_enrollment_events": 1,
                "analysis_window_start": 0,
                "analysis_window_end": 6,
            },
        ]

        for i, row in enumerate(query_job.result()):
            for k, v in expected_metrics_results[i].items():
                assert row[k] == v

        assert (
            client.get_table(f"{self.project_id}.{self.test_dataset}.test_experiment_weekly")
            is not None
        )
        assert (
            client.get_table(
                f"{self.project_id}.{self.test_dataset}.statistics_test_experiment_week_1"
            )
            is not None
        )
        assert (
            client.get_table(
                f"{self.project_id}.{self.test_dataset}.statistics_test_experiment_weekly"
            )
            is not None
        )