def test_ts_time_limits_create_not_enough_data(): with pytest.raises(ValueError): TimeLimits.for_ts( first_enrollment_date="2019-01-01", last_date_full_data="2019-01-13", time_series_period="weekly", num_dates_enrollment=8, )
def test_time_limits_create5(): # But not an 8 day window with pytest.raises(ValueError): TimeLimits.for_single_analysis_window( first_enrollment_date="2019-01-01", last_date_full_data="2019-01-14", analysis_start_days=0, analysis_length_dates=8, num_dates_enrollment=8, )
def test_time_limits_has_right_date_in_error_message(): msg_re = r"until we have data for 2020-03-30." with pytest.raises(ValueError, match=msg_re): TimeLimits.for_single_analysis_window( first_enrollment_date="2020-03-03", last_date_full_data="2020-03-23", analysis_start_days=0, analysis_length_dates=21, num_dates_enrollment=8, )
def test_time_limits_validates(): # Mainly check that the validation is running at all # No need to specify the same checks twice(?) with pytest.raises(TypeError): TimeLimits() with pytest.raises(AssertionError): TimeLimits( first_enrollment_date="2019-01-05", last_enrollment_date="2019-01-05", analysis_windows=(AnalysisWindow(1, 1),), first_date_data_required="2019-01-01", # Before enrollments last_date_data_required="2019-01-01", )
def test_exposure_signal_query_custom_windows(): exp = Experiment("slug", "2019-01-01", 8, app_id="my_cool_app") tl = TimeLimits.for_ts( first_enrollment_date="2019-01-01", last_date_full_data="2019-03-01", time_series_period="weekly", num_dates_enrollment=8, ) enrollment_sql = exp.build_enrollments_query( time_limits=tl, enrollments_query_type="glean-event", exposure_signal=ExposureSignal( name="exposures", data_source=mozanalysis.metrics.fenix.baseline, select_expr="metrics.counter.events_total_uri_count > 0", friendly_name="URI visited exposure", description="Exposed when URI visited", window_start=1, window_end=3, ), ) sql_lint(enrollment_sql) assert "exposures" in enrollment_sql assert "metrics.counter.events_total_uri_count > 0" in enrollment_sql assert "DATE_ADD('2019-01-01', INTERVAL 1 DAY)" in enrollment_sql assert "DATE_ADD('2019-01-01', INTERVAL 3 DAY)" in enrollment_sql
def test_metrics_query_based_on_exposure(): exp = Experiment("slug", "2019-01-01", 8) tl = TimeLimits.for_ts( first_enrollment_date="2019-01-01", last_date_full_data="2019-03-01", time_series_period="weekly", num_dates_enrollment=8, ) enrollments_sql = exp.build_enrollments_query( time_limits=tl, enrollments_query_type="fenix-fallback") sql_lint(enrollments_sql) metrics_sql = exp.build_metrics_query( metric_list=[ m for m in mozanalysis.metrics.fenix.__dict__.values() if isinstance(m, Metric) ], time_limits=tl, enrollments_table="enrollments", analysis_basis=AnalysisBasis.EXPOSURES, ) sql_lint(metrics_sql) assert "e.exposure_date" in metrics_sql
def test_process_enrollments(spark): exp = Experiment('a-stub', '20190101') enrollments = exp.get_enrollments(spark, _get_enrollment_view(slug="a-stub")) assert enrollments.count() == 4 # With final data collected on '20190114', we have 7 dates of data # for 'cccc' enrolled on '20190108' but not for 'dddd' enrolled on # '20190109'. tl = TimeLimits.for_single_analysis_window( first_enrollment_date=exp.start_date, last_date_full_data='20190114', analysis_start_days=0, analysis_length_dates=7, num_dates_enrollment=exp.num_dates_enrollment) assert tl.last_enrollment_date == '20190108' assert len(tl.analysis_windows) == 1 assert tl.analysis_windows[0].end == 6 pe = exp._process_enrollments(enrollments, tl) assert pe.count() == 3 pe = exp._process_enrollments(enrollments.alias('main_summary'), tl) assert pe.select(F.col('enrollments.enrollment_date')) with pytest.raises(AnalysisException): assert pe.select(F.col('main_summary.enrollment_date'))
def test_add_analysis_windows_to_enrollments(spark): exp = Experiment('a-stub', '20190101', num_dates_enrollment=8) enrollments = exp.get_enrollments(spark, _get_enrollment_view(slug="a-stub")) assert enrollments.count() == 3 tl = TimeLimits.for_ts( first_enrollment_date=exp.start_date, last_date_full_data='20190114', time_series_period='daily', num_dates_enrollment=exp.num_dates_enrollment, ) assert len(tl.analysis_windows) == 7 new_enrollments = exp._add_analysis_windows_to_enrollments(enrollments, tl) nep = new_enrollments.toPandas() assert len(nep) == enrollments.count() * len(tl.analysis_windows) a = nep[nep['client_id'] == 'aaaa'] assert len(a) == len(tl.analysis_windows) assert (a.mozanalysis_analysis_window_start.sort_values() == np.arange( len(tl.analysis_windows))).all() assert (a.mozanalysis_analysis_window_end.sort_values() == np.arange( len(tl.analysis_windows))).all()
def test_segments_megaquery_not_detectably_malformed(): exp = Experiment("slug", "2019-01-01", 8) tl = TimeLimits.for_ts( first_enrollment_date="2019-01-01", last_date_full_data="2019-03-01", time_series_period="weekly", num_dates_enrollment=8, ) enrollments_sql = exp.build_enrollments_query( time_limits=tl, segment_list=[s for s in msd.__dict__.values() if isinstance(s, msd.Segment)], enrollments_query_type="normandy", ) sql_lint(enrollments_sql) metrics_sql = exp.build_metrics_query( metric_list=[m for m in mad.__dict__.values() if isinstance(m, mad.Metric)], time_limits=tl, enrollments_table="enrollments", ) sql_lint(metrics_sql)
def test_no_analysis_exception_when_shared_parent_dataframe(spark): # Check that we don't fall victim to # https://issues.apache.org/jira/browse/SPARK-10925 df = spark.createDataFrame( [ # Just need the schema, really ['someone', '20190102', 'fake', 1], ], [ "client_id", "submission_date_s3", "branch", "some_value", ]) enrollments = df.groupby('client_id', 'branch').agg( F.min('submission_date_s3').alias('enrollment_date')) exp = Experiment('a-stub', '20180101') time_limits = TimeLimits.for_single_analysis_window( exp.start_date, last_date_full_data='20190522', analysis_start_days=28, analysis_length_dates=7) enrollments = exp._add_analysis_windows_to_enrollments( enrollments, time_limits) exp._get_results_for_one_data_source( enrollments, df, [F.max(F.col('some_value'))], )
def test_process_data_source_df(spark): start_date = '20190101' exp_8d = Experiment('experiment-with-8-day-cohort', start_date, 8) data_source_df = _get_data_source_df(spark) end_date = '20190114' # Are the fixtures sufficiently complicated that we're actually testing # things? assert _simple_return_agg_date(F.min, data_source_df) < start_date assert _simple_return_agg_date(F.max, data_source_df) > end_date tl_03 = TimeLimits.for_single_analysis_window( first_enrollment_date=exp_8d.start_date, last_date_full_data=end_date, analysis_start_days=0, analysis_length_dates=3, num_dates_enrollment=exp_8d.num_dates_enrollment) assert tl_03.first_date_data_required == start_date assert tl_03.last_date_data_required == '20190110' proc_ds = exp_8d._process_data_source_df(data_source_df, tl_03) assert _simple_return_agg_date(F.min, proc_ds) == tl_03.first_date_data_required assert _simple_return_agg_date(F.max, proc_ds) == tl_03.last_date_data_required tl_23 = TimeLimits.for_single_analysis_window( first_enrollment_date=exp_8d.start_date, last_date_full_data=end_date, analysis_start_days=2, analysis_length_dates=3, num_dates_enrollment=exp_8d.num_dates_enrollment) assert tl_23.first_date_data_required == add_days(start_date, 2) assert tl_23.last_date_data_required == '20190112' p_ds_2 = exp_8d._process_data_source_df(data_source_df, tl_23) assert _simple_return_agg_date(F.min, p_ds_2) == tl_23.first_date_data_required assert _simple_return_agg_date(F.max, p_ds_2) == tl_23.last_date_data_required assert proc_ds.select(F.col('data_source.client_id')) with pytest.raises(AnalysisException): assert data_source_df.select(F.col('data_source.client_id'))
def test_time_limits_create2(): # We don't have 14 dates of data for an 8-day cohort: with pytest.raises(ValueError): TimeLimits.for_single_analysis_window( first_enrollment_date="2019-01-01", last_date_full_data="2019-01-14", analysis_start_days=0, analysis_length_dates=14, num_dates_enrollment=8, ) # We don't have 15 full dates of data for any users with pytest.raises(AssertionError): TimeLimits.for_single_analysis_window( first_enrollment_date="2019-01-01", last_date_full_data="2019-01-14", analysis_start_days=0, analysis_length_dates=15, )
def test_ts_time_limits_create3(): tl = TimeLimits.for_ts(first_enrollment_date='2019-01-01', last_date_full_data='2019-01-15', time_series_period='weekly', num_dates_enrollment=8) assert tl.first_enrollment_date == '2019-01-01' assert tl.last_enrollment_date == '2019-01-08' assert len(tl.analysis_windows) == 1 assert tl.analysis_windows[0].start == 0 assert tl.analysis_windows[0].end == 6 assert tl.first_date_data_required == '2019-01-01' assert tl.last_date_data_required == '2019-01-14'
def test_time_limits_create6(): # Of course the flexi-experiment has data for a 1 day window tl = TimeLimits.for_single_analysis_window( first_enrollment_date="2019-01-01", last_date_full_data="2019-01-14", analysis_start_days=0, analysis_length_dates=1, ) assert tl.first_enrollment_date == "2019-01-01" assert tl.last_enrollment_date == "2019-01-14" assert len(tl.analysis_windows) == 1 assert tl.analysis_windows[0].start == 0 assert tl.analysis_windows[0].end == 0 assert tl.first_date_data_required == "2019-01-01" assert tl.last_date_data_required == "2019-01-14"
def test_time_limits_create7(): # If the analysis starts later, so does the data source tl = TimeLimits.for_single_analysis_window( first_enrollment_date="2019-01-01", last_date_full_data="2019-01-14", analysis_start_days=7, analysis_length_dates=1, ) assert tl.first_enrollment_date == "2019-01-01" assert tl.last_enrollment_date == "2019-01-07" assert len(tl.analysis_windows) == 1 assert tl.analysis_windows[0].start == 7 assert tl.analysis_windows[0].end == 7 assert tl.first_date_data_required == "2019-01-08" assert tl.last_date_data_required == "2019-01-14"
def test_ts_time_limits_create3(): tl = TimeLimits.for_ts( first_enrollment_date="2019-01-01", last_date_full_data="2019-01-15", time_series_period="weekly", num_dates_enrollment=8, ) assert tl.first_enrollment_date == "2019-01-01" assert tl.last_enrollment_date == "2019-01-08" assert len(tl.analysis_windows) == 1 assert tl.analysis_windows[0].start == 0 assert tl.analysis_windows[0].end == 6 assert tl.first_date_data_required == "2019-01-01" assert tl.last_date_data_required == "2019-01-14"
def test_time_limits_create4(): # Or a 2 day window tl = TimeLimits.for_single_analysis_window( first_enrollment_date="2019-01-01", last_date_full_data="2019-01-14", analysis_start_days=0, analysis_length_dates=2, num_dates_enrollment=8, ) assert tl.first_enrollment_date == "2019-01-01" assert tl.last_enrollment_date == "2019-01-08" assert len(tl.analysis_windows) == 1 assert tl.analysis_windows[0].start == 0 assert tl.analysis_windows[0].end == 1 assert tl.first_date_data_required == "2019-01-01" assert tl.last_date_data_required == "2019-01-09"
def test_time_limits_create3(): # For the 8-day cohort We have enough data for a 7 day window tl = TimeLimits.for_single_analysis_window( first_enrollment_date="2019-01-01", last_date_full_data="2019-01-14", analysis_start_days=0, analysis_length_dates=7, num_dates_enrollment=8, ) assert tl.first_enrollment_date == "2019-01-01" assert tl.last_enrollment_date == "2019-01-08" assert len(tl.analysis_windows) == 1 assert tl.analysis_windows[0].start == 0 assert tl.analysis_windows[0].end == 6 assert tl.first_date_data_required == "2019-01-01" assert tl.last_date_data_required == "2019-01-14"
def validate(self) -> None: self.check_runnable() dates_enrollment = self.config.experiment.proposed_enrollment + 1 if self.config.experiment.end_date is not None: end_date = self.config.experiment.end_date analysis_length_dates = ( (end_date - self.config.experiment.start_date).days - dates_enrollment + 1) else: analysis_length_dates = 21 # arbitrary end_date = self.config.experiment.start_date + timedelta( days=analysis_length_dates + dates_enrollment - 1) if analysis_length_dates < 0: logging.error( "Proposed enrollment longer than analysis dates length:" + f"{self.config.experiment.normandy_slug}") raise Exception("Cannot validate experiment") limits = TimeLimits.for_single_analysis_window( last_date_full_data=end_date.strftime("%Y-%m-%d"), analysis_start_days=0, analysis_length_dates=analysis_length_dates, first_enrollment_date=self.config.experiment.start_date.strftime( "%Y-%m-%d"), num_dates_enrollment=dates_enrollment, ) exp = mozanalysis.experiment.Experiment( experiment_slug=self.config.experiment.normandy_slug, start_date=self.config.experiment.start_date.strftime("%Y-%m-%d"), ) metrics = set() for v in self.config.metrics.values(): metrics |= {m.metric for m in v} sql = exp.build_query( metrics, limits, "normandy", self.config.experiment.enrollment_query, ) dry_run_query(sql)
def test_query_not_detectably_malformed(): exp = Experiment('slug', '2019-01-01', 8) tl = TimeLimits.for_ts( first_enrollment_date='2019-01-01', last_date_full_data='2019-03-01', time_series_period='weekly', num_dates_enrollment=8 ) sql = exp.build_query( metric_list=[], time_limits=tl, enrollments_query_type='normandy', ) sql_lint(sql)
def test_megaquery_not_detectably_malformed(): exp = Experiment('slug', '2019-01-01', 8) tl = TimeLimits.for_ts(first_enrollment_date='2019-01-01', last_date_full_data='2019-03-01', time_series_period='weekly', num_dates_enrollment=8) sql = exp.build_query( metric_list=[ m for m in mad.__dict__.values() if isinstance(m, mad.Metric) ], time_limits=tl, enrollments_query_type='normandy', ) sql_lint(sql)
def test_time_limits_create1(): # When we have complete data for 2019-01-14... # ...We have 14 dates of data for those who enrolled on the 1st tl = TimeLimits.for_single_analysis_window( first_enrollment_date="2019-01-01", last_date_full_data="2019-01-14", analysis_start_days=0, analysis_length_dates=14, ) assert tl.first_enrollment_date == "2019-01-01" assert tl.last_enrollment_date == "2019-01-01" assert len(tl.analysis_windows) == 1 assert tl.analysis_windows[0].start == 0 assert tl.analysis_windows[0].end == 13 assert tl.first_date_data_required == "2019-01-01" assert tl.last_date_data_required == "2019-01-14"
def test_firefox_ios_klar_app_id_propagation(): exp = Experiment("slug", "2019-01-01", 8, app_id="my_cool_app") tl = TimeLimits.for_ts( first_enrollment_date="2019-01-01", last_date_full_data="2019-03-01", time_series_period="weekly", num_dates_enrollment=8, ) sds = SegmentDataSource( name="cool_data_source", from_expr="`moz-fx-data-shared-prod`.{dataset}.cool_table", default_dataset="org_mozilla_ios_klar", ) segment = Segment( name="cool_segment", select_expr="COUNT(*)", data_source=sds, ) enrollments_sql = exp.build_enrollments_query( time_limits=tl, segment_list=[segment], enrollments_query_type="glean-event", ) sql_lint(enrollments_sql) metrics_sql = exp.build_metrics_query( metric_list=[ m for m in mozanalysis.metrics.klar_ios.__dict__.values() if isinstance(m, Metric) ], time_limits=tl, enrollments_table="enrollments", ) sql_lint(metrics_sql) assert "org_mozilla_ios_klar" not in enrollments_sql assert "my_cool_app" in enrollments_sql sql_lint(metrics_sql)
def test_query_not_detectably_malformed(): exp = Experiment('slug', '2019-01-01', 8) tl = TimeLimits.for_ts(first_enrollment_date='2019-01-01', last_date_full_data='2019-03-01', time_series_period='weekly', num_dates_enrollment=8) sql = exp.build_query( metric_list=[], time_limits=tl, enrollments_query_type='normandy', ) # This query is actually slightly malformed, due to a trailing comma. # We should add a metric here if the linter ever improves. sql_lint(sql)
def test_exposure_query(): exp = Experiment("slug", "2019-01-01", 8, app_id="my_cool_app") tl = TimeLimits.for_ts( first_enrollment_date="2019-01-01", last_date_full_data="2019-03-01", time_series_period="weekly", num_dates_enrollment=8, ) enrollment_sql = exp.build_enrollments_query( time_limits=tl, enrollments_query_type="glean-event", ) sql_lint(enrollment_sql) assert "exposures" in enrollment_sql
def dry_run_query(exp_path): report = validate_schema(op.join(exp_path, "report.json")) metric_list = _make_metric_list(report) exp = Experiment(experiment_slug=report["experiment_slug"], start_date=report["start_date"], num_dates_enrollment=report["num_dates_enrollment"]) # create an archive of the sql generating analysis time_limits = TimeLimits.for_single_analysis_window( first_enrollment_date=report['start_date'], last_date_full_data=report['last_date_full_data'], analysis_start_days=report['analysis_start_days'], analysis_length_dates=report['analysis_length_days'], num_dates_enrollment=report['num_dates_enrollment']) query = exp.build_query(metric_list=metric_list, time_limits=time_limits, enrollments_query_type='normandy') return query
def test_query_not_detectably_malformed_fenix_fallback(): exp = Experiment("slug", "2019-01-01", 8) tl = TimeLimits.for_ts( first_enrollment_date="2019-01-01", last_date_full_data="2019-03-01", time_series_period="weekly", num_dates_enrollment=8, ) enrollments_sql = exp.build_enrollments_query( time_limits=tl, enrollments_query_type="fenix-fallback") sql_lint(enrollments_sql) metrics_sql = exp.build_metrics_query( metric_list=[], time_limits=tl, enrollments_table="enrollments", ) sql_lint(metrics_sql)
def test_metrics_query_with_exposure_signal_custom_windows(): exp = Experiment("slug", "2019-01-01", 8) tl = TimeLimits.for_ts( first_enrollment_date="2019-01-01", last_date_full_data="2019-03-01", time_series_period="weekly", num_dates_enrollment=8, ) enrollments_sql = exp.build_enrollments_query( time_limits=tl, enrollments_query_type="fenix-fallback") sql_lint(enrollments_sql) metrics_sql = exp.build_metrics_query( metric_list=[ m for m in mozanalysis.metrics.fenix.__dict__.values() if isinstance(m, Metric) ], time_limits=tl, enrollments_table="enrollments", analysis_basis=AnalysisBasis.EXPOSURES, exposure_signal=ExposureSignal( name="exposures", data_source=mozanalysis.metrics.fenix.baseline, select_expr="metrics.counter.events_total_uri_count > 0", friendly_name="URI visited exposure", description="Exposed when URI visited", window_start=1, window_end=3, ), ) sql_lint(metrics_sql) assert "DATE_ADD('2019-01-01', INTERVAL 1 DAY)" in metrics_sql assert "DATE_ADD('2019-01-01', INTERVAL 3 DAY)" in metrics_sql
def _get_timelimits_if_ready( self, period: AnalysisPeriod, current_date: datetime ) -> Optional[TimeLimits]: """ Returns a TimeLimits instance if experiment is due for analysis. Otherwise returns None. """ prior_date = current_date - timedelta(days=1) prior_date_str = prior_date.strftime("%Y-%m-%d") current_date_str = current_date.strftime("%Y-%m-%d") dates_enrollment = self.config.experiment.proposed_enrollment + 1 if self.config.experiment.start_date is None: return None time_limits_args = { "first_enrollment_date": self.config.experiment.start_date.strftime("%Y-%m-%d"), "num_dates_enrollment": dates_enrollment, } if period != AnalysisPeriod.OVERALL: try: current_time_limits = TimeLimits.for_ts( last_date_full_data=current_date_str, time_series_period=period.mozanalysis_label, **time_limits_args, ) except ValueError: # There are no analysis windows yet. # TODO: Add a more specific check. return None try: prior_time_limits = TimeLimits.for_ts( last_date_full_data=prior_date_str, time_series_period=period.mozanalysis_label, **time_limits_args, ) except ValueError: # We have an analysis window today, and we didn't yesterday, # so we must have just closed our first window. return current_time_limits if len(current_time_limits.analysis_windows) == len(prior_time_limits.analysis_windows): # No new data today return None return current_time_limits assert period == AnalysisPeriod.OVERALL if ( self.config.experiment.end_date is None or self.config.experiment.end_date.date() != current_date.date() or self.config.experiment.status != "Complete" ): return None if self.config.experiment.end_date is None: return None analysis_length_dates = ( (self.config.experiment.end_date - self.config.experiment.start_date).days - dates_enrollment + 1 ) if analysis_length_dates < 0: raise errors.EnrollmentLongerThanAnalysisException(self.config.experiment.normandy_slug) return TimeLimits.for_single_analysis_window( last_date_full_data=prior_date_str, analysis_start_days=0, analysis_length_dates=analysis_length_dates, **time_limits_args, )
def _get_timelimits_if_ready( self, period: AnalysisPeriod, current_date: datetime) -> Optional[TimeLimits]: """ Returns a TimeLimits instance if experiment is due for analysis. Otherwise returns None. """ prior_date = current_date - timedelta(days=1) prior_date_str = prior_date.strftime("%Y-%m-%d") current_date_str = current_date.strftime("%Y-%m-%d") if not self.config.experiment.proposed_enrollment: self.logger.info("Skipping %s; no enrollment period", self.config.experiment.slug) return None dates_enrollment = self.config.experiment.proposed_enrollment + 1 if self.config.experiment.start_date is None: return None time_limits_args = { "first_enrollment_date": self.config.experiment.start_date.strftime("%Y-%m-%d"), "num_dates_enrollment": dates_enrollment, } if period != AnalysisPeriod.OVERALL: try: current_time_limits = TimeLimits.for_ts( last_date_full_data=current_date_str, time_series_period=period.adjective, **time_limits_args, ) except ValueError: # There are no analysis windows yet. # TODO: Add a more specific check. return None try: prior_time_limits = TimeLimits.for_ts( last_date_full_data=prior_date_str, time_series_period=period.adjective, **time_limits_args, ) except ValueError: # We have an analysis window today, and we didn't yesterday, # so we must have just closed our first window. return current_time_limits if len(current_time_limits.analysis_windows) == len( prior_time_limits.analysis_windows): # No new data today return None return current_time_limits # Period is OVERALL if self.config.experiment.end_date != prior_date: return None return TimeLimits.for_single_analysis_window( last_date_full_data=prior_date_str, analysis_start_days=0, analysis_length_dates=(self.config.experiment.end_date - self.config.experiment.start_date).days - dates_enrollment + 1, **time_limits_args, )