def for_ts( cls, first_enrollment_date, last_date_full_data, time_series_period, num_dates_enrollment, ): """Return a ``TimeLimits`` instance for a time series. Args: first_enrollment_date (str): First date on which enrollment events were received; the start date of the experiment. last_date_full_data (str): The most recent date for which we have complete data, e.g. '2019-03-22'. If you want to ignore all data collected after a certain date (e.g. when the experiment recipe was deactivated), then do that here. time_series_period: 'daily' or 'weekly'. num_dates_enrollment (int): Take this many days of client enrollments. This is a mandatory argument because it determines the number of points in the time series. """ if time_series_period not in ("daily", "weekly"): raise ValueError( "Unsupported time series period {}".format(time_series_period) ) if num_dates_enrollment <= 0: raise ValueError("Number of enrollment dates must be a positive number") analysis_window_length_dates = 1 if time_series_period == "daily" else 7 last_enrollment_date = add_days(first_enrollment_date, num_dates_enrollment - 1) max_dates_of_data = date_sub(last_date_full_data, last_enrollment_date) + 1 num_periods = max_dates_of_data // analysis_window_length_dates if num_periods <= 0: raise ValueError("Insufficient data") analysis_windows = tuple( [ AnalysisWindow( i * analysis_window_length_dates, (i + 1) * analysis_window_length_dates - 1, ) for i in range(num_periods) ] ) last_date_data_required = add_days( last_enrollment_date, analysis_windows[-1].end ) return cls( first_enrollment_date=first_enrollment_date, last_enrollment_date=last_enrollment_date, first_date_data_required=first_enrollment_date, last_date_data_required=last_date_data_required, analysis_windows=analysis_windows, )
def _validate_first_date_data_required(self, attribute, value): assert self.first_date_data_required <= self.last_date_data_required min_analysis_window_start = min(aw.start for aw in self.analysis_windows) assert self.first_date_data_required == add_days( self.first_enrollment_date, min_analysis_window_start )
def _calculate_metrics( self, exp: mozanalysis.experiment.Experiment, time_limits: TimeLimits, period: AnalysisPeriod, dry_run: bool, ): """ Calculate metrics for a specific experiment. Returns the BigQuery table results are written to. """ window = len(time_limits.analysis_windows) last_analysis_window = time_limits.analysis_windows[-1] # TODO: Add this functionality to TimeLimits. last_window_limits = attr.evolve( time_limits, analysis_windows=[last_analysis_window], first_date_data_required=add_days( time_limits.first_enrollment_date, last_analysis_window.start), ) res_table_name = self._table_name(period.value, window) sql = exp.build_query( {m.metric for m in self.config.metrics[period]}, last_window_limits, "normandy", self.config.experiment.enrollment_query, self.config.experiment.segments, ) if dry_run: logger.info( "Dry run; not actually calculating %s metrics for %s", period.value, self.config.experiment.normandy_slug, ) else: logger.info( "Executing query for %s (%s)", self.config.experiment.normandy_slug, period.value, ) self.bigquery.execute(sql, res_table_name) self._publish_view(period) return res_table_name
def test_process_data_source_df(spark): start_date = '20190101' exp_8d = Experiment('experiment-with-8-day-cohort', start_date, 8) data_source_df = _get_data_source_df(spark) end_date = '20190114' # Are the fixtures sufficiently complicated that we're actually testing # things? assert _simple_return_agg_date(F.min, data_source_df) < start_date assert _simple_return_agg_date(F.max, data_source_df) > end_date tl_03 = TimeLimits.for_single_analysis_window( first_enrollment_date=exp_8d.start_date, last_date_full_data=end_date, analysis_start_days=0, analysis_length_dates=3, num_dates_enrollment=exp_8d.num_dates_enrollment) assert tl_03.first_date_data_required == start_date assert tl_03.last_date_data_required == '20190110' proc_ds = exp_8d._process_data_source_df(data_source_df, tl_03) assert _simple_return_agg_date(F.min, proc_ds) == tl_03.first_date_data_required assert _simple_return_agg_date(F.max, proc_ds) == tl_03.last_date_data_required tl_23 = TimeLimits.for_single_analysis_window( first_enrollment_date=exp_8d.start_date, last_date_full_data=end_date, analysis_start_days=2, analysis_length_dates=3, num_dates_enrollment=exp_8d.num_dates_enrollment) assert tl_23.first_date_data_required == add_days(start_date, 2) assert tl_23.last_date_data_required == '20190112' p_ds_2 = exp_8d._process_data_source_df(data_source_df, tl_23) assert _simple_return_agg_date(F.min, p_ds_2) == tl_23.first_date_data_required assert _simple_return_agg_date(F.max, p_ds_2) == tl_23.last_date_data_required assert proc_ds.select(F.col('data_source.client_id')) with pytest.raises(AnalysisException): assert data_source_df.select(F.col('data_source.client_id'))
def _get_data_source_df(spark): clients_branches = [ ('aaaa', 'control'), ('bbbb', 'test'), ] dates = [add_days('20181215', i) for i in range(32)] data_rows = [[client, submission_date_s3, { 'a-stub': branch }, 1.] for client, branch in clients_branches for submission_date_s3 in dates] return spark.createDataFrame( data_rows, [ "client_id", "submission_date_s3", "experiments", "constant_one", ], )
def for_single_analysis_window( cls, first_enrollment_date, last_date_full_data, analysis_start_days, analysis_length_dates, num_dates_enrollment=None, ): """Return a ``TimeLimits`` instance with the following parameters Args: first_enrollment_date (str): First date on which enrollment events were received; the start date of the experiment. last_date_full_data (str): The most recent date for which we have complete data, e.g. '2019-03-22'. If you want to ignore all data collected after a certain date (e.g. when the experiment recipe was deactivated), then do that here. analysis_start_days (int): the start of the analysis window, measured in 'days since the client enrolled'. We ignore data collected outside this analysis window. analysis_length_days (int): the length of the analysis window, measured in days. num_dates_enrollment (int, optional): Only include this many days of enrollments. If ``None`` then use the maximum number of days as determined by the metric's analysis window and ``last_date_full_data``. Typically ``7n+1``, e.g. ``8``. The factor ``7`` removes weekly seasonality, and the ``+1`` accounts for the fact that enrollment typically starts a few hours before UTC midnight. """ analysis_window = AnalysisWindow( analysis_start_days, analysis_start_days + analysis_length_dates - 1 ) if num_dates_enrollment is None: last_enrollment_date = add_days(last_date_full_data, -analysis_window.end) else: last_enrollment_date = add_days( first_enrollment_date, num_dates_enrollment - 1 ) first_date_data_required = add_days( first_enrollment_date, analysis_window.start ) last_date_data_required = add_days(last_enrollment_date, analysis_window.end) if last_date_data_required > last_date_full_data: raise ValueError( "You said you wanted {} dates of enrollment, ".format( num_dates_enrollment ) + "and need data from the {}th day after enrollment. ".format( analysis_window.end ) + "For that, you need to wait until we have data for {}.".format( last_date_data_required ) ) tl = cls( first_enrollment_date=first_enrollment_date, last_enrollment_date=last_enrollment_date, first_date_data_required=first_date_data_required, last_date_data_required=last_date_data_required, analysis_windows=(analysis_window,), ) return tl
def _validate_last_date_data_required(self, attribute, value): max_analysis_window_end = max(aw.end for aw in self.analysis_windows) assert self.last_date_data_required == add_days( self.last_enrollment_date, max_analysis_window_end )
def test_add_days(): assert add_days('2019-01-01', 0) == '2019-01-01' assert add_days('2019-01-01', 1) == '2019-01-02' assert add_days('2019-01-01', -1) == '2018-12-31'
def calculate_metrics( self, exp: mozanalysis.experiment.Experiment, time_limits: TimeLimits, period: AnalysisPeriod, analysis_basis: AnalysisBasis, dry_run: bool, ): """ Calculate metrics for a specific experiment. Returns the BigQuery table results are written to. """ window = len(time_limits.analysis_windows) last_analysis_window = time_limits.analysis_windows[-1] # TODO: Add this functionality to TimeLimits. last_window_limits = attr.evolve( time_limits, analysis_windows=[last_analysis_window], first_date_data_required=add_days( time_limits.first_enrollment_date, last_analysis_window.start ), ) res_table_name = self._table_name(period.value, window, analysis_basis=analysis_basis) normalized_slug = bq_normalize_name(self.config.experiment.normandy_slug) if dry_run: logger.info( "Dry run; not actually calculating %s metrics for %s", period.value, self.config.experiment.normandy_slug, ) else: logger.info( "Executing query for %s (%s)", self.config.experiment.normandy_slug, period.value, ) enrollments_table_name = f"enrollments_{normalized_slug}" exposure_signal = None if self.config.experiment.exposure_signal: # if a custom exposure signal has been defined in the config, we'll # need to pass it into the metrics computation exposure_signal = ( self.config.experiment.exposure_signal.to_mozanalysis_exposure_signal( last_window_limits ) ) metrics_sql = exp.build_metrics_query( { m.metric.to_mozanalysis_metric() for m in self.config.metrics[period] if m.metric.analysis_bases == analysis_basis or analysis_basis in m.metric.analysis_bases }, last_window_limits, enrollments_table_name, analysis_basis, exposure_signal, ) self.bigquery.execute(metrics_sql, res_table_name) self._publish_view(period, analysis_basis=analysis_basis.value) return res_table_name
def get_enrollments(self, spark, study_type='pref_flip', end_date=None, debug_dupes=False): """Return a DataFrame of enrolled clients. This works for pref-flip and addon studies. The underlying queries are different for pref-flip vs addon studies, because as of 2019/04/02, branch information isn't reliably available in the ``events`` table for addon experiments: branch may be NULL for all enrollments. The enrollment information for them is most reliably available in ``telemetry_shield_study_parquet``. Once this issue is resolved, we will probably start using normandy events for all desktop studies. Ref: https://bugzilla.mozilla.org/show_bug.cgi?id=1536644 Args: spark: The spark context. study_type (str): One of the following strings: * 'pref_flip' * 'addon' or a callable that accepts a spark context as an argument and returns a Spark DataFrame containing all enrollment events ever conducted using that method, with columns ``client_id``, ``experiment_slug``, ``branch``, ``enrollment_date``, and ``addon_version`` if it's relevant. end_date (str, optional): Ignore enrollments after this date: for faster queries on stale experiments. If you set ``num_dates_enrollment`` then do not set this; at best it would be redundant, at worst it's contradictory. debug_dupes (bool, optional): Include a column ``num_events`` giving the number of enrollment events associated with the ``client_id`` and ``branch``. Returns: A Spark DataFrame of enrollment data. One row per enrollment. Columns: * client_id (str) * enrollment_date (str): e.g. '20190329' * branch (str) * num_events (int, optional) """ if callable(study_type): enrollments = study_type(spark) elif study_type == 'pref_flip': enrollments = self._get_enrollments_view_normandy(spark) elif study_type == 'addon': enrollments = self._get_enrollments_view_addon(spark) # elif study_type == 'glean': # raise NotImplementedError else: raise ValueError("Unrecognized study_type: {}".format(study_type)) enrollments = enrollments.filter( enrollments.enrollment_date >= self.start_date).filter( enrollments.experiment_slug == self.experiment_slug) if self.addon_version: if "addon_version" not in enrollments.columns: raise ValueError( ("Experiment constructed with an addon_version but your " "study_type (%s) is incompatible with addon versions." ).format(study_type)) enrollments = enrollments.filter( enrollments.addon_version == self.addon_version).drop( enrollments.addon_version) if self.num_dates_enrollment is not None: if end_date is not None: raise ValueError( "Don't specify both 'end_date' and " "'num_dates_enrollment'; you might contradict yourself.") enrollments = enrollments.filter( enrollments.enrollment_date <= add_days( self.start_date, self.num_dates_enrollment - 1)) elif end_date is not None: enrollments = enrollments.filter( enrollments.enrollment_date <= end_date) # Deduplicate enrollment events. Optionally keep track of what # had to be deduplicated. Deduplicating a client who enrolls in # multiple branches is left as an exercise to the reader :| enrollments = enrollments.groupBy( enrollments.client_id, enrollments.branch).agg(*( [F.min(enrollments.enrollment_date).alias('enrollment_date')] + ([F.count(enrollments.enrollment_date). alias('num_events')] if debug_dupes else []))) enrollments.cache() return enrollments
def test_add_days(): assert add_days("2019-01-01", 0) == "2019-01-01" assert add_days("2019-01-01", 1) == "2019-01-02" assert add_days("2019-01-01", -1) == "2018-12-31"