def _get_last_data_date(self, last_date_full_data, req_dates_of_data): """Return the date of the final used datum.""" if self.num_dates_enrollment is None: assert last_date_full_data >= add_days( self.start_date, req_dates_of_data - 1), "No users have had time to convert yet" return last_date_full_data last_required_data_date = add_days( self.start_date, self.num_dates_enrollment - 1 + req_dates_of_data - 1) # If I did the math right, this should be equivalent to the # check in _get_last_enrollment_date() assert last_required_data_date <= last_date_full_data, \ "You said you wanted {} dates of enrollment, ".format( self.num_dates_enrollment ) + "but your conversion time of {} days won't have ".format( req_dates_of_data ) + "complete data until we have the data for {}.".format( last_required_data_date ) return last_required_data_date
def _check_windows(self, today, min_days_per_user): """Check that the conversion window dates make sense We need min_days_per_user days of post-enrollment data per user. This places limits on how early we can run certain analyses. This method calculates and presents these limits. """ slack = 1 # 1 day of slack: assume yesterday's data is not present last_enrollment_date = add_days(today, -1 - min_days_per_user - slack) if self.num_days_enrollment is not None: official_last_enrollment_date = add_days( self.start_date, self.num_days_enrollment - 1) assert last_enrollment_date >= official_last_enrollment_date, \ "You said you wanted {} days of enrollment, ".format( self.num_days_enrollment ) + "but your conversion window of {} days won't have ".format( min_days_per_user ) + "complete data until we have the data for {}.".format( add_days(official_last_enrollment_date, 1 + min_days_per_user) ) last_enrollment_date = official_last_enrollment_date print("Taking enrollments between {} and {}".format( self.start_date, last_enrollment_date)) assert self.start_date <= last_enrollment_date, \ "No users have had time to convert yet"
def get_enrollments_addon_exp(self, spark): """Temporary alternative to `get_enrollments` for addon experiments. As of 2019/04/02, branch information isn't reliably available in the `events` table for addon experiments: branch may be NULL for all enrollments. The enrollment information for them is most reliably available in `telemetry_shield_study_parquet`. Ref: https://bugzilla.mozilla.org/show_bug.cgi?id=1536644 """ tssp = spark.table('telemetry_shield_study_parquet') enrollments = tssp.filter(tssp.submission >= self.start_date).filter( tssp.payload.data.study_state == 'enter').filter( tssp.payload.study_name == self.experiment_slug) if self.num_days_enrollment is not None: enrollments = enrollments.filter(tssp.submission < add_days( self.start_date, self.num_days_enrollment)) # TODO: should we also call `broadcast()`? Should we cache? # TODO: should we filter clients enrolled multiple times? return enrollments.select( enrollments.client_id, enrollments.submission.alias('enrollment_date'), enrollments.payload.branch.alias('branch'), )
def get_enrollments(self, spark): """Return a DataFrame of enrolled clients. This works for pref-flip studies. As of 2019/04/02, branch information isn't reliably available in the `events` table for addon experiments: branch may be NULL for all enrollments. The enrollment information for them is most reliably available in `telemetry_shield_study_parquet`. So use `get_enrollments_addon_exp()` for addon experiments until the underlying issue is resolved. Ref: https://bugzilla.mozilla.org/show_bug.cgi?id=1536644 """ events = spark.table('events') enrollments = events.filter( events.submission_date_s3 >= self.start_date).filter( events.event_category == 'normandy').filter( events.event_method == 'enroll').filter( events.event_string_value == self.experiment_slug) if self.num_days_enrollment is not None: enrollments = enrollments.filter( events.submission_date_s3 < add_days(self.start_date, self.num_days_enrollment)) # TODO: should we also call `broadcast()`? Should we cache? # TODO: should we filter clients enrolled multiple times? return enrollments.select( enrollments.client_id, enrollments.submission_date_s3.alias('enrollment_date'), enrollments.event_map_values.branch.alias('branch'), )
def _filter_df_for_conv_window(self, df, today, conv_window_start_days, conv_window_length_days): """Return the df, filtered to the relevant dates. This should not affect the results - it should just speed things up. """ if self.num_days_enrollment is not None: # Ignore data after the conversion window of the last enrollment df = df.filter(df.submission_date_s3 <= add_days( self.start_date, self.num_days_enrollment + conv_window_start_days + conv_window_length_days)) # Ignore data before the conversion window of the first enrollment return df.filter(df.submission_date_s3 >= add_days( self.start_date, conv_window_start_days))
def get_enrollments(self, spark): events = spark.table('events') enrollments = events.filter( events.submission_date_s3 >= self.start_date ).filter( events.event_category == 'normandy' ).filter( events.event_method == 'enroll' ).filter( events.event_string_value == self.experiment_slug ) if self.num_days_enrollment is not None: enrollments = enrollments.filter( events.submission_date_s3 < add_days( self.start_date, self.num_days_enrollment ) ) # TODO: should we also call `broadcast()`? Should we cache? return enrollments.select( enrollments.client_id, enrollments.submission_date_s3.alias('enrollment_date'), enrollments.event_map_values.branch.alias('branch'), )
def _get_last_enrollment_date(self, last_date_full_data, req_dates_of_data): """Return the date of the final used enrollment. We need `req_dates_of_data` days of post-enrollment data per user. This and `last_date_full_data` put constraints on the enrollment period. This method checks these constraints are feasible, and compatible with any manually supplied enrollment period. Args: last_date_full_data (str): The most recent date for which we have complete data, e.g. '20190322'. If you want to ignore all data collected after a certain date (e.g. when the experiment recipe was deactivated), then do that here. req_dates_of_data (int): The minimum number of dates of post-enrollment data required to have data for the user for the entire conversion window. """ last_enrollment_with_data = add_days(last_date_full_data, -(req_dates_of_data - 1)) if self.num_dates_enrollment is None: assert last_enrollment_with_data >= self.start_date, \ "No users have had time to convert yet" return last_enrollment_with_data else: intended_last_enrollment = self._get_scheduled_max_enrollment_date( ) assert last_enrollment_with_data >= intended_last_enrollment, \ "You said you wanted {} dates of enrollment, ".format( self.num_dates_enrollment ) + "but your conversion window of {} days won't have ".format( req_dates_of_data ) + "complete data until we have the data for {}.".format( add_days(intended_last_enrollment, req_dates_of_data - 1) ) return intended_last_enrollment
def filter_df_for_conv_window(self, df, last_date_full_data, conv_window_start_days, conv_window_length_days): """Return the df, filtered to the relevant dates. This should not affect the results - it should just speed things up. """ return df.filter( # Ignore data before the conversion window of the first enrollment df.submission_date_s3 >= add_days(self.start_date, conv_window_start_days) ).filter( # Ignore data after the conversion window of the last enrollment, # and data after the specified `last_date_full_data` df.submission_date_s3 <= self._get_last_data_date( last_date_full_data, conv_window_start_days + conv_window_length_days))
def get_per_client_data(self, enrollments, df, metric_list, today, conv_window_start_days, conv_window_length_days, keep_client_id=False): """Return a DataFrame containing per-client metric values. Args: - enrollments: A spark DataFrame of enrollments, like the one returned by `self.get_enrollments()`. - df: A spark DataFrame containing the data needed to calculate the metrics. Could be `main_summary` or `clients_daily`. _Don't_ use `experiments`; as of 2019/04/02 it drops data collected after people self-unenroll, so unenrolling users will appear to churn. - metric_list: A list of columns that aggregate and compute metrics, e.g. `[F.coalesce(F.sum(df.metric_name), F.lit(0)).alias('metric_name')]` - today: A string representing the most recent day for which we have incomplete data, e.g. '20190322'. - conv_window_start_days: the start of the conversion window, measured in 'days since the user enrolled'. We ignore data collected outside this conversion window. - conv_window_length_days: the length of the conversion window, measured in days. - keep_client_id: Whether to return a `client_id` column. Defaults to False to reduce memory usage of the results. """ self._check_windows(today, conv_window_start_days + conv_window_length_days) # TODO: can/should we switch from submission_date_s3 to when the # events actually happened? res = enrollments.filter( # Ignore clients that might convert in the future # TODO: print debug info if it throws out enrollments # when `num_days_enrollment is not None`? enrollments.enrollment_date < add_days( today, -1 - conv_window_length_days - conv_window_start_days) ).join( self._filter_df_for_conv_window(df, today, conv_window_start_days, conv_window_length_days), [ # TODO: would it be faster if we enforce a join on sample_id? enrollments.client_id == df.client_id, # TODO: once we can rely on # `df.experiments[self.experiment_slug]` # existing even after unenrollment, we could start joining on # branch to reduce problems associated with split client_ids. # Do a quick pass aiming to efficiently filter out lots of rows: enrollments.enrollment_date <= df.submission_date_s3, # Now do a more thorough pass filtering out irrelevant data: # TODO: is there a more efficient way to do this? ((F.unix_timestamp(df.submission_date_s3, 'yyyyMMdd') - F.unix_timestamp(enrollments.enrollment_date, 'yyyyMMdd')) / (24 * 60 * 60)).between( conv_window_start_days, conv_window_start_days + conv_window_length_days), # Try to filter data from day of enrollment before time of enrollment. # If the client enrolled and unenrolled on the same day then this # will also filter out that day's post unenrollment data but that's # probably the smallest and most innocuous evil on the menu. ((enrollments.enrollment_date != df.submission_date_s3) | (~F.isnull(df.experiments[self.experiment_slug]))), ], 'left').groupBy(enrollments.client_id, enrollments.branch).agg( *(metric_list + self._get_telemetry_sanity_check_metrics(enrollments, df))) if keep_client_id: return res else: return res.drop(enrollments.client_id)
def _get_scheduled_max_enrollment_date(self): """Return the last enrollment date, according to the plan.""" assert self.num_dates_enrollment is not None return add_days(self.start_date, self.num_dates_enrollment - 1)
def get_per_client_data( self, enrollments, df, metric_list, today, conv_window_start_days, conv_window_length_days, keep_client_id=False ): """Return a DataFrame containing per-client metric values. Args: - enrollments: A spark DataFrame of enrollments, like the one returned by `self.get_enrollments()`. - df: A spark DataFrame containing the data needed to calculate the metrics. Could be `main_summary` or `clients_daily`. - metric_list: A list of columns that aggregate and compute metrics, e.g. `[F.coalesce(F.sum(df.metric_name), F.lit(0)).alias('metric_name')]` - today: A string representing the most recent day for which we have incomplete data, e.g. '20190322'. - conv_window_start_days: the start of the conversion window, measured in 'days since the user enrolled'. We ignore data collected outside this conversion window. - conv_window_length_days: the length of the conversion window, measured in days. - keep_client_id: Whether to return a `client_id` column. Defaults to False to reduce memory usage of the results. """ # TODO: can/should we switch from submission_date_s3 to when the # events actually happened? res = enrollments.filter( # Ignore clients that might convert in the future # TODO: print debug info if it throws out enrollments # when `num_days_enrollment is not None`? enrollments.enrollment_date < add_days( today, -1 - conv_window_length_days - conv_window_start_days ) ).join( # TODO: coarsely filter `df` by conv window and enrollment dates df, [ # TODO: would it be faster if we enforce a join on sample_id? enrollments.client_id == df.client_id, enrollments.branch == df.branch, # Do a quick pass aiming to efficiently filter out lots of rows: enrollments.enrollment_date <= df.submission_date_s3, # Now do a more thorough pass filtering out irrelevant data: # TODO: is there a more efficient way to do this? ( ( F.unix_timestamp(df.submission_date_s3, 'yyyyMMdd') - F.unix_timestamp(enrollments.enrollment_date, 'yyyyMMdd') ) / (24 * 60 * 60) ).between( conv_window_start_days, conv_window_start_days + conv_window_length_days ) ], 'left' ).groupBy( enrollments.client_id, enrollments.branch ).agg( *metric_list ) if keep_client_id: return res else: return res.drop(enrollments.client_id)