def test_multiple_sources_transform(effective_version, generate_main_summary_data, generate_new_profile_data): main_summary = generate_main_summary_data([ { "client_id": "1" }, { "client_id": "3" }, ]) new_profile = generate_new_profile_data([ { "client_id": "1" }, { "client_id": "2" }, { "client_id": "2" }, ]) sources = job.extract(main_summary, new_profile, WEEK_START_DS, 1, 0, False) df = job.transform(sources, effective_version, WEEK_START_DS) # There are two different channels assert df.count() == 2 assert (df.select( F.sum("n_profiles").alias("n_profiles")).first().n_profiles) == 3
def test_extract_main_summary(spark, generate_main_summary_data): df = job.extract( generate_main_summary_data(None), spark.createDataFrame([], data.new_profile_schema), WEEK_START_DS, 1, 0, False, ) assert df.count() == 1
def test_extract_new_profile(spark, generate_new_profile_data): df = job.extract(spark.createDataFrame([], data.main_summary_schema), generate_new_profile_data([dict()]), WEEK_START_DS, 1, 0, False) assert df.count() == 1 row = df.first() assert row['subsession_length'] is None assert (row['profile_creation_date'] == data.new_profile_sample['environment']['profile']['creation_date']) assert row['scalar_parent_browser_engagement_total_uri_count'] is None
def test_ignored_submissions_outside_of_period(spark, generate_main_summary_data): # All pings within 17 days of the submission start date are valid. # However, only pings with ssd within the 7 day retention period # are used for computation. Generate pings for this case. late_submission = data.generate_dates(SUBSESSION_START, submission_offset=18) early_subsession = data.generate_dates(SUBSESSION_START.replace(days=-7)) late_submissions_df = generate_main_summary_data( [late_submission, early_subsession]) df = job.extract(late_submissions_df, spark.createDataFrame([], data.new_profile_schema), WEEK_START_DS, 7, 10, False) assert df.count() == 0
def main(start_date, path, input_bucket, input_prefix, period, slack, sample): spark = SparkSession.builder.appName('retention').getOrCreate() spark.conf.set('spark.sql.session.timeZone', 'UTC') start_ds = utils.format_date(arrow.get(start_date, utils.DS_NODASH), utils.DS_NODASH, -slack) main_summary = (spark.read.option('mergeSchema', 'true').parquet( utils.format_spark_path(input_bucket, input_prefix))) new_profile = (spark.read.parquet( "s3://net-mozaws-prod-us-west-2-pipeline-data/" "telemetry-new-profile-parquet/v1/")) extracted = churn_job.extract(main_summary, new_profile, start_ds, period, slack, sample) retention = transform(extracted, start_ds) save(retention, path)
def test_attribution_from_new_profile( effective_version, generate_main_summary_data, generate_new_profile_data ): main_summary = generate_main_summary_data( [ {"client_id": "1", "attribution": {"source": "mozilla.org"}}, {"client_id": "3", "attribution": None}, {"client_id": "4", "attribution": None}, { "client_id": "5", "attribution": {"source": "mozilla.org"}, "timestamp": SUBSESSION_START.shift(days=1).timestamp * 10 ** 9, }, { "client_id": "6", "attribution": {"source": "mozilla.org"}, "timestamp": SUBSESSION_START.shift(days=1).timestamp * 10 ** 9, }, {"client_id": "7", "attribution": {"source": "mozilla.org"}}, ] ) def update_attribution(attribution): # only updates the attribution section in the environment env = copy.deepcopy(data.new_profile_sample["environment"]) env["settings"]["attribution"] = attribution return env new_profile = generate_new_profile_data( [ # new profile without a main summary companion { "client_id": "2", "environment": update_attribution({"source": "mozilla.org"}), }, # recover null attribution { "client_id": "3", "environment": update_attribution({"source": "mozilla.org"}), }, # new-profile ping used to recover attribution, but outside of the # the current retention period { "client_id": "4", "environment": update_attribution({"source": "mozilla.org"}), "submission": SUBSESSION_START.shift(days=-7).format("YYYYMMDD"), }, # avoid accidentally overwriting an existing value with an empty structure {"client_id": "5", "environment": update_attribution({})}, # main-pings have higher latency than new-profile pings, so the main # ping attribution state will be set correctly {"client": "6", "environment": update_attribution(None)}, # new-profile timestamp is newer than main-ping, so attribution for the # client is unset { "client_id": "7", "environment": update_attribution(None), "timestamp": SUBSESSION_START.shift(days=1).timestamp * 10 ** 9, }, ] ) sources = job.extract(main_summary, new_profile, WEEK_START_DS, 2, 0, False) df = job.transform(sources, effective_version, WEEK_START_DS) assert df.where("source='mozilla.org'").agg(F.sum("n_profiles")).first()[0] == 6
def test_attribution_from_new_profile(effective_version, generate_main_summary_data, generate_new_profile_data): main_summary = generate_main_summary_data([ { 'client_id': '1', 'attribution': { 'source': 'mozilla.org' } }, { 'client_id': '3', 'attribution': None }, { 'client_id': '4', 'attribution': None }, { 'client_id': '5', 'attribution': { 'source': 'mozilla.org' }, 'timestamp': SUBSESSION_START.shift(days=1).timestamp * 10**9, }, { 'client_id': '6', 'attribution': { 'source': 'mozilla.org' }, 'timestamp': SUBSESSION_START.shift(days=1).timestamp * 10**9, }, { 'client_id': '7', 'attribution': { 'source': 'mozilla.org' } }, ]) def update_attribution(attribution): # only updates the attribution section in the environment env = copy.deepcopy(data.new_profile_sample['environment']) env['settings']['attribution'] = attribution return env new_profile = generate_new_profile_data([ # new profile without a main summary companion { 'client_id': '2', 'environment': update_attribution({'source': 'mozilla.org'}) }, # recover null attribution { 'client_id': '3', 'environment': update_attribution({'source': 'mozilla.org'}) }, # new-profile ping used to recover attribution, but outside of the # the current retention period { 'client_id': '4', 'environment': update_attribution({'source': 'mozilla.org'}), 'submission': SUBSESSION_START.shift(days=-7).format("YYYYMMDD"), }, # avoid accidentally overwriting an existing value with an empty structure { 'client_id': '5', 'environment': update_attribution({}) }, # main-pings have higher latency than new-profile pings, so the main # ping attribution state will be set correctly { 'client': '6', 'environment': update_attribution(None) }, # new-profile timestamp is newer than main-ping, so attribution for the # client is unset { 'client_id': '7', 'environment': update_attribution(None), 'timestamp': SUBSESSION_START.shift(days=1).timestamp * 10**9, }, ]) sources = job.extract(main_summary, new_profile, WEEK_START_DS, 2, 0, False) df = job.transform(sources, effective_version, WEEK_START_DS) assert df.where("source='mozilla.org'").agg( F.sum("n_profiles")).first()[0] == 6