def test_searches_by_country(generate_data): snippets = [ { 'client_id': 'profile_0', 'country': 'US', 'search_counts': [search_row(engine="hooli", count=2)] }, { 'client_id': 'profile_1', 'country': 'US', 'search_counts': [search_row(engine="altavista", count=2)] }, { 'client_id': 'profile_2', 'country': 'CA', 'search_counts': [search_row(engine="altavista", count=2)] }, ] df = generate_data(snippets) result = search_rollups.transform(df, "daily") def search_by_country(df, geo): return (df.where(F.col("country") == geo).select( F.sum("search_count")).first()[0]) assert result.count() == 3 assert search_by_country(result, "US") == 4 assert search_by_country(result, "CA") == 2
def test_multiple_clients_multiple_search_engines(generate_data): snippets = [ { 'client_id': 'profile_0', 'search_counts': [ search_row(engine="hooli", count=18), search_row(engine="altavista", count=3), ] }, { 'client_id': 'profile_1', 'search_counts': [ search_row(engine="hooli", count=3), search_row(engine="altavista", count=18), ] }, ] df = generate_data(snippets) result = search_rollups.transform(df, "daily") assert result.count() == 2 assert result.select(F.sum("search_count")).first()[0] == 42 assert (result.where("search_provider='hooli'").select( F.sum("search_count")).first()[0]) == 21
def test_single_client_shares_multiple_countries(generate_data): snippets = [ { 'country': 'US' }, { 'country': 'CA' }, ] df = generate_data(snippets) result = search_rollups.transform(df, "daily") assert result.count() == 2 assert result.select(F.sum("profile_share")).first()[0] == 1.0
def test_filter_incontent_searches(generate_data): snippets = [ { 'search_counts': [search_row(source="in-content")] }, # no { 'search_counts': [search_row(source="contextmenu")] }, # yes { 'search_counts': [search_row(source="abouthome")] }, # yes ] df = generate_data(snippets) result = search_rollups.transform(df, "daily") # in-content search should be filtered assert result.select(F.sum("search_count")).first()[0] == 2
def test_single_client_multiple_search_engines(generate_data): snippets = [ { 'search_counts': [ search_row(engine="hooli", count=2), search_row(engine="altavista", count=4), ] }, { 'search_counts': [search_row("altavista")] }, ] df = generate_data(snippets) result = search_rollups.transform(df, "daily") assert result.where("search_provider='hooli'").count() == 1 assert result.select(F.sum("search_count")).first()[0] == 7
def test_null_row(generate_data): # everything except client_id is null snippets = [{ 'country': None, 'default_search_engine': None, 'distribution_id': None, 'locale': None, 'search_counts': None, }] df = generate_data(snippets) result = search_rollups.transform(df, "daily") row = result.where("country<>'US'").first() assert row.country == "XX" assert row.search_provider == "NO_SEARCHES" assert row.default_provider == "NO_DEFAULT" assert row.locale == "xx" assert row.distribution_id == "MOZILLA" assert row.search_count == 0
def test_transform_excludes_profile_shares_for_monthly(generate_data): df = generate_data(None) result = search_rollups.transform(df, mode="monthly") assert "profile_share" not in result.columns