def test_clean_input_country(generate_data): snippets = [{"country": "INVALID"}, {"country": "US"}] df = generate_data(snippets) res = topline.clean_input(df, start_ds, end_ds) assert res.where(F.col("country") == "Other").count() == 1
def test_clean_input_profile_creation(generate_data): snippets = [{"profile_creation_date": 1}, {"profile_creation_date": -1}] df = generate_data(snippets) res = topline.clean_input(df, start_ds, end_ds) total = res.select("profile_creation_date").groupBy().sum().first()[0] assert total == topline.seconds_per_day
def test_clean_input_date_range(generate_data): snippets = [ generate_dates("20170601"), generate_dates("20170602"), generate_dates("20170501"), generate_dates("20170608") ] df = generate_data(snippets) res = topline.clean_input(df, start_ds, end_ds) assert res.count() == 2
def test_deduplicate_documents(dataframe_factory): snippets = [{ "document_id": "1" }, { "document_id": "2" }, { "document_id": "2" }] df = dataframe_factory.create_dataframe(snippets, default_sample, schema=schema) res = topline.clean_input(df, start_ds, end_ds) assert res.count() == 2
def test_clean_input_os(generate_data): oses = [ "Windows_NT", "WINNT", "Darwin", "Linux", # 4 "xWindows", "Mac", "SUN", "BaDsTrInG", # 4 ] snippets = [{"os": os} for os in oses] df = generate_data(snippets) res = topline.clean_input(df, start_ds, end_ds) assert res.where(F.col("os") == "Other").count() == 4 assert res.select("os").distinct().count() == 4
def test_clean_input_hours(generate_data): snippets = [ { "subsession_length": topline.seconds_per_hour }, { "subsession_length": 181 * topline.seconds_per_day }, { "subsession_length": -1 * topline.seconds_per_day }, ] df = generate_data(snippets) res = topline.clean_input(df, start_ds, end_ds) total = res.select("hours").groupBy().sum().first()[0] assert total == 1.0