Exemple #1
0
def test_clean_input_country(generate_data):
    snippets = [{"country": "INVALID"}, {"country": "US"}]

    df = generate_data(snippets)
    res = topline.clean_input(df, start_ds, end_ds)

    assert res.where(F.col("country") == "Other").count() == 1
def test_clean_input_profile_creation(generate_data):
    snippets = [{"profile_creation_date": 1}, {"profile_creation_date": -1}]

    df = generate_data(snippets)
    res = topline.clean_input(df, start_ds, end_ds)

    total = res.select("profile_creation_date").groupBy().sum().first()[0]
    assert total == topline.seconds_per_day
Exemple #3
0
def test_clean_input_date_range(generate_data):
    snippets = [
        generate_dates("20170601"),
        generate_dates("20170602"),
        generate_dates("20170501"),
        generate_dates("20170608")
    ]

    df = generate_data(snippets)
    res = topline.clean_input(df, start_ds, end_ds)
    assert res.count() == 2
def test_deduplicate_documents(dataframe_factory):
    snippets = [{
        "document_id": "1"
    }, {
        "document_id": "2"
    }, {
        "document_id": "2"
    }]
    df = dataframe_factory.create_dataframe(snippets,
                                            default_sample,
                                            schema=schema)

    res = topline.clean_input(df, start_ds, end_ds)
    assert res.count() == 2
Exemple #5
0
def test_clean_input_os(generate_data):
    oses = [
        "Windows_NT",
        "WINNT",
        "Darwin",
        "Linux",  # 4
        "xWindows",
        "Mac",
        "SUN",
        "BaDsTrInG",  # 4
    ]
    snippets = [{"os": os} for os in oses]
    df = generate_data(snippets)
    res = topline.clean_input(df, start_ds, end_ds)

    assert res.where(F.col("os") == "Other").count() == 4
    assert res.select("os").distinct().count() == 4
Exemple #6
0
def test_clean_input_hours(generate_data):
    snippets = [
        {
            "subsession_length": topline.seconds_per_hour
        },
        {
            "subsession_length": 181 * topline.seconds_per_day
        },
        {
            "subsession_length": -1 * topline.seconds_per_day
        },
    ]

    df = generate_data(snippets)
    res = topline.clean_input(df, start_ds, end_ds)

    total = res.select("hours").groupBy().sum().first()[0]
    assert total == 1.0