def test_multiple_dates_fails(generate_data, tmpdir):
    snippets = [{"date": "2016-01-01"}, {"date": "2016-01-08"}]
    input_df = generate_data(snippets)

    path = str(tmpdir)
    with pytest.raises(RuntimeError):
        backfill.backfill_topline_summary(input_df, path, overwrite=True)
Example #2
0
def test_multiple_dates_fails(spark, tmpdir):
    snippets = [{'date': '2016-01-01'}, {'date': '2016-01-08'}]
    input_df = snippets_to_df(spark, snippets, default_sample,
                              historical_schema)

    path = str(tmpdir)
    with pytest.raises(RuntimeError):
        backfill.backfill_topline_summary(input_df, path, overwrite=True)
def test_multiple_dates_batch(generate_data, tmpdir):
    snippets = [{"date": "2016-01-01"}, {"date": "2016-01-08"}]
    input_df = generate_data(snippets)

    path = str(tmpdir)
    backfill.backfill_topline_summary(input_df, path, batch=True, overwrite=True)

    parts = [name for name in os.listdir(path) if name.startswith("report_start")]
    assert len(parts) == 2
Example #4
0
def test_multiple_dates_batch(spark, tmpdir):
    snippets = [{'date': '2016-01-01'}, {'date': '2016-01-08'}]
    input_df = snippets_to_df(spark, snippets, default_sample,
                              historical_schema)

    path = str(tmpdir)
    backfill.backfill_topline_summary(input_df,
                                      path,
                                      batch=True,
                                      overwrite=True)

    parts = [
        name for name in os.listdir(path) if name.startswith('report_start')
    ]
    assert len(parts) == 2
def test_excludes_rows_containing_all(spark, generate_data, tmpdir):
    snippets = [
        {'geo': 'all'},
        {'os': 'all'},
        {'channel': 'all'},
        {}  # There must be a single data point
    ]
    input_df = generate_data(snippets)

    path = str(tmpdir.join('test/mode=weekly/'))
    backfill.backfill_topline_summary(input_df, path, overwrite=True)

    df = spark.read.parquet(path)
    assert df.where("geo = 'all'").count() == 0
    assert df.where("os = 'all'").count() == 0
    assert df.where("channel = 'all'").count() == 0