Esempio n. 1
0
def test_cli(spark, multi_profile_df, tmpdir, monkeypatch):

    # use the file:// prefix
    def mock_format_spark_path(bucket, prefix):
        return 'file://{}/{}'.format(bucket, prefix)

    monkeypatch.setattr(churn, 'format_spark_path', mock_format_spark_path)

    # write `main_summary` to disk
    bucket = str(tmpdir)
    input_prefix = 'main_summary/v3'
    output_prefix = 'churn/v2'

    path = churn.format_spark_path(bucket, input_prefix)

    multi_profile_df.write.partitionBy('submission_date_s3').parquet(path)

    runner = CliRunner()
    args = [
        week_start_ds,
        bucket,
        '--input-bucket',
        bucket,
        '--input-prefix',
        input_prefix,
        '--no-lag',  # week_start_ds already accounts for the lag time
    ]
    result = runner.invoke(churn.main, args)
    assert result.exit_code == 0

    # check that the files are correctly partitioned
    folder = os.path.join(bucket, output_prefix)
    assert any(item.startswith('week_start') for item in os.listdir(folder))

    # check that there is only a single partition
    folder = os.path.join(folder, 'week_start={}'.format(week_start_ds))
    assert len([
        item for item in os.listdir(folder) if item.endswith('.parquet')
    ]) == 1

    # check that the dataset conforms to expected output
    path = churn.format_spark_path(bucket, output_prefix)
    df = spark.read.parquet(path)
    rows = (df.groupBy(df.channel).agg(
        F.sum('n_profiles').alias('n_profiles'),
        F.sum('usage_hours').alias('usage_hours')).where(
            df.channel == 'release-cck-mozilla42').collect())
    assert rows[0].n_profiles == 2
    assert rows[0].usage_hours == 4
Esempio n. 2
0
def test_cli_fails_on_missing_input(spark, multi_profile_df, tmpdir,
                                    monkeypatch):

    # use the file:// prefix
    def mock_format_spark_path(bucket, prefix):
        return 'file://{}/{}'.format(bucket, prefix)

    monkeypatch.setattr(churn, 'format_spark_path', mock_format_spark_path)

    # write `main_summary` to disk
    bucket = str(tmpdir)
    input_prefix = 'main_summary/v3'

    path = churn.format_spark_path(bucket, input_prefix)

    # drop a field that is necessary for completion
    (multi_profile_df.drop("country").write.partitionBy(
        'submission_date_s3').parquet(path))

    runner = CliRunner()
    args = [
        week_start_ds,
        bucket,
        '--input-bucket',
        bucket,
        '--input-prefix',
        input_prefix,
        '--no-lag',  # week_start_ds already accounts for the lag time
    ]
    result = runner.invoke(churn.main, args)
    assert result.exit_code == -1