Beispiel #1
0
def test_mobile_aggregation_cli_avro(tmp_path, spark, aggregates_rdd,
                                     avro_testing_files):
    output = str(tmp_path / "output")

    result = CliRunner().invoke(
        run_mobile,
        [
            "--date",
            d.SUBMISSION_DATE_1.strftime('%Y%m%d'),
            "--output",
            output,
            "--num-partitions",
            10,
            "--source",
            "avro",
            "--avro-prefix",
            avro_testing_files,
        ],
        catch_exceptions=False,
    )

    assert result.exit_code == 0

    expect = get_aggregates_dataframe(spark, aggregates_rdd)
    actual = spark.read.parquet(output)

    assert expect.count() > 0 and actual.count() > 0
    assert expect.count() == actual.count()
Beispiel #2
0
def run_mobile(date, output, num_partitions, source, project_id, dataset_id,
               avro_prefix):
    spark = SparkSession.builder.getOrCreate()
    spark.conf.set("spark.sql.sources.partitionOverwriteMode", "dynamic")

    print(f"Running job for {date}")
    agg_metrics = mobile.aggregate_metrics(
        spark.sparkContext,
        date,
        num_partitions=num_partitions,
        source=source,
        project_id=project_id,
        dataset_id=dataset_id,
        avro_prefix=avro_prefix,
    )
    aggs = mobile.get_aggregates_dataframe(spark, agg_metrics)
    mobile.write_parquet(aggs, output)
Beispiel #3
0
def test_mobile_aggregation_cli_bigquery(tmp_path, spark, aggregates_rdd,
                                         bq_testing_table):
    output = str(tmp_path / "output")

    result = CliRunner().invoke(
        run_mobile,
        [
            "--date",
            d.SUBMISSION_DATE_1.strftime('%Y%m%d'), "--output", output,
            "--num-partitions", 10, "--source", "bigquery", "--project-id",
            os.environ["PROJECT_ID"], "--dataset-id",
            "pytest_mozaggregator_test"
        ],
        catch_exceptions=False,
    )
    assert len({f"submission_date={d.SUBMISSION_DATE_1.strftime('%Y%m%d')}"} -
               set(os.listdir(output))) == 0

    expect = get_aggregates_dataframe(spark, aggregates_rdd)
    actual = spark.read.parquet(output)

    assert expect.count() > 0 and actual.count() > 0
    assert expect.count() == actual.count()
Beispiel #4
0
def test_mobile_aggregation_cli(tmp_path, monkeypatch, spark, aggregates_rdd):
    output = str(tmp_path / "output")

    class Dataset:
        @staticmethod
        def from_source(*args, **kwargs):
            return Dataset()

        def where(self, *args, **kwargs):
            return self

        def records(self, *args, **kwargs):
            return spark.sparkContext.parallelize(d.generate_mobile_pings())

    monkeypatch.setattr("mozaggregator.mobile.Dataset", Dataset)

    result = CliRunner().invoke(
        run_mobile,
        [
            "--date",
            # this date is ignored because we are monkeypatching the dataset
            "20190901",
            "--output",
            output,
            "--num-partitions",
            10,
        ],
        catch_exceptions=False,
    )

    assert result.exit_code == 0

    expect = get_aggregates_dataframe(spark, aggregates_rdd)
    actual = spark.read.parquet(output)

    assert expect.count() > 0 and actual.count() > 0
    assert expect.count() == actual.count()