コード例 #1
0
 def setUpClass(cls):
     """Set up the database once for the test run."""
     cls.sc = pyspark.SparkContext(master="local[*]")
     raw_pings = list(generate_pings())
     aggregates = _aggregate_metrics(cls.sc.parallelize(raw_pings),
                                     num_reducers=10)
     submit_aggregates(aggregates)
    def setUp(self):
        self.sc = pyspark.SparkContext(master="local[1]")
        self.raw_pings = self.sc.parallelize(list(dataset.generate_pings()))

        result, self.main_processed_count, self.main_ignored_count, self.crash_processed_count, self.crash_ignored_count = compare_crashes(
            self.sc, self.raw_pings, COMPARABLE_DIMENSIONS, DIMENSION_NAMES)
        self.crash_rate_aggregates = result.collect()
コード例 #3
0
def test_count(build_id_aggregates, submission_date_aggregates):
    pings = list(d.generate_pings())
    num_build_ids = len(d.ping_dimensions["build_id"])
    assert (len(pings) /
            d.NUM_PINGS_PER_DIMENSIONS == len(build_id_aggregates))
    assert (len(pings) / d.NUM_PINGS_PER_DIMENSIONS /
            num_build_ids == len(submission_date_aggregates))
コード例 #4
0
 def test_count(self):
     pings = list(d.generate_pings())
     num_build_ids = len(d.ping_dimensions["build_id"])
     self.assertEqual(len(pings) / d.NUM_PINGS_PER_DIMENSIONS,
                      len(self.build_id_aggs))
     self.assertEqual(
         len(pings) / d.NUM_PINGS_PER_DIMENSIONS / num_build_ids,
         len(self.submission_date_aggs))
コード例 #5
0
def setup_module():
    global aggregates
    global sc

    sc = pyspark.SparkContext(master="local[*]")
    raw_pings = list(generate_pings())
    aggregates = _aggregate_metrics(sc.parallelize(raw_pings), num_reducers=10)
    submit_aggregates(aggregates)
コード例 #6
0
 def test_count(self):
     pings = list(d.generate_pings())
     num_build_ids = len(d.ping_dimensions["build_id"])
     self.assertEqual(len(pings) / d.NUM_PINGS_PER_DIMENSIONS,
                      len(self.build_id_aggs))
     self.assertEqual(
         len(pings) / d.NUM_PINGS_PER_DIMENSIONS / num_build_ids,
         len(self.submission_date_aggs))
コード例 #7
0
def setup_module():
    global aggregates
    global sc

    sc = pyspark.SparkContext(master="local[*]")
    raw_pings = list(generate_pings())
    aggregates = _aggregate_metrics(sc.parallelize(raw_pings), num_reducers=10)
    submit_aggregates(aggregates)
コード例 #8
0
    def setUpClass(cls):
        """Set up the database once for the test run."""
        clear_db()

        cls.sc = pyspark.SparkContext(master="local[*]")
        raw_pings = list(generate_pings())
        aggregates = _aggregate_metrics(cls.sc.parallelize(raw_pings), num_reducers=10)
        submit_aggregates(aggregates)
コード例 #9
0
    def setUp(self):
        self.sc = pyspark.SparkContext(master="local[1]")
        self.raw_pings = self.sc.parallelize(list(dataset.generate_pings()))

        result, self.main_processed_count, self.main_ignored_count, self.crash_processed_count, self.crash_ignored_count = compare_crashes(
            self.sc,
            self.raw_pings,
            COMPARABLE_DIMENSIONS, DIMENSION_NAMES
        )
        self.crash_rate_aggregates = result.collect()
コード例 #10
0
    def setUp(self):
        logger = logging.getLogger("py4j")
        logger.setLevel(logging.ERROR)

        self.sc = pyspark.SparkContext(master="local[*]")
        raw_pings = list(d.generate_pings())
        build_id_aggs, submission_date_aggs = (
            _aggregate_metrics(self.sc.parallelize(raw_pings), num_reducers=10))
        self.build_id_aggs = build_id_aggs.collect()
        self.submission_date_aggs = submission_date_aggs.collect()
コード例 #11
0
    def setUp(self):
        logger = logging.getLogger("py4j")
        logger.setLevel(logging.ERROR)

        self.sc = pyspark.SparkContext(master="local[*]")
        raw_pings = list(d.generate_pings())
        build_id_aggs, submission_date_aggs = (
            _aggregate_metrics(self.sc.parallelize(raw_pings), num_reducers=10))
        self.build_id_aggs = build_id_aggs.collect()
        self.submission_date_aggs = submission_date_aggs.collect()
コード例 #12
0
def setup_module():
    global build_id_aggregates
    global submission_date_aggregates

    logger = logging.getLogger("py4j")
    logger.setLevel(logging.ERROR)

    sc = pyspark.SparkContext(master="local[*]")
    raw_pings = list(d.generate_pings())
    build_id_aggregates, submission_date_aggregates = _aggregate_metrics(sc.parallelize(raw_pings), num_reducers=10)
    build_id_aggregates = build_id_aggregates.collect()
    submission_date_aggregates = submission_date_aggregates.collect()

    # Note: most tests are based on the build-id aggregates as the aggregation
    # code is the same for both scenarios.
    sc.stop()
コード例 #13
0
def setup_module():
    global build_id_aggregates
    global submission_date_aggregates

    logger = logging.getLogger("py4j")
    logger.setLevel(logging.ERROR)

    sc = pyspark.SparkContext(master="local[*]")
    raw_pings = list(d.generate_pings())
    build_id_aggregates, submission_date_aggregates = _aggregate_metrics(
        sc.parallelize(raw_pings), num_reducers=10)
    build_id_aggregates = build_id_aggregates.collect()
    submission_date_aggregates = submission_date_aggregates.collect()

    # Note: most tests are based on the build-id aggregates as the aggregation
    # code is the same for both scenarios.
    sc.stop()
コード例 #14
0
def bq_testing_table():
    bq_client = bigquery.Client()

    project_id = os.environ["PROJECT_ID"]
    dataset_id = f"{project_id}.pytest_mozaggregator_test"
    bq_client.delete_dataset(dataset_id,
                             delete_contents=True,
                             not_found_ok=True)
    bq_client.create_dataset(dataset_id)

    schema = bq_client.schema_from_json(
        os.path.join(os.path.dirname(__file__), "decoded.1.bq"))
    # use load_table instead of insert_rows to avoid eventual consistency guarantees
    df = [format_payload_bytes_decoded(ping) for ping in generate_pings()]
    mobile_df = [
        format_payload_bytes_decoded_mobile(ping)
        for ping in generate_mobile_pings()
    ]

    # result set to be yielded are (table_name, fully-qualified path) pairs
    results = []
    # create the relevant tables
    for table_name, df in [
        ("main_v4", df),
        ("saved_session_v4", df),
        ("mobile_metrics_v1", mobile_df),
    ]:
        table_id = f"{dataset_id}.telemetry_telemetry__{table_name}"
        table = bigquery.table.Table(table_id, schema)
        table.time_partitioning = bigquery.TimePartitioning(
            type_=bigquery.TimePartitioningType.DAY,
            field="submission_timestamp")
        bq_client.create_table(table)
        bq_client.load_table_from_json(
            df, table,
            job_config=bigquery.job.LoadJobConfig(schema=schema)).result()

        results.append((table_name, table_id))

    yield results

    bq_client.delete_dataset(dataset_id,
                             delete_contents=True,
                             not_found_ok=True)
コード例 #15
0
def aggregates(sc):
    logger = logging.getLogger("py4j")
    logger.setLevel(logging.ERROR)

    raw_pings = list(d.generate_pings())
    return _aggregate_metrics(sc.parallelize(raw_pings), num_reducers=10)
コード例 #16
0
def raw_pings():
    return list(d.generate_pings())
コード例 #17
0
 def records(self, *args, **kwargs):
     return spark.sparkContext.parallelize(generate_pings())
コード例 #18
0
def aggregates(sc):
    raw_pings = list(generate_pings())
    aggregates = _aggregate_metrics(sc.parallelize(raw_pings), num_reducers=10)
    submit_aggregates(aggregates)
    return aggregates
コード例 #19
0
def run_job(spark_context, sql_context, submission_date_range, use_test_data=False):
    """
    Compute crash aggregates for the specified submission date range,
    and upload the result to S3.
    """
    start_date = datetime.strptime(submission_date_range[0], "%Y%m%d").date()
    end_date = datetime.strptime(submission_date_range[1], "%Y%m%d").date()

    schema = types.StructType([
        types.StructField(
            "activity_date",
            types.StringType(),
            nullable=False
        ),
        types.StructField(
            "dimensions",
            types.MapType(types.StringType(), types.StringType(), True),
            nullable=False
        ),
        types.StructField(
            "stats",
            types.MapType(types.StringType(), types.DoubleType(), True),
            nullable=False
        ),
    ])

    current_date = start_date
    while current_date <= end_date:
        # useful statements for testing the program
        if use_test_data:
            # use test pings; very good for debugging the uploading process
            sys.path.append(os.path.join(os.path.dirname(os.path.abspath(__file__)), "..", "test"))
            import dataset
            pings = sc.parallelize(list(dataset.generate_pings()))
        else:
            pings = retrieve_crash_data(
                spark_context,
                current_date.strftime("%Y%m%d"),
                COMPARABLE_DIMENSIONS, FRACTION
            )

        result, main_processed_count, main_ignored_count, crash_processed_count, crash_ignored_count = compare_crashes(
            spark_context,
            pings,
            COMPARABLE_DIMENSIONS, DIMENSION_NAMES
        )
        result = result.coalesce(1)  # put everything into a single partition
        df = sql_context.createDataFrame(result, schema)
        print("SUCCESSFULLY COMPUTED CRASH AGGREGATES FOR {}".format(current_date))

        # upload the dataframe as Parquet to S3
        s3_result_url = (
            "s3n://telemetry-parquet/crash_aggregates/v1/submission_date={}".format(
                current_date
            )
        )
        df.write.parquet(s3_result_url)

        print("SUCCESSFULLY UPLOADED CRASH AGGREGATES FOR {} TO S3:".format(current_date))
        print("{} main pings processed, {} main pings ignored".format(main_processed_count.value, main_ignored_count.value))
        print("{} crash pings processed, {} crash pings ignored".format(crash_processed_count.value, crash_ignored_count.value))

        current_date += timedelta(days=1)

    print("========================================")
    print("JOB COMPLETED SUCCESSFULLY")
    print("========================================")
コード例 #20
0
def run_job(spark_context,
            sql_context,
            submission_date_range,
            use_test_data=False):
    """
    Compute crash aggregates for the specified submission date range,
    and upload the result to S3.
    """
    start_date = datetime.strptime(submission_date_range[0], "%Y%m%d").date()
    end_date = datetime.strptime(submission_date_range[1], "%Y%m%d").date()

    schema = types.StructType([
        types.StructField("activity_date", types.StringType(), nullable=False),
        types.StructField("dimensions",
                          types.MapType(types.StringType(), types.StringType(),
                                        True),
                          nullable=False),
        types.StructField("stats",
                          types.MapType(types.StringType(), types.DoubleType(),
                                        True),
                          nullable=False),
    ])

    current_date = start_date
    while current_date <= end_date:
        # useful statements for testing the program
        if use_test_data:
            # use test pings; very good for debugging the uploading process
            sys.path.append(
                os.path.join(os.path.dirname(os.path.abspath(__file__)), "..",
                             "test"))
            import dataset
            pings = sc.parallelize(list(dataset.generate_pings()))
        else:
            pings = retrieve_crash_data(spark_context,
                                        current_date.strftime("%Y%m%d"),
                                        COMPARABLE_DIMENSIONS, FRACTION)

        result, main_processed_count, main_ignored_count, crash_processed_count, crash_ignored_count = compare_crashes(
            spark_context, pings, COMPARABLE_DIMENSIONS, DIMENSION_NAMES)
        result = result.coalesce(1)  # put everything into a single partition
        df = sql_context.createDataFrame(result, schema)
        print("SUCCESSFULLY COMPUTED CRASH AGGREGATES FOR {}".format(
            current_date))

        # upload the dataframe as Parquet to S3
        s3_result_url = (
            "s3n://telemetry-parquet/crash_aggregates/v1/submission_date={}".
            format(current_date))
        df.write.parquet(s3_result_url)

        print("SUCCESSFULLY UPLOADED CRASH AGGREGATES FOR {} TO S3:".format(
            current_date))
        print("{} main pings processed, {} main pings ignored".format(
            main_processed_count.value, main_ignored_count.value))
        print("{} crash pings processed, {} crash pings ignored".format(
            crash_processed_count.value, crash_ignored_count.value))

        current_date += timedelta(days=1)

    print("========================================")
    print("JOB COMPLETED SUCCESSFULLY")
    print("========================================")
コード例 #21
0
def test_count():
    pings = list(d.generate_pings())
    num_build_ids = len(d.ping_dimensions["build_id"])
    assert(len(pings) / d.NUM_PINGS_PER_DIMENSIONS == len(build_id_aggregates))
    assert(len(pings) / d.NUM_PINGS_PER_DIMENSIONS / num_build_ids == len(submission_date_aggregates))