Python generate_pingsの例、dataset.generate_pings Pythonの例

コード例 #1

0

ファイルを表示

ファイル: test_service.py プロジェクト: jklukas/python_mozaggregator

 def setUpClass(cls):
     """Set up the database once for the test run."""
     cls.sc = pyspark.SparkContext(master="local[*]")
     raw_pings = list(generate_pings())
     aggregates = _aggregate_metrics(cls.sc.parallelize(raw_pings),
                                     num_reducers=10)
     submit_aggregates(aggregates)

コード例 #2

0

ファイルを表示

ファイル: test_crash_aggregator.py プロジェクト: Mozilla-GitHub-Standards/1677ce575cd8c12053db578477da6184fb1a0429eb3f568bec721086bc73f4c1

    def setUp(self):
        self.sc = pyspark.SparkContext(master="local[1]")
        self.raw_pings = self.sc.parallelize(list(dataset.generate_pings()))

        result, self.main_processed_count, self.main_ignored_count, self.crash_processed_count, self.crash_ignored_count = compare_crashes(
            self.sc, self.raw_pings, COMPARABLE_DIMENSIONS, DIMENSION_NAMES)
        self.crash_rate_aggregates = result.collect()

コード例 #3

0

ファイルを表示

ファイル: test_aggregator.py プロジェクト: scholtzan/python_mozaggregator

def test_count(build_id_aggregates, submission_date_aggregates):
    pings = list(d.generate_pings())
    num_build_ids = len(d.ping_dimensions["build_id"])
    assert (len(pings) /
            d.NUM_PINGS_PER_DIMENSIONS == len(build_id_aggregates))
    assert (len(pings) / d.NUM_PINGS_PER_DIMENSIONS /
            num_build_ids == len(submission_date_aggregates))

コード例 #4

0

ファイルを表示

ファイル: test_parquet.py プロジェクト: scholtzan/python_mozaggregator

 def test_count(self):
     pings = list(d.generate_pings())
     num_build_ids = len(d.ping_dimensions["build_id"])
     self.assertEqual(len(pings) / d.NUM_PINGS_PER_DIMENSIONS,
                      len(self.build_id_aggs))
     self.assertEqual(
         len(pings) / d.NUM_PINGS_PER_DIMENSIONS / num_build_ids,
         len(self.submission_date_aggs))

コード例 #5

0

ファイルを表示

ファイル: test_db.py プロジェクト: mozilla/python_mozaggregator

def setup_module():
    global aggregates
    global sc

    sc = pyspark.SparkContext(master="local[*]")
    raw_pings = list(generate_pings())
    aggregates = _aggregate_metrics(sc.parallelize(raw_pings), num_reducers=10)
    submit_aggregates(aggregates)

コード例 #6

0

ファイルを表示

ファイル: test_parquet.py プロジェクト: mozilla/python_mozaggregator

 def test_count(self):
     pings = list(d.generate_pings())
     num_build_ids = len(d.ping_dimensions["build_id"])
     self.assertEqual(len(pings) / d.NUM_PINGS_PER_DIMENSIONS,
                      len(self.build_id_aggs))
     self.assertEqual(
         len(pings) / d.NUM_PINGS_PER_DIMENSIONS / num_build_ids,
         len(self.submission_date_aggs))

コード例 #7

0

ファイルを表示

ファイル: test_db.py プロジェクト: jklukas/python_mozaggregator

def setup_module():
    global aggregates
    global sc

    sc = pyspark.SparkContext(master="local[*]")
    raw_pings = list(generate_pings())
    aggregates = _aggregate_metrics(sc.parallelize(raw_pings), num_reducers=10)
    submit_aggregates(aggregates)

コード例 #8

0

ファイルを表示

ファイル: test_service.py プロジェクト: mozilla/python_mozaggregator

    def setUpClass(cls):
        """Set up the database once for the test run."""
        clear_db()

        cls.sc = pyspark.SparkContext(master="local[*]")
        raw_pings = list(generate_pings())
        aggregates = _aggregate_metrics(cls.sc.parallelize(raw_pings), num_reducers=10)
        submit_aggregates(aggregates)

コード例 #9

0

ファイルを表示

ファイル: test_crash_aggregator.py プロジェクト: bsmedberg/moz-crash-rate-aggregates

    def setUp(self):
        self.sc = pyspark.SparkContext(master="local[1]")
        self.raw_pings = self.sc.parallelize(list(dataset.generate_pings()))

        result, self.main_processed_count, self.main_ignored_count, self.crash_processed_count, self.crash_ignored_count = compare_crashes(
            self.sc,
            self.raw_pings,
            COMPARABLE_DIMENSIONS, DIMENSION_NAMES
        )
        self.crash_rate_aggregates = result.collect()

コード例 #10

0

ファイルを表示

ファイル: test_parquet.py プロジェクト: scholtzan/python_mozaggregator

    def setUp(self):
        logger = logging.getLogger("py4j")
        logger.setLevel(logging.ERROR)

        self.sc = pyspark.SparkContext(master="local[*]")
        raw_pings = list(d.generate_pings())
        build_id_aggs, submission_date_aggs = (
            _aggregate_metrics(self.sc.parallelize(raw_pings), num_reducers=10))
        self.build_id_aggs = build_id_aggs.collect()
        self.submission_date_aggs = submission_date_aggs.collect()

コード例 #11

0

ファイルを表示

ファイル: test_parquet.py プロジェクト: mozilla/python_mozaggregator

    def setUp(self):
        logger = logging.getLogger("py4j")
        logger.setLevel(logging.ERROR)

        self.sc = pyspark.SparkContext(master="local[*]")
        raw_pings = list(d.generate_pings())
        build_id_aggs, submission_date_aggs = (
            _aggregate_metrics(self.sc.parallelize(raw_pings), num_reducers=10))
        self.build_id_aggs = build_id_aggs.collect()
        self.submission_date_aggs = submission_date_aggs.collect()

コード例 #12

0

ファイルを表示

ファイル: test_aggregator.py プロジェクト: mozilla/python_mozaggregator

def setup_module():
    global build_id_aggregates
    global submission_date_aggregates

    logger = logging.getLogger("py4j")
    logger.setLevel(logging.ERROR)

    sc = pyspark.SparkContext(master="local[*]")
    raw_pings = list(d.generate_pings())
    build_id_aggregates, submission_date_aggregates = _aggregate_metrics(sc.parallelize(raw_pings), num_reducers=10)
    build_id_aggregates = build_id_aggregates.collect()
    submission_date_aggregates = submission_date_aggregates.collect()

    # Note: most tests are based on the build-id aggregates as the aggregation
    # code is the same for both scenarios.
    sc.stop()

コード例 #13

0

ファイルを表示

ファイル: test_aggregator.py プロジェクト: jklukas/python_mozaggregator

def setup_module():
    global build_id_aggregates
    global submission_date_aggregates

    logger = logging.getLogger("py4j")
    logger.setLevel(logging.ERROR)

    sc = pyspark.SparkContext(master="local[*]")
    raw_pings = list(d.generate_pings())
    build_id_aggregates, submission_date_aggregates = _aggregate_metrics(
        sc.parallelize(raw_pings), num_reducers=10)
    build_id_aggregates = build_id_aggregates.collect()
    submission_date_aggregates = submission_date_aggregates.collect()

    # Note: most tests are based on the build-id aggregates as the aggregation
    # code is the same for both scenarios.
    sc.stop()

コード例 #14

0

ファイルを表示

ファイル: conftest.py プロジェクト: scholtzan/python_mozaggregator

def bq_testing_table():
    bq_client = bigquery.Client()

    project_id = os.environ["PROJECT_ID"]
    dataset_id = f"{project_id}.pytest_mozaggregator_test"
    bq_client.delete_dataset(dataset_id,
                             delete_contents=True,
                             not_found_ok=True)
    bq_client.create_dataset(dataset_id)

    schema = bq_client.schema_from_json(
        os.path.join(os.path.dirname(__file__), "decoded.1.bq"))
    # use load_table instead of insert_rows to avoid eventual consistency guarantees
    df = [format_payload_bytes_decoded(ping) for ping in generate_pings()]
    mobile_df = [
        format_payload_bytes_decoded_mobile(ping)
        for ping in generate_mobile_pings()
    ]

    # result set to be yielded are (table_name, fully-qualified path) pairs
    results = []
    # create the relevant tables
    for table_name, df in [
        ("main_v4", df),
        ("saved_session_v4", df),
        ("mobile_metrics_v1", mobile_df),
    ]:
        table_id = f"{dataset_id}.telemetry_telemetry__{table_name}"
        table = bigquery.table.Table(table_id, schema)
        table.time_partitioning = bigquery.TimePartitioning(
            type_=bigquery.TimePartitioningType.DAY,
            field="submission_timestamp")
        bq_client.create_table(table)
        bq_client.load_table_from_json(
            df, table,
            job_config=bigquery.job.LoadJobConfig(schema=schema)).result()

        results.append((table_name, table_id))

    yield results

    bq_client.delete_dataset(dataset_id,
                             delete_contents=True,
                             not_found_ok=True)

コード例 #15

0

ファイルを表示

ファイル: test_aggregator.py プロジェクト: scholtzan/python_mozaggregator

def aggregates(sc):
    logger = logging.getLogger("py4j")
    logger.setLevel(logging.ERROR)

    raw_pings = list(d.generate_pings())
    return _aggregate_metrics(sc.parallelize(raw_pings), num_reducers=10)

コード例 #16

0

ファイルを表示

ファイル: test_parquet.py プロジェクト: scholtzan/python_mozaggregator

def raw_pings():
    return list(d.generate_pings())

コード例 #17

0

ファイルを表示

 def records(self, *args, **kwargs):
     return spark.sparkContext.parallelize(generate_pings())

コード例 #18

0

ファイルを表示

def aggregates(sc):
    raw_pings = list(generate_pings())
    aggregates = _aggregate_metrics(sc.parallelize(raw_pings), num_reducers=10)
    submit_aggregates(aggregates)
    return aggregates

コード例 #19

0

ファイルを表示

ファイル: crash_aggregator.py プロジェクト: bsmedberg/moz-crash-rate-aggregates

def run_job(spark_context, sql_context, submission_date_range, use_test_data=False):
    """
    Compute crash aggregates for the specified submission date range,
    and upload the result to S3.
    """
    start_date = datetime.strptime(submission_date_range[0], "%Y%m%d").date()
    end_date = datetime.strptime(submission_date_range[1], "%Y%m%d").date()

    schema = types.StructType([
        types.StructField(
            "activity_date",
            types.StringType(),
            nullable=False
        ),
        types.StructField(
            "dimensions",
            types.MapType(types.StringType(), types.StringType(), True),
            nullable=False
        ),
        types.StructField(
            "stats",
            types.MapType(types.StringType(), types.DoubleType(), True),
            nullable=False
        ),
    ])

    current_date = start_date
    while current_date <= end_date:
        # useful statements for testing the program
        if use_test_data:
            # use test pings; very good for debugging the uploading process
            sys.path.append(os.path.join(os.path.dirname(os.path.abspath(__file__)), "..", "test"))
            import dataset
            pings = sc.parallelize(list(dataset.generate_pings()))
        else:
            pings = retrieve_crash_data(
                spark_context,
                current_date.strftime("%Y%m%d"),
                COMPARABLE_DIMENSIONS, FRACTION
            )

        result, main_processed_count, main_ignored_count, crash_processed_count, crash_ignored_count = compare_crashes(
            spark_context,
            pings,
            COMPARABLE_DIMENSIONS, DIMENSION_NAMES
        )
        result = result.coalesce(1)  # put everything into a single partition
        df = sql_context.createDataFrame(result, schema)
        print("SUCCESSFULLY COMPUTED CRASH AGGREGATES FOR {}".format(current_date))

        # upload the dataframe as Parquet to S3
        s3_result_url = (
            "s3n://telemetry-parquet/crash_aggregates/v1/submission_date={}".format(
                current_date
            )
        )
        df.write.parquet(s3_result_url)

        print("SUCCESSFULLY UPLOADED CRASH AGGREGATES FOR {} TO S3:".format(current_date))
        print("{} main pings processed, {} main pings ignored".format(main_processed_count.value, main_ignored_count.value))
        print("{} crash pings processed, {} crash pings ignored".format(crash_processed_count.value, crash_ignored_count.value))

        current_date += timedelta(days=1)

    print("========================================")
    print("JOB COMPLETED SUCCESSFULLY")
    print("========================================")

コード例 #20

0

ファイルを表示

ファイル: crash_aggregator.py プロジェクト: Uberi/moz-crash-rate-aggregates

def run_job(spark_context,
            sql_context,
            submission_date_range,
            use_test_data=False):
    """
    Compute crash aggregates for the specified submission date range,
    and upload the result to S3.
    """
    start_date = datetime.strptime(submission_date_range[0], "%Y%m%d").date()
    end_date = datetime.strptime(submission_date_range[1], "%Y%m%d").date()

    schema = types.StructType([
        types.StructField("activity_date", types.StringType(), nullable=False),
        types.StructField("dimensions",
                          types.MapType(types.StringType(), types.StringType(),
                                        True),
                          nullable=False),
        types.StructField("stats",
                          types.MapType(types.StringType(), types.DoubleType(),
                                        True),
                          nullable=False),
    ])

    current_date = start_date
    while current_date <= end_date:
        # useful statements for testing the program
        if use_test_data:
            # use test pings; very good for debugging the uploading process
            sys.path.append(
                os.path.join(os.path.dirname(os.path.abspath(__file__)), "..",
                             "test"))
            import dataset
            pings = sc.parallelize(list(dataset.generate_pings()))
        else:
            pings = retrieve_crash_data(spark_context,
                                        current_date.strftime("%Y%m%d"),
                                        COMPARABLE_DIMENSIONS, FRACTION)

        result, main_processed_count, main_ignored_count, crash_processed_count, crash_ignored_count = compare_crashes(
            spark_context, pings, COMPARABLE_DIMENSIONS, DIMENSION_NAMES)
        result = result.coalesce(1)  # put everything into a single partition
        df = sql_context.createDataFrame(result, schema)
        print("SUCCESSFULLY COMPUTED CRASH AGGREGATES FOR {}".format(
            current_date))

        # upload the dataframe as Parquet to S3
        s3_result_url = (
            "s3n://telemetry-parquet/crash_aggregates/v1/submission_date={}".
            format(current_date))
        df.write.parquet(s3_result_url)

        print("SUCCESSFULLY UPLOADED CRASH AGGREGATES FOR {} TO S3:".format(
            current_date))
        print("{} main pings processed, {} main pings ignored".format(
            main_processed_count.value, main_ignored_count.value))
        print("{} crash pings processed, {} crash pings ignored".format(
            crash_processed_count.value, crash_ignored_count.value))

        current_date += timedelta(days=1)

    print("========================================")
    print("JOB COMPLETED SUCCESSFULLY")
    print("========================================")

コード例 #21

0

ファイルを表示

ファイル: test_aggregator.py プロジェクト: mozilla/python_mozaggregator

def test_count():
    pings = list(d.generate_pings())
    num_build_ids = len(d.ping_dimensions["build_id"])
    assert(len(pings) / d.NUM_PINGS_PER_DIMENSIONS == len(build_id_aggregates))
    assert(len(pings) / d.NUM_PINGS_PER_DIMENSIONS / num_build_ids == len(submission_date_aggregates))