def setUpClass(cls): """Set up the database once for the test run.""" cls.sc = pyspark.SparkContext(master="local[*]") raw_pings = list(generate_pings()) aggregates = _aggregate_metrics(cls.sc.parallelize(raw_pings), num_reducers=10) submit_aggregates(aggregates)
def setUp(self): self.sc = pyspark.SparkContext(master="local[1]") self.raw_pings = self.sc.parallelize(list(dataset.generate_pings())) result, self.main_processed_count, self.main_ignored_count, self.crash_processed_count, self.crash_ignored_count = compare_crashes( self.sc, self.raw_pings, COMPARABLE_DIMENSIONS, DIMENSION_NAMES) self.crash_rate_aggregates = result.collect()
def test_count(build_id_aggregates, submission_date_aggregates): pings = list(d.generate_pings()) num_build_ids = len(d.ping_dimensions["build_id"]) assert (len(pings) / d.NUM_PINGS_PER_DIMENSIONS == len(build_id_aggregates)) assert (len(pings) / d.NUM_PINGS_PER_DIMENSIONS / num_build_ids == len(submission_date_aggregates))
def test_count(self): pings = list(d.generate_pings()) num_build_ids = len(d.ping_dimensions["build_id"]) self.assertEqual(len(pings) / d.NUM_PINGS_PER_DIMENSIONS, len(self.build_id_aggs)) self.assertEqual( len(pings) / d.NUM_PINGS_PER_DIMENSIONS / num_build_ids, len(self.submission_date_aggs))
def setup_module(): global aggregates global sc sc = pyspark.SparkContext(master="local[*]") raw_pings = list(generate_pings()) aggregates = _aggregate_metrics(sc.parallelize(raw_pings), num_reducers=10) submit_aggregates(aggregates)
def setUpClass(cls): """Set up the database once for the test run.""" clear_db() cls.sc = pyspark.SparkContext(master="local[*]") raw_pings = list(generate_pings()) aggregates = _aggregate_metrics(cls.sc.parallelize(raw_pings), num_reducers=10) submit_aggregates(aggregates)
def setUp(self): self.sc = pyspark.SparkContext(master="local[1]") self.raw_pings = self.sc.parallelize(list(dataset.generate_pings())) result, self.main_processed_count, self.main_ignored_count, self.crash_processed_count, self.crash_ignored_count = compare_crashes( self.sc, self.raw_pings, COMPARABLE_DIMENSIONS, DIMENSION_NAMES ) self.crash_rate_aggregates = result.collect()
def setUp(self): logger = logging.getLogger("py4j") logger.setLevel(logging.ERROR) self.sc = pyspark.SparkContext(master="local[*]") raw_pings = list(d.generate_pings()) build_id_aggs, submission_date_aggs = ( _aggregate_metrics(self.sc.parallelize(raw_pings), num_reducers=10)) self.build_id_aggs = build_id_aggs.collect() self.submission_date_aggs = submission_date_aggs.collect()
def setup_module(): global build_id_aggregates global submission_date_aggregates logger = logging.getLogger("py4j") logger.setLevel(logging.ERROR) sc = pyspark.SparkContext(master="local[*]") raw_pings = list(d.generate_pings()) build_id_aggregates, submission_date_aggregates = _aggregate_metrics(sc.parallelize(raw_pings), num_reducers=10) build_id_aggregates = build_id_aggregates.collect() submission_date_aggregates = submission_date_aggregates.collect() # Note: most tests are based on the build-id aggregates as the aggregation # code is the same for both scenarios. sc.stop()
def setup_module(): global build_id_aggregates global submission_date_aggregates logger = logging.getLogger("py4j") logger.setLevel(logging.ERROR) sc = pyspark.SparkContext(master="local[*]") raw_pings = list(d.generate_pings()) build_id_aggregates, submission_date_aggregates = _aggregate_metrics( sc.parallelize(raw_pings), num_reducers=10) build_id_aggregates = build_id_aggregates.collect() submission_date_aggregates = submission_date_aggregates.collect() # Note: most tests are based on the build-id aggregates as the aggregation # code is the same for both scenarios. sc.stop()
def bq_testing_table(): bq_client = bigquery.Client() project_id = os.environ["PROJECT_ID"] dataset_id = f"{project_id}.pytest_mozaggregator_test" bq_client.delete_dataset(dataset_id, delete_contents=True, not_found_ok=True) bq_client.create_dataset(dataset_id) schema = bq_client.schema_from_json( os.path.join(os.path.dirname(__file__), "decoded.1.bq")) # use load_table instead of insert_rows to avoid eventual consistency guarantees df = [format_payload_bytes_decoded(ping) for ping in generate_pings()] mobile_df = [ format_payload_bytes_decoded_mobile(ping) for ping in generate_mobile_pings() ] # result set to be yielded are (table_name, fully-qualified path) pairs results = [] # create the relevant tables for table_name, df in [ ("main_v4", df), ("saved_session_v4", df), ("mobile_metrics_v1", mobile_df), ]: table_id = f"{dataset_id}.telemetry_telemetry__{table_name}" table = bigquery.table.Table(table_id, schema) table.time_partitioning = bigquery.TimePartitioning( type_=bigquery.TimePartitioningType.DAY, field="submission_timestamp") bq_client.create_table(table) bq_client.load_table_from_json( df, table, job_config=bigquery.job.LoadJobConfig(schema=schema)).result() results.append((table_name, table_id)) yield results bq_client.delete_dataset(dataset_id, delete_contents=True, not_found_ok=True)
def aggregates(sc): logger = logging.getLogger("py4j") logger.setLevel(logging.ERROR) raw_pings = list(d.generate_pings()) return _aggregate_metrics(sc.parallelize(raw_pings), num_reducers=10)
def raw_pings(): return list(d.generate_pings())
def records(self, *args, **kwargs): return spark.sparkContext.parallelize(generate_pings())
def aggregates(sc): raw_pings = list(generate_pings()) aggregates = _aggregate_metrics(sc.parallelize(raw_pings), num_reducers=10) submit_aggregates(aggregates) return aggregates
def run_job(spark_context, sql_context, submission_date_range, use_test_data=False): """ Compute crash aggregates for the specified submission date range, and upload the result to S3. """ start_date = datetime.strptime(submission_date_range[0], "%Y%m%d").date() end_date = datetime.strptime(submission_date_range[1], "%Y%m%d").date() schema = types.StructType([ types.StructField( "activity_date", types.StringType(), nullable=False ), types.StructField( "dimensions", types.MapType(types.StringType(), types.StringType(), True), nullable=False ), types.StructField( "stats", types.MapType(types.StringType(), types.DoubleType(), True), nullable=False ), ]) current_date = start_date while current_date <= end_date: # useful statements for testing the program if use_test_data: # use test pings; very good for debugging the uploading process sys.path.append(os.path.join(os.path.dirname(os.path.abspath(__file__)), "..", "test")) import dataset pings = sc.parallelize(list(dataset.generate_pings())) else: pings = retrieve_crash_data( spark_context, current_date.strftime("%Y%m%d"), COMPARABLE_DIMENSIONS, FRACTION ) result, main_processed_count, main_ignored_count, crash_processed_count, crash_ignored_count = compare_crashes( spark_context, pings, COMPARABLE_DIMENSIONS, DIMENSION_NAMES ) result = result.coalesce(1) # put everything into a single partition df = sql_context.createDataFrame(result, schema) print("SUCCESSFULLY COMPUTED CRASH AGGREGATES FOR {}".format(current_date)) # upload the dataframe as Parquet to S3 s3_result_url = ( "s3n://telemetry-parquet/crash_aggregates/v1/submission_date={}".format( current_date ) ) df.write.parquet(s3_result_url) print("SUCCESSFULLY UPLOADED CRASH AGGREGATES FOR {} TO S3:".format(current_date)) print("{} main pings processed, {} main pings ignored".format(main_processed_count.value, main_ignored_count.value)) print("{} crash pings processed, {} crash pings ignored".format(crash_processed_count.value, crash_ignored_count.value)) current_date += timedelta(days=1) print("========================================") print("JOB COMPLETED SUCCESSFULLY") print("========================================")
def run_job(spark_context, sql_context, submission_date_range, use_test_data=False): """ Compute crash aggregates for the specified submission date range, and upload the result to S3. """ start_date = datetime.strptime(submission_date_range[0], "%Y%m%d").date() end_date = datetime.strptime(submission_date_range[1], "%Y%m%d").date() schema = types.StructType([ types.StructField("activity_date", types.StringType(), nullable=False), types.StructField("dimensions", types.MapType(types.StringType(), types.StringType(), True), nullable=False), types.StructField("stats", types.MapType(types.StringType(), types.DoubleType(), True), nullable=False), ]) current_date = start_date while current_date <= end_date: # useful statements for testing the program if use_test_data: # use test pings; very good for debugging the uploading process sys.path.append( os.path.join(os.path.dirname(os.path.abspath(__file__)), "..", "test")) import dataset pings = sc.parallelize(list(dataset.generate_pings())) else: pings = retrieve_crash_data(spark_context, current_date.strftime("%Y%m%d"), COMPARABLE_DIMENSIONS, FRACTION) result, main_processed_count, main_ignored_count, crash_processed_count, crash_ignored_count = compare_crashes( spark_context, pings, COMPARABLE_DIMENSIONS, DIMENSION_NAMES) result = result.coalesce(1) # put everything into a single partition df = sql_context.createDataFrame(result, schema) print("SUCCESSFULLY COMPUTED CRASH AGGREGATES FOR {}".format( current_date)) # upload the dataframe as Parquet to S3 s3_result_url = ( "s3n://telemetry-parquet/crash_aggregates/v1/submission_date={}". format(current_date)) df.write.parquet(s3_result_url) print("SUCCESSFULLY UPLOADED CRASH AGGREGATES FOR {} TO S3:".format( current_date)) print("{} main pings processed, {} main pings ignored".format( main_processed_count.value, main_ignored_count.value)) print("{} crash pings processed, {} crash pings ignored".format( crash_processed_count.value, crash_ignored_count.value)) current_date += timedelta(days=1) print("========================================") print("JOB COMPLETED SUCCESSFULLY") print("========================================")
def test_count(): pings = list(d.generate_pings()) num_build_ids = len(d.ping_dimensions["build_id"]) assert(len(pings) / d.NUM_PINGS_PER_DIMENSIONS == len(build_id_aggregates)) assert(len(pings) / d.NUM_PINGS_PER_DIMENSIONS / num_build_ids == len(submission_date_aggregates))