def test_where_exact_match(): dataset = Dataset('test-bucket', ['dim1', 'dim2'], prefix='prefix/') new_dataset = dataset.where(dim1='my-value') assert new_dataset is not dataset assert new_dataset.clauses.keys() == ['dim1'] condition = new_dataset.clauses['dim1'] assert condition('my-value')
def test_scan_multiple_params(): dataset = Dataset('test-bucket', ['dim1', 'dim2'], prefix='prefix/') new_dataset = dataset.where(dim1='myvalue') assert new_dataset is not dataset assert list(new_dataset.clauses.keys()) == ['dim1'] condition = new_dataset.clauses['dim1'] assert condition('myvalue')
def test_where(): dataset = Dataset('test-bucket', ['dim1', 'dim2'], prefix='prefix/') clause = lambda x: True new_dataset = dataset.where(dim1=clause) assert new_dataset is not dataset assert new_dataset.clauses == {'dim1': clause}
def aggregate_metrics(sc, channels, submission_date, main_ping_fraction=1, fennec_ping_fraction=1, num_reducers=10000): """ Returns the build-id and submission date aggregates for a given submission date. :param sc: A SparkContext instance :param channel: Either the name of a channel or a list/tuple of names :param submission-date: The submission date for which the data will be aggregated :param fraction: An approximative fraction of submissions to consider for aggregation """ if not isinstance(channels, (tuple, list)): channels = [channels] channels = set(channels) pings = Dataset.from_source('telemetry') \ .where(appUpdateChannel=lambda x: x in channels, submissionDate=submission_date, docType='main', sourceVersion='4', appName=lambda x: x != 'Fennec') \ .records(sc, sample=main_ping_fraction) fennec_pings = Dataset.from_source('telemetry') \ .where(appUpdateChannel=lambda x: x in channels, submissionDate=submission_date, docType='saved_session', sourceVersion='4', appName='Fennec') \ .records(sc, sample=fennec_ping_fraction) all_pings = pings.union(fennec_pings) return _aggregate_metrics(all_pings)
def aggregate_metrics(sc, channels, submission_date, main_ping_fraction=1, fennec_ping_fraction=1, num_reducers=10000): """ Returns the build-id and submission date aggregates for a given submission date. :param sc: A SparkContext instance :param channel: Either the name of a channel or a list/tuple of names :param submission-date: The submission date for which the data will be aggregated :param fraction: An approximative fraction of submissions to consider for aggregation """ if not isinstance(channels, (tuple, list)): channels = [channels] channels = set(channels) pings = Dataset.from_source('telemetry') \ .where(appUpdateChannel=lambda x: x in channels, submissionDate=submission_date, docType='main', sourceVersion='4', appName=lambda x: x != 'Fennec') \ .records(sc, sample=main_ping_fraction) fennec_pings = Dataset.from_source('telemetry') \ .where(appUpdateChannel=lambda x: x in channels, submissionDate=submission_date, docType='saved_session', sourceVersion='4', appName='Fennec') \ .records(sc, sample=fennec_ping_fraction) all_pings = pings.union(fennec_pings) return _aggregate_metrics(all_pings, num_reducers)
def test_select(): dataset1 = Dataset('test-bucket', ['dim1', 'dim2']).select('field1', 'field2') dataset2 = Dataset('test-bucket', ['dim1', 'dim2']).select('field1', field2='field2') dataset3 = Dataset('test-bucket', ['dim1', 'dim2']).select(field1='field1', field2='field2') assert dataset1.selection == { 'field1': 'field1', 'field2': 'field2', } assert dataset1.selection == dataset2.selection == dataset3.selection dataset4 = Dataset('test-bucket', ['dim1', 'dim2']).select('field1', field2='f2', field3='f3') assert dataset4.selection == { 'field1': 'field1', 'field2': 'f2', 'field3': 'f3', } dataset5 = dataset4.select('field4', field5='f5') assert dataset5.selection == { 'field1': 'field1', 'field2': 'f2', 'field3': 'f3', 'field4': 'field4', 'field5': 'f5' }
def test_where_wrong_dimension(): dataset = Dataset('test-bucket', ['dim1', 'dim2'], prefix='prefix/') clause = lambda x: True with pytest.raises(Exception) as exc_info: new_dataset = dataset.where(dim3=clause) assert str(exc_info.value) == 'The dimension dim3 doesn\'t exist'
def test_where_dupe_dimension(): clause = lambda x: True dataset = Dataset('test-bucket', ['dim1', 'dim2'], prefix='prefix/', clauses={'dim1': clause}) with pytest.raises(Exception) as exc_info: new_dataset = dataset.where(dim1=clause) assert str(exc_info.value) == 'There should be only one clause for dim1'
def test_scan_multiple_where_params(spark_context): bucket_name = 'test-bucket' store = InMemoryStore(bucket_name) store.store['dir1/subdir1/key1'] = 'value1' store.store['dir1/another-dir/key2'] = 'value2' dataset = Dataset(bucket_name, ['dim1', 'dim2'], store=store).where(dim1='dir1', dim2='subdir1') summaries = dataset.summaries(spark_context) expected_key = 'dir1/subdir1/key1' assert summaries == [{'key': expected_key, 'size': len(store.store[expected_key])}]
def test_records_limit_and_sample(spark_context): bucket_name = 'test-bucket' store = InMemoryStore(bucket_name) for i in range(1, 100 + 1): key = 'dir{}/subdir{}/key{}'.format(*[i] * 3) value = 'value{}'.format(i) store.store[key] = value dataset = Dataset(bucket_name, ['dim1', 'dim2'], store=store) records = dataset.records(spark_context, decode=lambda x: x, limit=5, sample=0.9) assert records.count() == 5
def test_scan_no_dimensions(): dataset = Dataset('test-bucket', ['dim1', 'dim2'], prefix='prefix/') with futures.ProcessPoolExecutor(1) as executor: folders = dataset._scan([], [ 'prefix/', ], {}, executor) assert folders == [ 'prefix/', ]
def test_records(spark_context): bucket_name = 'test-bucket' store = InMemoryStore(bucket_name) store.store['dir1/subdir1/key1'] = 'value1' store.store['dir2/subdir2/key2'] = 'value2' dataset = Dataset(bucket_name, ['dim1', 'dim2'], store=store) records = dataset.records(spark_context, decode=lambda x: x) records = sorted(records.collect()) assert records == [b'value1', b'value2']
def test_records(spark_context): bucket_name = 'test-bucket' store = InMemoryStore(bucket_name) store.store['dir1/subdir1/key1'] = 'value1' store.store['dir2/subdir2/key2'] = 'value2' dataset = Dataset(bucket_name, ['dim1', 'dim2'], store=store) records = dataset.records(spark_context, decode=lambda x: x) records = records.collect() assert records == [b'value1', b'value2']
def test_records_limit(spark_context): bucket_name = 'test-bucket' store = InMemoryStore(bucket_name) for i in range(1, 100 + 1): key = 'dir{}/subdir{}/key{}'.format(*[i] * 3) value = 'value{}'.format(i) store.store[key] = value dataset = Dataset(bucket_name, ['dim1', 'dim2'], store=store) records = dataset.records(spark_context, decode=lambda x: x, limit=5) assert records.count() == 5
def test_scan_with_prefix(): bucket_name = 'test-bucket' store = InMemoryStore(bucket_name) store.store['prefix1/dir1/subdir1/key1'] = 'value1' store.store['prefix2/dir2/another-dir/key2'] = 'value2' dataset = Dataset(bucket_name, ['dim1', 'dim2'], clauses={'dim1': lambda x: x == 'dir1'}, store=store) with futures.ProcessPoolExecutor(1) as executor: folders = dataset._scan(['dim1', 'dim2',], ['prefix1/',], dataset.clauses, executor) assert list(folders) == ['prefix1/dir1/']
def aggregate_metrics( sc, channels, submission_date, main_ping_fraction=1, fennec_ping_fraction=1, num_reducers=10000, source="moztelemetry", project_id=None, dataset_id=None, avro_prefix=None, ): """ Returns the build-id and submission date aggregates for a given submission date. :param sc: A SparkContext instance :param channel: Either the name of a channel or a list/tuple of names :param submission-date: The submission date for which the data will be aggregated :param fraction: An approximative fraction of submissions to consider for aggregation """ if not isinstance(channels, (tuple, list)): channels = [channels] if source == "bigquery" and project_id and dataset_id: dataset = BigQueryDataset() pings = dataset.load(project_id, dataset_id, "main", submission_date, channels, "normalized_app_name <> 'Fennec'") fennec_pings = dataset.load(project_id, dataset_id, "saved_session", submission_date, channels, "normalized_app_name = 'Fennec'") elif source == "avro" and avro_prefix: dataset = BigQueryDataset() pings = dataset.load_avro(avro_prefix, "main", submission_date, channels, "normalized_app_name <> 'Fennec'") fennec_pings = dataset.load_avro(avro_prefix, "saved_session", submission_date, channels, "normalized_app_name = 'Fennec'") else: pings = Dataset.from_source('telemetry') \ .where(appUpdateChannel=lambda x: x in channels, submissionDate=submission_date, docType='main', sourceVersion='4', appName=lambda x: x != 'Fennec') \ .records(sc, sample=main_ping_fraction) fennec_pings = Dataset.from_source('telemetry') \ .where(appUpdateChannel=lambda x: x in channels, submissionDate=submission_date, docType='saved_session', sourceVersion='4', appName='Fennec') \ .records(sc, sample=fennec_ping_fraction) all_pings = pings.union(fennec_pings) return _aggregate_metrics(all_pings, num_reducers)
def test_records_summaries(spark_context): bucket_name = 'test-bucket' store = InMemoryStore(bucket_name) store.store['dir1/subdir1/key1'] = 'value1' store.store['dir2/subdir2/key2'] = 'value2' dataset = Dataset(bucket_name, ['dim1', 'dim2'], store=store, max_concurrency=1) records = dataset.records(spark_context, decode=lambda x: x, summaries=[{'key': 'dir1/subdir1/key1', 'size': len('value1')}]) records = records.collect() assert records == [b'value1']
def test_summaries_with_limit(spark_context): bucket_name = 'test-bucket' store = InMemoryStore(bucket_name) store.store['dir1/subdir1/key1'] = 'value1' store.store['dir2/subdir2/key2'] = 'value2' dataset = Dataset(bucket_name, ['dim1', 'dim2'], store=store, max_concurrency=1) summaries = dataset.summaries(spark_context, 1) assert len(summaries) == 1 assert summaries[0]['key'] in store.store
def test_scan_no_clause(): bucket_name = 'test-bucket' store = InMemoryStore(bucket_name) key = 'dir1/dir2/key1' value = 'value1' store.store[key] = value dataset = Dataset(bucket_name, ['dim1', 'dim2'], store=store) with futures.ProcessPoolExecutor(1) as executor: folders = dataset._scan(['dim1', 'subdir'], ['prefix'], {}, executor) assert list(folders) == ['prefix']
def test_scan_with_clause(): bucket_name = 'test-bucket' store = InMemoryStore(bucket_name) store.store['dir1/subdir1/key1'] = 'value1' store.store['dir2/another-dir/key2'] = 'value2' dataset = Dataset(bucket_name, ['dim1', 'dim2'], clauses={'dim1': lambda x: x == 'dir1'}, store=store) with futures.ProcessPoolExecutor(1) as executor: folders = dataset._scan(['dim1', 'dim2'], [''], dataset.clauses, executor) assert list(folders) == ['dir1/']
def test_select_keep_state(): """Test that calling select only mutates the selection of a dataset""" dataset_before = Dataset('test-bucket', ['dim1', 'dim2']).where(dim1=True) dataset_after = dataset_before.select('field1', 'field2') assert dataset_before.selection != dataset_after.selection assert dataset_before.bucket == dataset_after.bucket assert dataset_before.schema == dataset_after.schema assert dataset_before.store == dataset_after.store assert dataset_before.prefix == dataset_after.prefix assert dataset_before.clauses == dataset_after.clauses
def test_summaries_with_limit(): bucket_name = 'test-bucket' store = InMemoryStore(bucket_name) store.store['dir1/subdir1/key1'] = 'value1' store.store['dir2/subdir2/key2'] = 'value2' dataset = Dataset(bucket_name, ['dim1', 'dim2'], store=store) summaries = list(dataset._summaries(1)) assert len(summaries) == 1 assert summaries[0]['key'] in store.store
def test_records_print_output(spark_context, capsys): bucket_name = 'test-bucket' store = InMemoryStore(bucket_name) for i in range(1, 100 + 1): key = 'dir{}/subdir{}/key{}'.format(*[i] * 3) value = 'value{}'.format(i) store.store[key] = value dataset = Dataset(bucket_name, ['dim1', 'dim2'], store=store) dataset.records(spark_context, decode=lambda x: x) out, err = capsys.readouterr() assert out.rstrip() == "fetching 0.00066MB in 100 files..."
def test_records_print_output(spark_context, capsys): bucket_name = 'test-bucket' store = InMemoryStore(bucket_name) for i in range(1, 100+1): key = 'dir{}/subdir{}/key{}'.format(*[i]*3) value = 'value{}'.format(i) store.store[key] = value dataset = Dataset(bucket_name, ['dim1', 'dim2'], store=store) dataset.records(spark_context, decode=lambda x: x) out, err = capsys.readouterr() assert out.rstrip() == "fetching 0.00066MB in 100 files..."
def test_records_summaries(spark_context): bucket_name = 'test-bucket' store = InMemoryStore(bucket_name) store.store['dir1/subdir1/key1'] = 'value1' store.store['dir2/subdir2/key2'] = 'value2' dataset = Dataset(bucket_name, ['dim1', 'dim2'], store=store) records = dataset.records(spark_context, decode=lambda x: x, summaries=[{'key': 'dir1/subdir1/key1', 'size': len('value1')}]) records = records.collect() assert records == ['value1']
def test_summaries_with_limit(spark_context): bucket_name = 'test-bucket' store = InMemoryStore(bucket_name) store.store['dir1/subdir1/key1'] = 'value1' store.store['dir2/subdir2/key2'] = 'value2' dataset = Dataset(bucket_name, ['dim1', 'dim2'], store=store) summaries = dataset.summaries(spark_context, 1) assert len(summaries) == 1 assert summaries[0]['key'] in store.store
def test_select_dupe_properties(): dataset = Dataset('test-bucket', ['dim1', 'dim2']).select('field1') with pytest.raises(Exception) as exc_info: dataset.select('field1') assert str(exc_info.value) == 'The property field1 has already been selected' with pytest.raises(Exception) as exc_info: dataset.select(field1='keyword_field') assert str(exc_info.value) == 'The property field1 has already been selected'
def test_sanitized_dimensions(spark_context): bucket_name = 'test-bucket' store = InMemoryStore(bucket_name) store.store['dir_1/subdir1/key1'] = 'value1' store.store['dir_1/subdir2/key2'] = 'value2' store.store['dir_2/subdir3/key3'] = 'value3' store.store['dir_3/subdir4/key4'] = 'value4' dataset = Dataset(bucket_name, ['dim1', 'dim2'], store=store).where(dim1="dir-1") summaries = dataset.summaries(spark_context) assert len(summaries) == 2
def test_records_many_groups(spark_context, monkeypatch): bucket_name = 'test-bucket' store = InMemoryStore(bucket_name) for i in range(1, spark_context.defaultParallelism + 2): store.store['dir1/subdir1/key{}'.format(i)] = 'value{}'.format(i) # create one group per item monkeypatch.setattr(moztelemetry.dataset, '_group_by_size', lambda x: [[y] for y in x]) dataset = Dataset(bucket_name, ['dim1', 'dim2'], store=store) records = dataset.records(spark_context, decode=lambda x: x) records = records.collect() assert records == ['value{}'.format(i) for i in range(1, spark_context.defaultParallelism + 2)]
def test_records_object(spark_context): expect = {"uid": 1} bucket_name = 'test-bucket' store = InMemoryStore(bucket_name) store.store['key'] = json.dumps(expect) ds = Dataset(bucket_name, None, store=store, max_concurrency=1) row = ds.records(spark_context, decode=decode).first() assert isinstance(row, dict) assert row == expect
def run_spinner_etl(sc): nightly_build_channels = ["nightly", "aurora"] sample_size = 1.0 probe_available = datetime(2016, 9, 8) look_back_date = datetime.today() - timedelta(days=180) start_date = max(probe_available, look_back_date).strftime("%Y%m%d") end_date = datetime.today().strftime("%Y%m%d") def appBuildId_filter(b): return ( (b.startswith(start_date) or b > start_date) and (b.startswith(end_date) or b < end_date) ) print "Start Date: {}, End Date: {}".format(start_date, end_date) build_results = {} for build_type in nightly_build_channels: # Bug 1341340 - if we're looking for pings from before 20161012, we need to query # old infra. old_infra_pings = Dataset.from_source("telemetry-oldinfra") \ .where(docType='main') \ .where(submissionDate=lambda b: b < "20161201") \ .where(appBuildId=appBuildId_filter) \ .where(appUpdateChannel=build_type) \ .records(sc, sample=sample_size) new_infra_pings = Dataset.from_source("telemetry") \ .where(docType='main') \ .where(submissionDate=lambda b: (b.startswith("20161201") or b > "20161201")) \ .where(appBuildId=appBuildId_filter) \ .where(appUpdateChannel=build_type) \ .records(sc, sample=sample_size) pings = old_infra_pings.union(new_infra_pings) build_results[build_type] = get_short_and_long_spinners(pings) s3_client = boto3.client('s3') for result_key, results in build_results.iteritems(): filename = "severities_by_build_id_%s.json" % result_key results_json = json.dumps(results, ensure_ascii=False) with open(filename, 'w') as f: f.write(results_json) s3_client.upload_file( filename, 'telemetry-public-analysis-2', 'spinner-severity-generator/data/{}'.format(filename) )
def test_records_selection(spark_context): bucket_name = 'test-bucket' store = InMemoryStore(bucket_name) key = 'dir1/subdir1/key1' value = '{"a": {"b": { "c": "value"}}}' store.store[key] = value dataset = Dataset(bucket_name, ['dim1', 'dim2'], store=store).select(field='a.b.c') records = dataset.records(spark_context, decode=decode) assert records.collect() == [{'field': 'value'}] # Check that concatenating `select`s works as expected records = dataset.select(field2='a.b').records(spark_context, decode=decode) assert records.collect() == [{'field': 'value', 'field2': {'c': 'value'}}]
def test_records_many_groups(spark_context, monkeypatch): bucket_name = 'test-bucket' store = InMemoryStore(bucket_name) for i in range(1, spark_context.defaultParallelism + 2): store.store['dir1/subdir1/key{}'.format(i)] = 'value{}'.format(i) # create one group per item monkeypatch.setattr(moztelemetry.dataset, '_group_by_size_greedy', lambda x, _: [[y] for y in x]) dataset = Dataset(bucket_name, ['dim1', 'dim2'], store=store) records = dataset.records(spark_context, decode=lambda x: x) records = records.collect() assert records == ['value{}'.format(i) for i in range(1, spark_context.defaultParallelism + 2)]
def test_summaries(): bucket_name = 'test-bucket' store = InMemoryStore(bucket_name) store.store['dir1/subdir1/key1'] = 'value1' store.store['dir2/subdir2/key2'] = 'value2' dataset = Dataset(bucket_name, ['dim1', 'dim2'], store=store) summaries = dataset._summaries() assert len(list(summaries)) == 2 for item in summaries: assert item['key'] in store.store assert item['size'] == len(store.store[item['key']])
def test_summaries(spark_context): bucket_name = 'test-bucket' store = InMemoryStore(bucket_name) store.store['dir1/subdir1/key1'] = 'value1' store.store['dir2/subdir2/key2'] = 'value2' dataset = Dataset(bucket_name, ['dim1', 'dim2'], store=store, max_concurrency=1) summaries = dataset.summaries(spark_context) assert len(summaries) == 2 for item in summaries: assert item['key'] in store.store assert item['size'] == len(store.store[item['key']])
def test_summaries(spark_context): bucket_name = 'test-bucket' store = InMemoryStore(bucket_name) store.store['dir1/subdir1/key1'] = 'value1' store.store['dir2/subdir2/key2'] = 'value2' dataset = Dataset(bucket_name, ['dim1', 'dim2'], store=store) summaries = dataset.summaries(spark_context) assert len(summaries) == 2 for item in summaries: assert item['key'] in store.store assert item['size'] == len(store.store[item['key']])
def run_spinner_etl(sc): sample_size = 1.0 start_date = (datetime.today() - timedelta(days=180)).strftime("%Y%m%d") end_date = datetime.today().strftime("%Y%m%d") def appBuildId_filter(b): return b >= start_date and (b.startswith(end_date) or b < end_date) print("Start Date: {}, End Date: {}".format(start_date, end_date)) results = {} pings = (Dataset.from_source("telemetry").where(docType="main").where( submissionDate=lambda b: b >= start_date).where( appBuildId=appBuildId_filter).where( appUpdateChannel="nightly").records(sc, sample=sample_size)) results = get_short_and_long_spinners(pings) s3_client = boto3.client("s3") filename = "severities_by_build_id_nightly.json" with open(filename, "w") as f: f.write(json.dumps(results, ensure_ascii=False, cls=CustomEncoder)) s3_client.upload_file( filename, "telemetry-public-analysis-2", "spinner-severity-generator/data/{}".format(filename), )
def dataset(): bucket_name = 'test_bucket' store = InMemoryStore(bucket_name) store.store['dir1/subdir1/key1'] = json.dumps({'foo': 1}) store.store['dir2/subdir2/key2'] = json.dumps({'foo': 2}) dataset = Dataset(bucket_name, ['dim1', 'dim2'], store=store) return dataset
def extract(sc, submission_date, sample=0.01): landfill = ( Dataset.from_source("landfill") .where(submissionDate=submission_date) .records(sc, sample=sample) ) return landfill
def get_data(sc, config, date): date_str = date.strftime("%Y%m%d") pings = (Dataset.from_source("telemetry").where(docType='crash').where( submissionDate=lambda b: b.startswith(date_str)).where( appUpdateChannel=config['channel']).records( sc, sample=config['sample_size'])) properties = [ "environment/system/os/name", "environment/system/os/version", "application/architecture", "application/buildId", "payload/processType", "payload/stackTraces/crash_info/crashing_thread", "payload/stackTraces/threads", "payload/stackTraces/modules", ] try: pings_props = get_pings_properties(pings, properties, with_processes=True) except ValueError: return None return pings_props.map(map_to_hang_format)
def test_apply_selection(): dataset = Dataset('test-bucket', ['dim1', 'dim2']).select('field1.field2') json_obj = {'field1': {'field2': 'value'}} assert dataset._apply_selection(json_obj) == {'field1.field2': 'value'} dataset = Dataset('test-bucket', ['dim1', 'dim2']).select(field='field1.field2') assert dataset._apply_selection(json_obj) == {'field': 'value'} dataset = Dataset('test-bucket', ['dim1', 'dim2']).select(field='foo.bar') assert dataset._apply_selection(json_obj) == {'field': None}
def get_data(sc): pings = Dataset.from_source("telemetry").where( docType='main', submissionDate=(date.today() - timedelta(1)).strftime("%Y%m%d"), appUpdateChannel="nightly").records(sc, sample=0.1) return get_pings_properties(pings, ["clientId", "environment/system/os/name"])
def submit(docType, channel, timestamp, group_by): spark = SparkSession.builder.getOrCreate() sc = spark.sparkContext rdd = Dataset.from_source("telemetry").where( docType=docType, appUpdateChannel=channel, submissionDate=timestamp).records(sc, group_by=group_by) print("Number of rows of RDD: {}".format(rdd.count())) print("Number of partitions used: {}".format(rdd.getNumPartitions())) sc.stop()
def test_prefix_slash(spark_context): bucket_name = 'test-bucket' store = InMemoryStore(bucket_name) store.store['a/b/dir1/subdir1/key1'] = 'value1' store.store['a/b/dir2/subdir2/key2'] = 'value2' store.store['x/b/dir3/subdir3/key3'] = 'value3' store.store['a/c/dir4/subdir4/key4'] = 'value4' dataset = Dataset(bucket_name, ['dim1', 'dim2'], store=store, prefix='a/b', max_concurrency=1) summaries = dataset.summaries(spark_context) assert len(summaries) == 2 for item in summaries: assert item['key'] in store.store assert item['size'] == len(store.store[item['key']]) # be sure "where" still works summaries_filtered = dataset.where(dim1='dir1').summaries(spark_context) assert len(summaries_filtered) == 1 assert summaries_filtered[0]['key'] == 'a/b/dir1/subdir1/key1'
def test_records_sample(spark_context): bucket_name = 'test-bucket' store = InMemoryStore(bucket_name) for i in range(1, 100 + 1): key = 'dir{}/subdir{}/key{}'.format(*[i] * 3) value = 'value{}'.format(i) store.store[key] = value dataset = Dataset(bucket_name, ['dim1', 'dim2'], store=store) records_1 = dataset.records(spark_context, decode=lambda x: x, sample=0.1, seed=None).collect() assert len(records_1) == 10 records_2 = dataset.records(spark_context, decode=lambda x: x, sample=0.1, seed=None).collect() # The sampling seed is different, so we have two different samples. assert sorted(records_1) != sorted(records_2) records_1 = dataset.records(spark_context, decode=lambda x: x, sample=0.1).collect() records_2 = dataset.records(spark_context, decode=lambda x: x, sample=0.1).collect() # Same seed, same sample. assert sorted(records_1) == sorted(records_2)
def test_dataset_from_source(my_mock_s3, monkeypatch): meta_bucket_name = 'net-mozaws-prod-us-west-2-pipeline-metadata' bucket = boto3.resource('s3').Bucket(meta_bucket_name) bucket.create() store = S3Store(meta_bucket_name) data_dir = os.path.join(os.path.dirname(__file__), 'data') with open(os.path.join(data_dir, 'sources.json'), 'rb') as f: store.upload_file(f, '', 'sources.json') with open(os.path.join(data_dir, 'schema.json'), 'rb') as f: store.upload_file(f, 'telemetry-2/', 'schema.json') f.seek(0) expected_dimensions = json.loads(f.read().decode('utf-8'))['dimensions'] dimensions = [dim['field_name'] for dim in expected_dimensions] assert Dataset.from_source('telemetry').schema == dimensions
def aggregate_metrics(sc, begin, end=None, num_partitions=10000): """ Returns the build-id and submission date aggregates for a given submission date. :param sc: A SparkContext instance :param begin: A string for the beginning date, in form "YYYYMMDD" :param end: An optional string for the end date, in form "YYYYMMDD". If not provided, metrics will only be aggregrated for the date provided with `begin`. :param num_partitions: An optional value to be passed to `aggregateByKey`. """ if end is None: end = begin pings = (Dataset.from_source('telemetry') .where(docType='mobile_metrics', submissionDate=lambda x: begin <= x <= end) .records(sc)) return _aggregate_metrics(pings, num_partitions)
# COMMAND ---------- from moztelemetry.dataset import Dataset import pandas as pd from pyspark.sql import Row from pyspark.sql import functions as f from pyspark.sql.types import StructType, StructField, StringType, BooleanType, IntegerType, DoubleType, LongType, MapType from statsmodels.stats.weightstats import DescrStatsW EXPERIMENT_ID = "prefflip-webrender-v1-2-1492568" EXPERIMENT_ID_2 = "prefflip-webrender-v1-3-1492568" PARTITIONS = [s.replace("-", "_") for s in (EXPERIMENT_ID, EXPERIMENT_ID_2)] # COMMAND ---------- Dataset.from_source("telemetry-cohorts") # COMMAND ---------- to_summarize = { "composite_time": "payload.processes.gpu.histograms.COMPOSITE_TIME", "content_frame_time": "payload.processes.gpu.histograms.CONTENT_FRAME_TIME", "content_frame_time_svg": "payload.processes.gpu.histograms.CONTENT_FRAME_TIME_WITH_SVG", "content_frame_time_reason": "payload.processes.gpu.histograms.CONTENT_FRAME_TIME_REASON", "content_frame_time_without_upload": "payload.processes.gpu.histograms.CONTENT_FRAME_TIME_WITHOUT_UPLOAD", "content_paint_time": "payload.processes.content.histograms.CONTENT_PAINT_TIME", "tab_switch_composite": "payload.histograms.FX_TAB_SWITCH_COMPOSITE_E10S_MS", "content_full_paint_time": "payload.processes.gpu.histograms.CONTENT_FULL_PAINT_TIME", "page_load_ms": "payload.histograms.FX_PAGE_LOAD_MS_2" }
def test_scan_no_dimensions(): dataset = Dataset('test-bucket', ['dim1', 'dim2'], prefix='prefix/') with futures.ProcessPoolExecutor(1) as executor: folders = dataset._scan([], ['prefix/', ], {}, executor) assert folders == ['prefix/', ]